acdha · December 14, 2017 13:45 · Jan 3, 2014 · Mar 11, 2013 · Nov 21, 2012 · Sep 27, 2012
diff --git a/solr_grouping_backend.py b/solr_grouping_backend.py
@@ -1,24 +1,25 @@
 # encoding: utf-8
 """Experimental Solr Grouping / Field Collapsing backend for Haystack 2.0"""
-# NOTE: Requires pysolr 3.0.6+
+# NOTE: You must be running the latest Pysolr master - no PyPI release yet!
 # See https://gist.github.com/3750774 for the current version of this code
 # See http://wiki.apache.org/solr/FieldCollapsing for the Solr feature documentation
 from __future__ import absolute_import
 
 import logging
 
 from django.db.models.loading import get_model
-
-from haystack.constants import ID, DJANGO_CT, DJANGO_ID
+from haystack.backends import EmptyResults
+from haystack.backends.solr_backend import SolrEngine, SolrSearchBackend, SolrSearchQuery
+from haystack.constants import DJANGO_CT, DJANGO_ID, ID
 from haystack.models import SearchResult
 from haystack.query import SearchQuerySet
-from haystack.backends import EmptyResults
+
 # Since there's no chance of this being portable (yet!) we'll import explicitly
 # rather than using the generic imports:
-from haystack.backends.solr_backend import SolrEngine, SolrSearchBackend, SolrSearchQuery
 
 
 class GroupedSearchQuery(SolrSearchQuery):
+
     def __init__(self, *args, **kwargs):
         super(GroupedSearchQuery, self).__init__(*args, **kwargs)
         self.grouping_field = None
@@ -37,10 +38,10 @@ def post_process_facets(self, results):
         # See matches dance in _process_results below:
         total = 0
 
-        if 'matches' in results:
-            total = int(results['matches'])
-        elif 'hits' in results:
+        if 'hits' in results:
             total = int(results['hits'])
+        elif 'matches' in results:
+            total = int(results['matches'])
 
         self._total_document_count = total
 
@@ -63,13 +64,14 @@ def build_params(self, *args, **kwargs):
                         'group.field': self.grouping_field,
                         'group.ngroups': 'true',
                         'group.limit': 2,  # TODO: Don't hard-code this
-                        'group.sort': 'score desc',
+                        'group.sort': 'django_ct desc, score desc',
                         'group.facet': 'true',
                         'result_class': GroupedSearchResult})
         return res
 
 
 class GroupedSearchResult(object):
+
     def __init__(self, field_name, group_data, raw_results={}):
         self.field_name = field_name
         self.key = group_data['groupValue']  # TODO: convert _to_python
@@ -117,6 +119,7 @@ def process_documents(self, doclist, raw_results):
 
 
 class GroupedSearchQuerySet(SearchQuerySet):
+
     def __init__(self, *args, **kwargs):
         super(GroupedSearchQuerySet, self).__init__(*args, **kwargs)
 
@@ -148,12 +151,15 @@ def total_document_count(self):
 
 
 class GroupedSolrSearchBackend(SolrSearchBackend):
+
     def build_search_kwargs(self, *args, **kwargs):
         group_kwargs = [(i, kwargs.pop(i)) for i in kwargs.keys() if i.startswith("group")]
 
         res = super(GroupedSolrSearchBackend, self).build_search_kwargs(*args, **kwargs)
 
         res.update(group_kwargs)
+        if group_kwargs and 'sort' not in kwargs:
+            res['sort'] = 'score desc, item_id asc'
 
         return res
 

diff --git a/solr_grouping_backend.py b/solr_grouping_backend.py
@@ -1,6 +1,6 @@
 # encoding: utf-8
 """Experimental Solr Grouping / Field Collapsing backend for Haystack 2.0"""
-# NOTE: You must be running the latest Pysolr master - no PyPI release yet!
+# NOTE: Requires pysolr 3.0.6+
 # See https://gist.github.com/3750774 for the current version of this code
 # See http://wiki.apache.org/solr/FieldCollapsing for the Solr feature documentation
 from __future__ import absolute_import
@@ -22,6 +22,7 @@ class GroupedSearchQuery(SolrSearchQuery):
     def __init__(self, *args, **kwargs):
         super(GroupedSearchQuery, self).__init__(*args, **kwargs)
         self.grouping_field = None
+        self._total_document_count = None
 
     def _clone(self, **kwargs):
         clone = super(GroupedSearchQuery, self)._clone(**kwargs)
@@ -31,6 +32,30 @@ def _clone(self, **kwargs):
     def add_group_by(self, field_name):
         self.grouping_field = field_name
 
+    def post_process_facets(self, results):
+        # FIXME: remove this hack once https://github.com/toastdriven/django-haystack/issues/750 lands
+        # See matches dance in _process_results below:
+        total = 0
+
+        if 'matches' in results:
+            total = int(results['matches'])
+        elif 'hits' in results:
+            total = int(results['hits'])
+
+        self._total_document_count = total
+
+        return super(GroupedSearchQuery, self).post_process_facets(results)
+
+    def get_total_document_count(self):
+        """Return the total number of matching documents rather than document groups
+
+        If the query has not been run, this will execute the query and store the results.
+        """
+        if self._total_document_count is None:
+            self.run()
+
+        return self._total_document_count
+
     def build_params(self, *args, **kwargs):
         res = super(GroupedSearchQuery, self).build_params(*args, **kwargs)
         if self.grouping_field is not None:
@@ -39,6 +64,7 @@ def build_params(self, *args, **kwargs):
                         'group.ngroups': 'true',
                         'group.limit': 2,  # TODO: Don't hard-code this
                         'group.sort': 'score desc',
+                        'group.facet': 'true',
                         'result_class': GroupedSearchResult})
         return res
 
@@ -85,16 +111,6 @@ def process_documents(self, doclist, raw_results):
                 if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
                     additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]]
 
-                # TODO: restore distance_point processing
-                if False:  # distance_point:
-                    additional_fields['_point_of_origin'] = distance_point
-
-                    if raw_result.get('__dist__'):
-                        from haystack.utils.geo import Distance
-                        additional_fields['_distance'] = Distance(km=float(raw_result['__dist__']))
-                    else:
-                        additional_fields['_distance'] = None
-
                 result = SearchResult(app_label, model_name, raw_result[DJANGO_ID],
                                       raw_result['score'], **additional_fields)
                 yield result
@@ -117,6 +133,19 @@ def post_process_results(self, results):
         # Override the default model-specific processing
         return results
 
+    def total_document_count(self):
+        """Returns the count for the total number of matching documents rather than groups
+
+        A GroupedSearchQuerySet normally returns the number of document groups; this allows
+        you to indicate the total number of matching documents - quite handy for making facet counts match the
+        displayed numbers
+        """
+        if self.query.has_run():
+            return self.query.get_total_document_count()
+        else:
+            clone = self._clone()
+            return clone.query.get_total_document_count()
+
 
 class GroupedSolrSearchBackend(SolrSearchBackend):
     def build_search_kwargs(self, *args, **kwargs):
@@ -145,13 +174,14 @@ def _process_results(self, raw_results, result_class=None, **kwargs):
         if isinstance(raw_results, EmptyResults):
             return res
 
-        assert len(raw_results.grouped) == 1
+        assert len(raw_results.grouped) == 1, "Grouping on more than one field is not supported"
 
         res['results'] = results = []
         for field_name, field_group in raw_results.grouped.items():
             res['hits'] = field_group['ngroups']
+            res['matches'] = field_group['matches']
             for group in field_group['groups']:
-                if group['groupValue'] == None:
+                if group['groupValue'] is None:
                     logging.warning("Unexpected NULL grouping", extra={'data': raw_results})
                     res['hits'] -= 1  # Avoid confusing Haystack with excluded bogon results
                     continue

diff --git a/solr_grouping_backend.py b/solr_grouping_backend.py
@@ -129,16 +129,16 @@ def build_search_kwargs(self, *args, **kwargs):
         return res
 
     def _process_results(self, raw_results, result_class=None, **kwargs):
+        res = super(GroupedSolrSearchBackend, self)._process_results(raw_results,
+                                                                     result_class=result_class,
+                                                                     **kwargs)
+
         if result_class and not issubclass(result_class, GroupedSearchResult):
-            raise TypeError("GroupedSolrSearchBackend requires use of the GroupedSearchResult result_class")
+            return res
 
         if len(raw_results.docs):
             raise RuntimeError("Grouped Solr searches should return grouped elements, not docs!")
 
-        res = super(GroupedSolrSearchBackend, self)._process_results(raw_results,
-                                                                     result_class=result_class,
-                                                                     **kwargs)
-
         assert not res['results']
         assert not res['hits']
 
@@ -153,6 +153,7 @@ def _process_results(self, raw_results, result_class=None, **kwargs):
             for group in field_group['groups']:
                 if group['groupValue'] == None:
                     logging.warning("Unexpected NULL grouping", extra={'data': raw_results})
+                    res['hits'] -= 1  # Avoid confusing Haystack with excluded bogon results
                     continue
                 results.append(result_class(field_name, group, raw_results=raw_results))
 

diff --git a/solr_grouping_backend.py b/solr_grouping_backend.py
@@ -12,6 +12,7 @@
 from haystack.constants import ID, DJANGO_CT, DJANGO_ID
 from haystack.models import SearchResult
 from haystack.query import SearchQuerySet
+from haystack.backends import EmptyResults
 # Since there's no chance of this being portable (yet!) we'll import explicitly
 # rather than using the generic imports:
 from haystack.backends.solr_backend import SolrEngine, SolrSearchBackend, SolrSearchQuery
@@ -140,6 +141,10 @@ def _process_results(self, raw_results, result_class=None, **kwargs):
 
         assert not res['results']
         assert not res['hits']
+
+        if isinstance(raw_results, EmptyResults):
+            return res
+
         assert len(raw_results.grouped) == 1
 
         res['results'] = results = []

diff --git a/solr_grouping_backend.py b/solr_grouping_backend.py
@@ -8,9 +8,8 @@
 import logging
 
 from django.db.models.loading import get_model
-from django.utils.datastructures import SortedDict
 
-from haystack.constants import DJANGO_CT, DJANGO_ID
+from haystack.constants import ID, DJANGO_CT, DJANGO_ID
 from haystack.models import SearchResult
 from haystack.query import SearchQuerySet
 # Since there's no chance of this being portable (yet!) we'll import explicitly
@@ -38,22 +37,23 @@ def build_params(self, *args, **kwargs):
                         'group.field': self.grouping_field,
                         'group.ngroups': 'true',
                         'group.limit': 2,  # TODO: Don't hard-code this
-                        'group.sort': 'django_ct asc',
+                        'group.sort': 'score desc',
                         'result_class': GroupedSearchResult})
         return res
 
 
 class GroupedSearchResult(object):
-    def __init__(self, field_name, group_data):
+    def __init__(self, field_name, group_data, raw_results={}):
         self.field_name = field_name
         self.key = group_data['groupValue']  # TODO: convert _to_python
         self.hits = group_data['doclist']['numFound']
-        self.documents = list(self.process_documents(group_data['doclist']['docs']))
+        self.documents = list(self.process_documents(group_data['doclist']['docs'],
+                                                     raw_results=raw_results))
 
     def __unicode__(self):
         return 'GroupedSearchResult({0.field_name}={0.group_key}, hits={0.hits})'.format(self)
 
-    def process_documents(self, doclist):
+    def process_documents(self, doclist, raw_results):
         # TODO: tame import spaghetti
         from haystack import connections
         engine = connections["en"]
@@ -81,9 +81,8 @@ def process_documents(self, doclist):
                 del(additional_fields[DJANGO_ID])
                 del(additional_fields['score'])
 
-                # TODO: Add highlighting visibility
-                # if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
-                #     additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]]
+                if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
+                    additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]]
 
                 # TODO: restore distance_point processing
                 if False:  # distance_point:
@@ -150,7 +149,7 @@ def _process_results(self, raw_results, result_class=None, **kwargs):
                 if group['groupValue'] == None:
                     logging.warning("Unexpected NULL grouping", extra={'data': raw_results})
                     continue
-                results.append(result_class(field_name, group))
+                results.append(result_class(field_name, group, raw_results=raw_results))
 
         return res
 

diff --git a/solr_grouping_backend.py b/solr_grouping_backend.py
@@ -36,32 +36,24 @@ def build_params(self, *args, **kwargs):
         if self.grouping_field is not None:
             res.update({'group': 'true',
                         'group.field': self.grouping_field,
-                        'group.limit': 3,  # TODO: This should not be hard-coded
                         'group.ngroups': 'true',
-                        'group.sort': 'django_ct desc',
+                        'group.limit': 2,  # TODO: Don't hard-code this
+                        'group.sort': 'django_ct asc',
                         'result_class': GroupedSearchResult})
         return res
 
 
 class GroupedSearchResult(object):
-    def __init__(self, field_name, group):
+    def __init__(self, field_name, group_data):
         self.field_name = field_name
-        self.count = group['ngroups']
-        self.total_documents = group['matches']
-        self.groups = SortedDict()
-
-        for group in group['groups']:
-            key = group['groupValue']
-            if key is None:
-                logging.warning("Skipping empty key value")
-                continue
-            self.groups[key] = list(self.process_documents(group['doclist']['docs']))
+        self.key = group_data['groupValue']  # TODO: convert _to_python
+        self.hits = group_data['doclist']['numFound']
+        self.documents = list(self.process_documents(group_data['doclist']['docs']))
 
     def __unicode__(self):
-        return 'GroupedSearchResult("{0.field_name}", count={0.count})'.format(self)
+        return 'GroupedSearchResult({0.field_name}={0.group_key}, hits={0.hits})'.format(self)
 
     def process_documents(self, doclist):
-
         # TODO: tame import spaghetti
         from haystack import connections
         engine = connections["en"]
@@ -149,12 +141,16 @@ def _process_results(self, raw_results, result_class=None, **kwargs):
 
         assert not res['results']
         assert not res['hits']
+        assert len(raw_results.grouped) == 1
 
         res['results'] = results = []
-        for field_name, groups in raw_results.grouped.items():
-            results.append(result_class(field_name, groups))
-
-        res['hits'] = len(results)
+        for field_name, field_group in raw_results.grouped.items():
+            res['hits'] = field_group['ngroups']
+            for group in field_group['groups']:
+                if group['groupValue'] == None:
+                    logging.warning("Unexpected NULL grouping", extra={'data': raw_results})
+                    continue
+                results.append(result_class(field_name, group))
 
         return res
 

diff --git a/solr_grouping_backend.py b/solr_grouping_backend.py
@@ -1,5 +1,8 @@
 # encoding: utf-8
-# See http://wiki.apache.org/solr/FieldCollapsing for the Solr feature information
+"""Experimental Solr Grouping / Field Collapsing backend for Haystack 2.0"""
+# NOTE: You must be running the latest Pysolr master - no PyPI release yet!
+# See https://gist.github.com/3750774 for the current version of this code
+# See http://wiki.apache.org/solr/FieldCollapsing for the Solr feature documentation
 from __future__ import absolute_import
 
 import logging
@@ -33,7 +36,7 @@ def build_params(self, *args, **kwargs):
         if self.grouping_field is not None:
             res.update({'group': 'true',
                         'group.field': self.grouping_field,
-                        'group.limit': 3,
+                        'group.limit': 3,  # TODO: This should not be hard-coded
                         'group.ngroups': 'true',
                         'group.sort': 'django_ct desc',
                         'result_class': GroupedSearchResult})
@@ -119,6 +122,7 @@ def group_by(self, field_name):
         return clone
 
     def post_process_results(self, results):
+        # Override the default model-specific processing
         return results
 
 

diff --git a/solr_grouping_backend.py b/solr_grouping_backend.py
@@ -0,0 +1,160 @@
+# encoding: utf-8
+# See http://wiki.apache.org/solr/FieldCollapsing for the Solr feature information
+from __future__ import absolute_import
+
+import logging
+
+from django.db.models.loading import get_model
+from django.utils.datastructures import SortedDict
+
+from haystack.constants import DJANGO_CT, DJANGO_ID
+from haystack.models import SearchResult
+from haystack.query import SearchQuerySet
+# Since there's no chance of this being portable (yet!) we'll import explicitly
+# rather than using the generic imports:
+from haystack.backends.solr_backend import SolrEngine, SolrSearchBackend, SolrSearchQuery
+
+
+class GroupedSearchQuery(SolrSearchQuery):
+    def __init__(self, *args, **kwargs):
+        super(GroupedSearchQuery, self).__init__(*args, **kwargs)
+        self.grouping_field = None
+
+    def _clone(self, **kwargs):
+        clone = super(GroupedSearchQuery, self)._clone(**kwargs)
+        clone.grouping_field = self.grouping_field
+        return clone
+
+    def add_group_by(self, field_name):
+        self.grouping_field = field_name
+
+    def build_params(self, *args, **kwargs):
+        res = super(GroupedSearchQuery, self).build_params(*args, **kwargs)
+        if self.grouping_field is not None:
+            res.update({'group': 'true',
+                        'group.field': self.grouping_field,
+                        'group.limit': 3,
+                        'group.ngroups': 'true',
+                        'group.sort': 'django_ct desc',
+                        'result_class': GroupedSearchResult})
+        return res
+
+
+class GroupedSearchResult(object):
+    def __init__(self, field_name, group):
+        self.field_name = field_name
+        self.count = group['ngroups']
+        self.total_documents = group['matches']
+        self.groups = SortedDict()
+
+        for group in group['groups']:
+            key = group['groupValue']
+            if key is None:
+                logging.warning("Skipping empty key value")
+                continue
+            self.groups[key] = list(self.process_documents(group['doclist']['docs']))
+
+    def __unicode__(self):
+        return 'GroupedSearchResult("{0.field_name}", count={0.count})'.format(self)
+
+    def process_documents(self, doclist):
+
+        # TODO: tame import spaghetti
+        from haystack import connections
+        engine = connections["en"]
+        conn = engine.get_backend().conn
+
+        unified_index = engine.get_unified_index()
+        indexed_models = unified_index.get_indexed_models()
+
+        for raw_result in doclist:
+            app_label, model_name = raw_result[DJANGO_CT].split('.')
+            additional_fields = {}
+            model = get_model(app_label, model_name)
+
+            if model and model in indexed_models:
+                for key, value in raw_result.items():
+                    index = unified_index.get_index(model)
+                    string_key = str(key)
+
+                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
+                        additional_fields[string_key] = index.fields[string_key].convert(value)
+                    else:
+                        additional_fields[string_key] = conn._to_python(value)
+
+                del(additional_fields[DJANGO_CT])
+                del(additional_fields[DJANGO_ID])
+                del(additional_fields['score'])
+
+                # TODO: Add highlighting visibility
+                # if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
+                #     additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]]
+
+                # TODO: restore distance_point processing
+                if False:  # distance_point:
+                    additional_fields['_point_of_origin'] = distance_point
+
+                    if raw_result.get('__dist__'):
+                        from haystack.utils.geo import Distance
+                        additional_fields['_distance'] = Distance(km=float(raw_result['__dist__']))
+                    else:
+                        additional_fields['_distance'] = None
+
+                result = SearchResult(app_label, model_name, raw_result[DJANGO_ID],
+                                      raw_result['score'], **additional_fields)
+                yield result
+
+
+class GroupedSearchQuerySet(SearchQuerySet):
+    def __init__(self, *args, **kwargs):
+        super(GroupedSearchQuerySet, self).__init__(*args, **kwargs)
+
+        if not isinstance(self.query, GroupedSearchQuery):
+            raise TypeError("GroupedSearchQuerySet must be used with a GroupedSearchQuery query")
+
+    def group_by(self, field_name):
+        """Have Solr group results based on the provided field name"""
+        clone = self._clone()
+        clone.query.add_group_by(field_name)
+        return clone
+
+    def post_process_results(self, results):
+        return results
+
+
+class GroupedSolrSearchBackend(SolrSearchBackend):
+    def build_search_kwargs(self, *args, **kwargs):
+        group_kwargs = [(i, kwargs.pop(i)) for i in kwargs.keys() if i.startswith("group")]
+
+        res = super(GroupedSolrSearchBackend, self).build_search_kwargs(*args, **kwargs)
+
+        res.update(group_kwargs)
+
+        return res
+
+    def _process_results(self, raw_results, result_class=None, **kwargs):
+        if result_class and not issubclass(result_class, GroupedSearchResult):
+            raise TypeError("GroupedSolrSearchBackend requires use of the GroupedSearchResult result_class")
+
+        if len(raw_results.docs):
+            raise RuntimeError("Grouped Solr searches should return grouped elements, not docs!")
+
+        res = super(GroupedSolrSearchBackend, self)._process_results(raw_results,
+                                                                     result_class=result_class,
+                                                                     **kwargs)
+
+        assert not res['results']
+        assert not res['hits']
+
+        res['results'] = results = []
+        for field_name, groups in raw_results.grouped.items():
+            results.append(result_class(field_name, groups))
+
+        res['hits'] = len(results)
+
+        return res
+
+
+class GroupedSolrEngine(SolrEngine):
+    backend = GroupedSolrSearchBackend
+    query = GroupedSearchQuery