Skip to content

Instantly share code, notes, and snippets.

@acdha
Last active December 14, 2017 13:45
Show Gist options
  • Save acdha/3750774 to your computer and use it in GitHub Desktop.
Save acdha/3750774 to your computer and use it in GitHub Desktop.
Experimental django-haystack 2 backend which adds support for Solr's grouping / field collapsing
# encoding: utf-8
"""Experimental Solr Grouping / Field Collapsing backend for Haystack 2.0"""
# NOTE: You must be running the latest Pysolr master - no PyPI release yet!
# See https://gist.github.com/3750774 for the current version of this code
# See http://wiki.apache.org/solr/FieldCollapsing for the Solr feature documentation
from __future__ import absolute_import
import logging
from django.db.models.loading import get_model
from haystack.backends import EmptyResults
from haystack.backends.solr_backend import SolrEngine, SolrSearchBackend, SolrSearchQuery
from haystack.constants import DJANGO_CT, DJANGO_ID, ID
from haystack.models import SearchResult
from haystack.query import SearchQuerySet
# Since there's no chance of this being portable (yet!) we'll import explicitly
# rather than using the generic imports:
class GroupedSearchQuery(SolrSearchQuery):
def __init__(self, *args, **kwargs):
super(GroupedSearchQuery, self).__init__(*args, **kwargs)
self.grouping_field = None
self._total_document_count = None
def _clone(self, **kwargs):
clone = super(GroupedSearchQuery, self)._clone(**kwargs)
clone.grouping_field = self.grouping_field
return clone
def add_group_by(self, field_name):
self.grouping_field = field_name
def post_process_facets(self, results):
# FIXME: remove this hack once https://github.com/toastdriven/django-haystack/issues/750 lands
# See matches dance in _process_results below:
total = 0
if 'hits' in results:
total = int(results['hits'])
elif 'matches' in results:
total = int(results['matches'])
self._total_document_count = total
return super(GroupedSearchQuery, self).post_process_facets(results)
def get_total_document_count(self):
"""Return the total number of matching documents rather than document groups
If the query has not been run, this will execute the query and store the results.
"""
if self._total_document_count is None:
self.run()
return self._total_document_count
def build_params(self, *args, **kwargs):
res = super(GroupedSearchQuery, self).build_params(*args, **kwargs)
if self.grouping_field is not None:
res.update({'group': 'true',
'group.field': self.grouping_field,
'group.ngroups': 'true',
'group.limit': 2, # TODO: Don't hard-code this
'group.sort': 'django_ct desc, score desc',
'group.facet': 'true',
'result_class': GroupedSearchResult})
return res
class GroupedSearchResult(object):
def __init__(self, field_name, group_data, raw_results={}):
self.field_name = field_name
self.key = group_data['groupValue'] # TODO: convert _to_python
self.hits = group_data['doclist']['numFound']
self.documents = list(self.process_documents(group_data['doclist']['docs'],
raw_results=raw_results))
def __unicode__(self):
return 'GroupedSearchResult({0.field_name}={0.group_key}, hits={0.hits})'.format(self)
def process_documents(self, doclist, raw_results):
# TODO: tame import spaghetti
from haystack import connections
engine = connections["en"]
conn = engine.get_backend().conn
unified_index = engine.get_unified_index()
indexed_models = unified_index.get_indexed_models()
for raw_result in doclist:
app_label, model_name = raw_result[DJANGO_CT].split('.')
additional_fields = {}
model = get_model(app_label, model_name)
if model and model in indexed_models:
for key, value in raw_result.items():
index = unified_index.get_index(model)
string_key = str(key)
if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
additional_fields[string_key] = index.fields[string_key].convert(value)
else:
additional_fields[string_key] = conn._to_python(value)
del(additional_fields[DJANGO_CT])
del(additional_fields[DJANGO_ID])
del(additional_fields['score'])
if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]]
result = SearchResult(app_label, model_name, raw_result[DJANGO_ID],
raw_result['score'], **additional_fields)
yield result
class GroupedSearchQuerySet(SearchQuerySet):
def __init__(self, *args, **kwargs):
super(GroupedSearchQuerySet, self).__init__(*args, **kwargs)
if not isinstance(self.query, GroupedSearchQuery):
raise TypeError("GroupedSearchQuerySet must be used with a GroupedSearchQuery query")
def group_by(self, field_name):
"""Have Solr group results based on the provided field name"""
clone = self._clone()
clone.query.add_group_by(field_name)
return clone
def post_process_results(self, results):
# Override the default model-specific processing
return results
def total_document_count(self):
"""Returns the count for the total number of matching documents rather than groups
A GroupedSearchQuerySet normally returns the number of document groups; this allows
you to indicate the total number of matching documents - quite handy for making facet counts match the
displayed numbers
"""
if self.query.has_run():
return self.query.get_total_document_count()
else:
clone = self._clone()
return clone.query.get_total_document_count()
class GroupedSolrSearchBackend(SolrSearchBackend):
def build_search_kwargs(self, *args, **kwargs):
group_kwargs = [(i, kwargs.pop(i)) for i in kwargs.keys() if i.startswith("group")]
res = super(GroupedSolrSearchBackend, self).build_search_kwargs(*args, **kwargs)
res.update(group_kwargs)
if group_kwargs and 'sort' not in kwargs:
res['sort'] = 'score desc, item_id asc'
return res
def _process_results(self, raw_results, result_class=None, **kwargs):
res = super(GroupedSolrSearchBackend, self)._process_results(raw_results,
result_class=result_class,
**kwargs)
if result_class and not issubclass(result_class, GroupedSearchResult):
return res
if len(raw_results.docs):
raise RuntimeError("Grouped Solr searches should return grouped elements, not docs!")
assert not res['results']
assert not res['hits']
if isinstance(raw_results, EmptyResults):
return res
assert len(raw_results.grouped) == 1, "Grouping on more than one field is not supported"
res['results'] = results = []
for field_name, field_group in raw_results.grouped.items():
res['hits'] = field_group['ngroups']
res['matches'] = field_group['matches']
for group in field_group['groups']:
if group['groupValue'] is None:
logging.warning("Unexpected NULL grouping", extra={'data': raw_results})
res['hits'] -= 1 # Avoid confusing Haystack with excluded bogon results
continue
results.append(result_class(field_name, group, raw_results=raw_results))
return res
class GroupedSolrEngine(SolrEngine):
backend = GroupedSolrSearchBackend
query = GroupedSearchQuery
@mynameistechno
Copy link

Hey @acdha, this looks great, have you been using it? I.e. how stable is this for production use? Thanks

@acdha
Copy link
Author

acdha commented Jul 29, 2014

@mynameistechno I've been using it in production since last November: http://chris.improbable.org/2014/3/17/content-search-on-a-budget/

It's not feature complete with all of the other backends but it handles the subset of functionality which I need.

@TippyTipster
Copy link

Do you have any tutorials or documentation on how to implement this? I'm fairly new to haystack/solr and can't seem to figure out where/how I should be using these.

@Pablo1990
Copy link

Hey!

I also need some tutorial on how implement this, where I should add this code?

Thanks in advance!

@acdha
Copy link
Author

acdha commented Feb 29, 2016

@TippyTipster, @Pablo1990: belated reply since I never saw a notification for your comments but for the record: I have that in a separate module (e.g. my_project/search/grouped.py and my Django settings module has 'ENGINE': 'my_project.search.grouped' in the Haystack connection configuration following the examples at http://django-haystack.readthedocs.org/en/v2.4.1/tutorial.html#modify-your-settings-py.

@acdha
Copy link
Author

acdha commented Sep 12, 2016

Anyone who finds this of interest but wants to use the newer / maybe-faster Collapsing filter and Expand component: https://gist.github.com/acdha/0a66ca23984bc8d607936fecd9c29941

@jmiguelv
Copy link

jmiguelv commented Mar 3, 2017

Hi @acdha, how would you sort search results using this? Applying order_by on the search query set does not seem to work. Thank you.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment