-
-
Save acdha/3750774 to your computer and use it in GitHub Desktop.
# encoding: utf-8 | |
"""Experimental Solr Grouping / Field Collapsing backend for Haystack 2.0""" | |
# NOTE: You must be running the latest Pysolr master - no PyPI release yet! | |
# See https://gist.github.com/3750774 for the current version of this code | |
# See http://wiki.apache.org/solr/FieldCollapsing for the Solr feature documentation | |
from __future__ import absolute_import | |
import logging | |
from django.db.models.loading import get_model | |
from haystack.backends import EmptyResults | |
from haystack.backends.solr_backend import SolrEngine, SolrSearchBackend, SolrSearchQuery | |
from haystack.constants import DJANGO_CT, DJANGO_ID, ID | |
from haystack.models import SearchResult | |
from haystack.query import SearchQuerySet | |
# Since there's no chance of this being portable (yet!) we'll import explicitly | |
# rather than using the generic imports: | |
class GroupedSearchQuery(SolrSearchQuery): | |
def __init__(self, *args, **kwargs): | |
super(GroupedSearchQuery, self).__init__(*args, **kwargs) | |
self.grouping_field = None | |
self._total_document_count = None | |
def _clone(self, **kwargs): | |
clone = super(GroupedSearchQuery, self)._clone(**kwargs) | |
clone.grouping_field = self.grouping_field | |
return clone | |
def add_group_by(self, field_name): | |
self.grouping_field = field_name | |
def post_process_facets(self, results): | |
# FIXME: remove this hack once https://github.com/toastdriven/django-haystack/issues/750 lands | |
# See matches dance in _process_results below: | |
total = 0 | |
if 'hits' in results: | |
total = int(results['hits']) | |
elif 'matches' in results: | |
total = int(results['matches']) | |
self._total_document_count = total | |
return super(GroupedSearchQuery, self).post_process_facets(results) | |
def get_total_document_count(self): | |
"""Return the total number of matching documents rather than document groups | |
If the query has not been run, this will execute the query and store the results. | |
""" | |
if self._total_document_count is None: | |
self.run() | |
return self._total_document_count | |
def build_params(self, *args, **kwargs): | |
res = super(GroupedSearchQuery, self).build_params(*args, **kwargs) | |
if self.grouping_field is not None: | |
res.update({'group': 'true', | |
'group.field': self.grouping_field, | |
'group.ngroups': 'true', | |
'group.limit': 2, # TODO: Don't hard-code this | |
'group.sort': 'django_ct desc, score desc', | |
'group.facet': 'true', | |
'result_class': GroupedSearchResult}) | |
return res | |
class GroupedSearchResult(object): | |
def __init__(self, field_name, group_data, raw_results={}): | |
self.field_name = field_name | |
self.key = group_data['groupValue'] # TODO: convert _to_python | |
self.hits = group_data['doclist']['numFound'] | |
self.documents = list(self.process_documents(group_data['doclist']['docs'], | |
raw_results=raw_results)) | |
def __unicode__(self): | |
return 'GroupedSearchResult({0.field_name}={0.group_key}, hits={0.hits})'.format(self) | |
def process_documents(self, doclist, raw_results): | |
# TODO: tame import spaghetti | |
from haystack import connections | |
engine = connections["en"] | |
conn = engine.get_backend().conn | |
unified_index = engine.get_unified_index() | |
indexed_models = unified_index.get_indexed_models() | |
for raw_result in doclist: | |
app_label, model_name = raw_result[DJANGO_CT].split('.') | |
additional_fields = {} | |
model = get_model(app_label, model_name) | |
if model and model in indexed_models: | |
for key, value in raw_result.items(): | |
index = unified_index.get_index(model) | |
string_key = str(key) | |
if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): | |
additional_fields[string_key] = index.fields[string_key].convert(value) | |
else: | |
additional_fields[string_key] = conn._to_python(value) | |
del(additional_fields[DJANGO_CT]) | |
del(additional_fields[DJANGO_ID]) | |
del(additional_fields['score']) | |
if raw_result[ID] in getattr(raw_results, 'highlighting', {}): | |
additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]] | |
result = SearchResult(app_label, model_name, raw_result[DJANGO_ID], | |
raw_result['score'], **additional_fields) | |
yield result | |
class GroupedSearchQuerySet(SearchQuerySet): | |
def __init__(self, *args, **kwargs): | |
super(GroupedSearchQuerySet, self).__init__(*args, **kwargs) | |
if not isinstance(self.query, GroupedSearchQuery): | |
raise TypeError("GroupedSearchQuerySet must be used with a GroupedSearchQuery query") | |
def group_by(self, field_name): | |
"""Have Solr group results based on the provided field name""" | |
clone = self._clone() | |
clone.query.add_group_by(field_name) | |
return clone | |
def post_process_results(self, results): | |
# Override the default model-specific processing | |
return results | |
def total_document_count(self): | |
"""Returns the count for the total number of matching documents rather than groups | |
A GroupedSearchQuerySet normally returns the number of document groups; this allows | |
you to indicate the total number of matching documents - quite handy for making facet counts match the | |
displayed numbers | |
""" | |
if self.query.has_run(): | |
return self.query.get_total_document_count() | |
else: | |
clone = self._clone() | |
return clone.query.get_total_document_count() | |
class GroupedSolrSearchBackend(SolrSearchBackend): | |
def build_search_kwargs(self, *args, **kwargs): | |
group_kwargs = [(i, kwargs.pop(i)) for i in kwargs.keys() if i.startswith("group")] | |
res = super(GroupedSolrSearchBackend, self).build_search_kwargs(*args, **kwargs) | |
res.update(group_kwargs) | |
if group_kwargs and 'sort' not in kwargs: | |
res['sort'] = 'score desc, item_id asc' | |
return res | |
def _process_results(self, raw_results, result_class=None, **kwargs): | |
res = super(GroupedSolrSearchBackend, self)._process_results(raw_results, | |
result_class=result_class, | |
**kwargs) | |
if result_class and not issubclass(result_class, GroupedSearchResult): | |
return res | |
if len(raw_results.docs): | |
raise RuntimeError("Grouped Solr searches should return grouped elements, not docs!") | |
assert not res['results'] | |
assert not res['hits'] | |
if isinstance(raw_results, EmptyResults): | |
return res | |
assert len(raw_results.grouped) == 1, "Grouping on more than one field is not supported" | |
res['results'] = results = [] | |
for field_name, field_group in raw_results.grouped.items(): | |
res['hits'] = field_group['ngroups'] | |
res['matches'] = field_group['matches'] | |
for group in field_group['groups']: | |
if group['groupValue'] is None: | |
logging.warning("Unexpected NULL grouping", extra={'data': raw_results}) | |
res['hits'] -= 1 # Avoid confusing Haystack with excluded bogon results | |
continue | |
results.append(result_class(field_name, group, raw_results=raw_results)) | |
return res | |
class GroupedSolrEngine(SolrEngine): | |
backend = GroupedSolrSearchBackend | |
query = GroupedSearchQuery |
@mynameistechno I've been using it in production since last November: http://chris.improbable.org/2014/3/17/content-search-on-a-budget/
It's not feature complete with all of the other backends but it handles the subset of functionality which I need.
Do you have any tutorials or documentation on how to implement this? I'm fairly new to haystack/solr and can't seem to figure out where/how I should be using these.
Hey!
I also need some tutorial on how implement this, where I should add this code?
Thanks in advance!
@TippyTipster, @Pablo1990: belated reply since I never saw a notification for your comments but for the record: I have that in a separate module (e.g. my_project/search/grouped.py
and my Django settings module has 'ENGINE': 'my_project.search.grouped'
in the Haystack connection configuration following the examples at http://django-haystack.readthedocs.org/en/v2.4.1/tutorial.html#modify-your-settings-py.
Anyone who finds this of interest but wants to use the newer / maybe-faster Collapsing filter and Expand component: https://gist.github.com/acdha/0a66ca23984bc8d607936fecd9c29941
Hi @acdha, how would you sort search results using this? Applying order_by on the search query set does not seem to work. Thank you.
Hey @acdha, this looks great, have you been using it? I.e. how stable is this for production use? Thanks