Skip to content

Instantly share code, notes, and snippets.

@acdha
Last active December 14, 2017 13:45
Show Gist options
  • Save acdha/3750774 to your computer and use it in GitHub Desktop.
Save acdha/3750774 to your computer and use it in GitHub Desktop.

Revisions

  1. acdha revised this gist Jan 3, 2014. 1 changed file with 15 additions and 9 deletions.
    24 changes: 15 additions & 9 deletions solr_grouping_backend.py
    Original file line number Diff line number Diff line change
    @@ -1,24 +1,25 @@
    # encoding: utf-8
    """Experimental Solr Grouping / Field Collapsing backend for Haystack 2.0"""
    # NOTE: Requires pysolr 3.0.6+
    # NOTE: You must be running the latest Pysolr master - no PyPI release yet!
    # See https://gist.github.com/3750774 for the current version of this code
    # See http://wiki.apache.org/solr/FieldCollapsing for the Solr feature documentation
    from __future__ import absolute_import

    import logging

    from django.db.models.loading import get_model

    from haystack.constants import ID, DJANGO_CT, DJANGO_ID
    from haystack.backends import EmptyResults
    from haystack.backends.solr_backend import SolrEngine, SolrSearchBackend, SolrSearchQuery
    from haystack.constants import DJANGO_CT, DJANGO_ID, ID
    from haystack.models import SearchResult
    from haystack.query import SearchQuerySet
    from haystack.backends import EmptyResults

    # Since there's no chance of this being portable (yet!) we'll import explicitly
    # rather than using the generic imports:
    from haystack.backends.solr_backend import SolrEngine, SolrSearchBackend, SolrSearchQuery


    class GroupedSearchQuery(SolrSearchQuery):

    def __init__(self, *args, **kwargs):
    super(GroupedSearchQuery, self).__init__(*args, **kwargs)
    self.grouping_field = None
    @@ -37,10 +38,10 @@ def post_process_facets(self, results):
    # See matches dance in _process_results below:
    total = 0

    if 'matches' in results:
    total = int(results['matches'])
    elif 'hits' in results:
    if 'hits' in results:
    total = int(results['hits'])
    elif 'matches' in results:
    total = int(results['matches'])

    self._total_document_count = total

    @@ -63,13 +64,14 @@ def build_params(self, *args, **kwargs):
    'group.field': self.grouping_field,
    'group.ngroups': 'true',
    'group.limit': 2, # TODO: Don't hard-code this
    'group.sort': 'score desc',
    'group.sort': 'django_ct desc, score desc',
    'group.facet': 'true',
    'result_class': GroupedSearchResult})
    return res


    class GroupedSearchResult(object):

    def __init__(self, field_name, group_data, raw_results={}):
    self.field_name = field_name
    self.key = group_data['groupValue'] # TODO: convert _to_python
    @@ -117,6 +119,7 @@ def process_documents(self, doclist, raw_results):


    class GroupedSearchQuerySet(SearchQuerySet):

    def __init__(self, *args, **kwargs):
    super(GroupedSearchQuerySet, self).__init__(*args, **kwargs)

    @@ -148,12 +151,15 @@ def total_document_count(self):


    class GroupedSolrSearchBackend(SolrSearchBackend):

    def build_search_kwargs(self, *args, **kwargs):
    group_kwargs = [(i, kwargs.pop(i)) for i in kwargs.keys() if i.startswith("group")]

    res = super(GroupedSolrSearchBackend, self).build_search_kwargs(*args, **kwargs)

    res.update(group_kwargs)
    if group_kwargs and 'sort' not in kwargs:
    res['sort'] = 'score desc, item_id asc'

    return res

  2. acdha revised this gist Mar 11, 2013. 1 changed file with 43 additions and 13 deletions.
    56 changes: 43 additions & 13 deletions solr_grouping_backend.py
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,6 @@
    # encoding: utf-8
    """Experimental Solr Grouping / Field Collapsing backend for Haystack 2.0"""
    # NOTE: You must be running the latest Pysolr master - no PyPI release yet!
    # NOTE: Requires pysolr 3.0.6+
    # See https://gist.github.com/3750774 for the current version of this code
    # See http://wiki.apache.org/solr/FieldCollapsing for the Solr feature documentation
    from __future__ import absolute_import
    @@ -22,6 +22,7 @@ class GroupedSearchQuery(SolrSearchQuery):
    def __init__(self, *args, **kwargs):
    super(GroupedSearchQuery, self).__init__(*args, **kwargs)
    self.grouping_field = None
    self._total_document_count = None

    def _clone(self, **kwargs):
    clone = super(GroupedSearchQuery, self)._clone(**kwargs)
    @@ -31,6 +32,30 @@ def _clone(self, **kwargs):
    def add_group_by(self, field_name):
    self.grouping_field = field_name

    def post_process_facets(self, results):
    # FIXME: remove this hack once https://github.com/toastdriven/django-haystack/issues/750 lands
    # See matches dance in _process_results below:
    total = 0

    if 'matches' in results:
    total = int(results['matches'])
    elif 'hits' in results:
    total = int(results['hits'])

    self._total_document_count = total

    return super(GroupedSearchQuery, self).post_process_facets(results)

    def get_total_document_count(self):
    """Return the total number of matching documents rather than document groups
    If the query has not been run, this will execute the query and store the results.
    """
    if self._total_document_count is None:
    self.run()

    return self._total_document_count

    def build_params(self, *args, **kwargs):
    res = super(GroupedSearchQuery, self).build_params(*args, **kwargs)
    if self.grouping_field is not None:
    @@ -39,6 +64,7 @@ def build_params(self, *args, **kwargs):
    'group.ngroups': 'true',
    'group.limit': 2, # TODO: Don't hard-code this
    'group.sort': 'score desc',
    'group.facet': 'true',
    'result_class': GroupedSearchResult})
    return res

    @@ -85,16 +111,6 @@ def process_documents(self, doclist, raw_results):
    if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
    additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]]

    # TODO: restore distance_point processing
    if False: # distance_point:
    additional_fields['_point_of_origin'] = distance_point

    if raw_result.get('__dist__'):
    from haystack.utils.geo import Distance
    additional_fields['_distance'] = Distance(km=float(raw_result['__dist__']))
    else:
    additional_fields['_distance'] = None

    result = SearchResult(app_label, model_name, raw_result[DJANGO_ID],
    raw_result['score'], **additional_fields)
    yield result
    @@ -117,6 +133,19 @@ def post_process_results(self, results):
    # Override the default model-specific processing
    return results

    def total_document_count(self):
    """Returns the count for the total number of matching documents rather than groups
    A GroupedSearchQuerySet normally returns the number of document groups; this allows
    you to indicate the total number of matching documents - quite handy for making facet counts match the
    displayed numbers
    """
    if self.query.has_run():
    return self.query.get_total_document_count()
    else:
    clone = self._clone()
    return clone.query.get_total_document_count()


    class GroupedSolrSearchBackend(SolrSearchBackend):
    def build_search_kwargs(self, *args, **kwargs):
    @@ -145,13 +174,14 @@ def _process_results(self, raw_results, result_class=None, **kwargs):
    if isinstance(raw_results, EmptyResults):
    return res

    assert len(raw_results.grouped) == 1
    assert len(raw_results.grouped) == 1, "Grouping on more than one field is not supported"

    res['results'] = results = []
    for field_name, field_group in raw_results.grouped.items():
    res['hits'] = field_group['ngroups']
    res['matches'] = field_group['matches']
    for group in field_group['groups']:
    if group['groupValue'] == None:
    if group['groupValue'] is None:
    logging.warning("Unexpected NULL grouping", extra={'data': raw_results})
    res['hits'] -= 1 # Avoid confusing Haystack with excluded bogon results
    continue
  3. acdha revised this gist Nov 21, 2012. 1 changed file with 6 additions and 5 deletions.
    11 changes: 6 additions & 5 deletions solr_grouping_backend.py
    Original file line number Diff line number Diff line change
    @@ -129,16 +129,16 @@ def build_search_kwargs(self, *args, **kwargs):
    return res

    def _process_results(self, raw_results, result_class=None, **kwargs):
    res = super(GroupedSolrSearchBackend, self)._process_results(raw_results,
    result_class=result_class,
    **kwargs)

    if result_class and not issubclass(result_class, GroupedSearchResult):
    raise TypeError("GroupedSolrSearchBackend requires use of the GroupedSearchResult result_class")
    return res

    if len(raw_results.docs):
    raise RuntimeError("Grouped Solr searches should return grouped elements, not docs!")

    res = super(GroupedSolrSearchBackend, self)._process_results(raw_results,
    result_class=result_class,
    **kwargs)

    assert not res['results']
    assert not res['hits']

    @@ -153,6 +153,7 @@ def _process_results(self, raw_results, result_class=None, **kwargs):
    for group in field_group['groups']:
    if group['groupValue'] == None:
    logging.warning("Unexpected NULL grouping", extra={'data': raw_results})
    res['hits'] -= 1 # Avoid confusing Haystack with excluded bogon results
    continue
    results.append(result_class(field_name, group, raw_results=raw_results))

  4. acdha revised this gist Sep 27, 2012. 1 changed file with 5 additions and 0 deletions.
    5 changes: 5 additions & 0 deletions solr_grouping_backend.py
    Original file line number Diff line number Diff line change
    @@ -12,6 +12,7 @@
    from haystack.constants import ID, DJANGO_CT, DJANGO_ID
    from haystack.models import SearchResult
    from haystack.query import SearchQuerySet
    from haystack.backends import EmptyResults
    # Since there's no chance of this being portable (yet!) we'll import explicitly
    # rather than using the generic imports:
    from haystack.backends.solr_backend import SolrEngine, SolrSearchBackend, SolrSearchQuery
    @@ -140,6 +141,10 @@ def _process_results(self, raw_results, result_class=None, **kwargs):

    assert not res['results']
    assert not res['hits']

    if isinstance(raw_results, EmptyResults):
    return res

    assert len(raw_results.grouped) == 1

    res['results'] = results = []
  5. acdha revised this gist Sep 21, 2012. 1 changed file with 9 additions and 10 deletions.
    19 changes: 9 additions & 10 deletions solr_grouping_backend.py
    Original file line number Diff line number Diff line change
    @@ -8,9 +8,8 @@
    import logging

    from django.db.models.loading import get_model
    from django.utils.datastructures import SortedDict

    from haystack.constants import DJANGO_CT, DJANGO_ID
    from haystack.constants import ID, DJANGO_CT, DJANGO_ID
    from haystack.models import SearchResult
    from haystack.query import SearchQuerySet
    # Since there's no chance of this being portable (yet!) we'll import explicitly
    @@ -38,22 +37,23 @@ def build_params(self, *args, **kwargs):
    'group.field': self.grouping_field,
    'group.ngroups': 'true',
    'group.limit': 2, # TODO: Don't hard-code this
    'group.sort': 'django_ct asc',
    'group.sort': 'score desc',
    'result_class': GroupedSearchResult})
    return res


    class GroupedSearchResult(object):
    def __init__(self, field_name, group_data):
    def __init__(self, field_name, group_data, raw_results={}):
    self.field_name = field_name
    self.key = group_data['groupValue'] # TODO: convert _to_python
    self.hits = group_data['doclist']['numFound']
    self.documents = list(self.process_documents(group_data['doclist']['docs']))
    self.documents = list(self.process_documents(group_data['doclist']['docs'],
    raw_results=raw_results))

    def __unicode__(self):
    return 'GroupedSearchResult({0.field_name}={0.group_key}, hits={0.hits})'.format(self)

    def process_documents(self, doclist):
    def process_documents(self, doclist, raw_results):
    # TODO: tame import spaghetti
    from haystack import connections
    engine = connections["en"]
    @@ -81,9 +81,8 @@ def process_documents(self, doclist):
    del(additional_fields[DJANGO_ID])
    del(additional_fields['score'])

    # TODO: Add highlighting visibility
    # if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
    # additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]]
    if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
    additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]]

    # TODO: restore distance_point processing
    if False: # distance_point:
    @@ -150,7 +149,7 @@ def _process_results(self, raw_results, result_class=None, **kwargs):
    if group['groupValue'] == None:
    logging.warning("Unexpected NULL grouping", extra={'data': raw_results})
    continue
    results.append(result_class(field_name, group))
    results.append(result_class(field_name, group, raw_results=raw_results))

    return res

  6. acdha revised this gist Sep 21, 2012. 1 changed file with 15 additions and 19 deletions.
    34 changes: 15 additions & 19 deletions solr_grouping_backend.py
    Original file line number Diff line number Diff line change
    @@ -36,32 +36,24 @@ def build_params(self, *args, **kwargs):
    if self.grouping_field is not None:
    res.update({'group': 'true',
    'group.field': self.grouping_field,
    'group.limit': 3, # TODO: This should not be hard-coded
    'group.ngroups': 'true',
    'group.sort': 'django_ct desc',
    'group.limit': 2, # TODO: Don't hard-code this
    'group.sort': 'django_ct asc',
    'result_class': GroupedSearchResult})
    return res


    class GroupedSearchResult(object):
    def __init__(self, field_name, group):
    def __init__(self, field_name, group_data):
    self.field_name = field_name
    self.count = group['ngroups']
    self.total_documents = group['matches']
    self.groups = SortedDict()

    for group in group['groups']:
    key = group['groupValue']
    if key is None:
    logging.warning("Skipping empty key value")
    continue
    self.groups[key] = list(self.process_documents(group['doclist']['docs']))
    self.key = group_data['groupValue'] # TODO: convert _to_python
    self.hits = group_data['doclist']['numFound']
    self.documents = list(self.process_documents(group_data['doclist']['docs']))

    def __unicode__(self):
    return 'GroupedSearchResult("{0.field_name}", count={0.count})'.format(self)
    return 'GroupedSearchResult({0.field_name}={0.group_key}, hits={0.hits})'.format(self)

    def process_documents(self, doclist):

    # TODO: tame import spaghetti
    from haystack import connections
    engine = connections["en"]
    @@ -149,12 +141,16 @@ def _process_results(self, raw_results, result_class=None, **kwargs):

    assert not res['results']
    assert not res['hits']
    assert len(raw_results.grouped) == 1

    res['results'] = results = []
    for field_name, groups in raw_results.grouped.items():
    results.append(result_class(field_name, groups))

    res['hits'] = len(results)
    for field_name, field_group in raw_results.grouped.items():
    res['hits'] = field_group['ngroups']
    for group in field_group['groups']:
    if group['groupValue'] == None:
    logging.warning("Unexpected NULL grouping", extra={'data': raw_results})
    continue
    results.append(result_class(field_name, group))

    return res

  7. acdha revised this gist Sep 19, 2012. 1 changed file with 6 additions and 2 deletions.
    8 changes: 6 additions & 2 deletions solr_grouping_backend.py
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,8 @@
    # encoding: utf-8
    # See http://wiki.apache.org/solr/FieldCollapsing for the Solr feature information
    """Experimental Solr Grouping / Field Collapsing backend for Haystack 2.0"""
    # NOTE: You must be running the latest Pysolr master - no PyPI release yet!
    # See https://gist.github.com/3750774 for the current version of this code
    # See http://wiki.apache.org/solr/FieldCollapsing for the Solr feature documentation
    from __future__ import absolute_import

    import logging
    @@ -33,7 +36,7 @@ def build_params(self, *args, **kwargs):
    if self.grouping_field is not None:
    res.update({'group': 'true',
    'group.field': self.grouping_field,
    'group.limit': 3,
    'group.limit': 3, # TODO: This should not be hard-coded
    'group.ngroups': 'true',
    'group.sort': 'django_ct desc',
    'result_class': GroupedSearchResult})
    @@ -119,6 +122,7 @@ def group_by(self, field_name):
    return clone

    def post_process_results(self, results):
    # Override the default model-specific processing
    return results


  8. acdha created this gist Sep 19, 2012.
    160 changes: 160 additions & 0 deletions solr_grouping_backend.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,160 @@
    # encoding: utf-8
    # See http://wiki.apache.org/solr/FieldCollapsing for the Solr feature information
    from __future__ import absolute_import

    import logging

    from django.db.models.loading import get_model
    from django.utils.datastructures import SortedDict

    from haystack.constants import DJANGO_CT, DJANGO_ID
    from haystack.models import SearchResult
    from haystack.query import SearchQuerySet
    # Since there's no chance of this being portable (yet!) we'll import explicitly
    # rather than using the generic imports:
    from haystack.backends.solr_backend import SolrEngine, SolrSearchBackend, SolrSearchQuery


    class GroupedSearchQuery(SolrSearchQuery):
    def __init__(self, *args, **kwargs):
    super(GroupedSearchQuery, self).__init__(*args, **kwargs)
    self.grouping_field = None

    def _clone(self, **kwargs):
    clone = super(GroupedSearchQuery, self)._clone(**kwargs)
    clone.grouping_field = self.grouping_field
    return clone

    def add_group_by(self, field_name):
    self.grouping_field = field_name

    def build_params(self, *args, **kwargs):
    res = super(GroupedSearchQuery, self).build_params(*args, **kwargs)
    if self.grouping_field is not None:
    res.update({'group': 'true',
    'group.field': self.grouping_field,
    'group.limit': 3,
    'group.ngroups': 'true',
    'group.sort': 'django_ct desc',
    'result_class': GroupedSearchResult})
    return res


    class GroupedSearchResult(object):
    def __init__(self, field_name, group):
    self.field_name = field_name
    self.count = group['ngroups']
    self.total_documents = group['matches']
    self.groups = SortedDict()

    for group in group['groups']:
    key = group['groupValue']
    if key is None:
    logging.warning("Skipping empty key value")
    continue
    self.groups[key] = list(self.process_documents(group['doclist']['docs']))

    def __unicode__(self):
    return 'GroupedSearchResult("{0.field_name}", count={0.count})'.format(self)

    def process_documents(self, doclist):

    # TODO: tame import spaghetti
    from haystack import connections
    engine = connections["en"]
    conn = engine.get_backend().conn

    unified_index = engine.get_unified_index()
    indexed_models = unified_index.get_indexed_models()

    for raw_result in doclist:
    app_label, model_name = raw_result[DJANGO_CT].split('.')
    additional_fields = {}
    model = get_model(app_label, model_name)

    if model and model in indexed_models:
    for key, value in raw_result.items():
    index = unified_index.get_index(model)
    string_key = str(key)

    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
    additional_fields[string_key] = index.fields[string_key].convert(value)
    else:
    additional_fields[string_key] = conn._to_python(value)

    del(additional_fields[DJANGO_CT])
    del(additional_fields[DJANGO_ID])
    del(additional_fields['score'])

    # TODO: Add highlighting visibility
    # if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
    # additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]]

    # TODO: restore distance_point processing
    if False: # distance_point:
    additional_fields['_point_of_origin'] = distance_point

    if raw_result.get('__dist__'):
    from haystack.utils.geo import Distance
    additional_fields['_distance'] = Distance(km=float(raw_result['__dist__']))
    else:
    additional_fields['_distance'] = None

    result = SearchResult(app_label, model_name, raw_result[DJANGO_ID],
    raw_result['score'], **additional_fields)
    yield result


    class GroupedSearchQuerySet(SearchQuerySet):
    def __init__(self, *args, **kwargs):
    super(GroupedSearchQuerySet, self).__init__(*args, **kwargs)

    if not isinstance(self.query, GroupedSearchQuery):
    raise TypeError("GroupedSearchQuerySet must be used with a GroupedSearchQuery query")

    def group_by(self, field_name):
    """Have Solr group results based on the provided field name"""
    clone = self._clone()
    clone.query.add_group_by(field_name)
    return clone

    def post_process_results(self, results):
    return results


    class GroupedSolrSearchBackend(SolrSearchBackend):
    def build_search_kwargs(self, *args, **kwargs):
    group_kwargs = [(i, kwargs.pop(i)) for i in kwargs.keys() if i.startswith("group")]

    res = super(GroupedSolrSearchBackend, self).build_search_kwargs(*args, **kwargs)

    res.update(group_kwargs)

    return res

    def _process_results(self, raw_results, result_class=None, **kwargs):
    if result_class and not issubclass(result_class, GroupedSearchResult):
    raise TypeError("GroupedSolrSearchBackend requires use of the GroupedSearchResult result_class")

    if len(raw_results.docs):
    raise RuntimeError("Grouped Solr searches should return grouped elements, not docs!")

    res = super(GroupedSolrSearchBackend, self)._process_results(raw_results,
    result_class=result_class,
    **kwargs)

    assert not res['results']
    assert not res['hits']

    res['results'] = results = []
    for field_name, groups in raw_results.grouped.items():
    results.append(result_class(field_name, groups))

    res['hits'] = len(results)

    return res


    class GroupedSolrEngine(SolrEngine):
    backend = GroupedSolrSearchBackend
    query = GroupedSearchQuery