Merge branch 'main' of https://github.com/docarray/docarray into refa…

…ctor-hnswlib-performance
docarray · JoanFM · Jul 31, 2023 · Jul 26, 2023 · Jul 26, 2023 · Jul 27, 2023
commit 02849c46ed6b4c941b71ea20530007364955a782
diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py
@@ -2,7 +2,7 @@
 import hashlib
 import os
 import sqlite3
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import (
@@ -34,7 +34,6 @@
 )
 from docarray.index.backends.helper import (
     _collect_query_args,
-    _execute_find_and_filter_query,
 )
 from docarray.proto import DocProto, NdArrayProto, NodeProto
 from docarray.typing.tensor.abstract_tensor import AbstractTensor
@@ -309,16 +308,9 @@ def _find_batched(
         limit: int,
         search_field: str = '',
     ) -> _FindResultBatched:
-
-        index = self._hnsw_indices[search_field]
-        labels, distances = index.knn_query(queries, k=int(limit))
-        result_das = [
-            self._get_docs_sqlite_hashed_id(
-                ids_per_query.tolist(),
-            )
-            for ids_per_query in labels
-        ]
-        return _FindResultBatched(documents=result_das, scores=distances)
+        return self._search_and_filter(
+            queries=queries, limit=limit, search_field=search_field
+        )
 
     def _find(
         self, query: np.ndarray, limit: int, search_field: str = ''
@@ -633,43 +625,37 @@ def _search_and_filter(
             documents and their corresponding scores.
         """
         # If there are no documents or hashed_ids is an empty set, return an empty _FindResultBatched
-        if self.num_docs() == 0 or (hashed_ids is not None and len(hashed_ids) == 0):
+        if hashed_ids is not None and len(hashed_ids) == 0:
             return _FindResultBatched(documents=[], scores=[])  # type: ignore
 
         # Set limit as the minimum of the provided limit and the total number of documents
-        limit = min(limit, self.num_docs())
+        limit = limit
 
         # Ensure the search field is in the HNSW indices
         if search_field not in self._hnsw_indices:
             raise ValueError(
                 f'Search field {search_field} is not present in the HNSW indices'
             )
 
-        index = self._hnsw_indices[search_field]
-
-        def accept_all(id):
-            """Accepts all IDs."""
-            return True
-
         def accept_hashed_ids(id):
             """Accepts IDs that are in hashed_ids."""
             return id in hashed_ids  # type: ignore[operator]
 
         # Choose the appropriate filter function based on whether hashed_ids was provided
-        filter_function = accept_hashed_ids if hashed_ids else accept_all
+        extra_kwargs = {}
+        if hashed_ids:
+            extra_kwargs['filter'] = accept_hashed_ids
 
         # If hashed_ids is provided, k is the minimum of limit and the length of hashed_ids; else it is limit
         k = min(limit, len(hashed_ids)) if hashed_ids else limit
-
-        labels, distances = index.knn_query(queries, k=k, filter=filter_function)
-
+        index = self._hnsw_indices[search_field]
+        labels, distances = index.knn_query(queries, k=int(limit), **extra_kwargs)
         result_das = [
             self._get_docs_sqlite_hashed_id(
                 ids_per_query.tolist(),
             )
             for ids_per_query in labels
         ]
-
         return _FindResultBatched(documents=result_das, scores=distances)
 
     @classmethod