Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Merge branch 'main' of https://github.com/docarray/docarray into refa…
…ctor-hnswlib-performance
  • Loading branch information
Joan Fontanals Martinez committed Jul 26, 2023
commit 02849c46ed6b4c941b71ea20530007364955a782
36 changes: 11 additions & 25 deletions docarray/index/backends/hnswlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import hashlib
import os
import sqlite3
from collections import OrderedDict
from collections import OrderedDict, defaultdict
from dataclasses import dataclass, field
from pathlib import Path
from typing import (
Expand Down Expand Up @@ -34,7 +34,6 @@
)
from docarray.index.backends.helper import (
_collect_query_args,
_execute_find_and_filter_query,
)
from docarray.proto import DocProto, NdArrayProto, NodeProto
from docarray.typing.tensor.abstract_tensor import AbstractTensor
Expand Down Expand Up @@ -309,16 +308,9 @@ def _find_batched(
limit: int,
search_field: str = '',
) -> _FindResultBatched:

index = self._hnsw_indices[search_field]
labels, distances = index.knn_query(queries, k=int(limit))
result_das = [
self._get_docs_sqlite_hashed_id(
ids_per_query.tolist(),
)
for ids_per_query in labels
]
return _FindResultBatched(documents=result_das, scores=distances)
return self._search_and_filter(
queries=queries, limit=limit, search_field=search_field
)

def _find(
self, query: np.ndarray, limit: int, search_field: str = ''
Expand Down Expand Up @@ -633,43 +625,37 @@ def _search_and_filter(
documents and their corresponding scores.
"""
# If there are no documents or hashed_ids is an empty set, return an empty _FindResultBatched
if self.num_docs() == 0 or (hashed_ids is not None and len(hashed_ids) == 0):
if hashed_ids is not None and len(hashed_ids) == 0:
return _FindResultBatched(documents=[], scores=[]) # type: ignore

# Set limit as the minimum of the provided limit and the total number of documents
limit = min(limit, self.num_docs())
limit = limit

# Ensure the search field is in the HNSW indices
if search_field not in self._hnsw_indices:
raise ValueError(
f'Search field {search_field} is not present in the HNSW indices'
)

index = self._hnsw_indices[search_field]

def accept_all(id):
"""Accepts all IDs."""
return True

def accept_hashed_ids(id):
"""Accepts IDs that are in hashed_ids."""
return id in hashed_ids # type: ignore[operator]

# Choose the appropriate filter function based on whether hashed_ids was provided
filter_function = accept_hashed_ids if hashed_ids else accept_all
extra_kwargs = {}
if hashed_ids:
extra_kwargs['filter'] = accept_hashed_ids

# If hashed_ids is provided, k is the minimum of limit and the length of hashed_ids; else it is limit
k = min(limit, len(hashed_ids)) if hashed_ids else limit

labels, distances = index.knn_query(queries, k=k, filter=filter_function)

index = self._hnsw_indices[search_field]
labels, distances = index.knn_query(queries, k=int(limit), **extra_kwargs)
result_das = [
self._get_docs_sqlite_hashed_id(
ids_per_query.tolist(),
)
for ids_per_query in labels
]

return _FindResultBatched(documents=result_das, scores=distances)

@classmethod
Expand Down
You are viewing a condensed version of this merge commit. You can view the full changes here.