Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
chore: apply optim
Signed-off-by: Joan Fontanals Martinez <[email protected]>
  • Loading branch information
Joan Fontanals Martinez committed Jul 28, 2023
commit 8c0bf6cf536c99b0e7dee665822cc210018ffd19
31 changes: 21 additions & 10 deletions docarray/index/backends/hnswlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,11 @@ def __init__(self, db_config=None, **kwargs):
if col.config
}
self._hnsw_indices = {}
sub_docs_exist = False
cosine_metric_index_exist = False
for col_name, col in self._column_infos.items():
if '__' in col_name:
sub_docs_exist = True
if safe_issubclass(col.docarray_type, AnyDocArray):
continue
if not col.config:
Expand All @@ -128,7 +132,12 @@ def __init__(self, db_config=None, **kwargs):
else:
self._hnsw_indices[col_name] = self._create_index(col_name, col)
self._logger.info(f'Created a new index for column `{col_name}`')
if self._hnsw_indices[col_name].space == 'cosine':
cosine_metric_index_exist = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we care about this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because for cosine, HNSWLib normalizes the vectors, and then if we retrieve, they have chanded, so no consistent API can be provided


self._apply_optim_no_embedding_in_sqlite = (
not sub_docs_exist and not cosine_metric_index_exist
) # optimization consisting in not serializing embeddings to SQLite because they are expensive to send and they can be reconstructed from the HNSW index itself.
# SQLite setup
self._sqlite_db_path = os.path.join(self._work_dir, 'docs_sqlite.db')
self._logger.debug(f'DB path set to {self._sqlite_db_path}')
Expand Down Expand Up @@ -565,9 +574,10 @@ def _get_num_docs_sqlite(self) -> int:
# serialization helpers
def _doc_to_bytes(self, doc: BaseDoc) -> bytes:
pb = doc.to_protobuf()
for col_name in self._hnsw_indices.keys():
pb.data[col_name].Clear()
pb.data[col_name].Clear()
if self._apply_optim_no_embedding_in_sqlite:
for col_name in self._hnsw_indices.keys():
pb.data[col_name].Clear()
pb.data[col_name].Clear()
return pb.SerializeToString()

def _doc_from_bytes(
Expand All @@ -578,13 +588,14 @@ def _doc_from_bytes(
pb = DocProto.FromString(
data
) # I cannot reconstruct directly the DA object because it may fail at validation because embedding may not be Optional
for k, v in reconstruct_embeddings.items():
node_proto = (
self.out_schema.__fields__[k]
.type_._docarray_from_ndarray(np.array(v))
._to_node_protobuf()
)
pb.data[k].MergeFrom(node_proto)
if self._apply_optim_no_embedding_in_sqlite:
for k, v in reconstruct_embeddings.items():
node_proto = (
schema_cls._get_field_type(k)
._docarray_from_ndarray(np.array(v))
._to_node_protobuf()
)
pb.data[k].MergeFrom(node_proto)

doc = schema_cls.from_protobuf(pb)
return doc
Expand Down
2 changes: 1 addition & 1 deletion tests/index/hnswlib/test_index_get_del.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,5 +410,5 @@ class TextSimpleDoc(SimpleDoc):
for doc in res.documents:
if doc.id == docs[0].id:
found = True
assert (doc.tens == new_tensor).all()
assert np.allclose(doc.tens, new_tensor)
assert found