Skip to content

Commit fdfcb56

Browse files
author
Joan Fontanals Martinez
committed
chore: apply optim
Signed-off-by: Joan Fontanals Martinez <[email protected]>
1 parent 1c8337e commit fdfcb56

File tree

1 file changed

+22
-10
lines changed

1 file changed

+22
-10
lines changed

docarray/index/backends/hnswlib.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,12 @@ def __init__(self, db_config=None, **kwargs):
108108
if col.config
109109
}
110110
self._hnsw_indices = {}
111+
self._apply_optim_no_embedding_in_sqlite = True
112+
sub_docs_exist = False
113+
cosine_metric_index_exist = False
111114
for col_name, col in self._column_infos.items():
115+
if '__' in col_name:
116+
sub_docs_exist = True
112117
if safe_issubclass(col.docarray_type, AnyDocArray):
113118
continue
114119
if not col.config:
@@ -128,7 +133,12 @@ def __init__(self, db_config=None, **kwargs):
128133
else:
129134
self._hnsw_indices[col_name] = self._create_index(col_name, col)
130135
self._logger.info(f'Created a new index for column `{col_name}`')
136+
if self._hnsw_indices[col_name].space == 'cosine':
137+
cosine_metric_index_exist = True
131138

139+
self._apply_optim_no_embedding_in_sqlite = (
140+
not sub_docs_exist and not cosine_metric_index_exist
141+
) # optimization consisting in not serializing embeddings to SQLite because they are expensive to send and they can be reconstructed from the HNSW index itself.
132142
# SQLite setup
133143
self._sqlite_db_path = os.path.join(self._work_dir, 'docs_sqlite.db')
134144
self._logger.debug(f'DB path set to {self._sqlite_db_path}')
@@ -565,9 +575,10 @@ def _get_num_docs_sqlite(self) -> int:
565575
# serialization helpers
566576
def _doc_to_bytes(self, doc: BaseDoc) -> bytes:
567577
pb = doc.to_protobuf()
568-
for col_name in self._hnsw_indices.keys():
569-
pb.data[col_name].Clear()
570-
pb.data[col_name].Clear()
578+
if self._apply_optim_no_embedding_in_sqlite:
579+
for col_name in self._hnsw_indices.keys():
580+
pb.data[col_name].Clear()
581+
pb.data[col_name].Clear()
571582
return pb.SerializeToString()
572583

573584
def _doc_from_bytes(
@@ -578,13 +589,14 @@ def _doc_from_bytes(
578589
pb = DocProto.FromString(
579590
data
580591
) # I cannot reconstruct directly the DA object because it may fail at validation because embedding may not be Optional
581-
for k, v in reconstruct_embeddings.items():
582-
node_proto = (
583-
self.out_schema.__fields__[k]
584-
.type_._docarray_from_ndarray(np.array(v))
585-
._to_node_protobuf()
586-
)
587-
pb.data[k].MergeFrom(node_proto)
592+
if self._apply_optim_no_embedding_in_sqlite:
593+
for k, v in reconstruct_embeddings.items():
594+
node_proto = (
595+
schema_cls._get_field_type(k)
596+
._docarray_from_ndarray(np.array(v))
597+
._to_node_protobuf()
598+
)
599+
pb.data[k].MergeFrom(node_proto)
588600

589601
doc = schema_cls.from_protobuf(pb)
590602
return doc

0 commit comments

Comments
 (0)