Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 40 additions & 11 deletions docarray/index/backends/in_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def __init__(
)()

self._embedding_map: Dict[str, Tuple[AnyTensor, Optional[List[int]]]] = {}
self._ids_to_positions: Dict[str, int] = {}

def python_type_to_db_type(self, python_type: Type) -> Any:
"""Map python type to database type.
Expand Down Expand Up @@ -163,7 +164,13 @@ def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs):
"""
# implementing the public option because conversion to column dict is not needed
docs = self._validate_docs(docs)
self._docs.extend(docs)
ids_to_positions = self._get_ids_to_positions()
for doc in docs:
if doc.id in ids_to_positions:
self._docs[ids_to_positions[doc.id]] = doc
else:
self._docs.append(doc)
self._ids_to_positions[str(doc.id)] = len(self._ids_to_positions)

# Add parent_id to all sub-index documents and store sub-index documents
data_by_columns = self._get_col_value_dict(docs)
Expand Down Expand Up @@ -216,6 +223,7 @@ def _del_items(self, doc_ids: Sequence[str]):
indices.append(i)

del self._docs[indices]
self._update_ids_to_positions()
self._rebuild_embedding()

def _ori_items(self, doc: BaseDoc) -> BaseDoc:
Expand Down Expand Up @@ -259,15 +267,18 @@ def _get_items(
"""

out_docs = []
for i, doc in enumerate(self._docs):
if doc.id in doc_ids:
if raw:
out_docs.append(doc)
else:
ori_doc = self._ori_items(doc)
schema_cls = cast(Type[BaseDoc], self.out_schema)
new_doc = schema_cls(**ori_doc.__dict__)
out_docs.append(new_doc)
ids_to_positions = self._get_ids_to_positions()
for doc_id in doc_ids:
if doc_id not in ids_to_positions:
continue
doc = self._docs[ids_to_positions[doc_id]]
if raw:
out_docs.append(doc)
else:
ori_doc = self._ori_items(doc)
schema_cls = cast(Type[BaseDoc], self.out_schema)
new_doc = schema_cls(**ori_doc.__dict__)
out_docs.append(new_doc)

return out_docs

Expand Down Expand Up @@ -461,7 +472,7 @@ def _text_search_batched(
raise NotImplementedError(f'{type(self)} does not support text search.')

def _doc_exists(self, doc_id: str) -> bool:
return any(doc.id == doc_id for doc in self._docs)
return doc_id in self._get_ids_to_positions()

def persist(self, file: Optional[str] = None) -> None:
"""Persist InMemoryExactNNIndex into a binary file."""
Expand Down Expand Up @@ -500,3 +511,21 @@ def _get_root_doc_id(self, id: str, root: str, sub: str) -> str:
id, fields[0], '__'.join(fields[1:])
)
return self._get_root_doc_id(cur_root_id, root, '')

def _get_ids_to_positions(self) -> Dict[str, int]:
"""
Obtains a mapping between document IDs and their respective positions
within the DocList. If this mapping hasn't been initialized, it will be created.

:return: A dictionary mapping each document ID to its corresponding position.
"""
if not self._ids_to_positions:
self._update_ids_to_positions()
return self._ids_to_positions

def _update_ids_to_positions(self) -> None:
"""
Generates or updates the mapping between document IDs and their corresponding
positions within the DocList.
"""
self._ids_to_positions = {doc.id: pos for pos, doc in enumerate(self._docs)}
55 changes: 55 additions & 0 deletions tests/index/in_memory/test_index_get_del.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import numpy as np

from docarray import BaseDoc, DocList
from docarray.index import InMemoryExactNNIndex
from docarray.typing import NdArray


class SimpleDoc(BaseDoc):
embedding: NdArray[128]
text: str


def test_update_payload():
docs = DocList[SimpleDoc](
[SimpleDoc(embedding=np.random.rand(128), text=f'hey {i}') for i in range(100)]
)
index = InMemoryExactNNIndex[SimpleDoc]()
index.index(docs)

assert index.num_docs() == 100

for doc in docs:
doc.text += '_changed'

index.index(docs)
assert index.num_docs() == 100

res = index.find(query=docs[0], search_field='embedding', limit=100)
assert len(res.documents) == 100
for doc in res.documents:
assert '_changed' in doc.text


def test_update_embedding():
docs = DocList[SimpleDoc](
[SimpleDoc(embedding=np.random.rand(128), text=f'hey {i}') for i in range(100)]
)
index = InMemoryExactNNIndex[SimpleDoc]()
index.index(docs)
assert index.num_docs() == 100

new_tensor = np.random.rand(128)
docs[0].embedding = new_tensor

index.index(docs[0])
assert index.num_docs() == 100

res = index.find(query=docs[0], search_field='embedding', limit=100)
assert len(res.documents) == 100
found = False
for doc in res.documents:
if doc.id == docs[0].id:
found = True
assert (doc.embedding == new_tensor).all()
assert found