Skip to content

Commit 9646bed

Browse files
authored
fix: fix edge case when two elements are strings (#37)
* fix: fix pr#29 * fix: fix pr#29
1 parent e9ce570 commit 9646bed

File tree

4 files changed

+116
-42
lines changed

4 files changed

+116
-42
lines changed

docarray/array/document.py

Lines changed: 64 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -146,12 +146,18 @@ def __getitem__(
146146
and len(index) == 2
147147
and isinstance(index[0], (slice, Sequence))
148148
):
149-
_docs = self[index[0]]
150-
_attrs = index[1]
151-
if isinstance(_attrs, str):
152-
_attrs = (index[1],)
153-
154-
return _docs._get_attributes(*_attrs)
149+
if isinstance(index[0], str) and isinstance(index[1], str):
150+
# ambiguity only comes from the second string
151+
if index[1] in self._id2offset:
152+
return DocumentArray([self[index[0]], self[index[1]]])
153+
else:
154+
return getattr(self[index[0]], index[1])
155+
elif isinstance(index[0], (slice, Sequence)):
156+
_docs = self[index[0]]
157+
_attrs = index[1]
158+
if isinstance(_attrs, str):
159+
_attrs = (index[1],)
160+
return _docs._get_attributes(*_attrs)
155161
elif isinstance(index[0], bool):
156162
return DocumentArray(itertools.compress(self._data, index))
157163
elif isinstance(index[0], int):
@@ -231,31 +237,45 @@ def __setitem__(
231237
and len(index) == 2
232238
and isinstance(index[0], (slice, Sequence))
233239
):
234-
_docs = self[index[0]]
235-
_attrs = index[1]
236-
237-
if isinstance(_attrs, str):
238-
# a -> [a]
239-
# [a, a] -> [a, a]
240-
_attrs = (index[1],)
241-
if isinstance(value, (list, tuple)) and not any(
242-
isinstance(el, (tuple, list)) for el in value
243-
):
244-
# [x] -> [[x]]
245-
# [[x], [y]] -> [[x], [y]]
246-
value = (value,)
247-
if not isinstance(value, (list, tuple)):
248-
# x -> [x]
249-
value = (value,)
250-
251-
for _a, _v in zip(_attrs, value):
252-
if _a == 'blob':
253-
_docs.blobs = _v
254-
elif _a == 'embedding':
255-
_docs.embeddings = _v
240+
if isinstance(index[0], str) and isinstance(index[1], str):
241+
# ambiguity only comes from the second string
242+
if index[1] in self._id2offset:
243+
for _d, _v in zip((self[index[0]], self[index[1]]), value):
244+
_d._data = _v._data
245+
self._rebuild_id2offset()
246+
elif hasattr(self[index[0]], index[1]):
247+
setattr(self[index[0]], index[1], value)
256248
else:
257-
for _d, _vv in zip(_docs, _v):
258-
setattr(_d, _a, _vv)
249+
# to avoid accidentally add new unsupport attribute
250+
raise ValueError(
251+
f'`{index[1]}` is neither a valid id nor attribute name'
252+
)
253+
elif isinstance(index[0], (slice, Sequence)):
254+
_docs = self[index[0]]
255+
_attrs = index[1]
256+
257+
if isinstance(_attrs, str):
258+
# a -> [a]
259+
# [a, a] -> [a, a]
260+
_attrs = (index[1],)
261+
if isinstance(value, (list, tuple)) and not any(
262+
isinstance(el, (tuple, list)) for el in value
263+
):
264+
# [x] -> [[x]]
265+
# [[x], [y]] -> [[x], [y]]
266+
value = (value,)
267+
if not isinstance(value, (list, tuple)):
268+
# x -> [x]
269+
value = (value,)
270+
271+
for _a, _v in zip(_attrs, value):
272+
if _a == 'blob':
273+
_docs.blobs = _v
274+
elif _a == 'embedding':
275+
_docs.embeddings = _v
276+
else:
277+
for _d, _vv in zip(_docs, _v):
278+
setattr(_d, _a, _vv)
259279
elif isinstance(index[0], bool):
260280
if len(index) != len(self._data):
261281
raise IndexError(
@@ -313,12 +333,20 @@ def __delitem__(self, index: 'DocumentArrayIndexType'):
313333
and len(index) == 2
314334
and isinstance(index[0], (slice, Sequence))
315335
):
316-
_docs = self[index[0]]
317-
_attrs = index[1]
318-
if isinstance(_attrs, str):
319-
_attrs = (index[1],)
320-
for _d in _docs:
321-
_d.pop(*_attrs)
336+
if isinstance(index[0], str) and isinstance(index[1], str):
337+
# ambiguity only comes from the second string
338+
if index[1] in self._id2offset:
339+
del self[index[0]]
340+
del self[index[1]]
341+
else:
342+
self[index[0]].pop(index[1])
343+
elif isinstance(index[0], (slice, Sequence)):
344+
_docs = self[index[0]]
345+
_attrs = index[1]
346+
if isinstance(_attrs, str):
347+
_attrs = (index[1],)
348+
for _d in _docs:
349+
_d.pop(*_attrs)
322350
elif isinstance(index[0], bool):
323351
self._data = list(
324352
itertools.compress(self._data, (not _i for _i in index))

docs/fundamentals/document/construct.md

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,19 @@ d = Document()
1515
<Document ('id',) at 5dd542406d3f11eca3241e008a366d49>
1616
```
1717

18-
Every Document will have a unique random `id` that helps you identify this Document. It can be used to {ref}`access this Document inside a DocumentArray<access-elements>`. You can override this `id` or assign your own `id` during construction, as demonstrated below.
18+
Every Document will have a unique random `id` that helps you identify this Document. It can be used to {ref}`access this Document inside a DocumentArray<access-elements>`.
19+
20+
````{tip}
21+
The random `id` is the hex value of [UUID1](https://docs.python.org/3/library/uuid.html#uuid.uuid1). To convert it into the string of UUID:
22+
23+
```python
24+
import uuid
25+
str(uuid.UUID(d.id))
26+
```
27+
````
28+
29+
Though possible, it is not recommended modifying `.id` of a Document frequently, as this will lead to unexpected behavior.
30+
1931

2032
## Construct with attributes
2133

@@ -25,7 +37,6 @@ This is the most common usage of the constructor: initializing a Document object
2537
from docarray import Document
2638
import numpy
2739

28-
d0 = Document(id='my_id')
2940
d1 = Document(text='hello')
3041
d2 = Document(buffer=b'\f1')
3142
d3 = Document(blob=numpy.array([1, 2, 3]))
@@ -43,7 +54,6 @@ Don't forget to leverage autocomplete in your IDE.
4354
```
4455

4556
```text
46-
<Document ('id',) at my_id>
4757
<Document ('id', 'mime_type', 'text') at a14effee6d3e11ec8bde1e008a366d49>
4858
<Document ('id', 'buffer') at a14f00986d3e11ec8bde1e008a366d49>
4959
<Document ('id', 'blob') at a14f01a66d3e11ec8bde1e008a366d49>

docs/get-started/what-is.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ As one can see, you can convert a DocumentArray into AwkwardArray via `.to_list(
9191

9292
[Zarr](https://zarr.readthedocs.io/en/stable/) is a format for the storage of chunked, compressed, N-dimensional arrays. I know Zarr quite long time ago, to me it is the package when a `numpy.ndarray` is so big to fit into memory. Zarr provides a comprehensive set of functions that allows one to chunk, compress, stream large NdArray. Hence, from that perspective, Zarr like `numpy.ndarray` focuses on numerical representation and computation.
9393

94-
In DocArray, the basic element one would work with is a Document, not `ndarray`. The support of `ndarray` is important, but not the full story: in the context of deep learning engineering, it is often an intermediate representation of Document for computing, then being thrown away. Therefore, having a consistent data structure that can live *long enough* to cover creating, storing, computing, transferring, returning and rendering is a motivation behind DocArray.
94+
In DocArray, the basic element one would work with is a Document, not `ndarray`. The support of `ndarray` is important, but not the full story: in the context of deep learning engineering, `ndarray` is often an intermediate representation of Document for computing, then throw away. Therefore, having a consistent data structure that lives *long enough* to cover creating, storing, computing, transferring, returning and rendering is one of the major motivations of DocArray.
9595

9696
## To Jina Users
9797

tests/unit/array/test_advance_indexing.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,8 +222,6 @@ def test_advance_selector_mixed():
222222

223223

224224
def test_single_boolean_and_padding():
225-
from docarray import DocumentArray
226-
227225
da = DocumentArray.empty(3)
228226

229227
with pytest.raises(IndexError):
@@ -237,3 +235,41 @@ def test_single_boolean_and_padding():
237235

238236
assert len(da[True, False]) == 1
239237
assert len(da[False, False]) == 0
238+
239+
240+
def test_edge_case_two_strings():
241+
# getitem
242+
da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
243+
assert da['1', 'id'] == '1'
244+
assert len(da['1', '2']) == 2
245+
assert isinstance(da['1', '2'], DocumentArray)
246+
with pytest.raises(KeyError):
247+
da['hello', '2']
248+
with pytest.raises(AttributeError):
249+
da['1', 'hello']
250+
assert len(da['1', '2', '3']) == 3
251+
assert isinstance(da['1', '2', '3'], DocumentArray)
252+
253+
# delitem
254+
del da['1', '2']
255+
assert len(da) == 1
256+
257+
da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
258+
del da['1', 'id']
259+
assert len(da) == 3
260+
assert not da[0].id
261+
262+
del da['2', 'hello']
263+
264+
# setitem
265+
da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
266+
da['1', '2'] = DocumentArray.empty(2)
267+
assert da[0].id != '1'
268+
assert da[1].id != '2'
269+
270+
da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
271+
da['1', 'text'] = 'hello'
272+
assert da['1'].text == 'hello'
273+
274+
with pytest.raises(ValueError):
275+
da['1', 'hellohello'] = 'hello'

0 commit comments

Comments
 (0)