fix: fix edge case when two elements are strings (#37)

hanxiao · web-flow · commit 9646bed94769 · 2022-01-12T11:24:54.000+01:00
* fix: fix pr#29

* fix: fix pr#29
diff --git a/docarray/array/document.py b/docarray/array/document.py
@@ -146,12 +146,18 @@ def __getitem__(
                 and len(index) == 2
                 and isinstance(index[0], (slice, Sequence))
             ):
-                _docs = self[index[0]]
-                _attrs = index[1]
-                if isinstance(_attrs, str):
-                    _attrs = (index[1],)
-
-                return _docs._get_attributes(*_attrs)
+                if isinstance(index[0], str) and isinstance(index[1], str):
+                    # ambiguity only comes from the second string
+                    if index[1] in self._id2offset:
+                        return DocumentArray([self[index[0]], self[index[1]]])
+                    else:
+                        return getattr(self[index[0]], index[1])
+                elif isinstance(index[0], (slice, Sequence)):
+                    _docs = self[index[0]]
+                    _attrs = index[1]
+                    if isinstance(_attrs, str):
+                        _attrs = (index[1],)
+                    return _docs._get_attributes(*_attrs)
             elif isinstance(index[0], bool):
                 return DocumentArray(itertools.compress(self._data, index))
             elif isinstance(index[0], int):
@@ -231,31 +237,45 @@ def __setitem__(
                 and len(index) == 2
                 and isinstance(index[0], (slice, Sequence))
             ):
-                _docs = self[index[0]]
-                _attrs = index[1]
-
-                if isinstance(_attrs, str):
-                    # a -> [a]
-                    # [a, a] -> [a, a]
-                    _attrs = (index[1],)
-                if isinstance(value, (list, tuple)) and not any(
-                    isinstance(el, (tuple, list)) for el in value
-                ):
-                    # [x] -> [[x]]
-                    # [[x], [y]] -> [[x], [y]]
-                    value = (value,)
-                if not isinstance(value, (list, tuple)):
-                    # x -> [x]
-                    value = (value,)
-
-                for _a, _v in zip(_attrs, value):
-                    if _a == 'blob':
-                        _docs.blobs = _v
-                    elif _a == 'embedding':
-                        _docs.embeddings = _v
+                if isinstance(index[0], str) and isinstance(index[1], str):
+                    # ambiguity only comes from the second string
+                    if index[1] in self._id2offset:
+                        for _d, _v in zip((self[index[0]], self[index[1]]), value):
+                            _d._data = _v._data
+                        self._rebuild_id2offset()
+                    elif hasattr(self[index[0]], index[1]):
+                        setattr(self[index[0]], index[1], value)
                     else:
-                        for _d, _vv in zip(_docs, _v):
-                            setattr(_d, _a, _vv)
+                        # to avoid accidentally add new unsupport attribute
+                        raise ValueError(
+                            f'`{index[1]}` is neither a valid id nor attribute name'
+                        )
+                elif isinstance(index[0], (slice, Sequence)):
+                    _docs = self[index[0]]
+                    _attrs = index[1]
+
+                    if isinstance(_attrs, str):
+                        # a -> [a]
+                        # [a, a] -> [a, a]
+                        _attrs = (index[1],)
+                    if isinstance(value, (list, tuple)) and not any(
+                        isinstance(el, (tuple, list)) for el in value
+                    ):
+                        # [x] -> [[x]]
+                        # [[x], [y]] -> [[x], [y]]
+                        value = (value,)
+                    if not isinstance(value, (list, tuple)):
+                        # x -> [x]
+                        value = (value,)
+
+                    for _a, _v in zip(_attrs, value):
+                        if _a == 'blob':
+                            _docs.blobs = _v
+                        elif _a == 'embedding':
+                            _docs.embeddings = _v
+                        else:
+                            for _d, _vv in zip(_docs, _v):
+                                setattr(_d, _a, _vv)
             elif isinstance(index[0], bool):
                 if len(index) != len(self._data):
                     raise IndexError(
@@ -313,12 +333,20 @@ def __delitem__(self, index: 'DocumentArrayIndexType'):
                 and len(index) == 2
                 and isinstance(index[0], (slice, Sequence))
             ):
-                _docs = self[index[0]]
-                _attrs = index[1]
-                if isinstance(_attrs, str):
-                    _attrs = (index[1],)
-                for _d in _docs:
-                    _d.pop(*_attrs)
+                if isinstance(index[0], str) and isinstance(index[1], str):
+                    # ambiguity only comes from the second string
+                    if index[1] in self._id2offset:
+                        del self[index[0]]
+                        del self[index[1]]
+                    else:
+                        self[index[0]].pop(index[1])
+                elif isinstance(index[0], (slice, Sequence)):
+                    _docs = self[index[0]]
+                    _attrs = index[1]
+                    if isinstance(_attrs, str):
+                        _attrs = (index[1],)
+                    for _d in _docs:
+                        _d.pop(*_attrs)
             elif isinstance(index[0], bool):
                 self._data = list(
                     itertools.compress(self._data, (not _i for _i in index))
diff --git a/docs/fundamentals/document/construct.md b/docs/fundamentals/document/construct.md
@@ -15,7 +15,19 @@ d = Document()
 <Document ('id',) at 5dd542406d3f11eca3241e008a366d49>
 ```
 
-Every Document will have a unique random `id` that helps you identify this Document. It can be used to {ref}`access this Document inside a DocumentArray<access-elements>`. You can override this `id` or assign your own `id` during construction, as demonstrated below.
+Every Document will have a unique random `id` that helps you identify this Document. It can be used to {ref}`access this Document inside a DocumentArray<access-elements>`.
+
+````{tip}
+The random `id` is the hex value of [UUID1](https://docs.python.org/3/library/uuid.html#uuid.uuid1). To convert it into the string of UUID:
+
+```python
+import uuid
+str(uuid.UUID(d.id))
+```
+````
+
+Though possible, it is not recommended modifying `.id` of a Document frequently, as this will lead to unexpected behavior.
+
 
 ## Construct with attributes
 
@@ -25,7 +37,6 @@ This is the most common usage of the constructor: initializing a Document object
 from docarray import Document
 import numpy
 
-d0 = Document(id='my_id')
 d1 = Document(text='hello')
 d2 = Document(buffer=b'\f1')
 d3 = Document(blob=numpy.array([1, 2, 3]))
@@ -43,7 +54,6 @@ Don't forget to leverage autocomplete in your IDE.
 ```
 
 ```text
-<Document ('id',) at my_id>
 <Document ('id', 'mime_type', 'text') at a14effee6d3e11ec8bde1e008a366d49>
 <Document ('id', 'buffer') at a14f00986d3e11ec8bde1e008a366d49> 
 <Document ('id', 'blob') at a14f01a66d3e11ec8bde1e008a366d49> 
diff --git a/docs/get-started/what-is.md b/docs/get-started/what-is.md
@@ -91,7 +91,7 @@ As one can see, you can convert a DocumentArray into AwkwardArray via `.to_list(
 
 [Zarr](https://zarr.readthedocs.io/en/stable/) is a format for the storage of chunked, compressed, N-dimensional arrays. I know Zarr quite long time ago, to me it is the package when a `numpy.ndarray` is so big to fit into memory. Zarr provides a comprehensive set of functions that allows one to chunk, compress, stream large NdArray. Hence, from that perspective, Zarr like `numpy.ndarray` focuses on numerical representation and computation.
 
-In DocArray, the basic element one would work with is a Document, not `ndarray`. The support of `ndarray` is important, but not the full story: in the context of deep learning engineering, it is often an intermediate representation of Document for computing, then being thrown away. Therefore, having a consistent data structure that can live *long enough* to cover creating, storing, computing, transferring, returning and rendering is a motivation behind DocArray.
+In DocArray, the basic element one would work with is a Document, not `ndarray`. The support of `ndarray` is important, but not the full story: in the context of deep learning engineering, `ndarray` is often an intermediate representation of Document for computing, then throw away. Therefore, having a consistent data structure that lives *long enough* to cover creating, storing, computing, transferring, returning and rendering is one of the major motivations of DocArray.
 
 ## To Jina Users
 
diff --git a/tests/unit/array/test_advance_indexing.py b/tests/unit/array/test_advance_indexing.py
@@ -222,8 +222,6 @@ def test_advance_selector_mixed():
 
 
 def test_single_boolean_and_padding():
-    from docarray import DocumentArray
-
     da = DocumentArray.empty(3)
 
     with pytest.raises(IndexError):
@@ -237,3 +235,41 @@ def test_single_boolean_and_padding():
 
     assert len(da[True, False]) == 1
     assert len(da[False, False]) == 0
+
+
+def test_edge_case_two_strings():
+    # getitem
+    da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
+    assert da['1', 'id'] == '1'
+    assert len(da['1', '2']) == 2
+    assert isinstance(da['1', '2'], DocumentArray)
+    with pytest.raises(KeyError):
+        da['hello', '2']
+    with pytest.raises(AttributeError):
+        da['1', 'hello']
+    assert len(da['1', '2', '3']) == 3
+    assert isinstance(da['1', '2', '3'], DocumentArray)
+
+    # delitem
+    del da['1', '2']
+    assert len(da) == 1
+
+    da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
+    del da['1', 'id']
+    assert len(da) == 3
+    assert not da[0].id
+
+    del da['2', 'hello']
+
+    # setitem
+    da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
+    da['1', '2'] = DocumentArray.empty(2)
+    assert da[0].id != '1'
+    assert da[1].id != '2'
+
+    da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
+    da['1', 'text'] = 'hello'
+    assert da['1'].text == 'hello'
+
+    with pytest.raises(ValueError):
+        da['1', 'hellohello'] = 'hello'