docarray · JohannesMessner · Jun 28, 2023 · May 22, 2023 · May 22, 2023 · May 22, 2023
diff --git a/docarray/array/doc_list/doc_list.py b/docarray/array/doc_list/doc_list.py
@@ -306,6 +306,12 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T:
         """
         return super().from_protobuf(pb_msg)
 
+    @classmethod
+    def _get_proto_class(cls: Type[T]):
+        from docarray.proto import DocListProto
+
+        return DocListProto
+
     @overload
     def __getitem__(self, item: SupportsIndex) -> T_doc:
         ...

diff --git a/docarray/array/doc_list/io.py b/docarray/array/doc_list/io.py
@@ -377,13 +377,12 @@ def from_csv(
     @classmethod
     def _from_csv_file(
         cls, file: Union[StringIO, TextIOWrapper], dialect: Union[str, csv.Dialect]
-    ) -> 'DocList':
-        from docarray import DocList
+    ) -> 'T':
 
         rows = csv.DictReader(file, dialect=dialect)
 
         doc_type = cls.doc_type
-        docs = DocList.__class_getitem__(doc_type)()
+        docs = []
 
         field_names: List[str] = (
             [] if rows.fieldnames is None else [str(f) for f in rows.fieldnames]
@@ -405,7 +404,7 @@ def _from_csv_file(
             doc_dict: Dict[Any, Any] = _access_path_dict_to_nested_dict(access_path2val)
             docs.append(doc_type.parse_obj(doc_dict))
 
-        return docs
+        return cls(docs)
 
     def to_csv(
         self, file_path: str, dialect: Union[str, csv.Dialect] = 'excel'
@@ -443,7 +442,7 @@ def to_csv(
                 writer.writerow(doc_dict)
 
     @classmethod
-    def from_dataframe(cls, df: 'pd.DataFrame') -> 'DocList':
+    def from_dataframe(cls, df: 'pd.DataFrame') -> 'T':
         """
         Load a `DocList` from a `pandas.DataFrame` following the schema
         defined in the [`.doc_type`][docarray.DocList] attribute.
@@ -515,6 +514,8 @@ class Person(BaseDoc):
             doc_dict = _access_path_dict_to_nested_dict(access_path2val)
             docs.append(doc_type.parse_obj(doc_dict))
 
+        if not isinstance(docs, cls):
+            return cls(docs)
         return docs
 
     def to_dataframe(self) -> 'pd.DataFrame':
@@ -563,6 +564,11 @@ def _stream_header(self) -> bytes:
         num_docs_as_bytes = len(self).to_bytes(8, 'big', signed=False)
         return version_byte + num_docs_as_bytes
 
+    @classmethod
+    @abstractmethod
+    def _get_proto_class(cls: Type[T]):
+        ...
+
     @classmethod
     def _load_binary_all(
         cls: Type[T],
@@ -593,12 +599,10 @@ def _load_binary_all(
                 compress = None
 
         if protocol is not None and protocol == 'protobuf-array':
-            from docarray.proto import DocListProto
-
-            dap = DocListProto()
-            dap.ParseFromString(d)
+            proto = cls._get_proto_class()()
+            proto.ParseFromString(d)
 
-            return cls.from_protobuf(dap)
+            return cls.from_protobuf(proto)
         elif protocol is not None and protocol == 'pickle-array':
             return pickle.loads(d)
 

diff --git a/docarray/array/doc_vec/column_storage.py b/docarray/array/doc_vec/column_storage.py
@@ -123,6 +123,11 @@ def __getitem__(self, name: str) -> Any:
             return None
         return col[self.index]
 
+    def __reduce__(self):
+        # implementing __reduce__ to solve a pickle issue when subclassing dict
+        # see here: https://stackoverflow.com/questions/21144845/how-can-i-unpickle-a-subclass-of-dict-that-validates-with-setitem-in-pytho
+        return (ColumnStorageView, (self.index, self.storage))
+
     def __setitem__(self, name, value) -> None:
         if self.storage.columns[name] is None:
             raise ValueError(

diff --git a/docarray/array/doc_vec/doc_vec.py b/docarray/array/doc_vec/doc_vec.py
@@ -23,6 +23,7 @@
 
 from docarray.array.any_array import AnyDocArray
 from docarray.array.doc_list.doc_list import DocList
+from docarray.array.doc_list.io import IOMixinArray
 from docarray.array.doc_vec.column_storage import ColumnStorage, ColumnStorageView
 from docarray.array.list_advance_indexing import ListAdvancedIndexing
 from docarray.base_doc import AnyDoc, BaseDoc
@@ -111,7 +112,7 @@ def _is_none_list_of_docvec_proto(proto: 'ListOfDocVecProto') -> bool:
     return isinstance(proto, ListOfDocVecProto) and len(proto.data) == 0
 
 
-class DocVec(AnyDocArray[T_doc]):
+class DocVec(IOMixinArray, AnyDocArray[T_doc]):
     """
     DocVec is a container of Documents appropriates to perform
     computation that require batches of data (ex: matrix multiplication, distance
@@ -156,7 +157,7 @@ class DocVec(AnyDocArray[T_doc]):
         AnyTensor or Union of NdArray and TorchTensor
     """
 
-    doc_type: Type[T_doc]
+    doc_type: Type[T_doc] = AnyDoc  # TODO(johannes): should this be BaseDoc?
 
     def __init__(
         self: T,
@@ -588,6 +589,15 @@ def __len__(self):
     # IO related       #
     ####################
 
+    @classmethod
+    def _get_proto_class(cls: Type[T]):
+        from docarray.proto import DocVecProto
+
+        return DocVecProto
+
+    def _docarray_to_json_compatible(self) -> List[Dict]:
+        return [doc._docarray_to_json_compatible() for doc in self]
+
     @classmethod
     def from_protobuf(
         cls: Type[T], pb_msg: 'DocVecProto', tensor_type: Type[AbstractTensor] = NdArray

diff --git a/docarray/base_doc/doc.py b/docarray/base_doc/doc.py
@@ -228,7 +228,7 @@ def json(
         `encoder` is an optional function to supply as `default` to json.dumps(),
         other arguments as per `json.dumps()`.
         """
-        exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
+        exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
             exclude=exclude
         )
 
@@ -315,7 +315,7 @@ def dict(
 
         """
 
-        exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
+        exclude, original_exclude, docarray_exclude_fields = self._exclude_docarray(
             exclude=exclude
         )
 
@@ -329,7 +329,7 @@ def dict(
             exclude_none=exclude_none,
         )
 
-        for field in doclist_exclude_fields:
+        for field in docarray_exclude_fields:
             # we need to do this because pydantic will not recognize DocList correctly
             original_exclude = original_exclude or {}
             if field not in original_exclude:
@@ -338,30 +338,32 @@ def dict(
 
         return data
 
-    def _exclude_doclist(
+    def _exclude_docarray(
         self, exclude: ExcludeType
     ) -> Tuple[ExcludeType, ExcludeType, List[str]]:
-        doclist_exclude_fields = []
+        docarray_exclude_fields = []
         for field in self.__fields__.keys():
-            from docarray import DocList
+            from docarray import DocList, DocVec
 
             type_ = self._get_field_type(field)
-            if isinstance(type_, type) and issubclass(type_, DocList):
-                doclist_exclude_fields.append(field)
+            if isinstance(type_, type) and (
+                issubclass(type_, DocList) or issubclass(type_, DocVec)
+            ):
+                docarray_exclude_fields.append(field)
 
         original_exclude = exclude
         if exclude is None:
-            exclude = set(doclist_exclude_fields)
+            exclude = set(docarray_exclude_fields)
         elif isinstance(exclude, AbstractSet):
-            exclude = set([*exclude, *doclist_exclude_fields])
+            exclude = set([*exclude, *docarray_exclude_fields])
         elif isinstance(exclude, Mapping):
             exclude = dict(**exclude)
-            exclude.update({field: ... for field in doclist_exclude_fields})
+            exclude.update({field: ... for field in docarray_exclude_fields})
 
         return (
             exclude,
             original_exclude,
-            doclist_exclude_fields,
+            docarray_exclude_fields,
         )
 
     to_json = json
diff --git a/docarray/base_doc/mixins/io.py b/docarray/base_doc/mixins/io.py
@@ -8,6 +8,7 @@
     Dict,
     Iterable,
     List,
+    Literal,
     Optional,
     Tuple,
     Type,
@@ -170,7 +171,7 @@ def to_bytes(
     def from_bytes(
         cls: Type[T],
         data: bytes,
-        protocol: str = 'protobuf',
+        protocol: Literal['protobuf', 'pickle'] = 'protobuf',
         compress: Optional[str] = None,
     ) -> T:
         """Build Document object from binary bytes

diff --git a/tests/units/array/test_array_from_to_bytes.py b/tests/units/array/test_array_from_to_bytes.py
@@ -1,6 +1,6 @@
 import pytest
 
-from docarray import BaseDoc, DocList
+from docarray import BaseDoc, DocList, DocVec
 from docarray.documents import ImageDoc
 from docarray.typing import NdArray
 
@@ -16,8 +16,9 @@ class MyDoc(BaseDoc):
 )
 @pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None])
 @pytest.mark.parametrize('show_progress', [False, True])
-def test_from_to_bytes(protocol, compress, show_progress):
-    da = DocList[MyDoc](
+@pytest.mark.parametrize('array_cls', [DocList, DocVec])
+def test_from_to_bytes(protocol, compress, show_progress, array_cls):
+    da = array_cls[MyDoc](
         [
             MyDoc(
                 embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png')
@@ -28,7 +29,7 @@ def test_from_to_bytes(protocol, compress, show_progress):
     bytes_da = da.to_bytes(
         protocol=protocol, compress=compress, show_progress=show_progress
     )
-    da2 = DocList[MyDoc].from_bytes(
+    da2 = array_cls[MyDoc].from_bytes(
         bytes_da, protocol=protocol, compress=compress, show_progress=show_progress
     )
     assert len(da2) == 2
@@ -46,8 +47,9 @@ def test_from_to_bytes(protocol, compress, show_progress):
 )
 @pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None])
 @pytest.mark.parametrize('show_progress', [False, True])
-def test_from_to_base64(protocol, compress, show_progress):
-    da = DocList[MyDoc](
+@pytest.mark.parametrize('array_cls', [DocList, DocVec])
+def test_from_to_base64(protocol, compress, show_progress, array_cls):
+    da = array_cls[MyDoc](
         [
             MyDoc(
                 embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png')
@@ -58,7 +60,7 @@ def test_from_to_base64(protocol, compress, show_progress):
     bytes_da = da.to_base64(
         protocol=protocol, compress=compress, show_progress=show_progress
     )
-    da2 = DocList[MyDoc].from_base64(
+    da2 = array_cls[MyDoc].from_base64(
         bytes_da, protocol=protocol, compress=compress, show_progress=show_progress
     )
     assert len(da2) == 2

diff --git a/tests/units/array/test_array_from_to_csv.py b/tests/units/array/test_array_from_to_csv.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from docarray import BaseDoc, DocList
+from docarray import BaseDoc, DocList, DocVec
 from docarray.documents import ImageDoc
 from tests import TOYDATA_DIR
 
@@ -21,8 +21,9 @@ class MyDocNested(MyDoc):
     return MyDocNested
 
 
-def test_to_from_csv(tmpdir, nested_doc_cls):
-    da = DocList[nested_doc_cls](
+@pytest.mark.parametrize('array_cls', [DocList, DocVec])
+def test_to_from_csv(tmpdir, nested_doc_cls, array_cls):
+    da = array_cls[nested_doc_cls](
         [
             nested_doc_cls(
                 count=0,
@@ -37,15 +38,18 @@ def test_to_from_csv(tmpdir, nested_doc_cls):
     da.to_csv(tmp_file)
     assert os.path.isfile(tmp_file)
 
-    da_from = DocList[nested_doc_cls].from_csv(tmp_file)
+    da_from = array_cls[nested_doc_cls].from_csv(tmp_file)
+    assert isinstance(da_from, array_cls)
     for doc1, doc2 in zip(da, da_from):
         assert doc1 == doc2
 
 
-def test_from_csv_nested(nested_doc_cls):
-    da = DocList[nested_doc_cls].from_csv(
+@pytest.mark.parametrize('array_cls', [DocList, DocVec])
+def test_from_csv_nested(nested_doc_cls, array_cls):
+    da = array_cls[nested_doc_cls].from_csv(
         file_path=str(TOYDATA_DIR / 'docs_nested.csv')
     )
+    assert isinstance(da, array_cls)
     assert len(da) == 3
 
     for i, doc in enumerate(da):
@@ -89,25 +93,31 @@ class Outer(BaseDoc):
     return doc
 
 
-def test_from_csv_without_schema_raise_exception():
+@pytest.mark.parametrize('array_cls', [DocList, DocVec])
+def test_from_csv_without_schema_raise_exception(array_cls):
     with pytest.raises(TypeError, match='no document schema defined'):
-        DocList.from_csv(file_path=str(TOYDATA_DIR / 'docs_nested.csv'))
+        array_cls.from_csv(file_path=str(TOYDATA_DIR / 'docs_nested.csv'))
 
 
-def test_from_csv_with_wrong_schema_raise_exception(nested_doc):
+@pytest.mark.parametrize('array_cls', [DocList, DocVec])
+def test_from_csv_with_wrong_schema_raise_exception(nested_doc, array_cls):
     with pytest.raises(ValueError, match='Column names do not match the schema'):
-        DocList[nested_doc.__class__].from_csv(file_path=str(TOYDATA_DIR / 'docs.csv'))
+        array_cls[nested_doc.__class__].from_csv(
+            file_path=str(TOYDATA_DIR / 'docs.csv')
+        )
 
 
-def test_from_remote_csv_file():
+@pytest.mark.parametrize('array_cls', [DocList, DocVec])
+def test_from_remote_csv_file(array_cls):
     remote_url = 'https://github.com/docarray/docarray/blob/main/tests/toydata/books.csv?raw=true'
 
     class Book(BaseDoc):
         title: str
         author: str
         year: int
 
-    books = DocList[Book].from_csv(file_path=remote_url)
+    books = array_cls[Book].from_csv(file_path=remote_url)
+    assert isinstance(books, array_cls)
 
     assert len(books) == 3
 
@@ -116,6 +126,7 @@ def test_doc_list_error(tmpdir):
     class Book(BaseDoc):
         title: str
 
+    # not testing DocVec bc it already fails here (as it should!)
     docs = DocList([Book(title='hello'), Book(title='world')])
     tmp_file = str(tmpdir / 'tmp.csv')
     with pytest.raises(TypeError):

diff --git a/tests/units/array/test_array_from_to_json.py b/tests/units/array/test_array_from_to_json.py
@@ -1,3 +1,5 @@
+import pytest
+
 from docarray import BaseDoc, DocList
 from docarray.documents import ImageDoc
 from docarray.typing import NdArray
@@ -9,7 +11,8 @@ class MyDoc(BaseDoc):
     image: ImageDoc
 
 
-def test_from_to_json():
+@pytest.mark.parametrize('doc_vec', [True])
+def test_from_to_json(doc_vec):
     da = DocList[MyDoc](
         [
             MyDoc(
@@ -18,6 +21,8 @@ def test_from_to_json():
             MyDoc(embedding=[5, 4, 3, 2, 1], text='hello world', image=ImageDoc()),
         ]
     )
+    if doc_vec:
+        da = da.to_doc_vec()
     json_da = da.to_json()
     da2 = DocList[MyDoc].from_json(json_da)
     assert len(da2) == 2