Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d43057b
feat: json and dict for docvec
JohannesMessner May 22, 2023
c45bfca
test: add tests
JohannesMessner May 22, 2023
564d144
test: add docvec to dict test
JohannesMessner May 22, 2023
76f9c8e
feat: to from dataframe for docvec
JohannesMessner May 22, 2023
73a1ac7
test: dataframe docvec tests
JohannesMessner May 22, 2023
f83fb4f
feat: to from csv for docvec
JohannesMessner May 22, 2023
ca8dc12
test: test csv with docvec
JohannesMessner May 22, 2023
2b52b1e
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 14, 2023
b115637
feat: pickle serialization for docvec
JohannesMessner Jun 14, 2023
bd86985
feat: protbuf array serialization for docvec
JohannesMessner Jun 14, 2023
c280ff2
test: test base64 deser for docvec
JohannesMessner Jun 14, 2023
ad881cf
test: test save and load for docvec
JohannesMessner Jun 14, 2023
4b1b533
feat: docvec json column wise
JohannesMessner Jun 19, 2023
60e651e
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 19, 2023
f9c97ec
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 20, 2023
0603fc5
test: add test for docvec json
JohannesMessner Jun 20, 2023
c6ace8e
test: add tensor type arg
JohannesMessner Jun 20, 2023
51719b2
fix: mypy stuff
JohannesMessner Jun 26, 2023
ad5f5bd
fix: raising of error when needed
JohannesMessner Jun 26, 2023
200dbac
fix: more exception raising
JohannesMessner Jun 26, 2023
8d1f446
fix: mypy
JohannesMessner Jun 26, 2023
6815720
refactor: don't expose to/from csv for docvec
JohannesMessner Jun 26, 2023
6b5ddc7
test: adjust tests
JohannesMessner Jun 26, 2023
587c20a
docs: add documentation for docvec io
JohannesMessner Jun 27, 2023
663f17d
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 27, 2023
7d035fb
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 28, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docarray/array/doc_list/doc_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,12 @@ def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T:
"""
return super().from_protobuf(pb_msg)

@classmethod
def _get_proto_class(cls: Type[T]):
from docarray.proto import DocListProto

return DocListProto

@overload
def __getitem__(self, item: SupportsIndex) -> T_doc:
...
Expand Down
24 changes: 14 additions & 10 deletions docarray/array/doc_list/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,13 +377,12 @@ def from_csv(
@classmethod
def _from_csv_file(
cls, file: Union[StringIO, TextIOWrapper], dialect: Union[str, csv.Dialect]
) -> 'DocList':
from docarray import DocList
) -> 'T':

rows = csv.DictReader(file, dialect=dialect)

doc_type = cls.doc_type
docs = DocList.__class_getitem__(doc_type)()
docs = []

field_names: List[str] = (
[] if rows.fieldnames is None else [str(f) for f in rows.fieldnames]
Expand All @@ -405,7 +404,7 @@ def _from_csv_file(
doc_dict: Dict[Any, Any] = _access_path_dict_to_nested_dict(access_path2val)
docs.append(doc_type.parse_obj(doc_dict))

return docs
return cls(docs)

def to_csv(
self, file_path: str, dialect: Union[str, csv.Dialect] = 'excel'
Expand Down Expand Up @@ -443,7 +442,7 @@ def to_csv(
writer.writerow(doc_dict)

@classmethod
def from_dataframe(cls, df: 'pd.DataFrame') -> 'DocList':
def from_dataframe(cls, df: 'pd.DataFrame') -> 'T':
"""
Load a `DocList` from a `pandas.DataFrame` following the schema
defined in the [`.doc_type`][docarray.DocList] attribute.
Expand Down Expand Up @@ -515,6 +514,8 @@ class Person(BaseDoc):
doc_dict = _access_path_dict_to_nested_dict(access_path2val)
docs.append(doc_type.parse_obj(doc_dict))

if not isinstance(docs, cls):
return cls(docs)
return docs

def to_dataframe(self) -> 'pd.DataFrame':
Expand Down Expand Up @@ -563,6 +564,11 @@ def _stream_header(self) -> bytes:
num_docs_as_bytes = len(self).to_bytes(8, 'big', signed=False)
return version_byte + num_docs_as_bytes

@classmethod
@abstractmethod
def _get_proto_class(cls: Type[T]):
...

@classmethod
def _load_binary_all(
cls: Type[T],
Expand Down Expand Up @@ -593,12 +599,10 @@ def _load_binary_all(
compress = None

if protocol is not None and protocol == 'protobuf-array':
from docarray.proto import DocListProto

dap = DocListProto()
dap.ParseFromString(d)
proto = cls._get_proto_class()()
proto.ParseFromString(d)

return cls.from_protobuf(dap)
return cls.from_protobuf(proto)
elif protocol is not None and protocol == 'pickle-array':
return pickle.loads(d)

Expand Down
5 changes: 5 additions & 0 deletions docarray/array/doc_vec/column_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ def __getitem__(self, name: str) -> Any:
return None
return col[self.index]

def __reduce__(self):
# implementing __reduce__ to solve a pickle issue when subclassing dict
# see here: https://stackoverflow.com/questions/21144845/how-can-i-unpickle-a-subclass-of-dict-that-validates-with-setitem-in-pytho
return (ColumnStorageView, (self.index, self.storage))

def __setitem__(self, name, value) -> None:
if self.storage.columns[name] is None:
raise ValueError(
Expand Down
14 changes: 12 additions & 2 deletions docarray/array/doc_vec/doc_vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

from docarray.array.any_array import AnyDocArray
from docarray.array.doc_list.doc_list import DocList
from docarray.array.doc_list.io import IOMixinArray
from docarray.array.doc_vec.column_storage import ColumnStorage, ColumnStorageView
from docarray.array.list_advance_indexing import ListAdvancedIndexing
from docarray.base_doc import AnyDoc, BaseDoc
Expand Down Expand Up @@ -111,7 +112,7 @@ def _is_none_list_of_docvec_proto(proto: 'ListOfDocVecProto') -> bool:
return isinstance(proto, ListOfDocVecProto) and len(proto.data) == 0


class DocVec(AnyDocArray[T_doc]):
class DocVec(IOMixinArray, AnyDocArray[T_doc]):
"""
DocVec is a container of Documents appropriates to perform
computation that require batches of data (ex: matrix multiplication, distance
Expand Down Expand Up @@ -156,7 +157,7 @@ class DocVec(AnyDocArray[T_doc]):
AnyTensor or Union of NdArray and TorchTensor
"""

doc_type: Type[T_doc]
doc_type: Type[T_doc] = AnyDoc # TODO(johannes): should this be BaseDoc?

def __init__(
self: T,
Expand Down Expand Up @@ -588,6 +589,15 @@ def __len__(self):
# IO related #
####################

@classmethod
def _get_proto_class(cls: Type[T]):
from docarray.proto import DocVecProto

return DocVecProto

def _docarray_to_json_compatible(self) -> List[Dict]:
return [doc._docarray_to_json_compatible() for doc in self]

@classmethod
def from_protobuf(
cls: Type[T], pb_msg: 'DocVecProto', tensor_type: Type[AbstractTensor] = NdArray
Expand Down
26 changes: 14 additions & 12 deletions docarray/base_doc/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def json(
`encoder` is an optional function to supply as `default` to json.dumps(),
other arguments as per `json.dumps()`.
"""
exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
exclude=exclude
)

Expand Down Expand Up @@ -315,7 +315,7 @@ def dict(

"""

exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
exclude, original_exclude, docarray_exclude_fields = self._exclude_docarray(
exclude=exclude
)

Expand All @@ -329,7 +329,7 @@ def dict(
exclude_none=exclude_none,
)

for field in doclist_exclude_fields:
for field in docarray_exclude_fields:
# we need to do this because pydantic will not recognize DocList correctly
original_exclude = original_exclude or {}
if field not in original_exclude:
Expand All @@ -338,30 +338,32 @@ def dict(

return data

def _exclude_doclist(
def _exclude_docarray(
self, exclude: ExcludeType
) -> Tuple[ExcludeType, ExcludeType, List[str]]:
doclist_exclude_fields = []
docarray_exclude_fields = []
for field in self.__fields__.keys():
from docarray import DocList
from docarray import DocList, DocVec

type_ = self._get_field_type(field)
if isinstance(type_, type) and issubclass(type_, DocList):
doclist_exclude_fields.append(field)
if isinstance(type_, type) and (
issubclass(type_, DocList) or issubclass(type_, DocVec)
):
docarray_exclude_fields.append(field)

original_exclude = exclude
if exclude is None:
exclude = set(doclist_exclude_fields)
exclude = set(docarray_exclude_fields)
elif isinstance(exclude, AbstractSet):
exclude = set([*exclude, *doclist_exclude_fields])
exclude = set([*exclude, *docarray_exclude_fields])
elif isinstance(exclude, Mapping):
exclude = dict(**exclude)
exclude.update({field: ... for field in doclist_exclude_fields})
exclude.update({field: ... for field in docarray_exclude_fields})

return (
exclude,
original_exclude,
doclist_exclude_fields,
docarray_exclude_fields,
)

to_json = json
3 changes: 2 additions & 1 deletion docarray/base_doc/mixins/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
Dict,
Iterable,
List,
Literal,
Optional,
Tuple,
Type,
Expand Down Expand Up @@ -170,7 +171,7 @@ def to_bytes(
def from_bytes(
cls: Type[T],
data: bytes,
protocol: str = 'protobuf',
protocol: Literal['protobuf', 'pickle'] = 'protobuf',
compress: Optional[str] = None,
) -> T:
"""Build Document object from binary bytes
Expand Down
16 changes: 9 additions & 7 deletions tests/units/array/test_array_from_to_bytes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from docarray import BaseDoc, DocList
from docarray import BaseDoc, DocList, DocVec
from docarray.documents import ImageDoc
from docarray.typing import NdArray

Expand All @@ -16,8 +16,9 @@ class MyDoc(BaseDoc):
)
@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None])
@pytest.mark.parametrize('show_progress', [False, True])
def test_from_to_bytes(protocol, compress, show_progress):
da = DocList[MyDoc](
@pytest.mark.parametrize('array_cls', [DocList, DocVec])
def test_from_to_bytes(protocol, compress, show_progress, array_cls):
da = array_cls[MyDoc](
[
MyDoc(
embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png')
Expand All @@ -28,7 +29,7 @@ def test_from_to_bytes(protocol, compress, show_progress):
bytes_da = da.to_bytes(
protocol=protocol, compress=compress, show_progress=show_progress
)
da2 = DocList[MyDoc].from_bytes(
da2 = array_cls[MyDoc].from_bytes(
bytes_da, protocol=protocol, compress=compress, show_progress=show_progress
)
assert len(da2) == 2
Expand All @@ -46,8 +47,9 @@ def test_from_to_bytes(protocol, compress, show_progress):
)
@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None])
@pytest.mark.parametrize('show_progress', [False, True])
def test_from_to_base64(protocol, compress, show_progress):
da = DocList[MyDoc](
@pytest.mark.parametrize('array_cls', [DocList, DocVec])
def test_from_to_base64(protocol, compress, show_progress, array_cls):
da = array_cls[MyDoc](
[
MyDoc(
embedding=[1, 2, 3, 4, 5], text='hello', image=ImageDoc(url='aux.png')
Expand All @@ -58,7 +60,7 @@ def test_from_to_base64(protocol, compress, show_progress):
bytes_da = da.to_base64(
protocol=protocol, compress=compress, show_progress=show_progress
)
da2 = DocList[MyDoc].from_base64(
da2 = array_cls[MyDoc].from_base64(
bytes_da, protocol=protocol, compress=compress, show_progress=show_progress
)
assert len(da2) == 2
Expand Down
35 changes: 23 additions & 12 deletions tests/units/array/test_array_from_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pytest

from docarray import BaseDoc, DocList
from docarray import BaseDoc, DocList, DocVec
from docarray.documents import ImageDoc
from tests import TOYDATA_DIR

Expand All @@ -21,8 +21,9 @@ class MyDocNested(MyDoc):
return MyDocNested


def test_to_from_csv(tmpdir, nested_doc_cls):
da = DocList[nested_doc_cls](
@pytest.mark.parametrize('array_cls', [DocList, DocVec])
def test_to_from_csv(tmpdir, nested_doc_cls, array_cls):
da = array_cls[nested_doc_cls](
[
nested_doc_cls(
count=0,
Expand All @@ -37,15 +38,18 @@ def test_to_from_csv(tmpdir, nested_doc_cls):
da.to_csv(tmp_file)
assert os.path.isfile(tmp_file)

da_from = DocList[nested_doc_cls].from_csv(tmp_file)
da_from = array_cls[nested_doc_cls].from_csv(tmp_file)
assert isinstance(da_from, array_cls)
for doc1, doc2 in zip(da, da_from):
assert doc1 == doc2


def test_from_csv_nested(nested_doc_cls):
da = DocList[nested_doc_cls].from_csv(
@pytest.mark.parametrize('array_cls', [DocList, DocVec])
def test_from_csv_nested(nested_doc_cls, array_cls):
da = array_cls[nested_doc_cls].from_csv(
file_path=str(TOYDATA_DIR / 'docs_nested.csv')
)
assert isinstance(da, array_cls)
assert len(da) == 3

for i, doc in enumerate(da):
Expand Down Expand Up @@ -89,25 +93,31 @@ class Outer(BaseDoc):
return doc


def test_from_csv_without_schema_raise_exception():
@pytest.mark.parametrize('array_cls', [DocList, DocVec])
def test_from_csv_without_schema_raise_exception(array_cls):
with pytest.raises(TypeError, match='no document schema defined'):
DocList.from_csv(file_path=str(TOYDATA_DIR / 'docs_nested.csv'))
array_cls.from_csv(file_path=str(TOYDATA_DIR / 'docs_nested.csv'))


def test_from_csv_with_wrong_schema_raise_exception(nested_doc):
@pytest.mark.parametrize('array_cls', [DocList, DocVec])
def test_from_csv_with_wrong_schema_raise_exception(nested_doc, array_cls):
with pytest.raises(ValueError, match='Column names do not match the schema'):
DocList[nested_doc.__class__].from_csv(file_path=str(TOYDATA_DIR / 'docs.csv'))
array_cls[nested_doc.__class__].from_csv(
file_path=str(TOYDATA_DIR / 'docs.csv')
)


def test_from_remote_csv_file():
@pytest.mark.parametrize('array_cls', [DocList, DocVec])
def test_from_remote_csv_file(array_cls):
remote_url = 'https://github.com/docarray/docarray/blob/main/tests/toydata/books.csv?raw=true'

class Book(BaseDoc):
title: str
author: str
year: int

books = DocList[Book].from_csv(file_path=remote_url)
books = array_cls[Book].from_csv(file_path=remote_url)
assert isinstance(books, array_cls)

assert len(books) == 3

Expand All @@ -116,6 +126,7 @@ def test_doc_list_error(tmpdir):
class Book(BaseDoc):
title: str

# not testing DocVec bc it already fails here (as it should!)
docs = DocList([Book(title='hello'), Book(title='world')])
tmp_file = str(tmpdir / 'tmp.csv')
with pytest.raises(TypeError):
Expand Down
7 changes: 6 additions & 1 deletion tests/units/array/test_array_from_to_json.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pytest

from docarray import BaseDoc, DocList
from docarray.documents import ImageDoc
from docarray.typing import NdArray
Expand All @@ -9,7 +11,8 @@ class MyDoc(BaseDoc):
image: ImageDoc


def test_from_to_json():
@pytest.mark.parametrize('doc_vec', [True])
def test_from_to_json(doc_vec):
da = DocList[MyDoc](
[
MyDoc(
Expand All @@ -18,6 +21,8 @@ def test_from_to_json():
MyDoc(embedding=[5, 4, 3, 2, 1], text='hello world', image=ImageDoc()),
]
)
if doc_vec:
da = da.to_doc_vec()
json_da = da.to_json()
da2 = DocList[MyDoc].from_json(json_da)
assert len(da2) == 2
Expand Down
Loading