Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d43057b
feat: json and dict for docvec
JohannesMessner May 22, 2023
c45bfca
test: add tests
JohannesMessner May 22, 2023
564d144
test: add docvec to dict test
JohannesMessner May 22, 2023
76f9c8e
feat: to from dataframe for docvec
JohannesMessner May 22, 2023
73a1ac7
test: dataframe docvec tests
JohannesMessner May 22, 2023
f83fb4f
feat: to from csv for docvec
JohannesMessner May 22, 2023
ca8dc12
test: test csv with docvec
JohannesMessner May 22, 2023
2b52b1e
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 14, 2023
b115637
feat: pickle serialization for docvec
JohannesMessner Jun 14, 2023
bd86985
feat: protbuf array serialization for docvec
JohannesMessner Jun 14, 2023
c280ff2
test: test base64 deser for docvec
JohannesMessner Jun 14, 2023
ad881cf
test: test save and load for docvec
JohannesMessner Jun 14, 2023
4b1b533
feat: docvec json column wise
JohannesMessner Jun 19, 2023
60e651e
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 19, 2023
f9c97ec
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 20, 2023
0603fc5
test: add test for docvec json
JohannesMessner Jun 20, 2023
c6ace8e
test: add tensor type arg
JohannesMessner Jun 20, 2023
51719b2
fix: mypy stuff
JohannesMessner Jun 26, 2023
ad5f5bd
fix: raising of error when needed
JohannesMessner Jun 26, 2023
200dbac
fix: more exception raising
JohannesMessner Jun 26, 2023
8d1f446
fix: mypy
JohannesMessner Jun 26, 2023
6815720
refactor: don't expose to/from csv for docvec
JohannesMessner Jun 26, 2023
6b5ddc7
test: adjust tests
JohannesMessner Jun 26, 2023
587c20a
docs: add documentation for docvec io
JohannesMessner Jun 27, 2023
663f17d
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 27, 2023
7d035fb
Merge branch 'main' into feat-docvec-io
JohannesMessner Jun 28, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat: docvec json column wise
Signed-off-by: Johannes Messner <[email protected]>
  • Loading branch information
JohannesMessner committed Jun 19, 2023
commit 4b1b533ccf299eca6e37a3571ea71ed161e5a053
27 changes: 27 additions & 0 deletions docarray/array/doc_vec/column_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
ItemsView,
Iterable,
MutableMapping,
NamedTuple,
Optional,
Type,
TypeVar,
Expand All @@ -26,6 +27,13 @@
T = TypeVar('T', bound='ColumnStorage')


class ColumnsJsonCompatible(NamedTuple):
tensor_columns: Dict[str, Any]
doc_columns: Dict[str, Any]
docs_vec_columns: Dict[str, Any]
any_columns: Dict[str, Any]


class ColumnStorage:
"""
ColumnStorage is a container to store the columns of the
Expand Down Expand Up @@ -91,6 +99,25 @@ def __getitem__(self: T, item: IndexIterType) -> T:
self.tensor_type,
)

def columns_json_compatible(self) -> ColumnsJsonCompatible:
tens_cols = {
key: value._docarray_to_json_compatible() if value is not None else value
for key, value in self.tensor_columns.items()
}
doc_cols = {
key: value._docarray_to_json_compatible() if value is not None else value
for key, value in self.doc_columns.items()
}
doc_vec_cols = {
key: [vec._docarray_to_json_compatible() for vec in value]
if value is not None
else value
for key, value in self.docs_vec_columns.items()
}
return ColumnsJsonCompatible(
tens_cols, doc_cols, doc_vec_cols, self.any_columns
)


class ColumnStorageView(dict, MutableMapping[str, Any]):
index: int
Expand Down
80 changes: 78 additions & 2 deletions docarray/array/doc_vec/doc_vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
)

import numpy as np
from orjson import orjson
from pydantic import BaseConfig, parse_obj_as
from typing_inspect import typingGenericAlias

Expand Down Expand Up @@ -595,8 +596,83 @@ def _get_proto_class(cls: Type[T]):

return DocVecProto

def _docarray_to_json_compatible(self) -> List[Dict]:
return [doc._docarray_to_json_compatible() for doc in self]
def _docarray_to_json_compatible(self) -> Dict[str, Dict[str, Any]]:
tup = self._storage.columns_json_compatible()
return tup._asdict()

@classmethod
def from_json(
cls: Type[T],
file: Union[str, bytes, bytearray],
tensor_type: Type[AbstractTensor] = NdArray,
) -> T:
"""Deserialize JSON strings or bytes into a `DocList`.

:param file: JSON object from where to deserialize a `DocList`
:param tensor_type: the tensor type to use for the tensor columns.
Could be NdArray, TorchTensor, or TensorFlowTensor. Defaults to NdArray.
All tensors of the output DocVec will be of this type.
:return: the deserialized `DocList`
"""
json_columns = orjson.loads(file)
return cls._from_json_col_dict(json_columns, tensor_type=tensor_type)

@classmethod
def _from_json_col_dict(
cls, json_columns: Dict[str, Any], tensor_type: Type[AbstractTensor] = NdArray
) -> T:

tensor_cols = json_columns['tensor_columns']
doc_cols = json_columns['doc_columns']
docs_vec_cols = json_columns['docs_vec_columns']
any_cols = json_columns['any_columns']

for key, col in tensor_cols.items():
if col is not None:
tensor_cols[key] = parse_obj_as(tensor_type, col)
else:
tensor_cols[key] = None

for key, col in doc_cols.items():
if col is not None:
col_doc_type = cls.doc_type._get_field_type(key)
doc_cols[key] = DocVec.__class_getitem__(
col_doc_type
)._from_json_col_dict(col, tensor_type=tensor_type)
else:
doc_cols[key] = None

for key, col in docs_vec_cols.items():
if col is not None:
col_doc_type = cls.doc_type._get_field_type(key).doc_type
col_ = ListAdvancedIndexing(
DocVec.__class_getitem__(col_doc_type)._from_json_col_dict(
vec, tensor_type=tensor_type
)
for vec in col
)
docs_vec_cols[key] = col_
else:
docs_vec_cols[key] = None

for key, col in any_cols.items():
if col is not None:
col_type = cls.doc_type._get_field_type(key)
col_type = (
col_type
if cls.doc_type.__fields__[key].required
else Optional[col_type]
)
col_ = ListAdvancedIndexing(parse_obj_as(col_type, val) for val in col)
any_cols[key] = col_
else:
any_cols[key] = None

return cls.from_columns_storage(
ColumnStorage(
tensor_cols, doc_cols, docs_vec_cols, any_cols, tensor_type=tensor_type
)
)

@classmethod
def from_protobuf(
Expand Down