Skip to content

Commit 74b9405

Browse files
authored
feat: add serialization to base64 (#33)
1 parent 5c1fb55 commit 74b9405

File tree

8 files changed

+160
-6
lines changed

8 files changed

+160
-6
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ DocArray is a library for nested, unstructured data such as text, image, audio,
2020

2121
🧑‍🔬 **Data science powerhouse**: greatly accelerate data scientists work on embedding, matching, visualizing, evaluating via Torch/Tensorflow/ONNX/PaddlePaddle on CPU/GPU.
2222

23-
🚡 **Portable**: ready-to-wire at anytime with efficient and compact serialization from/to Protobuf, bytes, JSON, CSV, dataframe.
23+
🚡 **Portable**: ready-to-wire at anytime with efficient and compact serialization from/to Protobuf, bytes, base64, JSON, CSV, dataframe.
2424

2525
<!-- end elevator-pitch -->
2626

docarray/array/mixins/io/binary.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import io
22
import os.path
3+
import base64
34
import pickle
45
from contextlib import nullcontext
56
from typing import Union, BinaryIO, TYPE_CHECKING, Type, Optional
@@ -12,7 +13,7 @@
1213

1314

1415
class BinaryIOMixin:
15-
"""Save/load an array to a binary file. """
16+
"""Save/load an array to a binary file."""
1617

1718
@classmethod
1819
def load_binary(
@@ -175,3 +176,26 @@ def from_protobuf(cls: Type['T'], pb_msg: 'DocumentArrayProto') -> 'T':
175176

176177
def __bytes__(self):
177178
return self.to_bytes()
179+
180+
@classmethod
181+
def from_base64(
182+
cls: Type['T'],
183+
data: str,
184+
protocol: str = 'pickle-array',
185+
compress: Optional[str] = None,
186+
_show_progress: bool = False,
187+
) -> 'T':
188+
return cls.load_binary(
189+
base64.b64decode(data),
190+
protocol=protocol,
191+
compress=compress,
192+
_show_progress=_show_progress,
193+
)
194+
195+
def to_base64(
196+
self,
197+
protocol: str = 'pickle-array',
198+
compress: Optional[str] = None,
199+
_show_progress: bool = False,
200+
) -> str:
201+
return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')

docarray/document/mixins/porting.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import dataclasses
22
import pickle
33
from typing import Optional, TYPE_CHECKING, Type, Dict, Any
4+
import base64
45

56
from ...helper import compress_bytes, decompress_bytes
67

@@ -86,3 +87,30 @@ def to_json(self) -> str:
8687
return MessageToJson(
8788
self.to_protobuf(), preserving_proto_field_name=True, sort_keys=True
8889
)
90+
91+
def to_base64(
92+
self, protocol: str = 'pickle', compress: Optional[str] = None
93+
) -> str:
94+
"""Serialize a Document object into as base64 string
95+
96+
:param protocol: protocol to use
97+
:param compress: compress method to use
98+
:return: a base64 encoded string
99+
"""
100+
return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
101+
102+
@classmethod
103+
def from_base64(
104+
cls: Type['T'],
105+
data: str,
106+
protocol: str = 'pickle',
107+
compress: Optional[str] = None,
108+
) -> 'T':
109+
"""Build Document object from binary bytes
110+
111+
:param data: a base64 encoded string
112+
:param protocol: protocol to use
113+
:param compress: compress method to use
114+
:return: a Document object
115+
"""
116+
return cls.from_bytes(base64.b64decode(data), protocol, compress)

docarray/proto/io/ndarray.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def read_ndarray(pb_msg: 'NdArrayProto') -> 'ArrayType':
3939

4040
return sparse_coo_tensor(idx, val, shape)
4141
else:
42-
if framework in {'numpy', 'torch', 'paddle', 'tensorflow'}:
42+
if framework in {'numpy', 'torch', 'paddle', 'tensorflow', 'list'}:
4343
x = _get_dense_array(pb_msg.dense)
4444
return _to_framework_array(x, framework)
4545

@@ -68,7 +68,7 @@ def flush_ndarray(pb_msg: 'NdArrayProto', value: 'ArrayType'):
6868
pb_msg.cls_name = 'numpy'
6969
_set_dense_array(pb_msg.dense, value)
7070
if framework == 'python':
71-
pb_msg.cls_name = 'numpy'
71+
pb_msg.cls_name = 'list'
7272
_set_dense_array(pb_msg.dense, np.array(value))
7373
if framework == 'tensorflow':
7474
pb_msg.cls_name = 'tensorflow'
@@ -144,3 +144,5 @@ def _to_framework_array(x, framework):
144144
from paddle import to_tensor
145145

146146
return to_tensor(x)
147+
elif framework == 'list':
148+
return x.tolist()

docs/fundamentals/document/serialization.md

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,43 @@ If you go with default `protcol` and `compress` settings, you can simply use `by
101101
```
102102

103103

104+
## From/to base64
105+
106+
```{important}
107+
Depending on your values of `protocol` and `compress` arguments, this feature may require `protobuf` and `lz4` dependencies. You can do `pip install "docarray[full]"` to install it.
108+
```
109+
110+
In some cases such as in REST API, you are allowed only to send/receive string not bytes. You can serialize Document into base64 string via {meth}`~docarray.document.mixins.porting.PortingMixin.to_base64` and load it via {meth}`~docarray.document.mixins.porting.PortingMixin.from_base64`.
111+
112+
```python
113+
from docarray import Document
114+
d = Document(text='hello', embedding=[1, 2, 3])
115+
116+
print(d.to_base64())
117+
```
118+
119+
```text
120+
gANjZG9jYXJyYXkuZG9jdW1lbnQKRG9jdW1lbnQKcQApgXEBfXECWAUAAABfZGF0YXEDY2RvY2FycmF5LmRvY3VtZW50LmRhdGEKRG9jdW1lbnREYXRhCnEEKYFxBX1xBihYDgAAAF9yZWZlcmVuY2VfZG9jcQdoAVgCAAAAaWRxCFggAAAAZmZjNTY3ODg3MzAyMTFlY2E4NjMxZTAwOGEzNjZkNDlxCVgJAAAAcGFyZW50X2lkcQpOWAsAAABncmFudWxhcml0eXELTlgJAAAAYWRqYWNlbmN5cQxOWAYAAABidWZmZXJxDU5YBAAAAGJsb2JxDk5YCQAAAG1pbWVfdHlwZXEPWAoAAAB0ZXh0L3BsYWlucRBYBAAAAHRleHRxEVgFAAAAaGVsbG9xElgHAAAAY29udGVudHETTlgGAAAAd2VpZ2h0cRROWAMAAAB1cmlxFU5YBAAAAHRhZ3NxFk5YBgAAAG9mZnNldHEXTlgIAAAAbG9jYXRpb25xGE5YCQAAAGVtYmVkZGluZ3EZXXEaKEsBSwJLA2VYCAAAAG1vZGFsaXR5cRtOWAsAAABldmFsdWF0aW9uc3EcTlgGAAAAc2NvcmVzcR1OWAYAAABjaHVua3NxHk5YBwAAAG1hdGNoZXNxH051YnNiLg==
121+
```
122+
123+
You can set `protocol` and `compress` to get a more compact string:
124+
125+
```python
126+
from docarray import Document
127+
d = Document(text='hello', embedding=[1, 2, 3])
128+
129+
print(len(d.to_base64()))
130+
print(len(d.to_base64(protocol='protobuf', compress='lz4')))
131+
```
132+
133+
```text
134+
664
135+
156
136+
```
137+
138+
Note that the same `protocol` and `compress` must be followed when using `.from_base64`.
139+
140+
104141
## From/to dict
105142

106143
```{important}
@@ -165,4 +202,4 @@ One can refer to the [Protobuf specification of `Document`](../../proto/index.md
165202

166203
## What's next?
167204

168-
Serializing single Document can be useful but often we want to do things in bulk, say hundreds or one million Documents at once. In that case, looping over each Document and serializing one by one is inefficient. In DocumentArray, we will introduce the similar interfaces {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_bytes`, {meth}`~docarray.array.mixins.io.json.JsonIOMixin.to_json`, and {meth}`~docarray.array.mixins.io.json.JsonIOMixin.to_list` that allows one to serialize multiple Documents much faster and more compact.
205+
Serializing single Document can be useful but often we want to do things in bulk, say hundreds or one million Documents at once. In that case, looping over each Document and serializing one by one is inefficient. In DocumentArray, we will introduce the similar interfaces {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_bytes`, {meth}`~docarray.array.mixins.io.json.JsonIOMixin.to_json`, and {meth}`~docarray.array.mixins.io.json.JsonIOMixin.to_list` that allows one to [serialize multiple Documents much faster and more compact](../documentarray/serialization.md).

docs/fundamentals/documentarray/serialization.md

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
DocArray is designed to be "ready-to-wire" at anytime. Serialization is important. DocumentArray provides multiple serialization methods that allows one transfer DocumentArray object over network and across different microservices.
55

66
- JSON string: `.from_json()`/`.to_json()`
7-
- Bytes (compressed): `.from_bytes()`/`.to_bytes()`
7+
- Bytes (compressed): `.from_bytes()`/`.to_bytes()`
8+
- Base64 (compressed): `.from_base64()`/`.to_base64()`
89
- Protobuf Message: `.from_protobuf()`/`.to_protobuf()`
910
- Python List: `.from_list()`/`.to_list()`
1011
- Pandas Dataframe: `.from_dataframe()`/`.to_dataframe()`
@@ -141,6 +142,47 @@ When set `protocol=pickle` or `protobuf`, the result binary string looks like th
141142

142143
Here `Delimiter` is a 16-bytes separator such as `b'g\x81\xcc\x1c\x0f\x93L\xed\xa2\xb0s)\x9c\xf9\xf6\xf2'` used for setting the boundary of each Document's serialization. Given a `to_bytes(protocol='pickle/protobuf')` binary string, once we know the first 16 bytes, the boundary is clear. Consequently, one can leverage this format to stream Documents, drop, skip, or early-stop, etc.
143144

145+
## From/to base64
146+
147+
```{important}
148+
Depending on your values of `protocol` and `compress` arguments, this feature may require `protobuf` and `lz4` dependencies. You can do `pip install "docarray[full]"` to install it.
149+
```
150+
151+
Serialize into base64 can be useful when binary string is not allowed, e.g. in REST API. This can be easily done via {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_base64` and {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.from_base64`. Like in binary serialization, one can specify `protocol` and `compress`:
152+
153+
```python
154+
from docarray import DocumentArray
155+
da = DocumentArray.empty(10)
156+
157+
d_str = da.to_base64(protocol='protobuf', compress='lz4')
158+
print(len(d_str), d_str)
159+
```
160+
161+
```text
162+
176 BCJNGEBAwHUAAAD/Iw+uQdpL9UDNsfvomZb8m7sKIGRkNTIyOTQyNzMwMzExZWNiM2I1MWUwMDhhMzY2ZDQ5MgAEP2FiNDIAHD9iMTgyAB0vNWUyAB0fYTIAHh9myAAdP2MzYZYAHD9jODAyAB0fZDIAHT9kMTZkAABQNjZkNDkAAAAA
163+
```
164+
165+
To deserialize, remember to set the correct `protocol` and `compress`:
166+
167+
```python
168+
from docarray import DocumentArray
169+
170+
da = DocumentArray.from_base64(d_str, protocol='protobuf', compress='lz4')
171+
da.summary()
172+
```
173+
174+
```text
175+
Length 10
176+
Homogenous Documents True
177+
Common Attributes ('id',)
178+
179+
Attributes Summary
180+
181+
Attribute Data type #Unique values Has empty value
182+
──────────────────────────────────────────────────────────
183+
id ('str',) 10 False
184+
```
185+
144186
## From/to Protobuf
145187

146188
Serializing to Protobuf Message is less frequently used, unless you are using Python Protobuf API. Nonetheless, you can use {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.from_protobuf` and {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_protobuf` to get a Protobuf Message object in Python.

tests/unit/array/mixins/test_io.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,15 @@ def test_push_pull_io(da_cls, show_progress):
105105

106106
assert len(da1) == len(da2) == 10
107107
assert da1.texts == da2.texts == random_texts
108+
109+
110+
@pytest.mark.parametrize('protocol', ['protobuf', 'pickle'])
111+
@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None])
112+
def test_from_to_base64(protocol, compress):
113+
da = DocumentArray.empty(10)
114+
da.embeddings = [[1, 2, 3]] * len(da)
115+
da_r = DocumentArray.from_base64(
116+
da.to_base64(protocol, compress), protocol, compress
117+
)
118+
assert da_r == da
119+
assert da_r[0].embedding == [1, 2, 3]

tests/unit/document/test_porting.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,12 @@ def test_dict_json(target):
2020
d1 = Document.from_dict(d.to_dict())
2121
d2 = Document.from_json(d.to_json())
2222
assert d1 == d2
23+
24+
25+
@pytest.mark.parametrize('protocol', ['protobuf', 'pickle'])
26+
@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None])
27+
def test_to_from_base64(protocol, compress):
28+
d = Document(text='hello', embedding=[1, 2, 3])
29+
d_r = Document.from_base64(d.to_base64(protocol, compress), protocol, compress)
30+
assert d_r == d
31+
assert d_r.embedding == [1, 2, 3]

0 commit comments

Comments
 (0)