feat: add serialization to base64 (#33)

hanxiao · web-flow · commit 74b9405a7670 · 2022-01-11T18:56:24.000+01:00
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ DocArray is a library for nested, unstructured data such as text, image, audio,
 
 &#129489;&zwj;&#128300; **Data science powerhouse**: greatly accelerate data scientists work on embedding, matching, visualizing, evaluating via Torch/Tensorflow/ONNX/PaddlePaddle on CPU/GPU.
 
-&#128673; **Portable**: ready-to-wire at anytime with efficient and compact serialization from/to Protobuf, bytes, JSON, CSV, dataframe.
+&#128673; **Portable**: ready-to-wire at anytime with efficient and compact serialization from/to Protobuf, bytes, base64, JSON, CSV, dataframe.
 
 <!-- end elevator-pitch -->
 
diff --git a/docarray/array/mixins/io/binary.py b/docarray/array/mixins/io/binary.py
@@ -1,5 +1,6 @@
 import io
 import os.path
+import base64
 import pickle
 from contextlib import nullcontext
 from typing import Union, BinaryIO, TYPE_CHECKING, Type, Optional
@@ -12,7 +13,7 @@
 
 
 class BinaryIOMixin:
-    """Save/load an array to a binary file. """
+    """Save/load an array to a binary file."""
 
     @classmethod
     def load_binary(
@@ -175,3 +176,26 @@ def from_protobuf(cls: Type['T'], pb_msg: 'DocumentArrayProto') -> 'T':
 
     def __bytes__(self):
         return self.to_bytes()
+
+    @classmethod
+    def from_base64(
+        cls: Type['T'],
+        data: str,
+        protocol: str = 'pickle-array',
+        compress: Optional[str] = None,
+        _show_progress: bool = False,
+    ) -> 'T':
+        return cls.load_binary(
+            base64.b64decode(data),
+            protocol=protocol,
+            compress=compress,
+            _show_progress=_show_progress,
+        )
+
+    def to_base64(
+        self,
+        protocol: str = 'pickle-array',
+        compress: Optional[str] = None,
+        _show_progress: bool = False,
+    ) -> str:
+        return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
diff --git a/docarray/document/mixins/porting.py b/docarray/document/mixins/porting.py
@@ -1,6 +1,7 @@
 import dataclasses
 import pickle
 from typing import Optional, TYPE_CHECKING, Type, Dict, Any
+import base64
 
 from ...helper import compress_bytes, decompress_bytes
 
@@ -86,3 +87,30 @@ def to_json(self) -> str:
         return MessageToJson(
             self.to_protobuf(), preserving_proto_field_name=True, sort_keys=True
         )
+
+    def to_base64(
+        self, protocol: str = 'pickle', compress: Optional[str] = None
+    ) -> str:
+        """Serialize a Document object into as base64 string
+
+        :param protocol: protocol to use
+        :param compress: compress method to use
+        :return: a base64 encoded string
+        """
+        return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
+
+    @classmethod
+    def from_base64(
+        cls: Type['T'],
+        data: str,
+        protocol: str = 'pickle',
+        compress: Optional[str] = None,
+    ) -> 'T':
+        """Build Document object from binary bytes
+
+        :param data: a base64 encoded string
+        :param protocol: protocol to use
+        :param compress: compress method to use
+        :return: a Document object
+        """
+        return cls.from_bytes(base64.b64decode(data), protocol, compress)
diff --git a/docarray/proto/io/ndarray.py b/docarray/proto/io/ndarray.py
@@ -39,7 +39,7 @@ def read_ndarray(pb_msg: 'NdArrayProto') -> 'ArrayType':
 
             return sparse_coo_tensor(idx, val, shape)
     else:
-        if framework in {'numpy', 'torch', 'paddle', 'tensorflow'}:
+        if framework in {'numpy', 'torch', 'paddle', 'tensorflow', 'list'}:
             x = _get_dense_array(pb_msg.dense)
             return _to_framework_array(x, framework)
 
@@ -68,7 +68,7 @@ def flush_ndarray(pb_msg: 'NdArrayProto', value: 'ArrayType'):
                 pb_msg.cls_name = 'numpy'
                 _set_dense_array(pb_msg.dense, value)
             if framework == 'python':
-                pb_msg.cls_name = 'numpy'
+                pb_msg.cls_name = 'list'
                 _set_dense_array(pb_msg.dense, np.array(value))
             if framework == 'tensorflow':
                 pb_msg.cls_name = 'tensorflow'
@@ -144,3 +144,5 @@ def _to_framework_array(x, framework):
         from paddle import to_tensor
 
         return to_tensor(x)
+    elif framework == 'list':
+        return x.tolist()
diff --git a/docs/fundamentals/document/serialization.md b/docs/fundamentals/document/serialization.md
@@ -101,6 +101,43 @@ If you go with default `protcol` and `compress` settings, you can simply use `by
 ```
 
 
+## From/to base64
+
+```{important}
+Depending on your values of `protocol` and `compress` arguments, this feature may require `protobuf` and `lz4` dependencies. You can do `pip install "docarray[full]"` to install it.
+```
+
+In some cases such as in REST API, you are allowed only to send/receive string not bytes. You can serialize Document into base64 string via {meth}`~docarray.document.mixins.porting.PortingMixin.to_base64` and load it via {meth}`~docarray.document.mixins.porting.PortingMixin.from_base64`.
+
+```python
+from docarray import Document
+d = Document(text='hello', embedding=[1, 2, 3])
+
+print(d.to_base64())
+```
+
+```text
+gANjZG9jYXJyYXkuZG9jdW1lbnQKRG9jdW1lbnQKcQApgXEBfXECWAUAAABfZGF0YXEDY2RvY2FycmF5LmRvY3VtZW50LmRhdGEKRG9jdW1lbnREYXRhCnEEKYFxBX1xBihYDgAAAF9yZWZlcmVuY2VfZG9jcQdoAVgCAAAAaWRxCFggAAAAZmZjNTY3ODg3MzAyMTFlY2E4NjMxZTAwOGEzNjZkNDlxCVgJAAAAcGFyZW50X2lkcQpOWAsAAABncmFudWxhcml0eXELTlgJAAAAYWRqYWNlbmN5cQxOWAYAAABidWZmZXJxDU5YBAAAAGJsb2JxDk5YCQAAAG1pbWVfdHlwZXEPWAoAAAB0ZXh0L3BsYWlucRBYBAAAAHRleHRxEVgFAAAAaGVsbG9xElgHAAAAY29udGVudHETTlgGAAAAd2VpZ2h0cRROWAMAAAB1cmlxFU5YBAAAAHRhZ3NxFk5YBgAAAG9mZnNldHEXTlgIAAAAbG9jYXRpb25xGE5YCQAAAGVtYmVkZGluZ3EZXXEaKEsBSwJLA2VYCAAAAG1vZGFsaXR5cRtOWAsAAABldmFsdWF0aW9uc3EcTlgGAAAAc2NvcmVzcR1OWAYAAABjaHVua3NxHk5YBwAAAG1hdGNoZXNxH051YnNiLg==
+```
+
+You can set `protocol` and `compress` to get a more compact string:
+
+```python
+from docarray import Document
+d = Document(text='hello', embedding=[1, 2, 3])
+
+print(len(d.to_base64()))
+print(len(d.to_base64(protocol='protobuf', compress='lz4')))
+```
+
+```text
+664
+156
+```
+
+Note that the same `protocol` and `compress` must be followed when using `.from_base64`.
+
+
 ## From/to dict
 
 ```{important}
@@ -165,4 +202,4 @@ One can refer to the [Protobuf specification of `Document`](../../proto/index.md
 
 ## What's next?
 
-Serializing single Document can be useful but often we want to do things in bulk, say hundreds or one million Documents at once. In that case, looping over each Document and serializing one by one is inefficient. In DocumentArray, we will introduce the similar interfaces {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_bytes`, {meth}`~docarray.array.mixins.io.json.JsonIOMixin.to_json`, and {meth}`~docarray.array.mixins.io.json.JsonIOMixin.to_list` that allows one to serialize multiple Documents much faster and more compact.
+Serializing single Document can be useful but often we want to do things in bulk, say hundreds or one million Documents at once. In that case, looping over each Document and serializing one by one is inefficient. In DocumentArray, we will introduce the similar interfaces {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_bytes`, {meth}`~docarray.array.mixins.io.json.JsonIOMixin.to_json`, and {meth}`~docarray.array.mixins.io.json.JsonIOMixin.to_list` that allows one to [serialize multiple Documents much faster and more compact](../documentarray/serialization.md).
diff --git a/docs/fundamentals/documentarray/serialization.md b/docs/fundamentals/documentarray/serialization.md
@@ -4,7 +4,8 @@
 DocArray is designed to be "ready-to-wire" at anytime. Serialization is important. DocumentArray provides multiple serialization methods that allows one transfer DocumentArray object over network and across different microservices.
 
 - JSON string: `.from_json()`/`.to_json()` 
-- Bytes (compressed): `.from_bytes()`/`.to_bytes()` 
+- Bytes (compressed): `.from_bytes()`/`.to_bytes()`
+- Base64 (compressed): `.from_base64()`/`.to_base64()` 
 - Protobuf Message: `.from_protobuf()`/`.to_protobuf()`
 - Python List: `.from_list()`/`.to_list()`
 - Pandas Dataframe: `.from_dataframe()`/`.to_dataframe()`
@@ -141,6 +142,47 @@ When set `protocol=pickle` or `protobuf`, the result binary string looks like th
 
 Here `Delimiter` is a 16-bytes separator such as `b'g\x81\xcc\x1c\x0f\x93L\xed\xa2\xb0s)\x9c\xf9\xf6\xf2'` used for setting the boundary of each Document's serialization. Given a `to_bytes(protocol='pickle/protobuf')` binary string, once we know the first 16 bytes, the boundary is clear. Consequently, one can leverage this format to stream Documents, drop, skip, or early-stop, etc.
 
+## From/to base64
+
+```{important}
+Depending on your values of `protocol` and `compress` arguments, this feature may require `protobuf` and `lz4` dependencies. You can do `pip install "docarray[full]"` to install it.
+```
+
+Serialize into base64 can be useful when binary string is not allowed, e.g. in REST API. This can be easily done via {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_base64` and {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.from_base64`. Like in binary serialization, one can specify `protocol` and `compress`:
+
+```python
+from docarray import DocumentArray
+da = DocumentArray.empty(10)
+
+d_str = da.to_base64(protocol='protobuf', compress='lz4')
+print(len(d_str), d_str)
+```
+
+```text
+176 BCJNGEBAwHUAAAD/Iw+uQdpL9UDNsfvomZb8m7sKIGRkNTIyOTQyNzMwMzExZWNiM2I1MWUwMDhhMzY2ZDQ5MgAEP2FiNDIAHD9iMTgyAB0vNWUyAB0fYTIAHh9myAAdP2MzYZYAHD9jODAyAB0fZDIAHT9kMTZkAABQNjZkNDkAAAAA
+```
+
+To deserialize, remember to set the correct `protocol` and `compress`:
+
+```python
+from docarray import DocumentArray
+
+da = DocumentArray.from_base64(d_str, protocol='protobuf', compress='lz4') 
+da.summary()
+```
+
+```text
+  Length                 10       
+  Homogenous Documents   True     
+  Common Attributes      ('id',)  
+                                  
+                     Attributes Summary                     
+                                                            
+  Attribute   Data type   #Unique values   Has empty value  
+ &#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472;&#9472; 
+  id          ('str',)    10               False                                                                    
+```
+
 ## From/to Protobuf
 
 Serializing to Protobuf Message is less frequently used, unless you are using Python Protobuf API. Nonetheless, you can use {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.from_protobuf` and {meth}`~docarray.array.mixins.io.binary.BinaryIOMixin.to_protobuf` to get a Protobuf Message object in Python.
diff --git a/tests/unit/array/mixins/test_io.py b/tests/unit/array/mixins/test_io.py
@@ -105,3 +105,15 @@ def test_push_pull_io(da_cls, show_progress):
 
     assert len(da1) == len(da2) == 10
     assert da1.texts == da2.texts == random_texts
+
+
+@pytest.mark.parametrize('protocol', ['protobuf', 'pickle'])
+@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None])
+def test_from_to_base64(protocol, compress):
+    da = DocumentArray.empty(10)
+    da.embeddings = [[1, 2, 3]] * len(da)
+    da_r = DocumentArray.from_base64(
+        da.to_base64(protocol, compress), protocol, compress
+    )
+    assert da_r == da
+    assert da_r[0].embedding == [1, 2, 3]
diff --git a/tests/unit/document/test_porting.py b/tests/unit/document/test_porting.py
@@ -20,3 +20,12 @@ def test_dict_json(target):
         d1 = Document.from_dict(d.to_dict())
         d2 = Document.from_json(d.to_json())
         assert d1 == d2
+
+
+@pytest.mark.parametrize('protocol', ['protobuf', 'pickle'])
+@pytest.mark.parametrize('compress', ['lz4', 'bz2', 'lzma', 'zlib', 'gzip', None])
+def test_to_from_base64(protocol, compress):
+    d = Document(text='hello', embedding=[1, 2, 3])
+    d_r = Document.from_base64(d.to_base64(protocol, compress), protocol, compress)
+    assert d_r == d
+    assert d_r.embedding == [1, 2, 3]

-Original file line number
+Diff line change
 🧑‍🔬 **Data science powerhouse**: greatly accelerate data scientists work on embedding, matching, visualizing, evaluating via Torch/Tensorflow/ONNX/PaddlePaddle on CPU/GPU.
 -🚡 **Portable**: ready-to-wire at anytime with efficient and compact serialization from/to Protobuf, bytes, JSON, CSV, dataframe.
 +🚡 **Portable**: ready-to-wire at anytime with efficient and compact serialization from/to Protobuf, bytes, base64, JSON, CSV, dataframe.
 <!-- end elevator-pitch -->