feat: add audio url and audio predefined class

Signed-off-by: anna-charlotte <[email protected]>
docarray · JoanFM · Jan 3, 2023 · Dec 14, 2022 · Dec 14, 2022 · Dec 15, 2022
commit 04abdae27bb4ad0fba42becb835b8b4240ee3665
diff --git a/docarray/__init__.py b/docarray/__init__.py
@@ -2,6 +2,14 @@
 
 from docarray.array import DocumentArray
 from docarray.document.document import BaseDocument as Document
-from docarray.predefined_document import Image, Mesh3D, PointCloud3D, Text
+from docarray.predefined_document import Audio, Image, Mesh3D, PointCloud3D, Text
 
-__all__ = ['Document', 'DocumentArray', 'Image', 'Text', 'Mesh3D', 'PointCloud3D']
+__all__ = [
+    'Document',
+    'DocumentArray',
+    'Image',
+    'Audio',
+    'Text',
+    'Mesh3D',
+    'PointCloud3D',
+]
diff --git a/docarray/predefined_document/__init__.py b/docarray/predefined_document/__init__.py
@@ -1,6 +1,7 @@
+from docarray.predefined_document.audio import Audio
 from docarray.predefined_document.image import Image
 from docarray.predefined_document.mesh import Mesh3D
 from docarray.predefined_document.point_cloud import PointCloud3D
 from docarray.predefined_document.text import Text
 
-__all__ = ['Text', 'Image', 'Mesh3D', 'PointCloud3D']
+__all__ = ['Text', 'Image', 'Audio', 'Mesh3D', 'PointCloud3D']
diff --git a/docarray/predefined_document/audio.py b/docarray/predefined_document/audio.py
@@ -0,0 +1,103 @@
+import wave
+from typing import BinaryIO, Optional, TypeVar, Union
+
+from docarray.document import BaseDocument
+from docarray.typing import AudioUrl, Embedding, Tensor
+
+T = TypeVar('T', bound='Audio')
+
+
+class Audio(BaseDocument):
+    """
+    Document for handling audios.
+
+    The Audio Document can contain an AudioUrl (`Audio.url`), a Tensor
+    (`Audio.tensor`), and an Embedding (`Audio.embedding`).
+
+    EXAMPLE USAGE:
+
+    You can use this Document directly:
+
+    .. code-block:: python
+
+        from docarray import Audio
+
+        # use it directly
+        audio = Audio(url='https://www.kozco.com/tech/piano2.wav')
+        audio.tensor = audio.url.load()
+        model = MyEmbeddingModel()
+        audio.embedding = model(audio.tensor)
+
+    You can extend this Document:
+
+    .. code-block:: python
+
+        from docarray import Audio
+        from docarray.typing import Embedding
+        from typing import Optional
+
+        # extend it
+        class MyAudio(Audio):
+            name: Optional[Text]
+
+
+        audio = MyAudio(url='https://www.kozco.com/tech/piano2.wav')
+        audio.tensor = audio.url.load()
+        model = MyEmbeddingModel()
+        audio.embedding = model(audio.tensor)
+        audio.name = 'my first audio'
+
+
+    You can use this Document for composition:
+
+    .. code-block:: python
+
+        from docarray import Document, Audio, Text
+
+        # compose it
+        class MultiModalDoc(Document):
+            audio: Audio
+            text: Text
+
+
+        mmdoc = MultiModalDoc(
+            audio=Audio(url='https://www.kozco.com/tech/piano2.wav'),
+            text=Text(text='hello world, how are you doing?'),
+        )
+        mmdoc.audio.tensor = mmdoc.audio.url.load()
+    """
+
+    url: Optional[AudioUrl]
+    tensor: Optional[Tensor]
+    embedding: Optional[Embedding]
+
+    def save_audio_tensor_to_file(
+        self: 'T',
+        file_path: Union[str, BinaryIO],
+        sample_rate: int = 44100,
+        sample_width: int = 2,
+    ) -> None:
+        """Save :attr:`.tensor` into a .wav file. Mono/stereo is preserved.
+
+        :param file_path: if file is a string, open the file by that name, otherwise
+            treat it as a file-like object.
+        :param sample_rate: sampling frequency
+        :param sample_width: sample width in bytes
+        """
+        if self.tensor is None:
+            raise ValueError(
+                'Audio.tensor has not been set, and therefore cannot be saved to file.'
+            )
+
+        # Convert to (little-endian) 16 bit integers.
+        max_int16 = 2**15
+        tensor = (self.tensor * max_int16).astype('<h')
+        n_channels = 2 if self.tensor.ndim > 1 else 1
+
+        with wave.open(file_path, 'w') as f:
+            # 2 Channels.
+            f.setnchannels(n_channels)
+            # 2 bytes per sample.
+            f.setsampwidth(sample_width)
+            f.setframerate(sample_rate)
+            f.writeframes(tensor.tobytes())
diff --git a/docarray/proto/docarray.proto b/docarray/proto/docarray.proto
@@ -63,6 +63,8 @@ message NodeProto {
 
     string point_cloud_url = 13;
 
+    string audio_url = 14;
+
 
   }
 

diff --git a/docarray/proto/pb2/docarray_pb2.py b/docarray/proto/pb2/docarray_pb2.py
diff --git a/docarray/typing/__init__.py b/docarray/typing/__init__.py
@@ -1,12 +1,20 @@
 from docarray.typing.id import ID
 from docarray.typing.tensor import NdArray, Tensor
 from docarray.typing.tensor.embedding import Embedding
-from docarray.typing.url import AnyUrl, ImageUrl, Mesh3DUrl, PointCloud3DUrl, TextUrl
+from docarray.typing.url import (
+    AnyUrl,
+    AudioUrl,
+    ImageUrl,
+    Mesh3DUrl,
+    PointCloud3DUrl,
+    TextUrl,
+)
 
 __all__ = [
     'NdArray',
     'Embedding',
     'ImageUrl',
+    'AudioUrl',
     'TextUrl',
     'Mesh3DUrl',
     'PointCloud3DUrl',

diff --git a/docarray/typing/url/__init__.py b/docarray/typing/url/__init__.py
@@ -1,7 +1,8 @@
 from docarray.typing.url.any_url import AnyUrl
+from docarray.typing.url.audio_url import AudioUrl
 from docarray.typing.url.image_url import ImageUrl
 from docarray.typing.url.text_url import TextUrl
 from docarray.typing.url.url_3d.mesh_url import Mesh3DUrl
 from docarray.typing.url.url_3d.point_cloud_url import PointCloud3DUrl
 
-__all__ = ['ImageUrl', 'AnyUrl', 'TextUrl', 'Mesh3DUrl', 'PointCloud3DUrl']
+__all__ = ['ImageUrl', 'AudioUrl', 'AnyUrl', 'TextUrl', 'Mesh3DUrl', 'PointCloud3DUrl']
diff --git a/docarray/typing/url/audio_url.py b/docarray/typing/url/audio_url.py
@@ -1,8 +1,98 @@
-from docarray.typing import AnyUrl
+import wave
+from typing import TYPE_CHECKING, TypeVar
+
+import numpy as np
+
+from docarray.typing.url.any_url import AnyUrl
+
+if TYPE_CHECKING:
+    from docarray.proto import NodeProto
+
+T = TypeVar('T', bound='AudioUrl')
 
 
 class AudioUrl(AnyUrl):
     """
     URL to a .wav file.
     Can be remote (web) URL, or a local file path.
     """
+
+    def _to_node_protobuf(self: T) -> 'NodeProto':
+        """Convert Document into a NodeProto protobuf message. This function should
+        be called when the Document is nested into another Document that needs to
+        be converted into a protobuf
+
+        :return: the nested item protobuf message
+        """
+        from docarray.proto import NodeProto
+
+        return NodeProto(audio_url=str(self))
+
+    def load(self: T) -> np.ndarray:
+        """
+        Load the data from the url into a numpy.ndarray audio tensor.
+
+        EXAMPLE USAGE
+
+        .. code-block:: python
+
+            from docarray import Document
+            import numpy as np
+
+            from docarray.typing import AudioUrl
+
+
+            class MyDoc(Document):
+                audio_url: AudioUrl
+
+
+            doc = MyDoc(mesh_url="toydata/hello.wav")
+
+            audio_tensor = doc.audio_url.load()
+            assert isinstance(audio_tensor, np.ndarray)
+
+        :return: np.ndarray representing the audio file content
+        """
+
+        if self.startswith('http'):
+            import io
+
+            import requests
+
+            resp = requests.get(self)
+            resp.raise_for_status()
+            file = io.BytesIO()
+            file.write(resp.content)
+            file.seek(0)
+        else:
+            file = self
+
+        # note wave is Python built-in mod. https://docs.python.org/3/library/wave.html
+        with wave.open(file) as ifile:
+            samples = ifile.getnframes()
+            audio = ifile.readframes(samples)
+
+            # Convert buffer to float32 using NumPy
+            audio_as_np_int16 = np.frombuffer(audio, dtype=np.int16)
+            audio_as_np_float32 = audio_as_np_int16.astype(np.float32)
+
+            # Normalise float32 array so that values are between -1.0 and +1.0
+            max_int16 = 2**15
+            audio_normalised = audio_as_np_float32 / max_int16
+
+            channels = ifile.getnchannels()
+            if channels == 2:
+                # 1 for mono, 2 for stereo
+                audio_stereo = np.empty(
+                    (int(len(audio_normalised) / channels), channels)
+                )
+                audio_stereo[:, 0] = audio_normalised[
+                    range(0, len(audio_normalised), 2)
+                ]
+                audio_stereo[:, 1] = audio_normalised[
+                    range(1, len(audio_normalised), 2)
+                ]
+
+                return audio_stereo
+            else:
+                return audio_normalised
Original file line number	Diff line number	Diff line change
Expand Up		@@ -63,6 +63,8 @@ message NodeProto {

		string point_cloud_url = 13;

		string audio_url = 14;


		}

Expand Down