feat: add audio tensors torch and ndarray

Signed-off-by: anna-charlotte <[email protected]>
docarray · JoanFM · Jan 3, 2023 · Dec 14, 2022 · Dec 14, 2022 · Dec 15, 2022
commit bdf8e884094134fcbf6d245eaeb5a3b20370afee
diff --git a/docarray/predefined_document/audio.py b/docarray/predefined_document/audio.py
@@ -1,8 +1,8 @@
-import wave
-from typing import BinaryIO, Optional, TypeVar, Union
+from typing import Optional, TypeVar
 
 from docarray.document import BaseDocument
-from docarray.typing import AudioUrl, Embedding, Tensor
+from docarray.typing import AudioUrl, Embedding
+from docarray.typing.tensor.audio.audio_tensor import AudioTensor
 
 T = TypeVar('T', bound='Audio')
 
@@ -68,36 +68,5 @@ class MultiModalDoc(Document):
     """
 
     url: Optional[AudioUrl]
-    tensor: Optional[Tensor]
+    tensor: Optional[AudioTensor]
     embedding: Optional[Embedding]
-
-    def save_audio_tensor_to_file(
-        self: 'T',
-        file_path: Union[str, BinaryIO],
-        sample_rate: int = 44100,
-        sample_width: int = 2,
-    ) -> None:
-        """Save :attr:`.tensor` into a .wav file. Mono/stereo is preserved.
-
-        :param file_path: if file is a string, open the file by that name, otherwise
-            treat it as a file-like object.
-        :param sample_rate: sampling frequency
-        :param sample_width: sample width in bytes
-        """
-        if self.tensor is None:
-            raise ValueError(
-                'Audio.tensor has not been set, and therefore cannot be saved to file.'
-            )
-
-        # Convert to (little-endian) 16 bit integers.
-        max_int16 = 2**15
-        tensor = (self.tensor * max_int16).astype('<h')
-        n_channels = 2 if self.tensor.ndim > 1 else 1
-
-        with wave.open(file_path, 'w') as f:
-            # 2 Channels.
-            f.setnchannels(n_channels)
-            # 2 bytes per sample.
-            f.setsampwidth(sample_width)
-            f.setframerate(sample_rate)
-            f.writeframes(tensor.tobytes())
diff --git a/docarray/typing/tensor/audio/__init__.py b/docarray/typing/tensor/audio/__init__.py
diff --git a/docarray/typing/tensor/audio/abstract_audio_tensor.py b/docarray/typing/tensor/audio/abstract_audio_tensor.py
@@ -0,0 +1,25 @@
+from abc import ABC, abstractmethod
+from typing import BinaryIO, TypeVar, Union
+
+from docarray.typing.tensor.abstract_tensor import AbstractTensor
+
+T = TypeVar('T', bound='AbstractAudioTensor')
+
+
+class AbstractAudioTensor(AbstractTensor, ABC):
+    @abstractmethod
+    def save_audio_tensor_to_file(
+        self: 'T',
+        file_path: Union[str, BinaryIO],
+        sample_rate: int = 44100,
+        sample_width: int = 2,
+    ) -> None:
+        """
+        Save :attr:`.tensor` into a .wav file. Mono/stereo is preserved.
+
+        :param file_path: if file is a string, open the file by that name, otherwise
+            treat it as a file-like object.
+        :param sample_rate: sampling frequency
+        :param sample_width: sample width in bytes
+        """
+        ...
diff --git a/docarray/typing/tensor/audio/audio_ndarray.py b/docarray/typing/tensor/audio/audio_ndarray.py
@@ -0,0 +1,38 @@
+import wave
+from typing import BinaryIO, TypeVar, Union
+
+from docarray.typing import NdArray
+
+T = TypeVar('T', bound='AudioNdArray')
+
+
+class AudioNdArray(NdArray):
+    """ """
+
+    def save_audio_tensor_to_file(
+        self: 'T',
+        file_path: Union[str, BinaryIO],
+        sample_rate: int = 44100,
+        sample_width: int = 2,
+    ) -> None:
+        """
+        Save :attr:`.tensor` into a .wav file. Mono/stereo is preserved.
+
+        :param file_path: if file is a string, open the file by that name, otherwise
+            treat it as a file-like object.
+        :param sample_rate: sampling frequency
+        :param sample_width: sample width in bytes
+        """
+
+        # Convert to (little-endian) 16 bit integers.
+        max_int16 = 2**15
+        tensor = (self * max_int16).astype('<h')
+        n_channels = 2 if self.ndim > 1 else 1
+
+        with wave.open(file_path, 'w') as f:
+            # 2 Channels.
+            f.setnchannels(n_channels)
+            # 2 bytes per sample.
+            f.setsampwidth(sample_width)
+            f.setframerate(sample_rate)
+            f.writeframes(tensor.tobytes())
diff --git a/docarray/typing/tensor/audio/audio_tensor.py b/docarray/typing/tensor/audio/audio_tensor.py
@@ -0,0 +1,13 @@
+from typing import Union
+
+from docarray.typing.tensor.audio.audio_ndarray import AudioNdArray
+
+try:
+    import torch  # noqa: F401
+except ImportError:
+    AudioTensor = Union[AudioNdArray]
+
+else:
+    from docarray.typing.tensor.audio.audio_torch_tensor import AudioTorchTensor
+
+    AudioTensor = Union[AudioNdArray, AudioTorchTensor]  # type: ignore
diff --git a/docarray/typing/tensor/audio/audio_torch_tensor.py b/docarray/typing/tensor/audio/audio_torch_tensor.py
@@ -0,0 +1,42 @@
+import wave
+from typing import BinaryIO, TypeVar, Union
+
+import numpy as np
+
+from docarray.typing import TorchTensor
+from docarray.typing.tensor.torch_tensor import metaTorchAndNode
+
+T = TypeVar('T', bound='AudioTorchTensor')
+
+
+class AudioTorchTensor(TorchTensor, metaclass=metaTorchAndNode):
+    """ """
+
+    def save_audio_tensor_to_file(
+        self: 'T',
+        file_path: Union[str, BinaryIO],
+        sample_rate: int = 44100,
+        sample_width: int = 2,
+    ) -> None:
+        """
+        Save :attr:`.tensor` into a .wav file. Mono/stereo is preserved.
+
+        :param file_path: if file is a string, open the file by that name, otherwise
+            treat it as a file-like object.
+        :param sample_rate: sampling frequency
+        :param sample_width: sample width in bytes
+        """
+        np_self: np.ndarray = self.cpu().detach().numpy()
+
+        # Convert to (little-endian) 16 bit integers.
+        max_int16 = 2**15
+        tensor = (np_self * max_int16).astype('<h')
+        n_channels = 2 if np_self.ndim > 1 else 1
+
+        with wave.open(file_path, 'w') as f:
+            # 2 Channels.
+            f.setnchannels(n_channels)
+            # 2 bytes per sample.
+            f.setsampwidth(sample_width)
+            f.setframerate(sample_rate)
+            f.writeframes(tensor.tobytes())