-
Notifications
You must be signed in to change notification settings - Fork 235
feat(v2): add audio url and predefined document #940
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
bebc9d4
6025c2f
9a599e5
04abdae
f8d700d
d58f804
bdf8e88
6572df8
9cd4baa
b3c1948
7774181
af840d4
797f488
8b48a77
e135438
14fcf6b
c623a13
1be8e3f
17786eb
97355f7
20e2344
7fc06e1
b34d783
130d8ab
2954351
61cb103
5943c0f
131c5ff
83ef649
eecca41
4762c3c
6948122
d174087
3a52303
a0be12e
9623d29
6efdcf2
5026543
703de43
83ece31
de079e2
2ef1350
d51d38e
3901cfa
a571898
71af630
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
Signed-off-by: anna-charlotte <[email protected]>
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| from docarray.predefined_document.audio import Audio | ||
| from docarray.predefined_document.image import Image | ||
| from docarray.predefined_document.mesh import Mesh3D | ||
| from docarray.predefined_document.point_cloud import PointCloud3D | ||
| from docarray.predefined_document.text import Text | ||
|
|
||
| __all__ = ['Text', 'Image', 'Mesh3D', 'PointCloud3D'] | ||
| __all__ = ['Text', 'Image', 'Audio', 'Mesh3D', 'PointCloud3D'] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| import wave | ||
| from typing import BinaryIO, Optional, TypeVar, Union | ||
|
|
||
| from docarray.document import BaseDocument | ||
| from docarray.typing import AudioUrl, Embedding, Tensor | ||
|
|
||
| T = TypeVar('T', bound='Audio') | ||
|
|
||
|
|
||
| class Audio(BaseDocument): | ||
| """ | ||
| Document for handling audios. | ||
|
|
||
| The Audio Document can contain an AudioUrl (`Audio.url`), a Tensor | ||
| (`Audio.tensor`), and an Embedding (`Audio.embedding`). | ||
|
|
||
| EXAMPLE USAGE: | ||
|
|
||
| You can use this Document directly: | ||
|
|
||
| .. code-block:: python | ||
|
|
||
| from docarray import Audio | ||
|
|
||
| # use it directly | ||
| audio = Audio(url='https://www.kozco.com/tech/piano2.wav') | ||
| audio.tensor = audio.url.load() | ||
| model = MyEmbeddingModel() | ||
| audio.embedding = model(audio.tensor) | ||
|
|
||
| You can extend this Document: | ||
|
|
||
| .. code-block:: python | ||
|
|
||
| from docarray import Audio | ||
| from docarray.typing import Embedding | ||
anna-charlotte marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| from typing import Optional | ||
|
|
||
| # extend it | ||
| class MyAudio(Audio): | ||
| name: Optional[Text] | ||
|
|
||
|
|
||
| audio = MyAudio(url='https://www.kozco.com/tech/piano2.wav') | ||
| audio.tensor = audio.url.load() | ||
| model = MyEmbeddingModel() | ||
| audio.embedding = model(audio.tensor) | ||
| audio.name = 'my first audio' | ||
|
|
||
|
|
||
| You can use this Document for composition: | ||
|
|
||
| .. code-block:: python | ||
|
|
||
| from docarray import Document, Audio, Text | ||
|
|
||
| # compose it | ||
| class MultiModalDoc(Document): | ||
| audio: Audio | ||
| text: Text | ||
|
|
||
|
|
||
| mmdoc = MultiModalDoc( | ||
| audio=Audio(url='https://www.kozco.com/tech/piano2.wav'), | ||
| text=Text(text='hello world, how are you doing?'), | ||
| ) | ||
| mmdoc.audio.tensor = mmdoc.audio.url.load() | ||
| """ | ||
|
|
||
| url: Optional[AudioUrl] | ||
| tensor: Optional[Tensor] | ||
| embedding: Optional[Embedding] | ||
|
|
||
| def save_audio_tensor_to_file( | ||
| self: 'T', | ||
| file_path: Union[str, BinaryIO], | ||
| sample_rate: int = 44100, | ||
| sample_width: int = 2, | ||
| ) -> None: | ||
| """Save :attr:`.tensor` into a .wav file. Mono/stereo is preserved. | ||
|
|
||
| :param file_path: if file is a string, open the file by that name, otherwise | ||
| treat it as a file-like object. | ||
| :param sample_rate: sampling frequency | ||
| :param sample_width: sample width in bytes | ||
| """ | ||
| if self.tensor is None: | ||
| raise ValueError( | ||
| 'Audio.tensor has not been set, and therefore cannot be saved to file.' | ||
| ) | ||
|
|
||
| # Convert to (little-endian) 16 bit integers. | ||
| max_int16 = 2**15 | ||
| tensor = (self.tensor * max_int16).astype('<h') | ||
| n_channels = 2 if self.tensor.ndim > 1 else 1 | ||
|
|
||
| with wave.open(file_path, 'w') as f: | ||
| # 2 Channels. | ||
| f.setnchannels(n_channels) | ||
| # 2 bytes per sample. | ||
| f.setsampwidth(sample_width) | ||
| f.setframerate(sample_rate) | ||
| f.writeframes(tensor.tobytes()) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -63,6 +63,8 @@ message NodeProto { | |
|
|
||
| string point_cloud_url = 13; | ||
|
|
||
| string audio_url = 14; | ||
|
|
||
|
|
||
| } | ||
|
|
||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,8 @@ | ||
| from docarray.typing.url.any_url import AnyUrl | ||
| from docarray.typing.url.audio_url import AudioUrl | ||
| from docarray.typing.url.image_url import ImageUrl | ||
| from docarray.typing.url.text_url import TextUrl | ||
| from docarray.typing.url.url_3d.mesh_url import Mesh3DUrl | ||
| from docarray.typing.url.url_3d.point_cloud_url import PointCloud3DUrl | ||
|
|
||
| __all__ = ['ImageUrl', 'AnyUrl', 'TextUrl', 'Mesh3DUrl', 'PointCloud3DUrl'] | ||
| __all__ = ['ImageUrl', 'AudioUrl', 'AnyUrl', 'TextUrl', 'Mesh3DUrl', 'PointCloud3DUrl'] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,98 @@ | ||
| from docarray.typing import AnyUrl | ||
| import wave | ||
| from typing import TYPE_CHECKING, TypeVar | ||
|
|
||
| import numpy as np | ||
|
|
||
| from docarray.typing.url.any_url import AnyUrl | ||
|
|
||
| if TYPE_CHECKING: | ||
| from docarray.proto import NodeProto | ||
|
|
||
| T = TypeVar('T', bound='AudioUrl') | ||
|
|
||
|
|
||
| class AudioUrl(AnyUrl): | ||
| """ | ||
| URL to a .wav file. | ||
| Can be remote (web) URL, or a local file path. | ||
| """ | ||
|
|
||
| def _to_node_protobuf(self: T) -> 'NodeProto': | ||
| """Convert Document into a NodeProto protobuf message. This function should | ||
| be called when the Document is nested into another Document that needs to | ||
| be converted into a protobuf | ||
|
|
||
| :return: the nested item protobuf message | ||
| """ | ||
| from docarray.proto import NodeProto | ||
|
|
||
| return NodeProto(audio_url=str(self)) | ||
|
|
||
| def load(self: T) -> np.ndarray: | ||
|
||
| """ | ||
| Load the data from the url into a numpy.ndarray audio tensor. | ||
|
|
||
| EXAMPLE USAGE | ||
|
|
||
| .. code-block:: python | ||
|
|
||
| from docarray import Document | ||
| import numpy as np | ||
|
|
||
| from docarray.typing import AudioUrl | ||
|
|
||
|
|
||
| class MyDoc(Document): | ||
| audio_url: AudioUrl | ||
|
|
||
|
|
||
| doc = MyDoc(mesh_url="toydata/hello.wav") | ||
|
|
||
| audio_tensor = doc.audio_url.load() | ||
| assert isinstance(audio_tensor, np.ndarray) | ||
|
|
||
| :return: np.ndarray representing the audio file content | ||
| """ | ||
|
|
||
| if self.startswith('http'): | ||
anna-charlotte marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| import io | ||
|
|
||
| import requests | ||
|
|
||
| resp = requests.get(self) | ||
| resp.raise_for_status() | ||
| file = io.BytesIO() | ||
| file.write(resp.content) | ||
| file.seek(0) | ||
| else: | ||
| file = self | ||
|
|
||
| # note wave is Python built-in mod. https://docs.python.org/3/library/wave.html | ||
| with wave.open(file) as ifile: | ||
| samples = ifile.getnframes() | ||
| audio = ifile.readframes(samples) | ||
|
|
||
| # Convert buffer to float32 using NumPy | ||
| audio_as_np_int16 = np.frombuffer(audio, dtype=np.int16) | ||
| audio_as_np_float32 = audio_as_np_int16.astype(np.float32) | ||
anna-charlotte marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # Normalise float32 array so that values are between -1.0 and +1.0 | ||
| max_int16 = 2**15 | ||
| audio_normalised = audio_as_np_float32 / max_int16 | ||
|
|
||
| channels = ifile.getnchannels() | ||
| if channels == 2: | ||
| # 1 for mono, 2 for stereo | ||
| audio_stereo = np.empty( | ||
| (int(len(audio_normalised) / channels), channels) | ||
| ) | ||
| audio_stereo[:, 0] = audio_normalised[ | ||
| range(0, len(audio_normalised), 2) | ||
| ] | ||
| audio_stereo[:, 1] = audio_normalised[ | ||
| range(1, len(audio_normalised), 2) | ||
| ] | ||
|
|
||
| return audio_stereo | ||
| else: | ||
| return audio_normalised | ||
Uh oh!
There was an error while loading. Please reload this page.