fix: video load and save

Signed-off-by: anna-charlotte <[email protected]>
docarray · samsja · Jan 17, 2023 · Jan 3, 2023 · Jan 3, 2023 · Jan 4, 2023
commit 395a495aaf09e452b9ac7e663dc1f234eaadcbd9
diff --git a/docarray/predefined_document/video.py b/docarray/predefined_document/video.py
@@ -2,6 +2,7 @@
 
 from docarray.document import BaseDocument
 from docarray.typing import AnyTensor, Embedding
+from docarray.typing.tensor.audio.audio_tensor import AudioTensor
 from docarray.typing.tensor.video.video_tensor import VideoTensor
 from docarray.typing.url.video_url import VideoUrl
 
@@ -11,9 +12,9 @@
 class Video(BaseDocument):
     """
     Document for handling video.
-    The Video Document can contain a VideoUrl (`Video.url`), a VideoTensor
-    (`Video.tensor`), an AnyTensor ('Video.key_frame_indices), and an Embedding
-    (`Video.embedding`).
+    The Video Document can contain a VideoUrl (`Video.url`), an AudioTensor
+    (`Video.audio_tensor`), a VideoTensor (`Video.video_tensor`), an AnyTensor
+    ('Video.key_frame_indices), and an Embedding (`Video.embedding`).
 
     EXAMPLE USAGE:
 
@@ -26,6 +27,7 @@ class Video(BaseDocument):
     """
 
     url: Optional[VideoUrl]
-    tensor: Optional[VideoTensor]
+    audio_tensor: Optional[AudioTensor]
+    video_tensor: Optional[VideoTensor]
     key_frame_indices: Optional[AnyTensor]
     embedding: Optional[Embedding]
diff --git a/docarray/typing/tensor/video/abstract_video_tensor.py b/docarray/typing/tensor/video/abstract_video_tensor.py
@@ -1,44 +1,65 @@
 from abc import ABC
-from typing import BinaryIO, TypeVar, Union
+from typing import BinaryIO, Optional, TypeVar, Union
 
 import numpy as np
 
 from docarray.typing.tensor.abstract_tensor import AbstractTensor
+from docarray.typing.tensor.audio.audio_tensor import AudioTensor
 
 T = TypeVar('T', bound='AbstractVideoTensor')
 
 
 class AbstractVideoTensor(AbstractTensor, ABC):
-    def save_to_file(
+    def save_to_mp4_file(
         self: 'T',
         file_path: Union[str, BinaryIO],
-        frame_rate: int = 24,
-        codec: str = 'h264',
+        audio_tensor: Optional[AudioTensor] = None,
+        video_frame_rate: int = 30,
+        video_codec: str = 'h264',
+        audio_frame_rate: int = 48000,
+        audio_codec: str = 'aac',
+        audio_format: str = 'fltp',
     ) -> None:
         """
         Save video tensor to a .mp4 file.
 
         :param file_path: path to a .mp4 file. If file is a string, open the file by
             that name, otherwise treat it as a file-like object.
-        :param frame_rate: frames per second.
-        :param codec: the name of a decoder/encoder.
+        :param video_frame_rate: frames per second.
+        :param video_codec: the name of a decoder/encoder.
         """
+        import av
+
         np_tensor = self.get_comp_backend().to_numpy(array=self)  # type: ignore
         video_tensor = np_tensor.astype('uint8')
-        import av
 
         with av.open(file_path, mode='w') as container:
             if video_tensor.ndim == 3:
                 video_tensor = np.expand_dims(video_tensor, axis=0)
 
-            stream = container.add_stream(codec, rate=frame_rate)
-            stream.height = video_tensor.shape[-3]
-            stream.width = video_tensor.shape[-2]
+            stream_video = container.add_stream(video_codec, rate=video_frame_rate)
+            stream_video.height = video_tensor.shape[-3]
+            stream_video.width = video_tensor.shape[-2]
+
+            if audio_tensor is not None:
+                stream_audio = container.add_stream(audio_codec)
+                audio_np = audio_tensor.get_comp_backend().to_numpy(array=audio_tensor)
+                audio_layout = 'stereo' if audio_np.shape[-2] == 2 else 'mono'
+
+                for i, audio in enumerate(audio_np):
+                    frame = av.AudioFrame.from_ndarray(
+                        array=audio, format=audio_format, layout=audio_layout
+                    )
+                    frame.rate = audio_frame_rate
+                    for packet in stream_audio.encode(frame):
+                        container.mux(packet)
 
             for vid in video_tensor:
                 frame = av.VideoFrame.from_ndarray(vid, format='rgb24')
-                for packet in stream.encode(frame):
+                for packet in stream_video.encode(frame):
                     container.mux(packet)
 
-            for packet in stream.encode(None):
+            for packet in stream_audio.encode(None):
+                container.mux(packet)
+            for packet in stream_video.encode(None):
                 container.mux(packet)
diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py
@@ -3,6 +3,7 @@
 import numpy as np
 from pydantic.tools import parse_obj_as
 
+from docarray.typing import AudioNdArray, NdArray
 from docarray.typing.tensor.video import VideoNdArray
 from docarray.typing.url.any_url import AnyUrl
 
@@ -50,19 +51,20 @@ def validate(
         return cls(str(url), scheme=None)
 
     def load(
-        self: T, only_keyframes: bool = False, **kwargs
-    ) -> Union[VideoNdArray, Tuple[VideoNdArray, np.ndarray]]:
+        self: T, only_keyframes: bool = False, audio_format: str = 'fltp', **kwargs
+    ) -> Union[VideoNdArray, Tuple[AudioNdArray, VideoNdArray, NdArray]]:
         """
-        Load the data from the url into a VideoNdArray or Tuple of VideoNdArray and
-        np.ndarray.
+        Load the data from the url into a VideoNdArray or Tuple of AudioNdArray,
+        VideoNdArray and NdArray.
 
         :param only_keyframes: if True keep only the keyframes, if False keep all frames
             and store the indices of the keyframes in :attr:`.tags`
         :param kwargs: supports all keyword arguments that are being supported by
             av.open() as described in:
             https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open
-        :return: np.ndarray representing the audio file content, list of key frame
-            indices if only_keyframe False.
+        :return: AudioNdArray representing the audio content, VideoNdArray representing
+            the images of the video, NdArray of key frame indices if only_keyframe
+            False, else only VideoNdArray representing the keyframes.
         """
         import av
 
@@ -71,19 +73,25 @@ def load(
                 stream = container.streams.video[0]
                 stream.codec_context.skip_frame = 'NONKEY'
 
-            frames = []
+            audio_frames = []
+            video_frames = []
             keyframe_indices = []
 
-            for i, frame in enumerate(container.decode(video=0)):
+            for frame in container.decode():
+                if type(frame) == av.audio.frame.AudioFrame:
+                    audio_frames.append(frame.to_ndarray(format=audio_format))
+                elif type(frame) == av.video.frame.VideoFrame:
+                    video_frames.append(frame.to_ndarray(format='rgb24'))
 
-                frame_np = frame.to_ndarray(format='rgb24')
-                frames.append(frame_np)
-                if not only_keyframes and frame.key_frame == 1:
-                    keyframe_indices.append(i)
+                    if not only_keyframes and frame.key_frame == 1:
+                        curr_index = len(video_frames)
+                        keyframe_indices.append(curr_index)
 
-        frames_vid = parse_obj_as(VideoNdArray, np.stack(frames))
+        video = parse_obj_as(VideoNdArray, np.stack(video_frames))
 
         if only_keyframes:
-            return frames_vid
+            return video
         else:
-            return frames_vid, np.array(keyframe_indices)
+            audio = parse_obj_as(AudioNdArray, np.stack(audio_frames))
+            indices = parse_obj_as(NdArray, keyframe_indices)
+            return audio, video, indices