feat: add videobytes

Signed-off-by: anna-charlotte <[email protected]>
docarray · samsja · Feb 17, 2023 · Feb 9, 2023 · Feb 9, 2023 · Feb 9, 2023
commit ca948f90f018047e81e4bccd28a0c3dc263075a1
diff --git a/docarray/typing/bytes/video_bytes.py b/docarray/typing/bytes/video_bytes.py
@@ -0,0 +1,112 @@
+from io import BytesIO
+from typing import TYPE_CHECKING, Any, NamedTuple, Type, TypeVar
+
+import numpy as np
+from pydantic import parse_obj_as
+from pydantic.validators import bytes_validator
+
+from docarray.typing import AudioNdArray, NdArray, VideoNdArray
+from docarray.typing.abstract_type import AbstractType
+from docarray.typing.proto_register import _register_proto
+
+if TYPE_CHECKING:
+    from pydantic.fields import BaseConfig, ModelField
+
+    from docarray.proto import NodeProto
+
+T = TypeVar('T', bound='VideoBytes')
+
+
+class VideoLoadResult(NamedTuple):
+    video: VideoNdArray
+    audio: AudioNdArray
+    key_frame_indices: NdArray
+
+
+@_register_proto(proto_type_name='video_bytes')
+class VideoBytes(bytes, AbstractType):
+    """
+    Bytes that store a video and that can be load into a video tensor
+    """
+
+    @classmethod
+    def validate(
+        cls: Type[T],
+        value: Any,
+        field: 'ModelField',
+        config: 'BaseConfig',
+    ) -> T:
+
+        value = bytes_validator(value)
+        return cls(value)
+
+    @classmethod
+    def from_protobuf(cls: Type[T], pb_msg: T) -> T:
+        return parse_obj_as(cls, pb_msg)
+
+    def _to_node_protobuf(self: T) -> 'NodeProto':
+        from docarray.proto import NodeProto
+
+        return NodeProto(blob=self, type=self._proto_type_name)
+
+    def load(self, **kwargs) -> VideoLoadResult:
+        """
+        Load the video from the bytes into a VideoLoadResult object consisting of a
+        VideoNdArray (`VideoLoadResult.video`), an AudioNdArray
+        (`VideoLoadResult.audio`) and an NdArray containing the key frame indices
+        (`VideoLoadResult.key_frame_indices`).
+
+        EXAMPLE USAGE
+
+        .. code-block:: python
+
+            from docarray import BaseDocument
+            from docarray.typing import VideoUrl
+            import numpy as np
+
+
+            class MyDoc(BaseDocument):
+                video_url: VideoUrl
+
+
+            doc = MyDoc(
+                video_url="https://upload.wikimedia.org/wikipedia/commons/8/80/"
+                "Dag_Sebastian_Ahlander_at_G%C3%B6teborg_Book_Fair_2012b.jpg"
+            )
+
+            video, audio, key_frame_indices = doc.video_url.load()
+            assert isinstance(video, np.ndarray)
+            assert isinstance(audio, np.ndarray)
+            assert isinstance(key_frame_indices, np.ndarray)
+
+        :param kwargs: supports all keyword arguments that are being supported by
+            av.open() as described in:
+            https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open
+        :return: a VideoLoadResult instance with video, audio and keyframe indices
+        """
+        import av
+
+        with av.open(BytesIO(self), **kwargs) as container:
+            audio_frames = []
+            video_frames = []
+            keyframe_indices = []
+
+            for frame in container.decode():
+                if type(frame) == av.audio.frame.AudioFrame:
+                    audio_frames.append(frame.to_ndarray())
+                elif type(frame) == av.video.frame.VideoFrame:
+                    video_frames.append(frame.to_ndarray(format='rgb24'))
+
+                    if frame.key_frame == 1:
+                        curr_index = len(video_frames)
+                        keyframe_indices.append(curr_index)
+
+        if len(audio_frames) == 0:
+            audio = parse_obj_as(AudioNdArray, np.array(audio_frames))
+        else:
+            audio = parse_obj_as(AudioNdArray, np.stack(audio_frames))
+
+        video = parse_obj_as(VideoNdArray, np.stack(video_frames))
+        indices = parse_obj_as(NdArray, keyframe_indices)
+
+        return VideoLoadResult(video=video, audio=audio, key_frame_indices=indices)
diff --git a/docarray/typing/tensor/video/video_tensor_mixin.py b/docarray/typing/tensor/video/video_tensor_mixin.py
@@ -1,10 +1,13 @@
 import abc
-from typing import BinaryIO, Optional, Type, TypeVar, Union
+import warnings
+from io import BytesIO
+from typing import Optional, Type, TypeVar, Union
 
 import numpy as np
 
 from docarray.typing.tensor.abstract_tensor import AbstractTensor
 from docarray.typing.tensor.audio.audio_tensor import AudioTensor
+from docarray.utils.misc import is_notebook
 
 T = TypeVar('T', bound='AbstractTensor')
 
@@ -24,7 +27,7 @@ def validate_shape(cls: Type['T'], value: 'T') -> 'T':
 
     def save(
         self: 'T',
-        file_path: Union[str, BinaryIO],
+        file_path: Union[str, BytesIO],
         audio_tensor: Optional[AudioTensor] = None,
         video_frame_rate: int = 24,
         video_codec: str = 'h264',
@@ -77,7 +80,7 @@ class MyDoc(BaseDocument):
         np_tensor = self.get_comp_backend().to_numpy(array=self)
         video_tensor = np_tensor.astype('uint8')
 
-        with av.open(file_path, mode='w') as container:
+        with av.open(file_path, mode='w', format='mp4') as container:
             if video_tensor.ndim == 3:
                 video_tensor = np.expand_dims(video_tensor, axis=0)
 
@@ -110,8 +113,50 @@ class MyDoc(BaseDocument):
             for packet in stream_video.encode(None):
                 container.mux(packet)
 
-    def display(self) -> None:
+    def to_bytes(
+        self: 'T',
+        audio_tensor: Optional[AudioTensor] = None,
+        video_frame_rate: int = 24,
+        video_codec: str = 'h264',
+        audio_frame_rate: int = 48000,
+        audio_codec: str = 'aac',
+        audio_format: str = 'fltp',
+    ) -> bytes:
+        """
+        Convert video tensor to bytes.
+
+        :param audio_tensor: AudioTensor containing the video's soundtrack.
+        :param video_frame_rate: video frames per second.
+        :param video_codec: the name of a video decoder/encoder.
+        :param audio_frame_rate: audio frames per second.
+        :param audio_codec: the name of an audio decoder/encoder.
+        :param audio_format: the name of one of the audio formats supported by PyAV,
+            such as 'flt', 'fltp', 's16' or 's16p'.
+
+        :return: bytes
+        """
+        bytes = BytesIO()
+        self.save(
+            file_path=bytes,
+            audio_tensor=audio_tensor,
+            video_frame_rate=video_frame_rate,
+            video_codec=video_codec,
+            audio_frame_rate=audio_frame_rate,
+            audio_codec=audio_codec,
+            audio_format=audio_format,
+        )
+        return bytes.getvalue()
+
+    def display(self, audio: Optional[AudioTensor] = None) -> None:
         """
         Display video data from tensor in notebook.
+
+        :param audio: sound to play with video tensor
         """
-        raise NotImplementedError
+        if is_notebook():
+            from IPython.display import Video, display
+
+            b = self.to_bytes(audio_tensor=audio)
+            display(Video(data=b, embed=True, mimetype='video/mp4'))
+        else:
+            warnings.warn('Display of video is only possible in a notebook.')
diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py
@@ -1,13 +1,10 @@
 import warnings
-from typing import TYPE_CHECKING, Any, NamedTuple, Type, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Type, TypeVar, Union
 
 import numpy as np
-from pydantic.tools import parse_obj_as
 
+from docarray.typing.bytes.video_bytes import VideoLoadResult
 from docarray.typing.proto_register import _register_proto
-from docarray.typing.tensor.audio.audio_ndarray import AudioNdArray
-from docarray.typing.tensor.ndarray import NdArray
-from docarray.typing.tensor.video import VideoNdArray
 from docarray.typing.url.any_url import AnyUrl
 from docarray.utils.misc import is_notebook
 
@@ -20,12 +17,6 @@
 VIDEO_FILE_FORMATS = ['mp4']
 
 
-class VideoLoadResult(NamedTuple):
-    video: VideoNdArray
-    audio: AudioNdArray
-    key_frame_indices: NdArray
-
-
 @_register_proto(proto_type_name='video_url')
 class VideoUrl(AnyUrl):
     """
@@ -106,46 +97,25 @@ class MyDoc(BaseDocument):
             assert isinstance(key_frame_indices, NdArray)
 
         """
-        import av
-
-        with av.open(self, **kwargs) as container:
-            audio_frames = []
-            video_frames = []
-            keyframe_indices = []
-
-            for frame in container.decode():
-                if type(frame) == av.audio.frame.AudioFrame:
-                    audio_frames.append(frame.to_ndarray())
-                elif type(frame) == av.video.frame.VideoFrame:
-                    video_frames.append(frame.to_ndarray(format='rgb24'))
-
-                    if frame.key_frame == 1:
-                        curr_index = len(video_frames)
-                        keyframe_indices.append(curr_index)
-
-        if len(audio_frames) == 0:
-            audio = parse_obj_as(AudioNdArray, np.array(audio_frames))
-        else:
-            audio = parse_obj_as(AudioNdArray, np.stack(audio_frames))
-
-        video = parse_obj_as(VideoNdArray, np.stack(video_frames))
-        indices = parse_obj_as(NdArray, keyframe_indices)
+        from docarray.typing.bytes.video_bytes import VideoBytes
 
-        return VideoLoadResult(video=video, audio=audio, key_frame_indices=indices)
+        buffer = VideoBytes(self.load_bytes(**kwargs))
+        return buffer.load()
 
     def display(self):
         """
         Play video from url in notebook.
         """
         if is_notebook():
-            remote_url = True if self.startswith('http') else False
-
             from IPython.display import display
 
+            remote_url = True if self.startswith('http') else False
+
             if remote_url:
                 from IPython.display import Video
 
-                display(Video(data=self))
+                b = self.load_bytes()
+                display(Video(data=b, embed=True, mimetype='video/mp4'))
             else:
                 import os
 

diff --git a/tests/units/typing/tensor/test_video_tensor.py b/tests/units/typing/tensor/test_video_tensor.py
@@ -1,4 +1,5 @@
 import os
+from io import BytesIO
 
 import numpy as np
 import pytest
@@ -130,6 +131,19 @@ def test_save_video_tensor_to_file(video_tensor, tmpdir):
     assert os.path.isfile(tmp_file)
 
 
+@pytest.mark.parametrize(
+    'video_tensor',
+    [
+        parse_obj_as(VideoTorchTensor, torch.zeros(1, 224, 224, 3)),
+        parse_obj_as(VideoNdArray, np.zeros((1, 224, 224, 3))),
+    ],
+)
+def test_save_video_tensor_to_bytes(video_tensor, tmpdir):
+    b = BytesIO()
+    video_tensor.save(b)
+    isinstance(b, BytesIO)
+
+
 @pytest.mark.tensorflow
 def test_save_video_tensorflow_tensor_to_file(tmpdir):
     tmp_file = str(tmpdir / 'tmp.mp4')