feat: add video torch tensor and tests

Signed-off-by: anna-charlotte <[email protected]>
docarray · samsja · Jan 17, 2023 · Jan 3, 2023 · Jan 3, 2023 · Jan 4, 2023
commit dc957d19bc20bf7072f0dda8591b5de1d668406e
diff --git a/docarray/__init__.py b/docarray/__init__.py
@@ -2,7 +2,7 @@
 
 from docarray.array.array import DocumentArray
 from docarray.document.document import BaseDocument
-from docarray.predefined_document import Audio, Image, Mesh3D, PointCloud3D, Text
+from docarray.predefined_document import Audio, Image, Mesh3D, PointCloud3D, Text, Video
 
 __all__ = [
     'BaseDocument',
@@ -12,4 +12,5 @@
     'Text',
     'Mesh3D',
     'PointCloud3D',
+    'Video',
 ]
diff --git a/docarray/predefined_document/__init__.py b/docarray/predefined_document/__init__.py
@@ -3,5 +3,6 @@
 from docarray.predefined_document.mesh import Mesh3D
 from docarray.predefined_document.point_cloud import PointCloud3D
 from docarray.predefined_document.text import Text
+from docarray.predefined_document.video import Video
 
-__all__ = ['Text', 'Image', 'Audio', 'Mesh3D', 'PointCloud3D']
+__all__ = ['Text', 'Image', 'Audio', 'Mesh3D', 'PointCloud3D', 'Video']
diff --git a/docarray/predefined_document/video.py b/docarray/predefined_document/video.py
@@ -0,0 +1,31 @@
+from typing import Optional, TypeVar
+
+from docarray.document import BaseDocument
+from docarray.typing import AnyTensor, Embedding
+from docarray.typing.tensor.video.video_tensor import VideoTensor
+from docarray.typing.url.video_url import VideoUrl
+
+T = TypeVar('T', bound='Video')
+
+
+class Video(BaseDocument):
+    """
+    Document for handling video.
+    The Video Document can contain a VideoUrl (`Video.url`), a VideoTensor
+    (`Video.tensor`), an AnyTensor ('Video.key_frame_indices), and an Embedding
+    (`Video.embedding`).
+
+    EXAMPLE USAGE:
+
+    You can use this Document directly:
+
+    You can extend this Document:
+
+    You can use this Document for composition:
+
+    """
+
+    url: Optional[VideoUrl]
+    tensor: Optional[VideoTensor]
+    key_frame_indices: Optional[AnyTensor]
+    embedding: Optional[Embedding]
diff --git a/docarray/typing/tensor/video/__init__.py b/docarray/typing/tensor/video/__init__.py
@@ -0,0 +1,12 @@
+from docarray.typing.tensor.video.video_ndarray import VideoNdArray
+
+__all__ = ['VideoNdArray']
+
+try:
+    import torch  # noqa: F401
+except ImportError:
+    pass
+else:
+    from docarray.typing.tensor.video.video_torch_tensor import VideoTorchTensor  # noqa
+
+    __all__.extend(['VideoTorchTensor'])
diff --git a/docarray/typing/tensor/video/abstract_video_tensor.py b/docarray/typing/tensor/video/abstract_video_tensor.py
@@ -0,0 +1,65 @@
+from abc import ABC, abstractmethod
+from typing import BinaryIO, Dict, Generator, Optional, Tuple, Type, TypeVar, Union
+
+import numpy as np
+
+from docarray.typing.tensor.abstract_tensor import AbstractTensor
+
+T = TypeVar('T', bound='AbstractVideoTensor')
+
+
+class AbstractVideoTensor(AbstractTensor, ABC):
+    @abstractmethod
+    def to_numpy(self) -> np.ndarray:
+        """
+        Convert video tensor to numpy.ndarray.
+        """
+        ...
+
+    def save_to_file(
+        self: 'T',
+        file_path: Union[str, BinaryIO],
+        frame_rate: int = 30,
+        codec: str = 'h264',
+    ) -> None:
+        """
+        Save video tensor to a .wav file. Mono/stereo is preserved.
+
+
+        :param file_path: path to a .wav file. If file is a string, open the file by
+            that name, otherwise treat it as a file-like object.
+        :param frame_rate: frames per second.
+        :param codec: the name of a decoder/encoder.
+        """
+        np_tensor = self.to_numpy()
+
+        video_tensor = np.moveaxis(np.clip(np_tensor, 0, 255), 1, 2).astype('uint8')
+
+        import av
+
+        with av.open(file_path, mode='w') as container:
+            stream = container.add_stream(codec, rate=frame_rate)
+            stream.width = np_tensor.shape[1]
+            stream.height = np_tensor.shape[2]
+            stream.pix_fmt = 'yuv420p'
+
+            for b in video_tensor:
+                frame = av.VideoFrame.from_ndarray(b, format='rgb24')
+                for packet in stream.encode(frame):
+                    container.mux(packet)
+
+            for packet in stream.encode():
+                container.mux(packet)
+
+    @classmethod
+    def generator_from_webcam(
+        cls: Type['T'],
+        height_width: Optional[Tuple[int, int]] = None,
+        show_window: bool = True,
+        window_title: str = 'webcam',
+        fps: int = 30,
+        exit_key: int = 27,
+        exit_event=None,
+        tags: Optional[Dict] = None,
+    ) -> Generator['T', None, None]:
+        ...
diff --git a/docarray/typing/tensor/video/video_ndarray.py b/docarray/typing/tensor/video/video_ndarray.py
@@ -1,4 +1,4 @@
-from typing import TypeVar
+from typing import TYPE_CHECKING, Any, List, Tuple, Type, TypeVar, Union
 
 import numpy as np
 
@@ -7,53 +7,37 @@
 
 T = TypeVar('T', bound='VideoNdArray')
 
+if TYPE_CHECKING:
+    from pydantic import BaseConfig
+    from pydantic.fields import ModelField
+
 
 class VideoNdArray(AbstractVideoTensor, NdArray):
     """
     Subclass of NdArray, to represent a video tensor.
-
-    Additionally, this allows storing such a tensor as a .wav audio file.
+    Adds video-specific features to the tensor.
 
     EXAMPLE USAGE
 
-    .. code-block:: python
-
-        from typing import Optional
-        from pydantic import parse_obj_as
-        from docarray import Document
-        from docarray.typing import AudioNdArray, AudioUrl
-        import numpy as np
-
-
-        class MyAudioDoc(Document):
-            title: str
-            audio_tensor: Optional[AudioNdArray]
-            url: Optional[AudioUrl]
-
-
-        # from tensor
-        doc_1 = MyAudioDoc(
-            title='my_first_audio_doc',
-            audio_tensor=np.random.rand(1000, 2),
-        )
-        doc_1.audio_tensor.save_to_wav_file(file_path='path/to/file_1.wav')
-        # from url
-        doc_2 = MyAudioDoc(
-            title='my_second_audio_doc',
-            url='https://github.com/docarray/docarray/tree/feat-add-audio-v2/tests/toydata/hello.wav',
-        )
-        doc_2.audio_tensor = parse_obj_as(AudioNdArray, doc_2.url.load())
-        doc_2.audio_tensor.save_to_wav_file(file_path='path/to/file_2.wav')
     """
 
     _PROTO_FIELD_NAME = 'video_ndarray'
 
-    def check_shape(self) -> None:
-        if self.ndim != 4 or self.shape[-1] != 3 or self.dtype != np.uint8:
+    @classmethod
+    def validate(
+        cls: Type[T],
+        value: Union[T, np.ndarray, List[Any], Tuple[Any], Any],
+        field: 'ModelField',
+        config: 'BaseConfig',
+    ) -> T:
+        array = super().validate(value=value, field=field, config=config)
+        if array.ndim not in [3, 4] or array.shape[-1] != 3:
             raise ValueError(
-                f'expects `` with dtype=uint8 and ndim=4 and the last dimension is 3, '
-                f'but receiving {self.shape} in {self.dtype}'
+                f'Expects tensor with 3 or 4 dimensions and the last dimension equal'
+                f' to 3, but received {array.shape} in {array.dtype}'
             )
+        else:
+            return array
 
     def to_numpy(self) -> np.ndarray:
         return self
diff --git a/docarray/typing/tensor/video/video_tensor.py b/docarray/typing/tensor/video/video_tensor.py
@@ -0,0 +1,13 @@
+from typing import Union
+
+from docarray.typing.tensor.video.video_ndarray import VideoNdArray
+
+try:
+    import torch  # noqa: F401
+except ImportError:
+    VideoTensor = VideoNdArray
+
+else:
+    from docarray.typing.tensor.video.video_torch_tensor import VideoTorchTensor
+
+    VideoTensor = Union[VideoNdArray, VideoTorchTensor]  # type: ignore
diff --git a/docarray/typing/tensor/video/video_torch_tensor.py b/docarray/typing/tensor/video/video_torch_tensor.py
@@ -0,0 +1,43 @@
+from typing import TYPE_CHECKING, Any, List, Tuple, Type, TypeVar, Union
+
+import numpy as np
+
+from docarray.typing.tensor.torch_tensor import TorchTensor, metaTorchAndNode
+from docarray.typing.tensor.video.abstract_video_tensor import AbstractVideoTensor
+
+T = TypeVar('T', bound='VideoTorchTensor')
+
+if TYPE_CHECKING:
+    from pydantic import BaseConfig
+    from pydantic.fields import ModelField
+
+
+class VideoTorchTensor(AbstractVideoTensor, TorchTensor, metaclass=metaTorchAndNode):
+    """
+    Subclass of TorchTensor, to represent a video tensor.
+    Adds video-specific features to the tensor.
+
+    EXAMPLE USAGE
+
+    """
+
+    _PROTO_FIELD_NAME = 'video_torch_tensor'
+
+    @classmethod
+    def validate(
+        cls: Type[T],
+        value: Union[T, np.ndarray, List[Any], Tuple[Any], Any],
+        field: 'ModelField',
+        config: 'BaseConfig',
+    ) -> T:
+        tensor = super().validate(value=value, field=field, config=config)
+        if tensor.ndim not in [3, 4] or tensor.shape[-1] != 3:
+            raise ValueError(
+                f'Expects tensor with 3 or 4 dimensions and the last dimension equal '
+                f'to 3, but received {tensor.shape} in {tensor.dtype}'
+            )
+        else:
+            return tensor
+
+    def to_numpy(self) -> np.ndarray:
+        return self.cpu().detach().numpy()
diff --git a/docarray/typing/url/video_url.py b/docarray/typing/url/video_url.py
@@ -50,15 +50,17 @@ def validate(
         return cls(str(url), scheme=None)
 
     def load(
-        self: T, only_keyframes: bool = False, **kwargs
-    ) -> Union[VideoNdArray, Tuple[VideoNdArray, VideoNdArray]]:
+        self: T, only_keyframes: bool = False, dtype: str = 'int32', **kwargs
+    ) -> Union[VideoNdArray, Tuple[VideoNdArray, np.ndarray]]:
         """
-        Load the data from the url into a numpy.ndarray.
+        Load the data from the url into a VideoNdArray or Tuple of VideoNdArray and
+        np.ndarray.
 
 
 
         :param only_keyframes: if True keep only the keyframes, if False keep all frames
             and store the indices of the keyframes in :attr:`.tags`
+        :param dtype: Data-type of the returned array; default: int32.
         :param kwargs: supports all keyword arguments that are being supported by
             av.open() as described in:
             https://pyav.org/docs/stable/api/_globals.html?highlight=open#av.open
@@ -86,7 +88,4 @@ def load(
         if only_keyframes:
             return frames
         else:
-            indices = parse_obj_as(
-                VideoNdArray, np.ndarray(keyframe_indices, dtype=np.int32)
-            )
-            return frames, indices
+            return frames, np.ndarray(keyframe_indices, dtype=dtype)
diff --git a/tests/integrations/predefined_document/test_video.py b/tests/integrations/predefined_document/test_video.py
@@ -0,0 +1,43 @@
+import os
+
+import numpy as np
+import pytest
+
+from docarray import Video
+from docarray.typing import VideoNdArray
+from tests import TOYDATA_DIR
+
+LOCAL_VIDEO_FILE = str(TOYDATA_DIR / 'mov_bbb.mp4')
+REMOTE_VIDEO_FILE = 'https://github.com/docarray/docarray/blob/feat-rewrite-v2/tests/toydata/mov_bbb.mp4?raw=true'  # noqa: E501
+
+
+@pytest.mark.slow
+@pytest.mark.internet
+@pytest.mark.parametrize('file_url', [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE])
+def test_video(file_url):
+    video = Video(url=file_url)
+    video.tensor, video.key_frame_indices = video.url.load()
+
+    assert isinstance(video.tensor, np.ndarray)
+    assert isinstance(video.tensor, VideoNdArray)
+    assert isinstance(video.key_frame_indices, np.ndarray)
+
+
+@pytest.mark.slow
+@pytest.mark.internet
+@pytest.mark.parametrize('file_url', [LOCAL_VIDEO_FILE, REMOTE_VIDEO_FILE])
+def test_save_video_ndarray(file_url, tmpdir):
+    tmp_file = str(tmpdir / 'tmp.mp4')
+
+    video = Video(url=file_url)
+    video.tensor, _ = video.url.load()
+
+    assert isinstance(video.tensor, np.ndarray)
+    assert isinstance(video.tensor, VideoNdArray)
+
+    video.tensor.save_to_file(tmp_file)
+    assert os.path.isfile(tmp_file)
+
+    video_from_file = Video(url=tmp_file)
+    video_from_file.tensor = video_from_file.url.load()
+    assert np.allclose(video.tensor, video_from_file.tensor)