docarray · anna-charlotte · Mar 3, 2023 · Feb 28, 2023 · Feb 28, 2023 · Feb 28, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -117,7 +117,7 @@ jobs:
       - name: Test
         id: test
         run: |
-          poetry run pytest -m "not tensorflow" ${{ matrix.test-path }}
+          poetry run pytest -m "not (tensorflow or benchmark)" ${{ matrix.test-path }}
         timeout-minutes: 30
 #        env:
 #          JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}"
@@ -162,7 +162,7 @@ jobs:
       - name: Test
         id: test
         run: |
-          poetry run pytest -m "not tensorflow" ${{ matrix.test-path }}
+          poetry run pytest -m "not (tensorflow or benchmark)" ${{ matrix.test-path }}
         timeout-minutes: 30
 
 
@@ -222,10 +222,35 @@ jobs:
           poetry run pytest -m 'tensorflow' tests
         timeout-minutes: 30
 
+  docarray-test-benchmarks:
+    needs: [lint-ruff, check-black, import-test]
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.7]
+    steps:
+      - uses: actions/[email protected]
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Prepare environment
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install poetry
+          poetry install --all-extras
+
+      - name: Test
+        id: test
+        run: |
+          poetry run pytest -m 'benchmark' tests
+        timeout-minutes: 30
+
 
   # just for blocking the merge until all parallel core-test are successful
   success-all-test:
-    needs: [docarray-test, docarray-test-proto3, docarray-test-tensorflow,  import-test, check-black, check-mypy, lint-ruff]
+    needs: [docarray-test, docarray-test-proto3, docarray-test-tensorflow, docarray-test-benchmarks, import-test, check-black, check-mypy, lint-ruff]
     if: always()
     runs-on: ubuntu-latest
     steps:

diff --git a/docarray/array/abstract_array.py b/docarray/array/abstract_array.py
@@ -1,17 +1,23 @@
+import random
 from abc import abstractmethod
 from typing import (
     TYPE_CHECKING,
     Any,
     Dict,
+    Generator,
     Generic,
+    Iterable,
     List,
     Sequence,
     Type,
     TypeVar,
     Union,
     cast,
+    overload,
 )
 
+import numpy as np
+
 from docarray.base_document import BaseDocument
 from docarray.display.document_array_summary import DocumentArraySummary
 from docarray.typing import NdArray
@@ -24,6 +30,7 @@
 
 T = TypeVar('T', bound='AnyDocumentArray')
 T_doc = TypeVar('T_doc', bound=BaseDocument)
+IndexIterType = Union[slice, Iterable[int], Iterable[bool], None]
 
 
 class AnyDocumentArray(Sequence[T_doc], Generic[T_doc], AbstractType):
@@ -79,6 +86,30 @@ def _setter(self, value):
 
         return cls.__typed_da__[cls][item]
 
+    @overload
+    def __getitem__(self: T, item: int) -> T_doc:
+        ...
+
+    @overload
+    def __getitem__(self: T, item: IndexIterType) -> T:
+        ...
+
+    @abstractmethod
+    def __getitem__(self, item: Union[int, IndexIterType]) -> Union[T_doc, T]:
+        ...
+
+    @overload
+    def __setitem__(self: T, key: int, value: T_doc):
+        ...
+
+    @overload
+    def __setitem__(self: T, key: IndexIterType, value: T):
+        ...
+
+    @abstractmethod
+    def __setitem__(self: T, key: Union[int, IndexIterType], value: Union[T, T_doc]):
+        ...
+
     @abstractmethod
     def _get_array_attribute(
         self: T,
@@ -249,3 +280,39 @@ def summary(self):
         Document type.
         """
         DocumentArraySummary(self).summary()
+
+    def _batch(
+        self: T,
+        batch_size: int,
+        shuffle: bool = False,
+        show_progress: bool = False,
+    ) -> Generator[T, None, None]:
+        """
+        Creates a `Generator` that yields `DocumentArray` of size `batch_size`.
+        Note, that the last batch might be smaller than `batch_size`.
+
+        :param batch_size: Size of each generated batch.
+        :param shuffle: If set, shuffle the Documents before dividing into minibatches.
+        :param show_progress: if set, show a progress bar when batching documents.
+        :yield: a Generator of `DocumentArray`, each in the length of `batch_size`
+        """
+        from rich.progress import track
+
+        if not (isinstance(batch_size, int) and batch_size > 0):
+            raise ValueError(
+                f'`batch_size` should be a positive integer, received: {batch_size}'
+            )
+
+        N = len(self)
+        indices = list(range(N))
+        n_batches = int(np.ceil(N / batch_size))
+
+        if shuffle:
+            random.shuffle(indices)
+
+        for i in track(
+            range(n_batches),
+            description='Batching documents',
+            disable=not show_progress,
+        ):
+            yield self[indices[i * batch_size : (i + 1) * batch_size]]
diff --git a/docarray/array/array/array.py b/docarray/array/array/array.py
@@ -177,11 +177,11 @@ def __getitem__(self, item):
             raise TypeError(f'Invalid type {type(head)} for indexing')
 
     @overload
-    def __setitem__(self: T, key: IndexIterType, value: T):
+    def __setitem__(self: T, key: int, value: T_doc):
         ...
 
     @overload
-    def __setitem__(self: T, key: int, value: T_doc):
+    def __setitem__(self: T, key: IndexIterType, value: T):
         ...
 
     def __setitem__(self: T, key: Union[int, IndexIterType], value: Union[T, T_doc]):

diff --git a/docarray/display/tensor_display.py b/docarray/display/tensor_display.py
@@ -30,9 +30,7 @@ def __rich_console__(
             from rich.segment import Segment
             from rich.style import Style
 
-            tensor_normalized = comp_be.minmax_normalize(
-                comp_be.detach(self.tensor), (0, 5)
-            )
+            tensor_normalized = comp_be.minmax_normalize(t_squeezed, (0, 5))
 
             hue = 0.75
             saturation = 1.0

diff --git a/docarray/helper.py b/docarray/helper.py
@@ -1,4 +1,5 @@
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type
+from types import LambdaType
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type
 
 if TYPE_CHECKING:
     from docarray import BaseDocument
@@ -138,3 +139,14 @@ def _get_field_type_by_access_path(
                 return None
     else:
         return None
+
+
+def _is_lambda_or_partial_or_local_function(func: Callable[[Any], Any]) -> bool:
+    """
+    Return True if `func` is lambda, local or partial function, else False.
+    """
+    return (
+        (isinstance(func, LambdaType) and func.__name__ == '<lambda>')
+        or not hasattr(func, '__qualname__')
+        or ('<locals>' in func.__qualname__)
+    )