feat: add apply_batch and _map_batch and tests

anna-charlotte · anna-charlotte · commit 4c274092db47 · 2023-03-02T15:59:18.000+01:00
Signed-off-by: anna-charlotte &lt;charlotte.gerhaher@jina.ai&gt;
diff --git a/docarray/utils/apply.py b/docarray/utils/apply.py
@@ -1,4 +1,5 @@
 from contextlib import nullcontext
+from math import ceil
 from multiprocessing.pool import Pool, ThreadPool
 from types import LambdaType
 from typing import Any, Callable, Generator, Optional, TypeVar, Union
@@ -20,14 +21,15 @@ def apply(
 ) -> T:
     """
     Apply `func` to every Document of the given DocumentArray while multiprocessing,
-    return itself after modification, without in-place changes.
+    return itself after modification.
 
     :param da: DocumentArray to apply function to
     :param func: a function that takes ab:class:`BaseDocument` as input and outputs
         a :class:`BaseDocument`.
     :param backend: `thread` for multi-threading and `process` for multi-processing.
-        Defaults to `thread`. In general, if `func` is IO-bound then `thread` is a
-        good choice. If `func` is CPU-bound, then you may use `process`.
+        Defaults to `thread`.
+        In general, if `func` is IO-bound then `thread` is a good choice.
+        On the other hand, if `func` is CPU-bound, then you may use `process`.
         In practice, you should try yourselves to figure out the best value.
         However, if you wish to modify the elements in-place, regardless of IO/CPU-bound,
         you should always use `thread` backend.
@@ -46,10 +48,9 @@ def apply(
 
     :return: DocumentArray with applied modifications
     """
-    da_new = da.__class_getitem__(item=da.document_type)()
     for i, doc in enumerate(_map(da, func, backend, num_worker, pool, show_progress)):
-        da_new.append(doc)
-    return da_new
+        da[i] = doc
+    return da
 
 
 def _map(
@@ -65,12 +66,13 @@ def _map(
     yielding the results.
 
     :param da: DocumentArray to apply function to
-    :param func:a function that takes ab:class:`BaseDocument` as input and outputs
+    :param func: a function that takes a :class:`BaseDocument` as input and outputs
         a :class:`BaseDocument`. You can either modify elements in-place or return
-        new Documents.
+        new Documents (depending on `backend`).
     :param backend: `thread` for multi-threading and `process` for multi-processing.
-        Defaults to `thread`. In general, if `func` is IO-bound then `thread` is a
-        good choice. If `func` is CPU-bound, then you may use `process`.
+        Defaults to `thread`.
+        In general, if `func` is IO-bound then `thread` is a good choice.
+        On the other hand, if `func` is CPU-bound, then you may use `process`.
         In practice, you should try yourselves to figure out the best value.
         However, if you wish to modify the elements in-place, regardless of IO/CPU-bound,
         you should always use `thread` backend.
@@ -110,6 +112,138 @@ def _map(
             yield x
 
 
+def apply_batch(
+    da: T,
+    func: Callable[[T], T],
+    batch_size: int,
+    backend: str = 'thread',
+    num_worker: Optional[int] = None,
+    shuffle: bool = False,
+    pool: Optional[Union[Pool, ThreadPool]] = None,
+    show_progress: bool = False,
+) -> T:
+    """Batches itself into mini-batches, applies `func` to every mini-batch, and return itself after the modifications.
+
+    EXAMPLE USAGE
+
+    .. code-block:: python
+
+        from docarray import Document, DocumentArray
+
+        da = DocumentArray([Document(text='The cake is a lie') for _ in range(100)])
+
+
+        def func(doc):
+            da.texts = [t.upper() for t in da.texts]
+            return da
+
+
+        da.apply_batch(func, batch_size=10)
+        print(da.texts[:3])
+
+    .. code-block:: text
+
+        ['THE CAKE IS A LIE', 'THE CAKE IS A LIE', 'THE CAKE IS A LIE']
+
+    :param da: DocumentArray to apply function to
+    :param func: a function that takes a :class:`BaseDocument` as input and outputs
+        a :class:`BaseDocument`.
+    :param batch_size: size of each generated batch (except the last batch, which might
+        be smaller).
+    :param backend: `thread` for multi-threading and `process` for multi-processing.
+        Defaults to `thread`.
+        In general, if `func` is IO-bound then `thread` is a good choice.
+        On the other hand, if `func` is CPU-bound, then you may use `process`.
+        In practice, you should try yourselves to figure out the best value.
+        However, if you wish to modify the elements in-place, regardless of IO/CPU-bound,
+        you should always use `thread` backend.
+
+        .. warning::
+            When using `process` backend, your `func` should not modify elements in-place.
+            This is because the multiprocessing backend passes the variable via pickle
+            and works in another process.
+            The passed object and the original object do **not** share the same memory.
+
+    :param num_worker: the number of parallel workers. If not given, the number of CPUs
+        in the system will be used.
+    :param shuffle: If set, shuffle the Documents before dividing into minibatches.
+    :param pool: use an existing/external process or thread pool. If given, you will
+        be responsible for closing the pool.
+    :param show_progress: show a progress bar. Defaults to False.
+
+    :return DocumentArray after modifications
+    """
+    for i, batch in enumerate(
+        _map_batch(
+            da, func, batch_size, backend, num_worker, shuffle, pool, show_progress
+        )
+    ):
+        indices = [i for i in range(i * batch_size, (i + 1) * batch_size)]
+        da[indices] = batch
+    return da
+
+
+def _map_batch(
+    da: T,
+    func: Callable[[T], T],
+    batch_size: int,
+    backend: str = 'thread',
+    num_worker: Optional[int] = None,
+    shuffle: bool = False,
+    pool: Optional[Union[Pool, ThreadPool]] = None,
+    show_progress: bool = False,
+) -> Generator[T, None, None]:
+    """Return an iterator that applies function to every **minibatch** of iterable in parallel, yielding the results.
+    Each element in the returned iterator is :class:`DocumentArray`.
+
+    .. seealso::
+        - To process single element, please use :meth:`.map`;
+        - To return :class:`DocumentArray`, please use :meth:`.apply_batch`.
+
+    :param batch_size: Size of each generated batch (except the last one, which might be smaller).
+    :param shuffle: If set, shuffle the Documents before dividing into minibatches.
+    :param func: a function that takes :class:`DocumentArray` as input and outputs anything. You can either modify elements
+        in-place (only with `thread` backend) or work later on return elements.
+    :param backend: if to use multi-`process` or multi-`thread` as the parallelization backend. In general, if your
+        ``func`` is IO-bound then perhaps `thread` is good enough. If your ``func`` is CPU-bound then you may use `process`.
+        In practice, you should try yourselves to figure out the best value. However, if you wish to modify the elements
+        in-place, regardless of IO/CPU-bound, you should always use `thread` backend.
+
+        .. warning::
+            When using `process` backend, you should not expect ``func`` modify elements in-place. This is because
+            the multiprocessing backing pass the variable via pickle and work in another process. The passed object
+            and the original object do **not** share the same memory.
+
+    :param num_worker: the number of parallel workers. If not given, then the number of CPUs in the system will be used.
+    :param show_progress: show a progress bar
+    :param pool: use an existing/external pool. If given, `backend` is ignored and you will be responsible for closing the pool.
+
+    :yield: anything return from ``func``
+    """
+
+    if backend == 'process' and _is_lambda_or_partial_or_local_function(func):
+        raise ValueError(
+            f'Multiprocessing does not allow functions that are local, lambda or partial: {func}'
+        )
+
+    from rich.progress import track
+
+    ctx_p: Union[nullcontext, Union[Pool, ThreadPool]]
+    if pool:
+        p = pool
+        ctx_p = nullcontext()
+    else:
+        p = _get_pool(backend, num_worker)
+        ctx_p = p
+
+    with ctx_p:
+        imap = p.imap(func, da.batch(batch_size=batch_size, shuffle=shuffle))
+        for x in track(
+            imap, total=ceil(len(da) / batch_size), disable=not show_progress
+        ):
+            yield x
+
+
 def _get_pool(backend, num_worker) -> Union[Pool, ThreadPool]:
     if backend == 'thread':
         return ThreadPool(processes=num_worker)
diff --git a/tests/benchmark_tests/test_apply.py b/tests/benchmark_tests/test_apply.py
@@ -34,7 +34,9 @@ def time_multiprocessing(num_workers: int) -> float:
         return time() - start_time
 
     time_1_cpu = time_multiprocessing(num_workers=1)
+    print(f"time_1_cpu = {time_1_cpu}")
     time_2_cpu = time_multiprocessing(num_workers=2)
+    print(f"time_2_cpu = {time_2_cpu}")
 
     assert time_2_cpu < time_1_cpu
 
@@ -57,6 +59,8 @@ def time_multithreading(num_workers: int) -> float:
         return time() - start_time
 
     time_1_thread = time_multithreading(num_workers=1)
+    print(f"time_1_thread = {time_1_thread}")
     time_2_thread = time_multithreading(num_workers=2)
+    print(f"time_2_thread = {time_2_thread}")
 
     assert time_2_thread < time_1_thread
diff --git a/tests/units/util/test_apply.py b/tests/units/util/test_apply.py
@@ -1,12 +1,14 @@
+from typing import Generator
+
 import pytest
 
 from docarray import DocumentArray
 from docarray.documents import Image
-from docarray.utils.apply import apply
+from docarray.utils.apply import _map_batch, apply, apply_batch
 from tests.units.typing.test_bytes import IMAGE_PATHS
 
 
-def foo(d: Image) -> Image:
+def load_from_doc(d: Image) -> Image:
     if d.url is not None:
         d.tensor = d.url.load()
     return d
@@ -25,7 +27,7 @@ def test_apply(da, backend):
     for tensor in da.tensor:
         assert tensor is None
 
-    da_applied = apply(da=da, func=foo, backend=backend)
+    da_applied = apply(da=da, func=load_from_doc, backend=backend)
 
     assert len(da) == len(da_applied)
     for tensor in da_applied.tensor:
@@ -49,8 +51,38 @@ def local_func(x):
 def test_check_order(backend):
     da = DocumentArray[Image]([Image(id=i) for i in range(2)])
 
-    da_applied = apply(da=da, func=foo, backend=backend)
+    da_applied = apply(da=da, func=load_from_doc, backend=backend)
 
     assert len(da) == len(da_applied)
     for id_1, id_2 in zip(da, da_applied):
         assert id_1 == id_2
+
+
+def load_from_da(da: DocumentArray[Image]) -> DocumentArray[Image]:
+    da_new = da.__class_getitem__(da.document_type)([Image() for _ in da])
+    return da_new
+
+
+@pytest.mark.parametrize('n_docs,batch_size', [(10, 5), (10, 7)])
+@pytest.mark.parametrize('backend', ['thread', 'process'])
+def test_apply_batch_multithreading(n_docs, batch_size, backend):
+
+    da = DocumentArray[Image]([Image(url=IMAGE_PATHS['png']) for _ in range(n_docs)])
+    da_applied = apply_batch(
+        da=da, func=load_from_da, batch_size=batch_size, backend=backend
+    )
+
+    for doc in da_applied:
+        assert isinstance(doc, Image)
+
+
+@pytest.mark.parametrize('n_docs,batch_size', [(10, 5), (10, 7)])
+@pytest.mark.parametrize('backend', ['thread', 'process'])
+def test_map_batch(n_docs, batch_size, backend):
+
+    da = DocumentArray[Image]([Image(url=IMAGE_PATHS['png']) for _ in range(n_docs)])
+    it = _map_batch(da=da, func=load_from_da, batch_size=batch_size, backend=backend)
+    assert isinstance(it, Generator)
+
+    for batch in it:
+        assert isinstance(batch, DocumentArray[Image])