fix: apply

anna-charlotte · anna-charlotte · commit 14e3cc185389 · 2023-03-02T17:41:50.000+01:00
Signed-off-by: anna-charlotte &lt;charlotte.gerhaher@jina.ai&gt;
diff --git a/docarray/utils/apply.py b/docarray/utils/apply.py
@@ -4,6 +4,8 @@
 from types import LambdaType
 from typing import Any, Callable, Generator, Optional, TypeVar, Union
 
+from rich.progress import track
+
 from docarray import BaseDocument
 from docarray.array.abstract_array import AnyDocumentArray
 
@@ -18,10 +20,10 @@ def apply(
     num_worker: Optional[int] = None,
     pool: Optional[Union[Pool, ThreadPool]] = None,
     show_progress: bool = False,
-) -> T:
+) -> None:
     """
-    Apply `func` to every Document of the given DocumentArray while multithreading or
-    multiprocessing, return itself after modification.
+    Apply `func` to every Document of the given DocumentArray in-place while multithreading
+    or multiprocessing.
 
     EXAMPLE USAGE
 
@@ -38,7 +40,7 @@ def load_url_to_tensor(img: Image) -> Image:
 
 
         da = DocumentArray[Image]([Image(url='path/to/img.png') for _ in range(100)])
-        da = apply(
+        apply(
             da, load_url_to_tensor, backend='thread'
         )  # threading is usually a good option for IO-bound tasks such as loading an image from url
 
@@ -68,11 +70,9 @@ def load_url_to_tensor(img: Image) -> Image:
         be responsible for closing the pool.
     :param show_progress: show a progress bar. Defaults to False.
 
-    :return: DocumentArray with applied modifications
     """
     for i, doc in enumerate(_map(da, func, backend, num_worker, pool, show_progress)):
         da[i] = doc
-    return da
 
 
 def _map(
@@ -115,7 +115,6 @@ def _map(
 
     :yield: Documents returned from `func`
     """
-    from rich.progress import track
 
     if backend == 'process' and _is_lambda_or_partial_or_local_function(func):
         raise ValueError(
@@ -145,10 +144,9 @@ def apply_batch(
     shuffle: bool = False,
     pool: Optional[Union[Pool, ThreadPool]] = None,
     show_progress: bool = False,
-) -> T:
+) -> None:
     """
-    Batches itself into mini-batches, applies `func` to every mini-batch, and return
-    itself after the modifications.
+    Batches itself into mini-batches, applies `func` to every mini-batch in-place.
 
     EXAMPLE USAGE
 
@@ -168,7 +166,7 @@ def upper_case_name(da: DocumentArray[MyDoc]) -> DocumentArray[MyDoc]:
 
 
         da = DocumentArray[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)])
-        da = apply_batch(da, upper_case_name, batch_size=10)
+        apply_batch(da, upper_case_name, batch_size=10)
         print(da.name[:3])
 
     .. code-block:: text
@@ -200,8 +198,6 @@ def upper_case_name(da: DocumentArray[MyDoc]) -> DocumentArray[MyDoc]:
     :param pool: use an existing/external process or thread pool. If given, you will
         be responsible for closing the pool.
     :param show_progress: show a progress bar. Defaults to False.
-
-    :return DocumentArray after modifications
     """
     for i, batch in enumerate(
         _map_batch(
@@ -210,7 +206,6 @@ def upper_case_name(da: DocumentArray[MyDoc]) -> DocumentArray[MyDoc]:
     ):
         indices = [i for i in range(i * batch_size, (i + 1) * batch_size)]
         da[indices] = batch
-    return da
 
 
 def _map_batch(
@@ -256,15 +251,13 @@ def _map_batch(
     :param pool: use an existing/external pool. If given, `backend` is ignored and you will
         be responsible for closing the pool.
 
-    :yield: anything return from ``func``
+    :yield: DocumentArrays returned from `func`
     """
     if backend == 'process' and _is_lambda_or_partial_or_local_function(func):
         raise ValueError(
             f'Multiprocessing does not allow functions that are local, lambda or partial: {func}'
         )
 
-    from rich.progress import track
-
     ctx_p: Union[nullcontext, Union[Pool, ThreadPool]]
     if pool:
         p = pool
@@ -274,7 +267,7 @@ def _map_batch(
         ctx_p = p
 
     with ctx_p:
-        imap = p.imap(func, da.batch(batch_size=batch_size, shuffle=shuffle))
+        imap = p.imap(func, da._batch(batch_size=batch_size, shuffle=shuffle))
         for x in track(
             imap, total=ceil(len(da) / batch_size), disable=not show_progress
         ):
diff --git a/tests/benchmark_tests/test_apply.py b/tests/benchmark_tests/test_apply.py
@@ -6,7 +6,7 @@
 from docarray import BaseDocument, DocumentArray
 from docarray.documents import Image
 from docarray.typing import NdArray
-from docarray.utils.apply import apply
+from docarray.utils.apply import apply, apply_batch
 from tests.units.typing.test_bytes import IMAGE_PATHS
 
 pytestmark = pytest.mark.benchmark
@@ -34,17 +34,14 @@ def time_multiprocessing(num_workers: int) -> float:
         return time() - start_time
 
     time_1_cpu = time_multiprocessing(num_workers=1)
-    print(f"time_1_cpu = {time_1_cpu}")
     time_2_cpu = time_multiprocessing(num_workers=2)
-    print(f"time_2_cpu = {time_2_cpu}")
 
     assert time_2_cpu < time_1_cpu
 
 
 def io_intensive(img: Image) -> Image:
     # some io intensive function: load and set image url
-    t = img.url.load()
-    img.tensor = t
+    img.tensor = img.url.load()
     return img
 
 
@@ -59,8 +56,35 @@ def time_multithreading(num_workers: int) -> float:
         return time() - start_time
 
     time_1_thread = time_multithreading(num_workers=1)
-    print(f"time_1_thread = {time_1_thread}")
     time_2_thread = time_multithreading(num_workers=2)
-    print(f"time_2_thread = {time_2_thread}")
+
+    assert time_2_thread < time_1_thread
+
+
+def io_intensive_batch(da: DocumentArray[Image]) -> DocumentArray[Image]:
+    # some io intensive function: load and set image url
+    for doc in da:
+        doc.tensor = doc.url.load()
+    return da
+
+
+def test_apply_batch_multithreading_benchmark():
+    def time_multithreading_batch(num_workers: int) -> float:
+        n_docs = 100
+        da = DocumentArray[Image](
+            [Image(url=IMAGE_PATHS['png']) for _ in range(n_docs)]
+        )
+        start_time = time()
+        apply_batch(
+            da=da,
+            func=io_intensive_batch,
+            backend='thread',
+            num_worker=num_workers,
+            batch_size=10,
+        )
+        return time() - start_time
+
+    time_1_thread = time_multithreading_batch(num_workers=1)
+    time_2_thread = time_multithreading_batch(num_workers=2)
 
     assert time_2_thread < time_1_thread
diff --git a/tests/units/array/test_batching.py b/tests/units/array/test_batching.py
@@ -26,7 +26,7 @@ class MyDoc(BaseDocument):
     if stack:
         da = da.stack()
 
-    batches = list(da.batch(batch_size=batch_size, shuffle=shuffle))
+    batches = list(da._batch(batch_size=batch_size, shuffle=shuffle))
     assert len(batches) == n_batches
 
     for i, batch in enumerate(batches):
diff --git a/tests/units/util/test_apply.py b/tests/units/util/test_apply.py
@@ -7,6 +7,8 @@
 from docarray.utils.apply import _map_batch, apply, apply_batch
 from tests.units.typing.test_bytes import IMAGE_PATHS
 
+N_DOCS = 2
+
 
 def load_from_doc(d: Image) -> Image:
     if d.url is not None:
@@ -16,9 +18,7 @@ def load_from_doc(d: Image) -> Image:
 
 @pytest.fixture()
 def da():
-    da = DocumentArray[Image](
-        [Image(url=url) for url in IMAGE_PATHS.values() for _ in range(2)]
-    )
+    da = DocumentArray[Image]([Image(url=IMAGE_PATHS['png']) for _ in range(N_DOCS)])
     return da
 
 
@@ -27,10 +27,10 @@ def test_apply(da, backend):
     for tensor in da.tensor:
         assert tensor is None
 
-    da_applied = apply(da=da, func=load_from_doc, backend=backend)
+    apply(da=da, func=load_from_doc, backend=backend)
 
-    assert len(da) == len(da_applied)
-    for tensor in da_applied.tensor:
+    assert len(da) == N_DOCS
+    for tensor in da.tensor:
         assert tensor is not None
 
 
@@ -49,13 +49,13 @@ def local_func(x):
 
 @pytest.mark.parametrize('backend', ['thread', 'process'])
 def test_check_order(backend):
-    da = DocumentArray[Image]([Image(id=i) for i in range(2)])
+    da = DocumentArray[Image]([Image(id=i) for i in range(N_DOCS)])
 
-    da_applied = apply(da=da, func=load_from_doc, backend=backend)
+    apply(da=da, func=load_from_doc, backend=backend)
 
-    assert len(da) == len(da_applied)
-    for id_1, id_2 in zip(da, da_applied):
-        assert id_1 == id_2
+    assert len(da) == N_DOCS
+    for i, id_1 in enumerate(da.id):
+        assert id_1 == str(i)
 
 
 def load_from_da(da: DocumentArray[Image]) -> DocumentArray[Image]:
@@ -69,11 +69,9 @@ def load_from_da(da: DocumentArray[Image]) -> DocumentArray[Image]:
 def test_apply_batch_multithreading(n_docs, batch_size, backend):
 
     da = DocumentArray[Image]([Image(url=IMAGE_PATHS['png']) for _ in range(n_docs)])
-    da_applied = apply_batch(
-        da=da, func=load_from_da, batch_size=batch_size, backend=backend
-    )
+    apply_batch(da=da, func=load_from_da, batch_size=batch_size, backend=backend)
 
-    for doc in da_applied:
+    for doc in da:
         assert isinstance(doc, Image)