fix: remove apply, only keep map

Signed-off-by: anna-charlotte <[email protected]>
docarray · anna-charlotte · Mar 3, 2023 · Feb 28, 2023 · Feb 28, 2023 · Feb 28, 2023
commit 38aae7afb4e3ddf44ab4a90a87dc4d6e3b0f74b2
diff --git a/docarray/utils/apply.py → docarray/utils/map.py b/docarray/utils/apply.py → docarray/utils/map.py
@@ -13,71 +13,7 @@
 T_doc = TypeVar('T_doc', bound=BaseDocument)
 
 
-def apply(
-    da: T,
-    func: Callable[[T_doc], T_doc],
-    backend: str = 'thread',
-    num_worker: Optional[int] = None,
-    pool: Optional[Union[Pool, ThreadPool]] = None,
-    show_progress: bool = False,
-) -> None:
-    """
-    Apply `func` to every Document of the given DocumentArray in-place while multithreading
-    or multiprocessing.
-
-    EXAMPLE USAGE
-
-    .. code-block:: python
-
-        from docarray import DocumentArray
-        from docarray.documents import Image
-        from docarray.utils.apply import apply
-
-
-        def load_url_to_tensor(img: Image) -> Image:
-            img.tensor = img.url.load()
-            return img
-
-
-        da = DocumentArray[Image]([Image(url='path/to/img.png') for _ in range(100)])
-        apply(
-            da, load_url_to_tensor, backend='thread'
-        )  # threading is usually a good option for IO-bound tasks such as loading an image from url
-
-        for doc in da:
-            assert doc.tensor is not None
-
-    :param da: DocumentArray to apply function to
-    :param func: a function that takes a :class:`BaseDocument` as input and outputs
-        a :class:`BaseDocument`.
-    :param backend: `thread` for multithreading and `process` for multiprocessing.
-        Defaults to `thread`.
-        In general, if `func` is IO-bound then `thread` is a good choice.
-        On the other hand, if `func` is CPU-bound, then you may use `process`.
-        In practice, you should try yourselves to figure out the best value.
-        However, if you wish to modify the elements in-place, regardless of IO/CPU-bound,
-        you should always use `thread` backend.
-        Note that computation that is offloaded to non-python code (e.g. through np/torch/tf)
-        falls under the "IO-bound" category.
-
-        .. warning::
-            When using `process` backend, your `func` should not modify elements in-place.
-            This is because the multiprocessing backend passes the variable via pickle
-            and works in another process.
-            The passed object and the original object do **not** share the same memory.
-
-    :param num_worker: the number of parallel workers. If not given, the number of
-        CPUs in the system will be used.
-    :param pool: use an existing/external process or thread pool. If given, you will
-        be responsible for closing the pool.
-    :param show_progress: show a progress bar. Defaults to False.
-
-    """
-    for i, doc in enumerate(_map(da, func, backend, num_worker, pool, show_progress)):
-        da[i] = doc
-
-
-def _map(
+def map(
     da: T,
     func: Callable[[T_doc], T_doc],
     backend: str = 'thread',
@@ -89,9 +25,6 @@ def _map(
     Return an iterator that applies `func` to every Document in `da` in parallel,
     yielding the results.
 
-    .. seealso::
-        - To return :class:`DocumentArray`, please use :func:`apply`.
-
     :param da: DocumentArray to apply function to
     :param func: a function that takes a :class:`BaseDocument` as input and outputs
         a :class:`BaseDocument`.
@@ -139,94 +72,7 @@ def _map(
             yield x
 
 
-def apply_batch(
-    da: T,
-    func: Union[Callable[[T], T], Callable[[T], T_doc]],
-    batch_size: int,
-    backend: str = 'thread',
-    num_worker: Optional[int] = None,
-    shuffle: bool = False,
-    pool: Optional[Union[Pool, ThreadPool]] = None,
-    show_progress: bool = False,
-) -> None:
-    """
-    Batches itself into mini-batches, applies `func` to every mini-batch in-place.
-
-    EXAMPLE USAGE
-
-    .. code-block:: python
-
-        from docarray import BaseDocument, DocumentArray
-        from docarray.utils.apply import apply_batch
-
-
-        class MyDoc(BaseDocument):
-            name: str
-
-
-        def upper_case_name(da: DocumentArray[MyDoc]) -> DocumentArray[MyDoc]:
-            da.name = [n.upper() for n in da.name]
-            return da
-
-
-        da = DocumentArray[MyDoc]([MyDoc(name='my orange cat') for _ in range(100)])
-        apply_batch(da, upper_case_name, batch_size=10)
-        print(da.name[:3])
-
-    .. code-block:: text
-
-        ['MY ORANGE CAT', 'MY ORANGE CAT', 'MY ORANGE CAT']
-
-    :param da: DocumentArray to apply function to
-    :param func: a function that takes an :class:`AnyDocumentArray` as input and outputs
-        an :class:`AnyDocumentArray` or a :class:`BaseDocument`.
-    :param batch_size: size of each generated batch (except the last batch, which might
-        be smaller).
-    :param backend: `thread` for multithreading and `process` for multiprocessing.
-        Defaults to `thread`.
-        In general, if `func` is IO-bound then `thread` is a good choice.
-        On the other hand, if `func` is CPU-bound, then you may use `process`.
-        In practice, you should try yourselves to figure out the best value.
-        However, if you wish to modify the elements in-place, regardless of IO/CPU-bound,
-        you should always use `thread` backend.
-        Note that computation that is offloaded to non-python code (e.g. through np/torch/tf)
-        falls under the "IO-bound" category.
-
-        .. warning::
-            When using `process` backend, your `func` should not modify elements in-place.
-            This is because the multiprocessing backend passes the variable via pickle
-            and works in another process.
-            The passed object and the original object do **not** share the same memory.
-
-    :param num_worker: the number of parallel workers. If not given, the number of CPUs
-        in the system will be used.
-    :param shuffle: If set, shuffle the Documents before dividing into minibatches.
-    :param pool: use an existing/external process or thread pool. If given, you will
-        be responsible for closing the pool.
-    :param show_progress: show a progress bar. Defaults to False.
-    """
-    diff = 0
-    for i, batch in enumerate(
-        _map_batch(
-            da, func, batch_size, backend, num_worker, shuffle, pool, show_progress
-        )
-    ):
-        if i == 0:
-            if isinstance(batch, AnyDocumentArray):
-                diff = len(batch) - batch_size
-            else:
-                diff = 1 - batch_size
-
-        start = i * (batch_size + diff)
-        stop = (i + 1) * batch_size + (i * diff)
-
-        if isinstance(batch, da.__class__):
-            da[start:stop] = batch
-        else:
-            da[start:stop] = da.__class_getitem__(da.document_type)([batch])
-
-
-def _map_batch(
+def map_batch(
     da: T,
     func: Callable[[T], Union[T, T_doc]],
     batch_size: int,
@@ -241,9 +87,6 @@ def _map_batch(
     yielding the results.
     Each element in the returned iterator is an :class:`AnyDocumentArray`.
 
-    .. seealso::
-        - To return :class:`DocumentArray`, please use :func:`apply_batch`.
-
     :param batch_size: Size of each generated batch (except the last one, which might
         be smaller).
     :param shuffle: If set, shuffle the Documents before dividing into minibatches.

diff --git a/tests/benchmark_tests/test_apply.py → tests/benchmark_tests/test_map.py b/tests/benchmark_tests/test_apply.py → tests/benchmark_tests/test_map.py
@@ -7,7 +7,7 @@
 from docarray import BaseDocument, DocumentArray
 from docarray.documents import Image
 from docarray.typing import NdArray
-from docarray.utils.apply import apply, apply_batch
+from docarray.utils.map import map, map_batch
 from tests.units.typing.test_bytes import IMAGE_PATHS
 
 pytestmark = [pytest.mark.benchmark, pytest.mark.slow]
@@ -25,7 +25,7 @@ def cpu_intensive(doc: MyMatrix) -> MyMatrix:
     return doc
 
 
-def test_apply_multiprocessing():
+def test_map_multiprocessing():
     if os.cpu_count() > 1:
 
         def time_multiprocessing(num_workers: int) -> float:
@@ -34,11 +34,17 @@ def time_multiprocessing(num_workers: int) -> float:
             matrices = [rng.random(size=(1000, 1000)) for _ in range(n_docs)]
             da = DocumentArray[MyMatrix]([MyMatrix(matrix=m) for m in matrices])
             start_time = time()
-            apply(da=da, func=cpu_intensive, backend='process', num_worker=num_workers)
+            list(
+                map(
+                    da=da, func=cpu_intensive, backend='process', num_worker=num_workers
+                )
+            )
             return time() - start_time
 
         time_1_cpu = time_multiprocessing(num_workers=1)
+        print(f"time_1_cpu = {time_1_cpu}")
         time_2_cpu = time_multiprocessing(num_workers=2)
+        print(f"time_2_cpu = {time_2_cpu}")
 
         assert time_2_cpu < time_1_cpu
 
@@ -52,7 +58,7 @@ def cpu_intensive_batch(da: DocumentArray[MyMatrix]) -> DocumentArray[MyMatrix]:
     return da
 
 
-def test_apply_batch_multiprocessing():
+def test_map_batch_multiprocessing():
     if os.cpu_count() > 1:
 
         def time_multiprocessing(num_workers: int) -> float:
@@ -61,17 +67,21 @@ def time_multiprocessing(num_workers: int) -> float:
             matrices = [rng.random(size=(1000, 1000)) for _ in range(n_docs)]
             da = DocumentArray[MyMatrix]([MyMatrix(matrix=m) for m in matrices])
             start_time = time()
-            apply_batch(
-                da=da,
-                func=cpu_intensive_batch,
-                batch_size=8,
-                backend='process',
-                num_worker=num_workers,
+            list(
+                map_batch(
+                    da=da,
+                    func=cpu_intensive_batch,
+                    batch_size=8,
+                    backend='process',
+                    num_worker=num_workers,
+                )
             )
             return time() - start_time
 
         time_1_cpu = time_multiprocessing(num_workers=1)
+        print(f"time_1_cpu = {time_1_cpu}")
         time_2_cpu = time_multiprocessing(num_workers=2)
+        print(f"time_2_cpu = {time_2_cpu}")
 
         assert time_2_cpu < time_1_cpu
 
@@ -82,18 +92,20 @@ def io_intensive(img: Image) -> Image:
     return img
 
 
-def test_apply_multithreading():
+def test_map_multithreading():
     def time_multithreading(num_workers: int) -> float:
         n_docs = 100
         da = DocumentArray[Image](
             [Image(url=IMAGE_PATHS['png']) for _ in range(n_docs)]
         )
         start_time = time()
-        apply(da=da, func=io_intensive, backend='thread', num_worker=num_workers)
+        list(map(da=da, func=io_intensive, backend='thread', num_worker=num_workers))
         return time() - start_time
 
     time_1_thread = time_multithreading(num_workers=1)
+    print(f"time_1_thread = {time_1_thread}")
     time_2_thread = time_multithreading(num_workers=2)
+    print(f"time_2_thread = {time_2_thread}")
 
     assert time_2_thread < time_1_thread
 
@@ -105,23 +117,27 @@ def io_intensive_batch(da: DocumentArray[Image]) -> DocumentArray[Image]:
     return da
 
 
-def test_apply_batch_multithreading():
+def test_map_batch_multithreading():
     def time_multithreading_batch(num_workers: int) -> float:
         n_docs = 100
         da = DocumentArray[Image](
             [Image(url=IMAGE_PATHS['png']) for _ in range(n_docs)]
         )
         start_time = time()
-        apply_batch(
-            da=da,
-            func=io_intensive_batch,
-            backend='thread',
-            num_worker=num_workers,
-            batch_size=10,
+        list(
+            map_batch(
+                da=da,
+                func=io_intensive_batch,
+                backend='thread',
+                num_worker=num_workers,
+                batch_size=10,
+            )
         )
         return time() - start_time
 
     time_1_thread = time_multithreading_batch(num_workers=1)
+    print(f"time_1_thread = {time_1_thread}")
     time_2_thread = time_multithreading_batch(num_workers=2)
+    print(f"time_2_thread = {time_2_thread}")
 
     assert time_2_thread < time_1_thread