feat: add apply function

anna-charlotte · anna-charlotte · commit 0c3bdfd5a12b · 2023-02-28T11:08:36.000+01:00
Signed-off-by: anna-charlotte &lt;charlotte.gerhaher@jina.ai&gt;
diff --git a/docarray/display/tensor_display.py b/docarray/display/tensor_display.py
@@ -30,9 +30,7 @@ def __rich_console__(
             from rich.segment import Segment
             from rich.style import Style
 
-            tensor_normalized = comp_be.minmax_normalize(
-                comp_be.detach(self.tensor), (0, 5)
-            )
+            tensor_normalized = comp_be.minmax_normalize(t_squeezed, (0, 5))
 
             hue = 0.75
             saturation = 1.0
diff --git a/docarray/utils/apply.py b/docarray/utils/apply.py
@@ -0,0 +1,105 @@
+import uuid
+from contextlib import nullcontext
+from types import LambdaType
+from typing import TYPE_CHECKING, Any, Callable, Generator, Optional, TypeVar, Union
+
+from docarray import BaseDocument
+from docarray.array.abstract_array import AnyDocumentArray
+
+if TYPE_CHECKING:
+    from multiprocessing.pool import Pool
+
+
+T = TypeVar('T', bound=AnyDocumentArray)
+
+
+def apply(
+    da: T,
+    func: Callable[[BaseDocument], BaseDocument],
+    num_worker: Optional[int] = None,
+    pool: Optional['Pool'] = None,
+    show_progress: bool = False,
+) -> T:
+    """
+    Apply `func` to every Document of the given DocumentArray while multiprocessing,
+    return itself after modification, without in-place changes.
+
+    :param da: DocumentArray to apply function to
+    :param func: a function that takes ab:class:`BaseDocument` as input and outputs
+        a :class:`BaseDocument`.
+    :param num_worker: the number of parallel workers. If not given, the number of
+        CPUs in the system will be used.
+    :param pool: use an existing/external process or thread pool. If given, you will
+        be responsible for closing the pool.
+    :param show_progress: show a progress bar. Defaults to False.
+
+    :return: DocumentArray with applied modifications
+    """
+    da_new = da.__class_getitem__(item=da.document_type)()
+    for i, doc in enumerate(_map(da, func, num_worker, show_progress, pool)):
+        da_new.append(doc)
+    return da_new
+
+
+def _map(
+    da: T,
+    func: Callable[[BaseDocument], BaseDocument],
+    num_worker: Optional[int] = None,
+    pool: Optional['Pool'] = None,
+    show_progress: bool = False,
+) -> Generator['BaseDocument', None, None]:
+    """
+    Return an iterator that applies `func` to every Document in `da` in parallel,
+    yielding the results.
+
+    :param da: DocumentArray to apply function to
+    :param func:a function that takes ab:class:`BaseDocument` as input and outputs
+        a :class:`BaseDocument`. You can either modify elements in-place or return
+        new Documents.
+    :param num_worker: the number of parallel workers. If not given, the number of
+        CPUs in the system will be used.
+    use an existing/external process or thread pool. If given, you will
+        be responsible for closing the pool.
+    :param show_progress: show a progress bar. Defaults to False.
+
+    :yield: Documents returned from `func`
+    """
+    from rich.progress import track
+
+    if _is_lambda_or_partial_or_local_function(func):
+        print(f"func = {func}")
+        func = _globalize_function(func)
+        print(f"func = {func}")
+
+    ctx_p: Union[nullcontext, 'Pool']
+    if pool:
+        p = pool
+        ctx_p = nullcontext()
+    else:
+        from multiprocessing.pool import Pool
+
+        p = Pool(processes=num_worker)
+        ctx_p = p
+
+    with ctx_p:
+        for x in track(p.imap(func, da), total=len(da), disable=not show_progress):
+            yield x
+
+
+def _is_lambda_or_partial_or_local_function(func: Callable[[Any], Any]):
+    return (
+        (isinstance(func, LambdaType) and func.__name__ == '<lambda>')
+        or not hasattr(func, '__qualname__')
+        or ('<locals>' in func.__qualname__)
+    )
+
+
+def _globalize_function(func):
+    import sys
+
+    def result(*args, **kwargs):
+        return func(*args, **kwargs)
+
+    result.__name__ = result.__qualname__ = uuid.uuid4().hex
+    setattr(sys.modules[result.__module__], result.__name__, result)
+    return result
diff --git a/tests/units/util/test_apply.py b/tests/units/util/test_apply.py
@@ -0,0 +1,124 @@
+import time
+from multiprocessing import cpu_count
+from typing import Optional
+
+import numpy as np
+import pytest
+
+from docarray import BaseDocument, DocumentArray
+from docarray.documents import Image
+from docarray.typing import NdArray
+from docarray.utils.apply import apply
+from tests.units.typing.test_bytes import IMAGE_PATHS
+
+
+def foo(d: Image) -> Image:
+    if d.url is not None:
+        d.tensor = d.url.load()
+    return d
+
+
+@pytest.fixture()
+def da():
+    da = DocumentArray[Image](
+        [Image(url=url) for url in IMAGE_PATHS.values() for _ in range(10)]
+    )
+    return da
+
+
+def test_apply(da):
+    for tensor in da.tensor:
+        assert tensor is None
+
+    da_applied = apply(da=da, func=foo)
+
+    assert len(da) == len(da_applied)
+    for tensor in da_applied.tensor:
+        assert tensor is not None
+
+
+def test_apply_with_lambda(da):
+    for tensor in da.tensor:
+        assert tensor is None
+
+    da_applied = apply(da=da, func=lambda x: x)
+
+    assert len(da) == len(da_applied)
+    for tensor in da_applied.tensor:
+        assert tensor is None
+
+
+def test_apply_with_local_function(da):
+    def local_func(d: Image) -> Image:
+        if d.url is not None:
+            d.tensor = d.url.load()
+        return d
+
+    for tensor in da.tensor:
+        assert tensor is None
+
+    da_applied = apply(da=da, func=local_func)
+
+    assert len(da) == len(da_applied)
+    for tensor in da_applied.tensor:
+        assert tensor is None
+
+
+class MyDoc(BaseDocument):
+    tensor_a: Optional[NdArray]
+    tensor_b: Optional[NdArray]
+    tensor_matmul: Optional[NdArray]
+
+
+@pytest.fixture()
+def func():
+    def matmul(doc):
+        if doc.tensor_a is not None and doc.tensor_b is not None:
+            doc.tensor_matmul = np.matmul(doc.tensor_a, doc.tensor_b)
+        return doc
+
+    return matmul
+
+
+def matmul(doc):
+    if doc.tensor_a is not None and doc.tensor_b is not None:
+        doc.tensor_matmul = np.matmul(doc.tensor_a, doc.tensor_b)
+    return doc
+
+
+def test_benchmark(func):
+    time_mproc = []
+    time_no_mproc = []
+
+    for n_docs in [1, 2]:
+        da = DocumentArray[MyDoc](
+            [
+                MyDoc(
+                    tensor_a=np.random.randn(100, 200),
+                    tensor_b=np.random.randn(200, 100),
+                )
+                for _ in range(n_docs)
+            ]
+        )
+
+        # with multiprocessing
+        start_time = time.time()
+        apply(da=da, func=func)
+        duration_mproc = time.time() - start_time
+        time_mproc.append(duration_mproc)
+
+        # without multiprocessing
+        start_time = time.time()
+        da_no_mproc = DocumentArray[MyDoc]()
+        for i, doc in enumerate(da):
+            da_no_mproc.append(func(doc))
+        duration_no_mproc = time.time() - start_time
+        time_no_mproc.append(duration_no_mproc)
+
+    # if more than 1 CPU available, check that when using multiprocessing
+    # grows slower with more documents, then without multiprocessing.
+    print(f"cpu_count() = {cpu_count()}")
+    if cpu_count() > 1:
+        growth_factor_mproc = time_mproc[1] / time_mproc[0]
+        growth_factor_no_mproc = time_no_mproc[1] / time_no_mproc[0]
+        assert growth_factor_mproc < growth_factor_no_mproc