feat: pass validation for urls with not ext

Signed-off-by: jupyterjazz <[email protected]>
docarray · JoanFM · Jun 27, 2023 · Jun 26, 2023 · Jun 26, 2023 · Jun 26, 2023
commit 9464fb78536b94b5d1164575f45460750d00013f
diff --git a/docarray/documents/text.py b/docarray/documents/text.py
@@ -93,8 +93,8 @@ class MultiModalDoc(BaseDoc):
     ```python
     from docarray.documents import TextDoc
 
-    doc = TextDoc(text='This is the main text', url='exampleurl.com/file.txt')
-    doc2 = TextDoc(text='This is the main text', url='exampleurl.com/file.txt')
+    doc = TextDoc(text='This is the main text', url='exampleurl.com')
+    doc2 = TextDoc(text='This is the main text', url='exampleurl.com')
 
     doc == 'This is the main text'  # True
     doc == doc2  # True

diff --git a/docarray/typing/url/any_url.py b/docarray/typing/url/any_url.py
@@ -52,27 +52,46 @@ def _to_node_protobuf(self) -> 'NodeProto':
 
         return NodeProto(text=str(self), type=self._proto_type_name)
 
+    @staticmethod
+    def _get_url_extension(url):
+        """
+        Extracts and returns the file extension from a given URL.
+        If no file extension is present, the function returns None.
+
+
+        :param url: The URL to extract the file extension from.
+        :return: The file extension without the period, if one exists, otherwise None.
+        """
+
+        parsed_url = urllib.parse.urlparse(url)
+        path = parsed_url.path
+        ext = os.path.splitext(path)[1]
+        ext = ext[1:] if ext.startswith('.') else ext
+        return None if ext == '' else ext
+
     @classmethod
     def is_extension_allowed(cls, value: Any) -> bool:
         """
         Check if the file extension of the URL is allowed for this class.
         First, it guesses the mime type of the file. If it fails to detect the
         mime type, it then checks the extra file extensions.
+        Note: This method assumes that any URL without an extension is valid.
 
         :param value: The URL or file path.
         :return: True if the extension is allowed, False otherwise
         """
         if cls is AnyUrl:
             return True
 
-        url_parts = value.split("?")
+        url_parts = value.split('?')
+        extension = cls._get_url_extension(value)
+        if not extension:
+            return True
+
         mimetype, _ = mimetypes.guess_type(url_parts[0])
         if mimetype and mimetype.startswith(cls.mime_type()):
             return True
 
-        filename = url_parts[0].split('.')
-        extension = filename[-1] if len(filename) > 1 else None
-
         return extension in cls.extra_extensions()
 
     @classmethod

diff --git a/docarray/typing/url/text_url.py b/docarray/typing/url/text_url.py
@@ -43,7 +43,7 @@ class MyDoc(BaseDoc):
 
 
         doc = MyDoc(
-            remote_url='https://www.gutenberg.org/files/1065/1065-0.txt',
+            remote_url='https://de.wikipedia.org/wiki/Brixen',
         )
 
         remote_txt = doc.remote_url.load()

diff --git a/tests/index/weaviate/test_index_get_del_weaviate.py b/tests/index/weaviate/test_index_get_del_weaviate.py
@@ -403,7 +403,7 @@ class MyMultiModalDoc(BaseDoc):
 
 
 def test_index_document_with_bytes(weaviate_client):
-    doc = ImageDoc(id="1", url="www.foo.com/test.png", bytes_=b"foo")
+    doc = ImageDoc(id="1", url="www.foo.com", bytes_=b"foo")
 
     index = WeaviateDocumentIndex[ImageDoc]()
     index.index([doc])

diff --git a/tests/integrations/predefined_document/test_audio.py b/tests/integrations/predefined_document/test_audio.py
@@ -29,6 +29,7 @@
     str(TOYDATA_DIR / 'hello.ogg'),
     str(TOYDATA_DIR / 'hello.wma'),
     str(TOYDATA_DIR / 'hello.aac'),
+    str(TOYDATA_DIR / 'hello'),
 ]
 
 LOCAL_AUDIO_FILES_AND_FORMAT = [
@@ -39,6 +40,7 @@
     (str(TOYDATA_DIR / 'hello.ogg'), 'ogg'),
     (str(TOYDATA_DIR / 'hello.wma'), 'asf'),
     (str(TOYDATA_DIR / 'hello.aac'), 'adts'),
+    (str(TOYDATA_DIR / 'hello'), 'wav'),
 ]
 
 NON_AUDIO_FILES = [

diff --git a/tests/integrations/typing/test_typing_proto.py b/tests/integrations/typing/test_typing_proto.py
@@ -34,7 +34,7 @@ class Mymmdoc(BaseDoc):
         embedding=np.zeros((100, 1)),
         any_url='http://jina.ai',
         image_url='http://jina.ai/bla.jpg',
-        text_url='http://jina.ai/file.txt',
+        text_url='http://jina.ai',
         mesh_url='http://jina.ai/mesh.obj',
         point_cloud_url='http://jina.ai/mesh.obj',
     )

diff --git a/tests/units/document/test_docs_operators.py b/tests/units/document/test_docs_operators.py
@@ -2,15 +2,15 @@
 
 
 def test_text_document_operators():
-    doc = TextDoc(text='text', url='http://url.com/file.txt')
+    doc = TextDoc(text='text', url='http://url.com')
 
     assert doc == 'text'
     assert doc != 'http://url.com'
 
-    doc2 = TextDoc(id=doc.id, text='text', url='http://url.com/file.txt')
+    doc2 = TextDoc(id=doc.id, text='text', url='http://url.com')
     assert doc == doc2
 
-    doc3 = TextDoc(id='other-id', text='text', url='http://url.com/file.txt')
+    doc3 = TextDoc(id='other-id', text='text', url='http://url.com')
     assert doc == doc3
 
     assert 't' in doc

diff --git a/tests/units/test_helper.py b/tests/units/test_helper.py
@@ -152,7 +152,7 @@ class MyDoc(BaseDoc):
         docs=DocList[VideoDoc](
             [
                 VideoDoc(
-                    url=f'http://example.ai/videos/{i}.mp4',
+                    url=f'http://example.ai/videos/{i}',
                     tensor_video=rand(256),
                 )
                 for i in range(10)

diff --git a/tests/units/typing/url/test_any_url.py b/tests/units/typing/url/test_any_url.py
@@ -40,3 +40,20 @@ def test_operators():
     assert url != 'aljdñjd'
     assert 'data' in url
     assert 'docarray' not in url
+
+
+def test_get_url_extension():
+    # Test with a URL with extension
+    assert AnyUrl._get_url_extension('https://jina.ai/hey.md?model=gpt-4') == 'md'
+    assert AnyUrl._get_url_extension('https://jina.ai/text.txt') == 'txt'
+    assert AnyUrl._get_url_extension('bla.jpg') == 'jpg'
+
+    # Test with a URL without extension
+    assert AnyUrl._get_url_extension('https://jina.ai') == None
+    assert AnyUrl._get_url_extension('https://jina.ai/?model=gpt-4') == None
+
+    # Test with a text without extension
+    assert AnyUrl._get_url_extension('some_text') == None
+
+    # Test with empty input
+    assert AnyUrl._get_url_extension('') == None
diff --git a/tests/units/typing/url/test_text_url.py b/tests/units/typing/url/test_text_url.py
@@ -15,7 +15,7 @@
 )
 from tests import TOYDATA_DIR
 
-REMOTE_TEXT_FILE = 'https://www.gutenberg.org/files/1065/1065-0.txt'
+REMOTE_TEXT_FILE = 'https://de.wikipedia.org/wiki/Brixen'
 CUR_DIR = os.path.dirname(os.path.abspath(__file__))
 LOCAL_TEXT_FILES = [
     str(TOYDATA_DIR / 'penal_colony.txt'),
@@ -39,13 +39,13 @@
 @pytest.mark.internet
 @pytest.mark.parametrize(
     'url,expected_beginning',
-    [(REMOTE_TEXT_FILE, 'The Project Gutenberg'), *LOCAL_TEXT_FILES_AND_BEGINNING],
+    [(REMOTE_TEXT_FILE, '<!DOCTYPE html>'), *LOCAL_TEXT_FILES_AND_BEGINNING],
 )
 def test_load(url, expected_beginning):
     uri = parse_obj_as(TextUrl, url)
 
     txt = uri.load()
-    assert expected_beginning in txt
+    assert txt.startswith(expected_beginning)
 
 
 @pytest.mark.slow
@@ -61,7 +61,7 @@ def test_load_to_bytes(url):
 @pytest.mark.proto
 @pytest.mark.slow
 @pytest.mark.internet
-@pytest.mark.parametrize('url', [REMOTE_TEXT_FILE])
+@pytest.mark.parametrize('url', [REMOTE_TEXT_FILE, *LOCAL_TEXT_FILES])
 def test_proto_text_url(url):
     uri = parse_obj_as(TextUrl, url)