Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat: pass validation for urls with not ext
Signed-off-by: jupyterjazz <[email protected]>
  • Loading branch information
jupyterjazz committed Jun 27, 2023
commit 9464fb78536b94b5d1164575f45460750d00013f
4 changes: 2 additions & 2 deletions docarray/documents/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ class MultiModalDoc(BaseDoc):
```python
from docarray.documents import TextDoc

doc = TextDoc(text='This is the main text', url='exampleurl.com/file.txt')
doc2 = TextDoc(text='This is the main text', url='exampleurl.com/file.txt')
doc = TextDoc(text='This is the main text', url='exampleurl.com')
doc2 = TextDoc(text='This is the main text', url='exampleurl.com')

doc == 'This is the main text' # True
doc == doc2 # True
Expand Down
27 changes: 23 additions & 4 deletions docarray/typing/url/any_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,27 +52,46 @@ def _to_node_protobuf(self) -> 'NodeProto':

return NodeProto(text=str(self), type=self._proto_type_name)

@staticmethod
def _get_url_extension(url):
"""
Extracts and returns the file extension from a given URL.
If no file extension is present, the function returns None.


:param url: The URL to extract the file extension from.
:return: The file extension without the period, if one exists, otherwise None.
"""

parsed_url = urllib.parse.urlparse(url)
path = parsed_url.path
ext = os.path.splitext(path)[1]
ext = ext[1:] if ext.startswith('.') else ext
return None if ext == '' else ext

@classmethod
def is_extension_allowed(cls, value: Any) -> bool:
"""
Check if the file extension of the URL is allowed for this class.
First, it guesses the mime type of the file. If it fails to detect the
mime type, it then checks the extra file extensions.
Note: This method assumes that any URL without an extension is valid.

:param value: The URL or file path.
:return: True if the extension is allowed, False otherwise
"""
if cls is AnyUrl:
return True

url_parts = value.split("?")
url_parts = value.split('?')
extension = cls._get_url_extension(value)
if not extension:
return True

mimetype, _ = mimetypes.guess_type(url_parts[0])
if mimetype and mimetype.startswith(cls.mime_type()):
return True

filename = url_parts[0].split('.')
extension = filename[-1] if len(filename) > 1 else None

return extension in cls.extra_extensions()

@classmethod
Expand Down
2 changes: 1 addition & 1 deletion docarray/typing/url/text_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class MyDoc(BaseDoc):


doc = MyDoc(
remote_url='https://www.gutenberg.org/files/1065/1065-0.txt',
remote_url='https://de.wikipedia.org/wiki/Brixen',
)

remote_txt = doc.remote_url.load()
Expand Down
2 changes: 1 addition & 1 deletion tests/index/weaviate/test_index_get_del_weaviate.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ class MyMultiModalDoc(BaseDoc):


def test_index_document_with_bytes(weaviate_client):
doc = ImageDoc(id="1", url="www.foo.com/test.png", bytes_=b"foo")
doc = ImageDoc(id="1", url="www.foo.com", bytes_=b"foo")

index = WeaviateDocumentIndex[ImageDoc]()
index.index([doc])
Expand Down
2 changes: 2 additions & 0 deletions tests/integrations/predefined_document/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
str(TOYDATA_DIR / 'hello.ogg'),
str(TOYDATA_DIR / 'hello.wma'),
str(TOYDATA_DIR / 'hello.aac'),
str(TOYDATA_DIR / 'hello'),
]

LOCAL_AUDIO_FILES_AND_FORMAT = [
Expand All @@ -39,6 +40,7 @@
(str(TOYDATA_DIR / 'hello.ogg'), 'ogg'),
(str(TOYDATA_DIR / 'hello.wma'), 'asf'),
(str(TOYDATA_DIR / 'hello.aac'), 'adts'),
(str(TOYDATA_DIR / 'hello'), 'wav'),
]

NON_AUDIO_FILES = [
Expand Down
2 changes: 1 addition & 1 deletion tests/integrations/typing/test_typing_proto.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class Mymmdoc(BaseDoc):
embedding=np.zeros((100, 1)),
any_url='http://jina.ai',
image_url='http://jina.ai/bla.jpg',
text_url='http://jina.ai/file.txt',
text_url='http://jina.ai',
mesh_url='http://jina.ai/mesh.obj',
point_cloud_url='http://jina.ai/mesh.obj',
)
Expand Down
6 changes: 3 additions & 3 deletions tests/units/document/test_docs_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@


def test_text_document_operators():
doc = TextDoc(text='text', url='http://url.com/file.txt')
doc = TextDoc(text='text', url='http://url.com')

assert doc == 'text'
assert doc != 'http://url.com'

doc2 = TextDoc(id=doc.id, text='text', url='http://url.com/file.txt')
doc2 = TextDoc(id=doc.id, text='text', url='http://url.com')
assert doc == doc2

doc3 = TextDoc(id='other-id', text='text', url='http://url.com/file.txt')
doc3 = TextDoc(id='other-id', text='text', url='http://url.com')
assert doc == doc3

assert 't' in doc
Expand Down
2 changes: 1 addition & 1 deletion tests/units/test_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ class MyDoc(BaseDoc):
docs=DocList[VideoDoc](
[
VideoDoc(
url=f'http://example.ai/videos/{i}.mp4',
url=f'http://example.ai/videos/{i}',
tensor_video=rand(256),
)
for i in range(10)
Expand Down
17 changes: 17 additions & 0 deletions tests/units/typing/url/test_any_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,20 @@ def test_operators():
assert url != 'aljdñjd'
assert 'data' in url
assert 'docarray' not in url


def test_get_url_extension():
# Test with a URL with extension
assert AnyUrl._get_url_extension('https://jina.ai/hey.md?model=gpt-4') == 'md'
assert AnyUrl._get_url_extension('https://jina.ai/text.txt') == 'txt'
assert AnyUrl._get_url_extension('bla.jpg') == 'jpg'

# Test with a URL without extension
assert AnyUrl._get_url_extension('https://jina.ai') == None
assert AnyUrl._get_url_extension('https://jina.ai/?model=gpt-4') == None

# Test with a text without extension
assert AnyUrl._get_url_extension('some_text') == None

# Test with empty input
assert AnyUrl._get_url_extension('') == None
8 changes: 4 additions & 4 deletions tests/units/typing/url/test_text_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
)
from tests import TOYDATA_DIR

REMOTE_TEXT_FILE = 'https://www.gutenberg.org/files/1065/1065-0.txt'
REMOTE_TEXT_FILE = 'https://de.wikipedia.org/wiki/Brixen'
CUR_DIR = os.path.dirname(os.path.abspath(__file__))
LOCAL_TEXT_FILES = [
str(TOYDATA_DIR / 'penal_colony.txt'),
Expand All @@ -39,13 +39,13 @@
@pytest.mark.internet
@pytest.mark.parametrize(
'url,expected_beginning',
[(REMOTE_TEXT_FILE, 'The Project Gutenberg'), *LOCAL_TEXT_FILES_AND_BEGINNING],
[(REMOTE_TEXT_FILE, '<!DOCTYPE html>'), *LOCAL_TEXT_FILES_AND_BEGINNING],
)
def test_load(url, expected_beginning):
uri = parse_obj_as(TextUrl, url)

txt = uri.load()
assert expected_beginning in txt
assert txt.startswith(expected_beginning)


@pytest.mark.slow
Expand All @@ -61,7 +61,7 @@ def test_load_to_bytes(url):
@pytest.mark.proto
@pytest.mark.slow
@pytest.mark.internet
@pytest.mark.parametrize('url', [REMOTE_TEXT_FILE])
@pytest.mark.parametrize('url', [REMOTE_TEXT_FILE, *LOCAL_TEXT_FILES])
def test_proto_text_url(url):
uri = parse_obj_as(TextUrl, url)

Expand Down