PEP585 updates.

gandersen101 · Jan 30, 2021 · 1b01e4f · 1b01e4f
1 parent 9564b0a
commit 1b01e4f
Show file tree

Hide file tree

Showing 18 changed files with 135 additions and 141 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+*v0.4.1 Release Notes:*
+- *Spaczz's phrase searching algorithm has been further optimized so both the `FuzzyMatcher` and `SimilarityMatcher` should run considerably faster.*
+- *The `FuzzyMatcher` and `SimilarityMatcher` now include a `thresh` parameter that defaults to `100`. When matching, if `flex > 0` and the match ratio is >= `thresh` during the initial scan of the document, no optimization will be attempted. By default perfect matches don't need to be run through match optimization.*
+- *PEP585 code updates.*
+
 *v0.4.0 Release Notes:*
 - *Spaczz now includes a `TokenMatcher` that provides token pattern support like spaCy's `Matcher`. It provides all the same functionality as spaCy's `Matcher` but adds fuzzy and fuzzy-regex support. However, it adds additional overhead to it's spaCy counterpart so it should only be used as needed for fuzzy matching purposes.*
 - *Spaczz's custom attributes have been reworked and now initialize within spaczz's root `__init__`. These are set via spaczz pipeline components (currently just the `SpaczzRuler`) The only downside is that I had to remove the `attr` parameter from the `SpaczzRuler` to enable this.*

diff --git a/noxfile.py b/noxfile.py
@@ -9,7 +9,7 @@
 
 
 package = "spaczz"
-nox.options.sessions = "lint", "mypy", "safety", "tests", "typeguard"
+nox.options.sessions = "lint", "mypy", "safety", "tests"
 locations = "src", "tests", "noxfile.py", "docs/conf.py"
 
 
@@ -151,14 +151,15 @@ def tests(session: Session) -> None:
     session.run("pytest", *args)
 
 
-@nox.session(python=["3.9", "3.8", "3.7"])
-def typeguard(session: Session) -> None:
-    """Runtime type checking using Typeguard."""
-    args = session.posargs
-    session.run("poetry", "install", "--no-dev", external=True)
-    install_with_constraints(session, "pytest", "pytest-mock", "typeguard")
-    session.run("python", "-m", "spacy", "download", "en_core_web_md")
-    session.run("pytest", f"--typeguard-packages={package}", *args)
+# Typeguard does not seem to currently work with PEP585
+# @nox.session(python=["3.9", "3.8", "3.7"])
+# def typeguard(session: Session) -> None:
+#     """Runtime type checking using Typeguard."""
+#     args = session.posargs
+#     session.run("poetry", "install", "--no-dev", external=True)
+#     install_with_constraints(session, "pytest", "pytest-mock", "typeguard")
+#     session.run("python", "-m", "spacy", "download", "en_core_web_md")
+#     session.run("pytest", f"--typeguard-packages={package}", *args)
 
 
 @nox.session(python=["3.9", "3.8", "3.7"])

diff --git a/src/spaczz/attrs.py b/src/spaczz/attrs.py
@@ -1,7 +1,7 @@
 """Custom spaCy attributes for spaczz."""
 from __future__ import annotations
 
-from typing import Iterable, Optional, Set, Tuple, Type
+from typing import Iterable, Optional, Type
 import warnings
 
 from spacy.tokens import Doc, Span, Token
@@ -69,7 +69,7 @@ def get_spaczz_span(span: Span) -> bool:
         return all([token._.spaczz_token for token in span])
 
     @staticmethod
-    def get_token_types(token: Token) -> Set[str]:
+    def get_token_types(token: Token) -> set[str]:
         """Getter for spaczz_types `Token` attribute."""
         types = set()
         if token._.spaczz_ratio:
@@ -81,7 +81,7 @@ def get_token_types(token: Token) -> Set[str]:
         return types
 
     @classmethod
-    def get_span_types(cls: Type[SpaczzAttrs], span: Span) -> Set[str]:
+    def get_span_types(cls: Type[SpaczzAttrs], span: Span) -> set[str]:
         """Getter for spaczz_types `Span` attribute."""
         types = set()
         if cls.get_ratio(span):
@@ -103,7 +103,7 @@ def get_ratio(cls: Type[SpaczzAttrs], span: Span) -> Optional[int]:
     @classmethod
     def get_counts(
         cls: Type[SpaczzAttrs], span: Span
-    ) -> Optional[Tuple[int, int, int]]:
+    ) -> Optional[tuple[int, int, int]]:
         """Getter for spaczz_counts `Span` attribute."""
         if cls._all_equal([token._.spaczz_counts for token in span]):
             return span[0]._.spaczz_counts
@@ -124,7 +124,7 @@ def get_spaczz_doc(doc: Doc) -> bool:
         return any([token._.spaczz_token for token in doc])
 
     @staticmethod
-    def get_doc_types(doc: Doc) -> Set[str]:
+    def get_doc_types(doc: Doc) -> set[str]:
         """Getter for spaczz_types `Doc` attribute."""
         types = set()
         for token in doc:

diff --git a/src/spaczz/matcher/_phrasematcher.py b/src/spaczz/matcher/_phrasematcher.py
@@ -6,12 +6,9 @@
     Any,
     Callable,
     DefaultDict,
-    Dict,
     Generator,
     Iterable,
-    List,
     Optional,
-    Tuple,
 )
 import warnings
 
@@ -61,16 +58,16 @@ def __init__(self: _PhraseMatcher, vocab: Vocab, **defaults: Any) -> None:
         """
         self.defaults = defaults
         self.type = "_phrase"
-        self._callbacks: Dict[
+        self._callbacks: dict[
             str,
-            Optional[Callable[[Any, Doc, int, List[Tuple[str, int, int, int]]], None]],
+            Optional[Callable[[Any, Doc, int, list[tuple[str, int, int, int]]], None]],
         ] = {}  # Any type due to inheritence typing issue.
         self._patterns: DefaultDict[str, DefaultDict[str, Any]] = defaultdict(
             lambda: defaultdict(list)
         )
         self._searcher = _PhraseSearcher(vocab=vocab)
 
-    def __call__(self: _PhraseMatcher, doc: Doc) -> List[Tuple[str, int, int, int]]:
+    def __call__(self: _PhraseMatcher, doc: Doc) -> list[tuple[str, int, int, int]]:
         """Find all sequences matching the supplied patterns in the doc.
 
         Args:
@@ -120,7 +117,7 @@ def __len__(self: _PhraseMatcher) -> int:
         return len(self._patterns)
 
     @property
-    def labels(self: _PhraseMatcher) -> Tuple[str, ...]:
+    def labels(self: _PhraseMatcher) -> tuple[str, ...]:
         """All labels present in the matcher.
 
         Returns:
@@ -138,7 +135,7 @@ def labels(self: _PhraseMatcher) -> Tuple[str, ...]:
         return tuple(self._patterns.keys())
 
     @property
-    def patterns(self: _PhraseMatcher) -> List[Dict[str, Any]]:
+    def patterns(self: _PhraseMatcher) -> list[dict[str, Any]]:
         """Get all patterns and kwargs that were added to the matcher.
 
         Returns:
@@ -179,10 +176,10 @@ def vocab(self: _PhraseMatcher) -> Vocab:
     def add(
         self: _PhraseMatcher,
         label: str,
-        patterns: List[Doc],
-        kwargs: Optional[List[Dict[str, Any]]] = None,
+        patterns: list[Doc],
+        kwargs: Optional[list[dict[str, Any]]] = None,
         on_match: Optional[
-            Callable[[Any, Doc, int, List[Tuple[str, int, int, int]]], None]
+            Callable[[Any, Doc, int, list[tuple[str, int, int, int]]], None]
         ] = None,  # Any type due to inheritence typing issue.
     ) -> None:
         """Add a rule to the matcher, consisting of a label and one or more patterns.
@@ -207,11 +204,11 @@ def add(
             TypeError: If kwargs is not an iterable dictionaries.
 
         Warnings:
-            UserWarning:
+            KwargsWarning:
                 If there are more patterns than kwargs
                 default matching settings will be used
                 for extra patterns.
-            UserWarning:
+            KwargsWarning:
                 If there are more kwargs dicts than patterns,
                 the extra kwargs will be ignored.
 

diff --git a/src/spaczz/matcher/regexmatcher.py b/src/spaczz/matcher/regexmatcher.py
@@ -6,13 +6,9 @@
     Any,
     Callable,
     DefaultDict,
-    Dict,
     Generator,
     Iterable,
-    List,
     Optional,
-    Sequence,
-    Tuple,
     Union,
 )
 import warnings
@@ -73,15 +69,15 @@ def __init__(
         """
         self.defaults = defaults
         self.type = "regex"
-        self._callbacks: Dict[
+        self._callbacks: dict[
             str,
             Optional[
                 Callable[
                     [
                         RegexMatcher,
                         Doc,
                         int,
-                        List[Tuple[str, int, int, Tuple[int, int, int]]],
+                        list[tuple[str, int, int, tuple[int, int, int]]],
                     ],
                     None,
                 ],
@@ -94,7 +90,7 @@ def __init__(
 
     def __call__(
         self: RegexMatcher, doc: Doc
-    ) -> List[Tuple[str, int, int, Tuple[int, int, int]]]:
+    ) -> list[tuple[str, int, int, tuple[int, int, int]]]:
         r"""Find all sequences matching the supplied patterns in the doc.
 
         Args:
@@ -145,7 +141,7 @@ def __len__(self: RegexMatcher) -> int:
         return len(self._patterns)
 
     @property
-    def labels(self: RegexMatcher) -> Tuple[str, ...]:
+    def labels(self: RegexMatcher) -> tuple[str, ...]:
         """All labels present in the matcher.
 
         Returns:
@@ -163,7 +159,7 @@ def labels(self: RegexMatcher) -> Tuple[str, ...]:
         return tuple(self._patterns.keys())
 
     @property
-    def patterns(self: RegexMatcher) -> List[Dict[str, Any]]:
+    def patterns(self: RegexMatcher) -> list[dict[str, Any]]:
         """Get all patterns and kwargs that were added to the matcher.
 
         Returns:
@@ -203,15 +199,15 @@ def vocab(self: RegexMatcher) -> Vocab:
     def add(
         self: RegexMatcher,
         label: str,
-        patterns: Sequence[str],
-        kwargs: Optional[List[Dict[str, Any]]] = None,
+        patterns: list[str],
+        kwargs: Optional[list[dict[str, Any]]] = None,
         on_match: Optional[
             Callable[
                 [
                     RegexMatcher,
                     Doc,
                     int,
-                    List[Tuple[str, int, int, Tuple[int, int, int]]],
+                    list[tuple[str, int, int, tuple[int, int, int]]],
                 ],
                 None,
             ]
@@ -240,11 +236,11 @@ def add(
             TypeError: If kwargs is not a iterable of dictionaries.
 
         Warnings:
-            UserWarning:
+            KwargsWarning:
                 If there are more patterns than kwargs
                 default regex matching settings will be used
                 for extra patterns.
-            UserWarning:
+            KwargsWarning:
                 If there are more kwargs dictionaries than patterns,
                 the extra kwargs will be ignored.
 

diff --git a/src/spaczz/matcher/similaritymatcher.py b/src/spaczz/matcher/similaritymatcher.py
@@ -45,10 +45,6 @@ def __init__(self: SimilarityMatcher, vocab: Vocab, **defaults: Any) -> None:
                 be used as default matching settings.
                 These arguments will become the new defaults for matching.
                 See `SimilaritySearcher` documentation for details.
-
-        Warnings:
-            UserWarning:
-                If vocab does not contain any word vectors.
         """
         super().__init__(vocab=vocab, **defaults)
         self.type = "similarity"

diff --git a/src/spaczz/matcher/tokenmatcher.py b/src/spaczz/matcher/tokenmatcher.py
@@ -7,12 +7,9 @@
     Any,
     Callable,
     DefaultDict,
-    Dict,
     Generator,
     Iterable,
-    List,
     Optional,
-    Tuple,
     Union,
 )
 
@@ -71,19 +68,19 @@ def __init__(self: TokenMatcher, vocab: Vocab, **defaults: Any) -> None:
         """
         self.defaults = defaults
         self.type = "token"
-        self._callbacks: Dict[
+        self._callbacks: dict[
             str,
             Union[
                 Callable[
-                    [TokenMatcher, Doc, int, List[Tuple[str, int, int, None]]], None
+                    [TokenMatcher, Doc, int, list[tuple[str, int, int, None]]], None
                 ],
                 None,
             ],
         ] = {}
-        self._patterns: DefaultDict[str, List[List[Dict[str, Any]]]] = defaultdict(list)
+        self._patterns: DefaultDict[str, list[list[dict[str, Any]]]] = defaultdict(list)
         self._searcher = TokenSearcher(vocab=vocab)
 
-    def __call__(self: TokenMatcher, doc: Doc) -> List[Tuple[str, int, int, None]]:
+    def __call__(self: TokenMatcher, doc: Doc) -> list[tuple[str, int, int, None]]:
         """Find all sequences matching the supplied patterns in the doc.
 
         Args:
@@ -135,7 +132,7 @@ def __len__(self: TokenMatcher) -> int:
         return len(self._patterns)
 
     @property
-    def labels(self: TokenMatcher) -> Tuple[str, ...]:
+    def labels(self: TokenMatcher) -> tuple[str, ...]:
         """All labels present in the matcher.
 
         Returns:
@@ -153,7 +150,7 @@ def labels(self: TokenMatcher) -> Tuple[str, ...]:
         return tuple(self._patterns.keys())
 
     @property
-    def patterns(self: TokenMatcher) -> List[Dict[str, Any]]:
+    def patterns(self: TokenMatcher) -> list[dict[str, Any]]:
         """Get all patterns that were added to the matcher.
 
         Returns:
@@ -189,9 +186,9 @@ def vocab(self: TokenMatcher) -> Vocab:
     def add(
         self: TokenMatcher,
         label: str,
-        patterns: List[List[Dict[str, Any]]],
+        patterns: list[list[dict[str, Any]]],
         on_match: Optional[
-            Callable[[TokenMatcher, Doc, int, List[Tuple[str, int, int, None]]], None]
+            Callable[[TokenMatcher, Doc, int, list[tuple[str, int, int, None]]], None]
         ] = None,
     ) -> None:
         """Add a rule to the matcher, consisting of a label and one or more patterns.
@@ -322,8 +319,8 @@ def pipe(
 
 
 def _spacyfy(
-    matches: List[List[Optional[Tuple[str, str]]]], pattern: List[Dict[str, Any]]
-) -> List[List[Dict[str, Any]]]:
+    matches: list[list[Optional[tuple[str, str]]]], pattern: list[dict[str, Any]]
+) -> list[list[dict[str, Any]]]:
     """Turns token searcher matches into spaCy `Matcher` compatible patterns."""
     new_patterns = []
     if matches: