Cleanup and readme.

gandersen101 · Feb 24, 2021 · 4635a5b · 4635a5b
1 parent c22a5e8
commit 4635a5b
Show file tree

Hide file tree

Showing 12 changed files with 306 additions and 261 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,8 +1,19 @@
+*v0.4.2 Release Notes:*
+- *Fixed a bug where `TokenMatcher` callbacks did nothing.*
+- *Fixed a bug where `spaczz_token_defaults` in the `SpaczzRuler` did nothing.*
+- *Fixed a bug where defaults would not be added to their respective matchers when loading from bytes/disk in the `SpaczzRuler`.*
+- *Fixed some inconsistencies in the `SpaczzRuler` which will be particularly noticeable with ent_ids. See the "Known Issues" section below for more details.*
+- *Small tweaks to spaczz custom attributes.*
+- *Available fuzzy matching functions have changed in RapidFuzz and have changed in spaczz accordingly.*
+- *Preparing for spaCy v3 updates.*
+
+
 *v0.4.1 Release Notes:*
 - *Spaczz's phrase searching algorithm has been further optimized so both the `FuzzyMatcher` and `SimilarityMatcher` should run considerably faster.*
 - *The `FuzzyMatcher` and `SimilarityMatcher` now include a `thresh` parameter that defaults to `100`. When matching, if `flex > 0` and the match ratio is >= `thresh` during the initial scan of the document, no optimization will be attempted. By default perfect matches don't need to be run through match optimization.*
 - *PEP585 code updates.*
 
+
 *v0.4.0 Release Notes:*
 - *Spaczz now includes a `TokenMatcher` that provides token pattern support like spaCy's `Matcher`. It provides all the same functionality as spaCy's `Matcher` but adds fuzzy and fuzzy-regex support. However, it adds additional overhead to it's spaCy counterpart so it should only be used as needed for fuzzy matching purposes.*
 - *Spaczz's custom attributes have been reworked and now initialize within spaczz's root `__init__`. These are set via spaczz pipeline components (currently just the `SpaczzRuler`) The only downside is that I had to remove the `attr` parameter from the `SpaczzRuler` to enable this.*
@@ -11,9 +22,11 @@
 - *`min_r1` for the fuzzy phrase matcher is now `50`, this is still low but not so low that it filters almost nothing out in the initial document scan.*
 - *Bug fixes to phrase searching that could cause index errors in spaCy `Span` objects.*
 
+
 *v0.3.1 Release Notes:*
 - *spaczz now includes an experimental `SimilarityMatcher` that attempts to match search terms based on vector similarity. It requires a a spaCy model with word vectors (e.x. spaCy's medium and large English models) to function properly. See the documentation below for usage details.*
 
+
 *v0.3.0 Release Notes:*
 - *The `FuzzyMatcher` and `RegexMatcher` now return fuzzy ratio and fuzzy count details respectively. The behavior of these two matchers is still the same except they now return lists of tuples of length 4 (match id, start, end, fuzzy details).*
     - *This change could be breaking in instances where these tuples are unpacked in the traditional spaCy fashion (match id, start, end). Simply include the fuzzy details or a placeholder during unpacking to fix.*
@@ -22,6 +35,7 @@
     - *If, in the rare case, the same match is made via a fuzzy pattern and regex pattern, the span will have both extensions set with their repsective values.*
 - *Fixed a bug where the `attr` parameter in the `SpaczzRuler` did not actually change the name of the custom span attribute.*
 
+
 *v0.2.0 Release Notes:*
 - *Fuzzy matching is now performed with [RapidFuzz](https://github.com/maxbachmann/rapidfuzz) instead of [FuzzyWuzzy](https://github.com/seatgeek/fuzzywuzzy).*
     - *RapidFuzz is higher performance with a more liberal license.*

diff --git a/README.md b/README.md
diff --git a/src/spaczz/matcher/_phrasematcher.py b/src/spaczz/matcher/_phrasematcher.py
@@ -18,7 +18,7 @@
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
 
-from ..exceptions import KwargsWarning, PipeDeprecation
+from ..exceptions import KwargsWarning
 from ..search import _PhraseSearcher
 from ..util import nest_defaultdict
 
@@ -314,12 +314,6 @@ def pipe(
         Yields:
             `Doc` objects, in order.
         """
-        warnings.warn(
-            """As of spaCy v3.0 and spaczz v0.5 matcher.poipe methods are deprecated.
-        If you need to match on a stream of documents, you can use nlp.pipe and
-        call the matcher on each Doc object.""",
-            PipeDeprecation,
-        )
         if as_tuples:
             for doc, context in stream:
                 matches = self(doc)

diff --git a/src/spaczz/matcher/regexmatcher.py b/src/spaczz/matcher/regexmatcher.py
@@ -18,7 +18,7 @@
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
 
-from ..exceptions import KwargsWarning, PipeDeprecation
+from ..exceptions import KwargsWarning
 from ..regex import RegexConfig
 from ..search import RegexSearcher
 from ..util import nest_defaultdict
@@ -327,12 +327,6 @@ def pipe(
         Yields:
             Doc objects, in order.
         """
-        warnings.warn(
-            """As of spaCy v3.0 and spaczz v0.5 matcher.poipe methods are deprecated.
-        If you need to match on a stream of documents, you can use nlp.pipe and
-        call the matcher on each Doc object.""",
-            PipeDeprecation,
-        )
         if as_tuples:
             for doc, context in stream:
                 matches = self(doc)

diff --git a/src/spaczz/matcher/tokenmatcher.py b/src/spaczz/matcher/tokenmatcher.py
@@ -4,13 +4,11 @@
 from collections import defaultdict
 from copy import deepcopy
 from typing import Any, Callable, Generator, Iterable, List, Optional, Tuple, Type
-import warnings
 
 from spacy.matcher import Matcher
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
 
-from ..exceptions import PipeDeprecation
 from ..search import TokenSearcher
 
 
@@ -287,12 +285,6 @@ def pipe(
         Yields:
             `Doc` objects, in order.
         """
-        warnings.warn(
-            """As of spaCy v3.0 and spaczz v0.5 matcher.pipe methods are deprecated.
-        If you need to match on a stream of documents, you can use nlp.pipe and
-        call the matcher on each Doc object.""",
-            PipeDeprecation,
-        )
         if as_tuples:
             for doc, context in stream:
                 matches = self(doc)

diff --git a/src/spaczz/pipeline/spaczzruler.py b/src/spaczz/pipeline/spaczzruler.py
@@ -750,23 +750,16 @@ def _filter_overlapping_matches(
         If more than one match span includes the same tokens
         the first of these match spans in matches is kept.
 
+        It also removes non-kept matches from the lookup dict as well.
+
         Args:
             matches: List of match span tuples
                 (match_id, start_index, end_index).
             lookup: Match ratio, count and detail values in
                 a `defaultdict(dict)`.
 
         Returns:
-            The filtered list of match span tuples.
-
-        Example:
-            >>> import spacy
-            >>> from spaczz.pipeline import SpaczzRuler
-            >>> nlp = spacy.blank("en")
-            >>> ruler = SpaczzRuler(nlp)
-            >>> matches = [("TEST", 1, 3), ("TEST", 1, 2)]
-            >>> ruler._filter_overlapping_matches(matches)
-            [('TEST', 1, 3)]
+            The filtered list of match span tuples and the lookup dict.
         """
         filtered_matches: list[tuple[str, int, int]] = []
         for match in matches:

diff --git a/src/spaczz/process.py b/src/spaczz/process.py
@@ -1,8 +1,7 @@
 """Module for various doc/text processing classes/functions."""
 from __future__ import annotations
 
-from itertools import tee
-from typing import Any, Callable, Iterable
+from typing import Callable
 
 from rapidfuzz import fuzz
 from spacy.tokens import Doc
@@ -17,15 +16,6 @@ def map_chars_to_tokens(doc: Doc) -> dict[int, int]:
     return chars_to_tokens
 
 
-def n_wise(iterable: Iterable[Any], n: int) -> Iterable[Any]:
-    """Iterates over an iterables in slices of length n by one step at a time."""
-    iterables = tee(iterable, n)
-    for i in range(len(iterables)):
-        for _ in range(i):
-            next(iterables[i], None)
-    return zip(*iterables)
-
-
 class FuzzyFuncs:
     """Container class housing fuzzy matching functions.
 
@@ -53,7 +43,7 @@ class FuzzyFuncs:
     """
 
     def __init__(self: FuzzyFuncs, match_type: str = "phrase") -> None:
-        """Initializes a fuzzyfuncs container.
+        """Initializes a `FuzzyFuncs` container.
 
         Args:
             match_type: Whether the fuzzy matching functions

diff --git a/src/spaczz/search/tokensearcher.py b/src/spaczz/search/tokensearcher.py
@@ -7,7 +7,8 @@
 from spacy.tokens import Doc, Token
 from spacy.vocab import Vocab
 
-from ..process import FuzzyFuncs, n_wise
+from ..process import FuzzyFuncs
+from ..util import n_wise
 
 
 class TokenSearcher:

diff --git a/src/spaczz/util.py b/src/spaczz/util.py
@@ -3,9 +3,9 @@
 
 from collections import defaultdict
 from functools import partial
-from itertools import repeat
+from itertools import repeat, tee
 from pathlib import Path
-from typing import Any, Union
+from typing import Any, Iterable, Union
 
 
 def ensure_path(path: Union[str, Path]) -> Path:
@@ -31,6 +31,15 @@ def nest_defaultdict(default_factory: Any, depth: int = 1) -> defaultdict[Any, A
     return result()
 
 
+def n_wise(iterable: Iterable[Any], n: int) -> Iterable[Any]:
+    """Iterates over an iterables in slices of length n by one step at a time."""
+    iterables = tee(iterable, n)
+    for i in range(len(iterables)):
+        for _ in range(i):
+            next(iterables[i], None)
+    return zip(*iterables)
+
+
 def read_from_disk(path: Union[str, Path], readers: Any, exclude: Any) -> Path:
     """Reads a pipeline component from disk."""
     path = ensure_path(path)

diff --git a/tests/test_matcher/test_fuzzymatcher.py b/tests/test_matcher/test_fuzzymatcher.py
@@ -2,7 +2,6 @@
 from __future__ import annotations
 
 import pickle
-import warnings
 
 import pytest
 from spacy.language import Language
@@ -170,61 +169,53 @@ def test_matcher_uses_on_match_callback(matcher: FuzzyMatcher, doc: Doc) -> None
 
 def test_matcher_pipe(nlp: Language) -> None:
     """It returns a stream of Doc objects."""
-    with warnings.catch_warnings():
-        warnings.filterwarnings("ignore")
-        doc_stream = (
-            nlp.make_doc("test doc 1: Corvold"),
-            nlp.make_doc("test doc 2: Prosh"),
-        )
-        matcher = FuzzyMatcher(nlp.vocab)
-        output = matcher.pipe(doc_stream)
-        assert list(output) == list(doc_stream)
+    doc_stream = (
+        nlp.make_doc("test doc 1: Corvold"),
+        nlp.make_doc("test doc 2: Prosh"),
+    )
+    matcher = FuzzyMatcher(nlp.vocab)
+    output = matcher.pipe(doc_stream)
+    assert list(output) == list(doc_stream)
 
 
 def test_matcher_pipe_with_context(nlp: Language) -> None:
     """It returns a stream of Doc objects as tuples with context."""
-    with warnings.catch_warnings():
-        warnings.filterwarnings("ignore")
-        doc_stream = (
-            (nlp.make_doc("test doc 1: Corvold"), "Jund"),
-            (nlp.make_doc("test doc 2: Prosh"), "Jund"),
-        )
-        matcher = FuzzyMatcher(nlp.vocab)
-        output = matcher.pipe(doc_stream, as_tuples=True)
-        assert list(output) == list(doc_stream)
+    doc_stream = (
+        (nlp.make_doc("test doc 1: Corvold"), "Jund"),
+        (nlp.make_doc("test doc 2: Prosh"), "Jund"),
+    )
+    matcher = FuzzyMatcher(nlp.vocab)
+    output = matcher.pipe(doc_stream, as_tuples=True)
+    assert list(output) == list(doc_stream)
 
 
 def test_matcher_pipe_with_matches(nlp: Language) -> None:
     """It returns a stream of Doc objects and matches as tuples."""
-    with warnings.catch_warnings():
-        warnings.filterwarnings("ignore")
-        doc_stream = (
-            nlp.make_doc("test doc 1: Corvold"),
-            nlp.make_doc("test doc 2: Prosh"),
-        )
-        matcher = FuzzyMatcher(nlp.vocab)
-        matcher.add("DRAGON", [nlp.make_doc("Korvold"), nlp.make_doc("Prossh")])
-        output = matcher.pipe(doc_stream, return_matches=True)
-        matches = [entry[1] for entry in output]
-        assert matches == [[("DRAGON", 4, 5, 86)], [("DRAGON", 4, 5, 91)]]
+    doc_stream = (
+        nlp.make_doc("test doc 1: Corvold"),
+        nlp.make_doc("test doc 2: Prosh"),
+    )
+    matcher = FuzzyMatcher(nlp.vocab)
+    matcher.add("DRAGON", [nlp.make_doc("Korvold"), nlp.make_doc("Prossh")])
+    output = matcher.pipe(doc_stream, return_matches=True)
+    matches = [entry[1] for entry in output]
+    assert matches == [[("DRAGON", 4, 5, 86)], [("DRAGON", 4, 5, 91)]]
 
 
 def test_matcher_pipe_with_matches_and_context(nlp: Language) -> None:
     """It returns a stream of Doc objects and matches and context as tuples."""
-    with warnings.catch_warnings():
-        warnings.filterwarnings("ignore")
-        doc_stream = (
-            (nlp.make_doc("test doc 1: Corvold"), "Jund"),
-            (nlp.make_doc("test doc 2: Prosh"), "Jund"),
-        )
-        matcher = FuzzyMatcher(nlp.vocab)
-        matcher.add("DRAGON", [nlp.make_doc("Korvold"), nlp.make_doc("Prossh")])
-        output = matcher.pipe(doc_stream, return_matches=True, as_tuples=True)
-        matches = [(entry[0][1], entry[1]) for entry in output]
-        assert matches == [
-            ([("DRAGON", 4, 5, 86)], "Jund"),
-            ([("DRAGON", 4, 5, 91)], "Jund"),
-        ]
+    doc_stream = (
+        (nlp.make_doc("test doc 1: Corvold"), "Jund"),
+        (nlp.make_doc("test doc 2: Prosh"), "Jund"),
+    )
+    matcher = FuzzyMatcher(nlp.vocab)
+    matcher.add("DRAGON", [nlp.make_doc("Korvold"), nlp.make_doc("Prossh")])
+    output = matcher.pipe(doc_stream, return_matches=True, as_tuples=True)
+    matches = [(entry[0][1], entry[1]) for entry in output]
+    assert matches == [
+        ([("DRAGON", 4, 5, 86)], "Jund"),
+        ([("DRAGON", 4, 5, 91)], "Jund"),
+    ]
 
 
 def test_pickling_matcher(matcher: FuzzyMatcher) -> None: