Updating docs and readme

gandersen101 · May 1, 2023 · eea7c84 · eea7c84
1 parent 306118a
commit eea7c84
Show file tree

Hide file tree

Showing 25 changed files with 823 additions and 678 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,14 @@
+*v0.6.0 Release Notes:*
+- *Returning the matching pattern for all matchers, this is a breaking change as matches are now tuples of length 5 instead of 4.*
+- *Regex and token matches now return match ratios.*
+- *Support for `python<=3.11,>=3.7`, along with `rapidfuzz>=1.0.0`.
+- *Dropped support for spaCy v2. Sorry to do this without a deprecation cycle, but I stepped away from this project for a long time.*
+- *Removed support of `"spaczz_"` preprended optional `SpaczzRuler` init arguments. Also, sorry to do this without a deprecation cycle.*
+- *`Matcher.pipe` methods, which were deprecated, are now removed.
+- *`spaczz_span` custom attribute, which was deprecated, is now removed.
+
 *v0.5.4 Release Notes:*
-- BugFix for german Combination words for RegexSearcher.
+- BugFix for German combination words for RegexSearcher.
 - Other minor docs/tooling updates.
 
 

diff --git a/README.md b/README.md
diff --git a/notebooks/README.ipynb b/notebooks/README.ipynb
diff --git a/notebooks/fuzzy_matching_tweaks.ipynb b/notebooks/fuzzy_matching_tweaks.ipynb
@@ -30,8 +30,8 @@
    "id": "killing-refrigerator",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:14.175873Z",
-     "start_time": "2023-04-24T21:37:13.543204Z"
+     "end_time": "2023-05-01T03:07:36.303723Z",
+     "start_time": "2023-05-01T03:07:35.754256Z"
     }
    },
    "outputs": [],
@@ -50,8 +50,8 @@
    "id": "satellite-iraqi",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:14.233985Z",
-     "start_time": "2023-04-24T21:37:14.231401Z"
+     "end_time": "2023-05-01T03:07:36.895321Z",
+     "start_time": "2023-05-01T03:07:36.891744Z"
     }
    },
    "outputs": [],
@@ -73,8 +73,8 @@
    "id": "united-drive",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:15.091156Z",
-     "start_time": "2023-04-24T21:37:15.087229Z"
+     "end_time": "2023-05-01T03:07:38.238639Z",
+     "start_time": "2023-05-01T03:07:38.234477Z"
     }
    },
    "outputs": [],
@@ -105,8 +105,8 @@
    "id": "experimental-yugoslavia",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:16.350023Z",
-     "start_time": "2023-04-24T21:37:16.133813Z"
+     "end_time": "2023-05-01T03:07:40.265853Z",
+     "start_time": "2023-05-01T03:07:40.003313Z"
     }
    },
    "outputs": [],
@@ -130,8 +130,8 @@
    "id": "subtle-marsh",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:17.264280Z",
-     "start_time": "2023-04-24T21:37:17.239135Z"
+     "end_time": "2023-05-01T03:07:41.141043Z",
+     "start_time": "2023-05-01T03:07:41.115500Z"
     }
    },
    "outputs": [
@@ -169,8 +169,8 @@
    "id": "naval-pilot",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:18.658683Z",
-     "start_time": "2023-04-24T21:37:18.631286Z"
+     "end_time": "2023-05-01T03:07:42.574251Z",
+     "start_time": "2023-05-01T03:07:42.546920Z"
     }
    },
    "outputs": [
@@ -199,9 +199,9 @@
    "id": "mineral-cradle",
    "metadata": {},
    "source": [
-    "Uh oh. Why does \"and Ireland\" match to \"Åland Islands\" when \"Ireland\" is in the patterns and provides a 100% match with \"Ireland\" in the text? This happens because as long as the `min_r2` parameter is exceeded in fuzzy matching, spaczz considers this a match and will prioritize longer matches (in tokens) over shorter matches.\n",
+    "Uh oh. Why does \"and Ireland\" match to \"Åland Islands\" when \"Ireland\" is in the patterns and provides a 100% match with \"Ireland\" in the text? This happens because as long as the `min_r` parameter is exceeded in fuzzy matching, spaczz considers this a match and will prioritize longer matches (in tokens) over shorter matches.\n",
     "\n",
-    "By default the fuzzy matcher uses a `min_r2` of `75`. It also lower-cases input by default, which on-average results in higher match ratios. See the results below:"
+    "By default the fuzzy matcher uses a `min_r` of `75`. It also lower-cases input by default, which on-average results in higher match ratios. See the results below:"
    ]
   },
   {
@@ -210,8 +210,8 @@
    "id": "basic-wagner",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:20.030933Z",
-     "start_time": "2023-04-24T21:37:20.023237Z"
+     "end_time": "2023-05-01T03:07:43.561268Z",
+     "start_time": "2023-05-01T03:07:43.554379Z"
     }
    },
    "outputs": [
@@ -237,11 +237,11 @@
    "id": "irish-miracle",
    "metadata": {},
    "source": [
-    "This exactly meets the default `min_r2` threshold. Many use-cases will likely require increasing this value, and the optimal value may vary from pattern to pattern. For example, shorter patterns (in characters) may need a higher `min_r2` than longer patterns to provide good matches. A better method for setting a good `min_r2` is a process I would like to provide some automated and/or heuristic-based options for in the future but they do not exist at this time.\n",
+    "This exactly meets the default `min_r` threshold. Many use-cases will likely require increasing this value, and the optimal value may vary from pattern to pattern. For example, shorter patterns (in characters) may need a higher `min_r` than longer patterns to provide good matches. A better method for setting a good `min_r` is a process I would like to provide some automated and/or heuristic-based options for in the future but they do not exist at this time.\n",
     "\n",
     "Why not prioritize higher ratios over longer matches? Because shorter matches will have a distinct advantage. Say in the above string we are searching, \"Northern Ireland\" was misspelled as \"Norten Ireland\"? If we prioritize ratio, then the pattern \"Ireland\" will match with the text \"Ireland\" and leave off \"Norten\", even though from a fuzzy matching standpoint, we would likely want \"Norten Ireland\" to match with \"Northern Ireland\"\n",
     "\n",
-    "So to address this we will often want to tweak `min_r2` either per-pattern or for the entire pipeline. We will increase `min_r2` for the entire pipeline below."
+    "So to address this we will often want to tweak `min_r` either per-pattern or for the entire pipeline. We will increase `min_r` for the entire pipeline below."
    ]
   },
   {
@@ -258,15 +258,15 @@
    "id": "generic-trade",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:21.608955Z",
-     "start_time": "2023-04-24T21:37:21.431271Z"
+     "end_time": "2023-05-01T03:07:45.656738Z",
+     "start_time": "2023-05-01T03:07:45.483210Z"
     }
    },
    "outputs": [],
    "source": [
     "nlp = spacy.blank(\"en\")\n",
     "spaczz_ruler = nlp.add_pipe(\n",
-    "    \"spaczz_ruler\", config={\"fuzzy_defaults\": {\"min_r2\": 85}}\n",
+    "    \"spaczz_ruler\", config={\"fuzzy_defaults\": {\"min_r\": 85}}\n",
     ")  # increase from 75 and applies to each pattern.\n",
     "spaczz_ruler.add_patterns(fuzzy_patterns)"
    ]
@@ -277,8 +277,8 @@
    "id": "helpful-dakota",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:22.346341Z",
-     "start_time": "2023-04-24T21:37:22.318340Z"
+     "end_time": "2023-05-01T03:07:46.501599Z",
+     "start_time": "2023-05-01T03:07:46.475531Z"
     }
    },
    "outputs": [
@@ -324,8 +324,8 @@
    "id": "caroline-gamma",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:24.336949Z",
-     "start_time": "2023-04-24T21:37:24.333692Z"
+     "end_time": "2023-05-01T03:07:48.126735Z",
+     "start_time": "2023-05-01T03:07:48.123353Z"
     }
    },
    "outputs": [],
@@ -356,8 +356,8 @@
    "id": "celtic-insurance",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:25.920416Z",
-     "start_time": "2023-04-24T21:37:25.741332Z"
+     "end_time": "2023-05-01T03:07:49.288399Z",
+     "start_time": "2023-05-01T03:07:49.113802Z"
     }
    },
    "outputs": [],
@@ -373,8 +373,8 @@
    "id": "5429a9c1",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:27.872724Z",
-     "start_time": "2023-04-24T21:37:26.989871Z"
+     "end_time": "2023-05-01T03:07:51.359740Z",
+     "start_time": "2023-05-01T03:07:50.418908Z"
     }
    },
    "outputs": [
@@ -408,9 +408,9 @@
     }
    },
    "source": [
-    "Yep. It looks like the default `min_r2` value of `75` is far to permissive for many of these shorter patterns. As mentioned in example 2, a better method for setting a good `min_r2` is a process I would like to provide some automated and/or heuristic-based options for in the future but they do not exist yet.\n",
+    "Yep. It looks like the default `min_r` value of `75` is far to permissive for many of these shorter patterns.\n",
     "\n",
-    "In this situation we could also increase the `min_r2` for the entire pipeline like we did in example 2, or we could try changing the `min_r2` on a pattern level. Let's try the latter this time.\n",
+    "In this situation we could also increase the `min_r` for the entire pipeline like we did in example 2, or we could try changing the `min_r` on a pattern level. Let's try the latter this time.\n",
     "\n",
     "But first there is one tweak we can make to the entire pipeline (also available on the pattern-level) that might also help: enabling case-sensitivity which is disabled by default. Case sensitive matches will lower the match ratio between potential matches with different casings."
    ]
@@ -429,8 +429,8 @@
    "id": "national-growing",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:29.569233Z",
-     "start_time": "2023-04-24T21:37:29.391212Z"
+     "end_time": "2023-05-01T03:07:54.832300Z",
+     "start_time": "2023-05-01T03:07:54.651476Z"
     }
    },
    "outputs": [],
@@ -448,8 +448,8 @@
    "id": "2f0617f0",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:31.129405Z",
-     "start_time": "2023-04-24T21:37:30.239477Z"
+     "end_time": "2023-05-01T03:07:56.831205Z",
+     "start_time": "2023-05-01T03:07:55.939471Z"
     }
    },
    "outputs": [
@@ -482,7 +482,7 @@
     "\n",
     "**Note**\n",
     "\n",
-    "With short enough patterns (less than 5-6 or so characters long) fuzzy matching becomes less useful. Using the default fuzzy matching settings \"Chad\" matches with \"had\" with a ratio of 75 and there isn't a ratio between that and an 100% match. Setting a `min_r2` of say `95` with these short patterns is effectively setting it to `100`. Therefore, short patterns are probably better used with spaCy's `EntityRuler` for it's far superior speed."
+    "With short enough patterns (less than 5-6 or so characters long) fuzzy matching becomes less useful. Using the default fuzzy matching settings \"Chad\" matches with \"had\" with a ratio of 75 and there isn't a ratio between that and an 100% match. Setting a `min_r` of say `95` with these short patterns is effectively setting it to `100`. Therefore, short patterns are probably better used with spaCy's `EntityRuler` for it's far superior speed."
    ]
   },
   {
@@ -491,8 +491,8 @@
    "id": "developed-delta",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:32.476780Z",
-     "start_time": "2023-04-24T21:37:32.470902Z"
+     "end_time": "2023-05-01T03:07:59.617420Z",
+     "start_time": "2023-05-01T03:07:59.611640Z"
     }
    },
    "outputs": [],
@@ -508,9 +508,9 @@
     "        \"id\": pattern[\"name\"],\n",
     "    }\n",
     "    if len(template[\"pattern\"]) < 5:\n",
-    "        template[\"kwargs\"] = {\"min_r2\": 100}  # see note above\n",
+    "        template[\"kwargs\"] = {\"min_r\": 100}  # see note above\n",
     "    elif len(template[\"pattern\"]) >= 5 and len(template[\"pattern\"]) < 8:\n",
-    "        template[\"kwargs\"] = {\"min_r2\": 85}\n",
+    "        template[\"kwargs\"] = {\"min_r\": 85}\n",
     "    fuzzy_patterns.append(template)"
    ]
   },
@@ -528,8 +528,8 @@
    "id": "preceding-swimming",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:33.617444Z",
-     "start_time": "2023-04-24T21:37:33.451866Z"
+     "end_time": "2023-05-01T03:08:02.369759Z",
+     "start_time": "2023-05-01T03:08:02.187639Z"
     }
    },
    "outputs": [],
@@ -555,8 +555,8 @@
    "id": "gross-revision",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2023-04-24T21:37:35.734831Z",
-     "start_time": "2023-04-24T21:37:34.871305Z"
+     "end_time": "2023-05-01T03:08:05.639539Z",
+     "start_time": "2023-05-01T03:08:04.723697Z"
     }
    },
    "outputs": [
@@ -585,7 +585,7 @@
   "kernelspec": {
    "display_name": "Python [poetry:spaczz]",
    "language": "python",
-   "name": "python.poetry.spaczz"
+   "name": "python-poetry-spaczz"
   },
   "language_info": {
    "codemirror_mode": {
@@ -597,7 +597,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.11"
+   "version": "3.11.3"
   },
   "toc": {
    "base_numbering": 1,

diff --git a/noxfile.py b/noxfile.py
@@ -106,7 +106,7 @@ def xdoctest(session: Session) -> None:
     args = session.posargs or ["all"]
     session.run("poetry", "install", "--only", "main,xdoctest", external=True)
     session.run("python", "-m", "spacy", "download", "en_core_web_md")
-    session.run("python", "-m", "xdoctest", PACKAGE, *args)
+    session.run("xdoctest", PACKAGE, *args)
 
 
 @nox.session(python=PYTHON)

diff --git a/pyproject.toml b/pyproject.toml
@@ -93,7 +93,7 @@ source = ["src", "*/site-packages"]
 
 [tool.coverage.report]
 show_missing = true
-fail_under = 95
+fail_under = 96
 
 [tool.coverage.run]
 branch = true

diff --git a/src/spaczz/_search/phrasesearcher.py b/src/spaczz/_search/phrasesearcher.py
@@ -248,20 +248,11 @@ def _calc_flex(query: DocLike, flex: FlexType) -> int:
             The new `flex` value.
 
         Raises:
-            ValueError: If `flex` is not `"default"`, `"max"`, `"min"`, or an `int`.
+            TypeError: If `flex` is not `"default"`, `"max"`, `"min"`, or an `int`.
 
         Warnings:
             FlexWarning:
                 If `flex` > `len(query)` or `flex` < `0`.
-
-        Example:
-            >>> import spacy
-            >>> from spaczz._search import PhraseSearcher
-            >>> nlp = spacy.blank("en")
-            >>> searcher = PhraseSearcher(nlp.vocab)
-            >>> query = nlp("Test query")
-            >>> searcher._calc_flex(query, "default")
-            1
         """
         if flex == "default":
             flex = len(query) // 2
@@ -274,7 +265,7 @@ def _calc_flex(query: DocLike, flex: FlexType) -> int:
             if flex > query_len:
                 warnings.warn(
                     f"""`flex` of size `{flex}` is > `len(query)`, `{query_len}`.
-                        Setting flex to `{query_len}` instead.""",
+                        Setting `flex` to `{query_len}` instead.""",
                     FlexWarning,
                     stacklevel=2,
                 )
@@ -288,10 +279,10 @@ def _calc_flex(query: DocLike, flex: FlexType) -> int:
                 )
                 flex = 0
         else:
-            raise ValueError(
+            raise TypeError(
                 (
-                    "`flex` must be the string value of `'default'`,",
-                    "`'max'` or `'min'`, or an `int`.",
+                    "`flex` must be a `FlexLiteral` (`'default'`,",
+                    "`'max'` or `'min'`), or an `int`.",
                 )
             )
         return flex

diff --git a/src/spaczz/_search/searchutil.py b/src/spaczz/_search/searchutil.py
@@ -27,7 +27,7 @@ def filter_overlapping_matches(
         The filtered list of matches.
 
     Example:
-        >>> from spaczz.search.searchutil import filter_overlapping_matches
+        >>> from spaczz._search.searchutil import filter_overlapping_matches
         >>> matches = [(1, 3, 80), (1, 2, 70)]
         >>> filter_overlapping_matches(matches)
         [(1, 3, 80)]
@@ -61,7 +61,7 @@ def parse_regex(
         RegexParseError: If regex compilation produces any errors.
 
     Example:
-        >>> from spaczz.search.util import parse_regex
+        >>> from spaczz._search.searchutil import parse_regex
         >>> pattern = parse_regex("Test")
         >>> isinstance(pattern, re.Pattern)
         True

diff --git a/src/spaczz/_search/tokensearcher.py b/src/spaczz/_search/tokensearcher.py
@@ -54,7 +54,7 @@ def match(
                 {"TEXT": {"FREGEX": "(advair){e<=1}"}}
                 ]
             >>> searcher.match(doc, pattern)
-            [[('TEXT', 'zithramax'), ("", "", 100), ('TEXT', 'advar')]]
+            [[('TEXT', 'zithramax', 89), ('', '', 100), ('TEXT', 'advar', 91)]]
         """
         matches = []
         matches = [
@@ -142,7 +142,7 @@ def regex_compare(
             >>> nlp = spacy.blank("en")
             >>> searcher = TokenSearcher(nlp)
             >>> searcher.regex_compare("sequel", "(sql){i<=3}")
-            True
+            67
         """
         pattern_ = parse_regex(pattern, predef=predef)
         if ignore_case:

diff --git a/src/spaczz/customtypes.py b/src/spaczz/customtypes.py
@@ -17,3 +17,7 @@
 SearchResult = ty.Tuple[int, int, int]
 MatchResult = ty.Tuple[str, int, int, int, str]
 SpaczzType = Literal["fuzzy", "regex", "token", "similarity", "phrase"]
+RulerPattern = ty.Dict[
+    str, ty.Union[str, ty.Dict[str, ty.Any], ty.List[ty.Dict[str, ty.Any]]]
+]
+RulerResult = ty.Tuple[str, int, int, int, str, SpaczzType]  # type: ignore[valid-type]