Updates to Clemsciences's spacy process (#1239)

* Added spaCy process * Improved spaCy to CLTK wrapper * Use correct Token attributes * Use spaCy download function instead of shell command * Update poetry.lock * Removed unrelated changes * Fixed SpacyWrapper, StanzaWrapper and download_all_models.py * Improved SpacyWrapper * update dependencies * demo notebook not working * Added spaCy process * Improved spaCy to CLTK wrapper * Use correct Token attributes * Use spaCy download function instead of shell command * Removed unrelated changes * Fixed SpacyWrapper, StanzaWrapper and download_all_models.py * Improved SpacyWrapper * Improved SpacyWrapper * Renamed spacy_dep.py to spacy_wrapper.py and lint fixes * add morphology from spacy to cltk Doc * downgrade spacy to 3.6.1 * Download spaCy model if the model is absent * update dev dependencies, improve spacy wrapper * spacy wrapper working * fix .get_dependencies() * make e2e work with LatinCy * re-add Latin Stops Process * add spacy dl to build script * trigger CI rerun * load spacy directly * dl spacy model with subprocess * load model after dl * load w/ spacy (wrapper seems to err * don't check only dl * bump spacy * why old spacy on ci? * deepcopy to copy * basic lat tests pass * rewrite bad UD types from Mood to VerbForm * finish more cleanup or LatinCy release * add citation printing * rename latincy proc --------- Co-authored-by: Clément Besnier <[email protected]> Co-authored-by: Clément Besnier <[email protected]>
cltk · Dec 27, 2023 · cdc278e · cdc278e
1 parent 0bca52a
commit cdc278e
Show file tree

Hide file tree

Showing 39 changed files with 2,302 additions and 1,967 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -37,6 +37,10 @@ jobs:
           name: Run quick tests
           # This assumes pytest is installed via the install-package step above
           command: make testNoInternet
+      # TODO: Remove this once caching issue figured out
+      - run:
+          name: Check spacy version
+          command: poetry run spacy info
       - run:
           name: Download dependencies
           command: poetry run python scripts/download_all_models.py --languages=lat

diff --git a/docs/cltk.dependency.rst b/docs/cltk.dependency.rst
@@ -17,6 +17,14 @@ cltk.dependency.processes module
    :undoc-members:
    :show-inheritance:
 
+cltk.dependency.spacy\_wrapper module
+-------------------------------------
+
+.. automodule:: cltk.dependency.spacy_wrapper
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 cltk.dependency.stanza\_wrapper module
 --------------------------------------
 

diff --git a/notebooks/CLTK Demonstration.ipynb b/notebooks/CLTK Demonstration.ipynb
diff --git a/notebooks/CLTK data types.ipynb b/notebooks/CLTK data types.ipynb
@@ -426,8 +426,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "‎𐤀 CLTK version '1.1.7a5'.\n",
-      "Pipeline for language 'Latin' (ISO: 'lat'): `LatinNormalizeProcess`, `LatinStanzaProcess`, `LatinEmbeddingsProcess`, `StopsProcess`, `LatinLexiconProcess`.\n"
+      "‎𐤀 CLTK version '1.2.0'. When using the CLTK in research, please cite: https://aclanthology.org/2021.acl-demo.3/\n",
+      "\n",
+      "Pipeline for language 'Latin' (ISO: 'lat'): `LatinNormalizeProcess`, `LatinSpacyProcess`, `LatinEmbeddingsProcess`, `StopsProcess`, `LatinLexiconProcess`.\n",
+      "\n",
+      "⸖ ``LatinSpacyProcess`` using LatinCy model by Patrick Burns from https://arxiv.org/abs/2305.04365 . Please cite:  https://arxiv.org/abs/2305.04365\n",
+      "⸖ ``LatinEmbeddingsProcess`` using word2vec model by University of Oslo from http://vectors.nlpl.eu/ . Please cite: https://aclanthology.org/W17-0237/\n",
+      "⸖ ``LatinLexiconProcess`` using Lewis's *An Elementary Latin Dictionary* (1890).\n",
+      "\n",
+      "⸎ To suppress these messages, instantiate ``NLP()`` with ``suppress_banner=True``.\n"
      ]
     }
    ],

diff --git a/notebooks/Demo of Pipeline for all languages.ipynb b/notebooks/Demo of Pipeline for all languages.ipynb
diff --git a/notebooks/Make custom Process and add to Pipeline.ipynb b/notebooks/Make custom Process and add to Pipeline.ipynb
@@ -175,8 +175,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "‎𐤀 CLTK version '1.1.7a5'.\n",
-      "Pipeline for language 'Old English (ca. 450-1100)' (ISO: 'ang'): `MultilingualTokenizationProcess`, `OldEnglishLemmatizationProcess`, `OldEnglishEmbeddingsProcess`, `StopsProcess`.\n"
+      "‎𐤀 CLTK version '1.2.0'. When using the CLTK in research, please cite: https://aclanthology.org/2021.acl-demo.3/\n",
+      "\n",
+      "Pipeline for language 'Old English (ca. 450-1100)' (ISO: 'ang'): `MultilingualTokenizationProcess`, `OldEnglishLemmatizationProcess`, `OldEnglishEmbeddingsProcess`, `StopsProcess`.\n",
+      "\n",
+      "\n",
+      "⸎ To suppress these messages, instantiate ``NLP()`` with ``suppress_banner=True``.\n"
      ]
     }
    ],

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cltk"
-version = "1.1.7"
+version = "1.2.0"
 description = "The Classical Language Toolkit"
 license = "MIT"
 authors = ["Kyle P. Johnson <[email protected]>", "Patrick J. Burns <[email protected]>", "John Stewart <[email protected]>", "Todd Cook <[email protected]>", "Clément Besnier <[email protected]>", "William J. B. Mattingly <https://github.com/wjbmattingly>"]
@@ -41,7 +41,7 @@ rapidfuzz = "^3.4.0"
 stanza = "^1.6.0"
 nltk = "^3.7"
 stringcase = "^1.2"
-spacy = "^3.6.1"
+spacy = "3.7.2"
 PyYAML = "^6.0.0"
 scikit-learn = "^1.0.2"
 # Note: Adding torch like this should not be necessary,
@@ -51,23 +51,23 @@ scikit-learn = "^1.0.2"
 torch = ">=2.0.0, !=2.0.1, !=2.1.0"
 
 [tool.poetry.dev-dependencies]
-pytest = "^7.1.1"
+pytest = "^7.4.3"
 nose = "^1.3"
 ipython = "^8.2"
-pylint = "^2.13.5"
+pylint = "^3.0.3"
 sphinx = "^4.5.0"
-coverage = "^6.3.2"
+coverage = "^7.3.4"
 pylint-json2html = "^0.4.0"
 tox = "^3.24.5"
 tox-pyenv = "^1.1"
 pytest-cov = "^3.0"
 rtd-poetry = "^0.1.0"
 sphinx-autodoc-typehints = "^1.17"
-pre-commit = "2.18.1"
-mypy = "^0.942"
+pre-commit = "3.6.0"
+mypy = "^1.8.0"
 lxml = "^4.9"
-black = "^22.3.0"
-isort = "^5.10.1"
+black = "^23.12.1"
+isort = "^5.13.2"
 
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"

diff --git a/scripts/download_all_models.py b/scripts/download_all_models.py
@@ -7,14 +7,17 @@
 """
 
 import argparse
+
+# import subprocess
 import time
-from typing import Dict, List
 
-from git import GitCommandError
+import spacy
 
 from cltk.core.exceptions import CLTKException, CorpusImportError
 from cltk.data.fetch import LANGUAGE_CORPORA as AVAILABLE_CLTK_LANGS
 from cltk.data.fetch import FetchCorpus
+from cltk.dependency.spacy_wrapper import MAP_LANGS_CLTK_SPACY as AVAIL_SPACY_LANGS
+from cltk.dependency.spacy_wrapper import SpacyWrapper
 from cltk.dependency.stanza_wrapper import (
     MAP_LANGS_CLTK_STANZA as AVAIL_STANZA_LANGS,
 )  # pylint: disable=syntax-error
@@ -24,15 +27,18 @@
 from cltk.embeddings.embeddings import FastTextEmbeddings, Word2VecEmbeddings
 from cltk.nlp import iso_to_pipeline
 
+# from git import GitCommandError
+
+
 T0 = time.time()
 
-PARSER = argparse.ArgumentParser()
+PARSER: argparse.ArgumentParser = argparse.ArgumentParser()
 PARSER.add_argument(
     "--languages", help="What languages to download. Comma separated, no spaces."
 )
-ARGS = PARSER.parse_args()
-SELECTED_LANGS = list()  # type: List[str]
-ALL_AVAILABLE_LANGS = list(iso_to_pipeline.keys())  # type: List[str]
+ARGS: argparse.Namespace = PARSER.parse_args()
+SELECTED_LANGS: list[str] = list()
+ALL_AVAILABLE_LANGS: list[str] = list(iso_to_pipeline.keys())
 if not ARGS.languages:
     SELECTED_LANGS = ALL_AVAILABLE_LANGS
 else:
@@ -113,6 +119,29 @@ def download_nlpl_model(iso_code: str) -> None:
     print(f"Finished downloading NLPL model for '{iso_code}'.")
 
 
+def download_spacy_models(iso_code: str) -> None:
+    """Download language models, from the ``spaCy`` project,
+    that are supported by the CLTK or in scope.
+    """
+    print(f"Going to download spaCy model for '{iso_code}'.")
+    if iso_code not in AVAIL_SPACY_LANGS:
+        raise CLTKException(f"Language '{iso_code}' not available for spaCy.")
+    if not spacy.util.is_package("la_core_web_lg"):
+        print("Spacy Latin model not found. Going to download it ...")
+        spacy_wrapper: SpacyWrapper = SpacyWrapper(
+            language="lat", interactive=False, silent=False
+        )
+        # subprocess.check_call(
+        #     [
+        #         "pip",
+        #         "install",
+        #         "https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl",
+        #     ]
+        # )
+        print("Spacy downloaded?", spacy_wrapper._is_model_present())
+    print(f"Finished downloading spaCy for '{iso_code}'.")
+
+
 if __name__ == "__main__":
     print(f"Module loaded. Total elapsed time: {time.time() - T0}")
     print("*** Downloading a basic set of models ... this will take a while.*** \n")
@@ -130,6 +159,9 @@ def download_nlpl_model(iso_code: str) -> None:
         # 4. Check nlpl
         if LANG in AVAIL_NLPL_LANGS:
             download_nlpl_model(iso_code=LANG)
+        # 5. Check spaCy
+        if LANG in AVAIL_SPACY_LANGS:
+            download_spacy_models(iso_code=LANG)
         print(
             f"All models fetched for '{LANG}'. Total elapsed time: {time.time() - T0}"
         )

diff --git a/src/cltk/alphabet/grc/beta_to_unicode.py b/src/cltk/alphabet/grc/beta_to_unicode.py
@@ -152,8 +152,8 @@ def replace_beta_code(self, text: str) -> str:
         if text.isupper():
             text = regex.sub(r"(?<!\*)([A-Z]+)", lambda pat: pat.group(1).lower(), text)
         text = text.replace("-", "")
-        for (pattern, repl) in self.reorder_pattern:
+        for pattern, repl in self.reorder_pattern:
             text = pattern.subn(repl, text)[0]
-        for (pattern, repl) in self.pattern:
+        for pattern, repl in self.pattern:
             text = pattern.subn(repl, text)[0]
         return normalize("NFC", text)
diff --git a/src/cltk/alphabet/lat.py b/src/cltk/alphabet/lat.py
@@ -71,7 +71,7 @@ def __init__(self):
 
     def replace(self, text):
         """Do j/v replacement"""
-        for (pattern, repl) in self.patterns:
+        for pattern, repl in self.patterns:
             text = re.subn(pattern, repl, text)[0]
         return text
 
@@ -106,7 +106,7 @@ def __init__(self):
 
     def replace(self, text):
         """Do character replacement."""
-        for (pattern, repl) in self.patterns:
+        for pattern, repl in self.patterns:
             text = re.subn(pattern, repl, text)[0]
         return text
 

diff --git a/src/cltk/core/data_types.py b/src/cltk/core/data_types.py
@@ -72,9 +72,9 @@ class Word:
     lemma: Optional[str] = None
     stem: Optional[str] = None
     scansion: Optional[str] = None
-    xpos: Optional[str] = None  # treebank-specific POS tag (from stanza)
-    upos: Optional[str] = None  # universal POS tag (from stanza)
-    dependency_relation: Optional[str] = None  # (from stanza)
+    xpos: Optional[str] = None  # treebank-specific POS tag (from Stanza or Spacy)
+    upos: Optional[str] = None  # universal POS tag (from Stanza or Spacy)
+    dependency_relation: Optional[str] = None  # (from Stanza or Spacy)
     governor: Optional[int] = None
     features: MorphosyntacticFeatureBundle = MorphosyntacticFeatureBundle()
     category: MorphosyntacticFeatureBundle = MorphosyntacticFeatureBundle()

diff --git a/src/cltk/dependency/processes.py b/src/cltk/dependency/processes.py
@@ -2,12 +2,14 @@
 
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Literal, Optional, Tuple
 
+import spacy
 import stanza
 from boltons.cacheutils import cachedproperty
 
 from cltk.core.data_types import Doc, MorphosyntacticFeature, Process, Word
+from cltk.dependency.spacy_wrapper import SpacyWrapper
 from cltk.dependency.stanza_wrapper import StanzaWrapper
 from cltk.dependency.tree import DependencyTree
 from cltk.morphology.morphosyntax import (
@@ -145,6 +147,7 @@ class GreekStanzaProcess(StanzaProcess):
 
     language: str = "grc"
     description: str = "Default process for Stanza for the Ancient Greek language."
+    authorship_info: str = "``LatinSpacyProcess`` using Stanza model by Stanford University from https://stanfordnlp.github.io/stanza/ . Please cite: https://arxiv.org/abs/2003.07082"
 
 
 @dataclass
@@ -217,3 +220,119 @@ class TreeBuilderProcess(Process):
     def algorithm(self, doc):
         doc.trees = [DependencyTree.to_tree(sentence) for sentence in doc.sentences]
         return doc
+
+
+@dataclass
+class SpacyProcess(Process):
+    """A ``Process`` type to capture everything, that the ``spaCy`` project can do for a given language.
+
+    .. note::
+        ``spacy`` has only partial functionality available for some languages.
+
+    >>> from cltk.languages.example_texts import get_example_text
+    >>> process_spacy = SpacyProcess(language="lat")
+    >>> isinstance(process_spacy, SpacyProcess)
+    True
+
+    # >>> from spacy.models.common.doc import Document
+    # >>> output_doc = process_spacy.run(Doc(raw=get_example_text("lat")))
+    # >>> isinstance(output_doc.spacy_doc, Document)
+    True
+    """
+
+    # language: Optional[str] = None
+
+    @cachedproperty
+    def algorithm(self):
+        return SpacyWrapper.get_nlp(language=self.language)
+
+    def run(self, input_doc: Doc) -> Doc:
+        output_doc = deepcopy(input_doc)
+        spacy_wrapper = self.algorithm
+        if output_doc.normalized_text:
+            input_text = output_doc.normalized_text
+        else:
+            input_text = output_doc.raw
+        spacy_doc = spacy_wrapper.parse(input_text)
+        cltk_words = self.spacy_to_cltk_word_type(spacy_doc)
+        output_doc.words = cltk_words
+        output_doc.spacy_doc = spacy_doc
+
+        return output_doc
+
+    @staticmethod
+    def spacy_to_cltk_word_type(spacy_doc: spacy.tokens.doc.Doc):
+        """Take an entire ``spacy`` document, extract
+        each word, and encode it in the way expected by
+        the CLTK's ``Word`` type.
+
+        It works only if there is some sentence boundaries has been set by the loaded model.
+
+        See note in code about starting word token index at 1
+
+        >>> from cltk.dependency.processes import SpacyProcess
+        >>> from cltk.languages.example_texts import get_example_text
+        >>> process_spacy = SpacyProcess(language="lat")
+        >>> cltk_words = process_spacy.run(Doc(raw=get_example_text("lat"))).words
+        >>> isinstance(cltk_words, list)
+        True
+        >>> isinstance(cltk_words[0], Word)
+        True
+        >>> cltk_words[0]
+        Word(index_char_start=0, index_char_stop=6, index_token=0, index_sentence=0, string='Gallia', pos=None, lemma='Gallia', stem=None, scansion=None, xpos='proper_noun', upos='PROPN', dependency_relation='nsubj', governor=None, features={}, category={}, stop=False, named_entity=None, syllables=None, phonetic_transcription=None, definition=None)
+
+        """
+        words_list: List[Word] = []
+        for sentence_index, sentence in enumerate(spacy_doc.doc.sents):
+            sent_words: Dict[int, Word] = {}
+            for spacy_word in sentence:
+                pos: Optional[MorphosyntacticFeature] = None
+                if spacy_word.pos_:
+                    pos = from_ud("POS", spacy_word.pos_)
+                cltk_word = Word(
+                    # Note: In order to match how Stanza orders token output
+                    # (index starting at 1, not 0), we must add an extra 1 to each
+                    index_token=spacy_word.i + 1,
+                    index_char_start=spacy_word.idx,
+                    index_char_stop=spacy_word.idx + len(spacy_word),
+                    index_sentence=sentence_index,
+                    string=spacy_word.text,  # same as ``token.text``
+                    pos=pos,
+                    xpos=spacy_word.tag_,
+                    upos=spacy_word.pos_,
+                    lemma=spacy_word.lemma_,
+                    dependency_relation=spacy_word.dep_,  # str
+                    stop=spacy_word.is_stop,
+                    # Note: Must increment this, too
+                    governor=spacy_word.head.i + 1,  # TODO: Confirm this is the index
+                )
+                raw_features: list[tuple[str, str]] = (
+                    [
+                        (feature, value)
+                        for feature, value in spacy_word.morph.to_dict().items()
+                    ]
+                    if spacy_word.morph
+                    else []
+                )
+                cltk_features = [
+                    from_ud(feature_name, feature_value)
+                    for feature_name, feature_value in raw_features
+                ]
+                cltk_word.features = MorphosyntacticFeatureBundle(*cltk_features)
+                cltk_word.category = to_categorial(cltk_word.pos)
+                cltk_word.spacy_features = spacy_word.morph
+                sent_words[cltk_word.index_token] = cltk_word
+                words_list.append(cltk_word)
+        return words_list
+
+
+@dataclass
+class LatinSpacyProcess(SpacyProcess):
+    """Run a Spacy model.
+
+    <https://huggingface.co/latincy>_
+    """
+
+    language: Literal["lat"] = "lat"
+    description: str = "Process for Spacy for Patrick Burn's Latin model."
+    authorship_info: str = "``LatinSpacyProcess`` using LatinCy model by Patrick Burns from https://arxiv.org/abs/2305.04365 . Please cite: https://arxiv.org/abs/2305.04365"