generated from freelawproject/new-project-template
-
-
Notifications
You must be signed in to change notification settings - Fork 33
/
find.py
260 lines (227 loc) · 8.02 KB
/
find.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
from typing import List, Type, cast
from eyecite.helpers import (
disambiguate_reporters,
extract_pin_cite,
joke_cite,
match_on_tokens,
)
from eyecite.models import (
CitationBase,
CitationToken,
FullCaseCitation,
FullCitation,
FullJournalCitation,
FullLawCitation,
IdCitation,
IdToken,
ResourceCitation,
SectionToken,
ShortCaseCitation,
SupraCitation,
SupraToken,
Tokens,
UnknownCitation,
)
from eyecite.regexes import SHORT_CITE_ANTECEDENT_REGEX, SUPRA_ANTECEDENT_REGEX
from eyecite.tokenizers import Tokenizer, default_tokenizer
def get_citations(
plain_text: str,
remove_ambiguous: bool = False,
tokenizer: Tokenizer = default_tokenizer,
) -> List[CitationBase]:
"""This is eyecite's main workhorse function. Given a string of text
(e.g., a judicial opinion or other legal document), return a list of
`eyecite.models.CitationBase` objects representing the citations found
in the document.
Args:
plain_text: The text to parse. You may wish to use the
`eyecite.clean.clean_text` function to pre-process your text
before passing it here.
remove_ambiguous: Whether to remove citations that might refer to more
than one reporter and can't be narrowed down by date.
tokenizer: An instance of a Tokenizer object. See `eyecite.tokenizers`
for information about available tokenizers. Uses the
`eyecite.tokenizers.AhocorasickTokenizer` by default.
Returns:
A list of `eyecite.models.CitationBase` objects
"""
if plain_text == "eyecite":
return joke_cite
words, citation_tokens = tokenizer.tokenize(plain_text)
citations = []
for i, token in citation_tokens:
citation: CitationBase
token_type = type(token)
# CASE 1: Token is a CitationToken (i.e., a reporter, a law journal,
# or a law).
# In this case, first try extracting it as a standard, full citation,
# and if that fails try extracting it as a short form citation.
if token_type is CitationToken:
citation_token = cast(CitationToken, token)
if citation_token.short:
citation = _extract_shortform_citation(words, i)
else:
citation = _extract_full_citation(words, i)
# CASE 2: Token is an "Id." or "Ibid." reference.
# In this case, the citation should simply be to the item cited
# immediately prior, but for safety we will leave that resolution up
# to the user.
elif token_type is IdToken:
citation = _extract_id_citation(words, i)
# CASE 3: Token is a "supra" reference.
# In this case, we're not sure yet what the citation's antecedent is.
# It could be any of the previous citations above. Thus, like an Id.
# citation, for safety we won't resolve this reference yet.
elif token_type is SupraToken:
citation = _extract_supra_citation(words, i)
# CASE 4: Token is a section marker.
# In this case, it's likely that this is a reference to a citation,
# but we're not sure what it is if it doesn't match any of the above.
# So we record this marker in order to keep an accurate list of the
# possible antecedents for id citations.
elif token_type is SectionToken:
citation = UnknownCitation(cast(SectionToken, token), i)
# CASE 5: The token is not a citation.
else:
continue
citations.append(citation)
# Remove citations with multiple reporter candidates where we couldn't
# guess correct reporter
if remove_ambiguous:
citations = disambiguate_reporters(citations)
# Returns a list of citations ordered in the sequence that they appear in
# the document. The ordering of this list is important for reconstructing
# the references of the ShortCaseCitation, SupraCitation, and
# IdCitation objects.
return citations
def _extract_full_citation(
words: Tokens,
index: int,
) -> FullCitation:
"""Given a list of words and the index of a citation, return
a FullCitation object."""
# Our cite was matched by one or more regexes, which could have come from
# one or more of the sources in reporters_db (e.g. reporters, laws,
# journals). Get the set of all sources that matched, preferring exact
# matches to variations:
token = cast(CitationToken, words[index])
cite_sources = set(
e.reporter.source
for e in (token.exact_editions or token.variation_editions)
)
# get citation_class based on cite_sources
citation_class: Type[ResourceCitation]
if "reporters" in cite_sources:
citation_class = FullCaseCitation
elif "laws" in cite_sources:
citation_class = FullLawCitation
elif "journals" in cite_sources:
citation_class = FullJournalCitation
else:
raise ValueError(f"Unknown cite_sources value {cite_sources}")
# make citation
citation = citation_class(
token,
index,
exact_editions=token.exact_editions,
variation_editions=token.variation_editions,
)
citation.add_metadata(words)
return citation
def _extract_shortform_citation(
words: Tokens,
index: int,
) -> ShortCaseCitation:
"""Given a list of words and the index of a citation, construct and return
a ShortCaseCitation object.
Shortform 1: Adarand, 515 U.S., at 241
Shortform 2: 515 U.S., at 241
"""
# get antecedent word
antecedent_guess = None
m = match_on_tokens(
words,
index - 1,
SHORT_CITE_ANTECEDENT_REGEX,
strings_only=True,
forward=False,
)
if m:
antecedent_guess = m["antecedent"].strip()
# Get pin_cite
cite_token = cast(CitationToken, words[index])
pin_cite, span_end, parenthetical = extract_pin_cite(
words, index, prefix=cite_token.groups["page"]
)
# make ShortCaseCitation
citation = ShortCaseCitation(
cite_token,
index,
exact_editions=cite_token.exact_editions,
variation_editions=cite_token.variation_editions,
span_end=span_end,
metadata={
"antecedent_guess": antecedent_guess,
"pin_cite": pin_cite,
"parenthetical": parenthetical,
},
)
# add metadata
citation.guess_edition()
citation.guess_court()
return citation
def _extract_supra_citation(
words: Tokens,
index: int,
) -> SupraCitation:
"""Given a list of words and the index of a supra token, look before
and after to see if this is a supra citation. If found, construct
and return a SupraCitation object.
Supra 1: Adarand, supra, at 240
Supra 2: Adarand, 515 supra, at 240
Supra 3: Adarand, supra, somethingelse
Supra 4: Adrand, supra. somethingelse
"""
pin_cite, span_end, parenthetical = extract_pin_cite(words, index)
antecedent_guess = None
volume = None
m = match_on_tokens(
words,
index - 1,
SUPRA_ANTECEDENT_REGEX,
strings_only=True,
forward=False,
)
if m:
antecedent_guess = m["antecedent"]
volume = m["volume"]
# Return SupraCitation
return SupraCitation(
cast(SupraToken, words[index]),
index,
span_end=span_end,
metadata={
"antecedent_guess": antecedent_guess,
"pin_cite": pin_cite,
"parenthetical": parenthetical,
"volume": volume,
},
)
def _extract_id_citation(
words: Tokens,
index: int,
) -> IdCitation:
"""Given a list of words and the index of an id token, gather the
immediately succeeding tokens to construct and return an IdCitation
object.
"""
pin_cite, span_end, parenthetical = extract_pin_cite(words, index)
return IdCitation(
cast(IdToken, words[index]),
index,
span_end=span_end,
metadata={
"pin_cite": pin_cite,
"parenthetical": parenthetical,
},
)