Skip to content

Commit

Permalink
fix(regexes): make PRE_FULL_CITATION pincite optional
Browse files Browse the repository at this point in the history
- add test cases for full case citation with antecedent and no pincite
- fix span calculation on add_pre_citation
  • Loading branch information
grossir committed Feb 13, 2025
1 parent 0a32f08 commit d6c4bd6
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 17 deletions.
2 changes: 1 addition & 1 deletion eyecite/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def add_pre_citation(citation: FullCaseCitation, words: Tokens) -> None:
citation.metadata.pin_cite = clean_pin_cite(m["pin_cite"]) or None
citation.metadata.antecedent_guess = m["antecedent"]
match_length = m.span()[1] - m.span()[0]
citation.full_span_start = citation.span()[1] - match_length
citation.full_span_start = citation.span()[0] - match_length


def add_law_metadata(citation: FullLawCitation, words: Tokens) -> None:
Expand Down
13 changes: 8 additions & 5 deletions eyecite/regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,13 +267,16 @@ def short_cite_re(regex):

# These full case citations usually appear after the same full case citation
# has appeared at least once.
# For example "Nobelman at 332, 113 S.Ct. 2106"
# Example with pincite: "Nobelman at 332, 113 S.Ct. 2106"
# Example without pincite: "Johnson, 515 U. S. 304"
PRE_FULL_CITATION_REGEX = rf"""
(?P<antecedent>[A-Z][a-z\-.]+)\ ?,? # single word antecedent
{PIN_CITE_REGEX}
# `PIN_CITE_REGEX` uses a positive lookahead for end characters, but we
# single word antecedent
(?P<antecedent>[A-Z][a-z\-.]+)\ ?,?
# optional pincite
{PIN_CITE_REGEX}?
# `PIN_CITE_REGEX` uses a positive lookahead for end characters, but we
# must also capture them to calculate spans
,?\ ?
,?\ ?
"""


Expand Down
35 changes: 24 additions & 11 deletions tests/test_FindTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
from unittest import TestCase

from eyecite import clean_text, get_citations
from eyecite.find import (
extract_reference_citations,
)
from eyecite.find import extract_reference_citations
from eyecite.helpers import filter_citations

# by default tests use a cache for speed
Expand Down Expand Up @@ -964,19 +962,34 @@ def test_reference_filtering(self):
"""Can we filter out ReferenceCitation that overlap other citations?"""
texts = [
# https://www.courtlistener.com/api/rest/v4/opinions/9435339/
# There should be no ReferenceCitations
"""decided <em>Bell Atlantic Corp. </em>v. <em>Twombly, </em>550 U. S. 544 (2007), which discussed ...
apellate court’s core competency. <em>Twombly, </em>550 U. S., at 557. Evaluating...
In <em>Twombly</em>, supra, at 553-554, the Court found it necessary...
Another, in <em>Twombly, supra</em>, at 553-554, the Court found it necessary...
# Test no overlap with supra citations
"""<em>Bell Atlantic Corp. </em>v. <em>Twombly, </em>550 U. S. 544 (2007),
which discussed... apellate court’s core competency.
<em>Twombly, </em>550 U. S., at 557. Evaluating...
In <em>Twombly</em>, supra, at 553-554, the Court found...
Another, in <em>Twombly, supra</em>, at 553-554, the Court found
""",
# From the previous source; test no overlap with single-name
# full case citation
"""
<em>Johnson </em>v. <em>Jones, </em>515 U. S. 304, 309 (1995)
something... with,” <em>Swint </em>v. <em>Chambers County Comm’n,
</em>514 U. S. 35, 51 (1995), and “directly implicated by,”
<em>Hartman, supra, </em>at 257, n. 5, the qualified-immunity
defense.</p>\n<p id=\"b773-6\">Respondent counters that our
holding in <em>Johnson, </em>515 U. S. 304, confirms
""",
""" was not con-firmable. <em>Nobelman v. Am. Sav. Bank, </em>508 U.S. 324, 113 S.Ct. 2106, 124 L.Ed.2d 228 (1993). That plan
residence.” <em>Nobelman </em>at 332, 113 S.Ct. 2106. Section 1123(b)(5) codifies the
# https://www.courtlistener.com/opinion/8524158/in-re-cahill/
# Test no overlap with single-name-and-pincite full case citation
""" was not con-firmable. <em>Nobelman v. Am. Sav. Bank, </em>
508 U.S. 324, 113 S.Ct. 2106, 124 L.Ed.2d 228 (1993). That plan
residence.” <em>Nobelman </em>at 332, 113 S.Ct. 2106.
Section 1123(b)(5) codifies the
""",
]
for markup_text in texts:
plain_text = clean_text(markup_text, ["html", "all_whitespace"])
citations = get_citations(plain_text)
citations = get_citations(plain_text, markup_text=markup_text)
self.assertFalse(
any(
[isinstance(cite, ReferenceCitation) for cite in citations]
Expand Down

0 comments on commit d6c4bd6

Please sign in to comment.