From d6c4bd6e082a34951d8f4bbe421395233f28373d Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi
Date: Thu, 13 Feb 2025 10:58:55 -0500
Subject: [PATCH] fix(regexes): make PRE_FULL_CITATION pincite optional
- add test cases for full case citation with antecedent and no pincite
- fix span calculation on add_pre_citation
---
eyecite/helpers.py | 2 +-
eyecite/regexes.py | 13 ++++++++-----
tests/test_FindTest.py | 35 ++++++++++++++++++++++++-----------
3 files changed, 33 insertions(+), 17 deletions(-)
diff --git a/eyecite/helpers.py b/eyecite/helpers.py
index b1b91b78..f0445e48 100644
--- a/eyecite/helpers.py
+++ b/eyecite/helpers.py
@@ -185,7 +185,7 @@ def add_pre_citation(citation: FullCaseCitation, words: Tokens) -> None:
citation.metadata.pin_cite = clean_pin_cite(m["pin_cite"]) or None
citation.metadata.antecedent_guess = m["antecedent"]
match_length = m.span()[1] - m.span()[0]
- citation.full_span_start = citation.span()[1] - match_length
+ citation.full_span_start = citation.span()[0] - match_length
def add_law_metadata(citation: FullLawCitation, words: Tokens) -> None:
diff --git a/eyecite/regexes.py b/eyecite/regexes.py
index b394b422..c8dbe352 100644
--- a/eyecite/regexes.py
+++ b/eyecite/regexes.py
@@ -267,13 +267,16 @@ def short_cite_re(regex):
# These full case citations usually appear after the same full case citation
# has appeared at least once.
-# For example "Nobelman at 332, 113 S.Ct. 2106"
+# Example with pincite: "Nobelman at 332, 113 S.Ct. 2106"
+# Example without pincite: "Johnson, 515 U. S. 304"
PRE_FULL_CITATION_REGEX = rf"""
- (?P[A-Z][a-z\-.]+)\ ?,? # single word antecedent
- {PIN_CITE_REGEX}
- # `PIN_CITE_REGEX` uses a positive lookahead for end characters, but we
+ # single word antecedent
+ (?P[A-Z][a-z\-.]+)\ ?,?
+ # optional pincite
+ {PIN_CITE_REGEX}?
+ # `PIN_CITE_REGEX` uses a positive lookahead for end characters, but we
# must also capture them to calculate spans
- ,?\ ?
+ ,?\ ?
"""
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index 68745e66..9e85d915 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -4,9 +4,7 @@
from unittest import TestCase
from eyecite import clean_text, get_citations
-from eyecite.find import (
- extract_reference_citations,
-)
+from eyecite.find import extract_reference_citations
from eyecite.helpers import filter_citations
# by default tests use a cache for speed
@@ -964,19 +962,34 @@ def test_reference_filtering(self):
"""Can we filter out ReferenceCitation that overlap other citations?"""
texts = [
# https://www.courtlistener.com/api/rest/v4/opinions/9435339/
- # There should be no ReferenceCitations
- """decided Bell Atlantic Corp. v. Twombly, 550 U. S. 544 (2007), which discussed ...
- apellate court’s core competency. Twombly, 550 U. S., at 557. Evaluating...
- In Twombly, supra, at 553-554, the Court found it necessary...
- Another, in Twombly, supra, at 553-554, the Court found it necessary...
+ # Test no overlap with supra citations
+ """Bell Atlantic Corp. v. Twombly, 550 U. S. 544 (2007),
+ which discussed... apellate court’s core competency.
+ Twombly, 550 U. S., at 557. Evaluating...
+ In Twombly, supra, at 553-554, the Court found...
+ Another, in Twombly, supra, at 553-554, the Court found
+ """,
+ # From the previous source; test no overlap with single-name
+ # full case citation
+ """
+ Johnson v. Jones, 515 U. S. 304, 309 (1995)
+ something... with,” Swint v. Chambers County Comm’n,
+ 514 U. S. 35, 51 (1995), and “directly implicated by,”
+ Hartman, supra, at 257, n. 5, the qualified-immunity
+ defense.
\nRespondent counters that our
+ holding in Johnson, 515 U. S. 304, confirms
""",
- """ was not con-firmable. Nobelman v. Am. Sav. Bank, 508 U.S. 324, 113 S.Ct. 2106, 124 L.Ed.2d 228 (1993). That plan
- residence.” Nobelman at 332, 113 S.Ct. 2106. Section 1123(b)(5) codifies the
+ # https://www.courtlistener.com/opinion/8524158/in-re-cahill/
+ # Test no overlap with single-name-and-pincite full case citation
+ """ was not con-firmable. Nobelman v. Am. Sav. Bank,
+ 508 U.S. 324, 113 S.Ct. 2106, 124 L.Ed.2d 228 (1993). That plan
+ residence.” Nobelman at 332, 113 S.Ct. 2106.
+ Section 1123(b)(5) codifies the
""",
]
for markup_text in texts:
plain_text = clean_text(markup_text, ["html", "all_whitespace"])
- citations = get_citations(plain_text)
+ citations = get_citations(plain_text, markup_text=markup_text)
self.assertFalse(
any(
[isinstance(cite, ReferenceCitation) for cite in citations]