From d6c4bd6e082a34951d8f4bbe421395233f28373d Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Thu, 13 Feb 2025 10:58:55 -0500 Subject: [PATCH] fix(regexes): make PRE_FULL_CITATION pincite optional - add test cases for full case citation with antecedent and no pincite - fix span calculation on add_pre_citation --- eyecite/helpers.py | 2 +- eyecite/regexes.py | 13 ++++++++----- tests/test_FindTest.py | 35 ++++++++++++++++++++++++----------- 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/eyecite/helpers.py b/eyecite/helpers.py index b1b91b78..f0445e48 100644 --- a/eyecite/helpers.py +++ b/eyecite/helpers.py @@ -185,7 +185,7 @@ def add_pre_citation(citation: FullCaseCitation, words: Tokens) -> None: citation.metadata.pin_cite = clean_pin_cite(m["pin_cite"]) or None citation.metadata.antecedent_guess = m["antecedent"] match_length = m.span()[1] - m.span()[0] - citation.full_span_start = citation.span()[1] - match_length + citation.full_span_start = citation.span()[0] - match_length def add_law_metadata(citation: FullLawCitation, words: Tokens) -> None: diff --git a/eyecite/regexes.py b/eyecite/regexes.py index b394b422..c8dbe352 100644 --- a/eyecite/regexes.py +++ b/eyecite/regexes.py @@ -267,13 +267,16 @@ def short_cite_re(regex): # These full case citations usually appear after the same full case citation # has appeared at least once. -# For example "Nobelman at 332, 113 S.Ct. 2106" +# Example with pincite: "Nobelman at 332, 113 S.Ct. 2106" +# Example without pincite: "Johnson, 515 U. S. 304" PRE_FULL_CITATION_REGEX = rf""" - (?P[A-Z][a-z\-.]+)\ ?,? # single word antecedent - {PIN_CITE_REGEX} - # `PIN_CITE_REGEX` uses a positive lookahead for end characters, but we + # single word antecedent + (?P[A-Z][a-z\-.]+)\ ?,? + # optional pincite + {PIN_CITE_REGEX}? + # `PIN_CITE_REGEX` uses a positive lookahead for end characters, but we # must also capture them to calculate spans - ,?\ ? + ,?\ ? """ diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py index 68745e66..9e85d915 100644 --- a/tests/test_FindTest.py +++ b/tests/test_FindTest.py @@ -4,9 +4,7 @@ from unittest import TestCase from eyecite import clean_text, get_citations -from eyecite.find import ( - extract_reference_citations, -) +from eyecite.find import extract_reference_citations from eyecite.helpers import filter_citations # by default tests use a cache for speed @@ -964,19 +962,34 @@ def test_reference_filtering(self): """Can we filter out ReferenceCitation that overlap other citations?""" texts = [ # https://www.courtlistener.com/api/rest/v4/opinions/9435339/ - # There should be no ReferenceCitations - """decided Bell Atlantic Corp. v. Twombly, 550 U. S. 544 (2007), which discussed ... - apellate court’s core competency. Twombly, 550 U. S., at 557. Evaluating... - In Twombly, supra, at 553-554, the Court found it necessary... - Another, in Twombly, supra, at 553-554, the Court found it necessary... + # Test no overlap with supra citations + """Bell Atlantic Corp. v. Twombly, 550 U. S. 544 (2007), + which discussed... apellate court’s core competency. + Twombly, 550 U. S., at 557. Evaluating... + In Twombly, supra, at 553-554, the Court found... + Another, in Twombly, supra, at 553-554, the Court found + """, + # From the previous source; test no overlap with single-name + # full case citation + """ + Johnson v. Jones, 515 U. S. 304, 309 (1995) + something... with,” Swint v. Chambers County Comm’n, + 514 U. S. 35, 51 (1995), and “directly implicated by,” + Hartman, supra, at 257, n. 5, the qualified-immunity + defense.

\n

Respondent counters that our + holding in Johnson, 515 U. S. 304, confirms """, - """ was not con-firmable. Nobelman v. Am. Sav. Bank, 508 U.S. 324, 113 S.Ct. 2106, 124 L.Ed.2d 228 (1993). That plan - residence.” Nobelman at 332, 113 S.Ct. 2106. Section 1123(b)(5) codifies the + # https://www.courtlistener.com/opinion/8524158/in-re-cahill/ + # Test no overlap with single-name-and-pincite full case citation + """ was not con-firmable. Nobelman v. Am. Sav. Bank, + 508 U.S. 324, 113 S.Ct. 2106, 124 L.Ed.2d 228 (1993). That plan + residence.” Nobelman at 332, 113 S.Ct. 2106. + Section 1123(b)(5) codifies the """, ] for markup_text in texts: plain_text = clean_text(markup_text, ["html", "all_whitespace"]) - citations = get_citations(plain_text) + citations = get_citations(plain_text, markup_text=markup_text) self.assertFalse( any( [isinstance(cite, ReferenceCitation) for cite in citations]