diff --git a/cl/citations/description_score.py b/cl/citations/description_score.py index f369cd619e..6db4d8af95 100644 --- a/cl/citations/description_score.py +++ b/cl/citations/description_score.py @@ -4,11 +4,11 @@ from cl.search.models import OpinionCluster _GERUND = re.compile(r"(?:\S+ing)", re.IGNORECASE) -_GERUND_THAT = re.compile(rf"{_GERUND} that", re.IGNORECASE) +_GERUND_THAT = re.compile(rf"{_GERUND.pattern} that", re.IGNORECASE) _HOLDING = re.compile( r"(?:holding|deciding|ruling|recognizing|concluding)", re.IGNORECASE ) -_HOLDING_THAT = re.compile(rf"{_HOLDING} that", re.IGNORECASE) +_HOLDING_THAT = re.compile(rf"{_HOLDING.pattern} that", re.IGNORECASE) # Observation of thousands of parentheticals seems to indicate that the # most useful ones are in the neighborhood of 20 words long. diff --git a/cl/citations/filter_parentheticals.py b/cl/citations/filter_parentheticals.py index 3e33f98101..d26765e276 100644 --- a/cl/citations/filter_parentheticals.py +++ b/cl/citations/filter_parentheticals.py @@ -1,6 +1,6 @@ import re -_MODIFIABLE = r"(omissions?|quotations?|quotes?|headings?|(quotations? )?marks?|ellips.s|cites?|citations?|emphas.s|italics?|footnotes?|alterations?|punctuation|modifications?|brackets?|bracketed material|formatting)" +_MODIFIABLE = r"(omissions?|quotations?|quotes?|headings?|(quotations? )?marks|ellips.s|cites?|citations?|emphas.s|italics?|footnotes?|alterations?|punctuation|modifications?|brackets?|bracketed material|formatting)" _MODIFABLE_TYPE = r"(internal|former|latter|first|second|third|fourth|fifth|last|some|further|certain|numbered|other|transcript)" _FULL_MODIFIABLE = f"(({_MODIFABLE_TYPE} )?{_MODIFIABLE})" _QUOTE_MODIFICATION = r"(added|provided|removed|adopted|(in )?(the )original|omitted|included|deleted|eliminated|altered|modified|supplied|ours|mine|changed|(in|by) \S+|by \S+ court)" @@ -20,16 +20,19 @@ r".n banc", # en banc or in banc # Scalia, J., dissenting; Roberts, C.J., concurring in the judgment, concurring in part, and dissenting in part f"{_JUDGE_NAME}( {_FULL_OPINION_DESCRIPTOR})?([ ,]+(and )?{_FULL_OPINION_DESCRIPTOR})*", + f"{_JUDGE_NAME}.{{1,75}}", # concurring in result f"({_DOCUMENT_TYPES} )?{_FULL_OPINION_DESCRIPTOR}", # opinion of Breyer, J.; opinion of Scalia and Alito, J.J. f"{_DOCUMENT_TYPES} of {_JUDGE_NAME}", # plurality opinion, supplemental order f"{_OPINION_TYPES}( {_DOCUMENT_TYPES})?( {_OPINION_TYPE_MODIFICATION})?", + rf"({_DOCUMENT_TYPES} )?opinion.*", r"dictum|dicta", r"on rehearing|denying cert(iorari)?", r"simplified|cleaned up|as amended", r"same|similar|contra", + r"standard of review", r"(and )?cases cited therein", # No. 12-345 r"No. \d+.?\d+", @@ -45,6 +48,8 @@ f"{_FULL_MODIFIABLE} and {_FULL_MODIFIABLE} {_QUOTE_MODIFICATION}", f"{_FULL_MODIFIABLE} {_QUOTE_MODIFICATION}[;,] ?{_FULL_MODIFIABLE} {_QUOTE_MODIFICATION}", f"({_MODIFABLE_TYPE} )?{_MODIFIABLE}, {_MODIFIABLE}, and {_MODIFIABLE} {_QUOTE_MODIFICATION}", + # Match any short parenthetical that looks like a modification (e.g. "citations and internal marks omitted, emphasis added") + rf"(?=.*{_MODIFIABLE}.*).{{1,75}}", # citing Gonzales v. Raich, 123 U.S. 456 (2019). A tad over-inclusive but very helpful f"{_REFERENTIAL} .*", # 2nd Cir. 2019, Third Circuit 1993 @@ -55,6 +60,8 @@ r".{1,10} (Circuit|Cir.)", # hereinafter, "Griffin II" r"here(in)?after(,)? .+", + # Imbalanced parentheses (for when eyecite cuts off the parenthetical too soon) e.g. "holding Section 4(a" + r"^.{1,35}\([^\)]{1,35}$", # Single-word parentheticals, e.g., 'TILA' r"\S*", ] diff --git a/cl/citations/fixtures/opinions_matching_citations.json b/cl/citations/fixtures/opinions_matching_citations.json index 12a1e724fb..1c9a2be370 100644 --- a/cl/citations/fixtures/opinions_matching_citations.json +++ b/cl/citations/fixtures/opinions_matching_citations.json @@ -262,6 +262,17 @@ "model": "search.citation", "pk": 6 }, + { + "fields": { + "volume": 2, + "reporter": "S.Ct.", + "page": "2", + "type": 1, + "cluster": 4 + }, + "model": "search.citation", + "pk": 20 + }, { "fields": { "volume": 2, @@ -401,7 +412,7 @@ "date_modified": "2015-08-15T14:10:56.801Z", "extracted_by_ocr": false, "author": 2, - "plain_text": "my plain text secret word for queries", + "plain_text": "my plain text secret word for queries. Foo v. Bar, 1 U.S. 1, 4, 2 S.Ct. 2, 5 (2000) (holding something happened)", "html": "", "download_url": null, "cluster": 8, diff --git a/cl/citations/tasks.py b/cl/citations/tasks.py index 3c71237612..c6295c980c 100644 --- a/cl/citations/tasks.py +++ b/cl/citations/tasks.py @@ -163,12 +163,19 @@ def find_citations_and_parentheticals_for_opinion_by_pks( parentheticals = [] for _opinion, _citations in citation_resolutions.items(): + # Currently, eyecite has a bug where parallel citations are + # detected individually. We avoid creating duplicate parentheticals + # because of that by keeping track of what we've seen so far. + parenthetical_texts = set() for _cit in _citations: # If the citation has a descriptive parenthetical, clean # it up and store it as a Parenthetical if ( - par_text := _cit.metadata.parenthetical - ) and is_parenthetical_descriptive(par_text): + (par_text := _cit.metadata.parenthetical) + and par_text not in parenthetical_texts + and is_parenthetical_descriptive(par_text) + ): + parenthetical_texts.add(par_text) clean = clean_parenthetical_text(par_text) parentheticals.append( Parenthetical( diff --git a/cl/citations/tests.py b/cl/citations/tests.py index fea8bb0ab7..783ee4cfd3 100644 --- a/cl/citations/tests.py +++ b/cl/citations/tests.py @@ -627,6 +627,18 @@ def test_opinionscited_creation(self) -> None: num_parentheticals, ) + def test_no_duplicate_parentheticals_from_parallel_cites(self) -> None: + remove_citations_from_imported_fixtures() + citing = Opinion.objects.get(pk=11) + cited = Opinion.objects.get(pk=7) + find_citations_and_parentheticals_for_opinion_by_pks.delay([11]) + self.assertEqual( + Parenthetical.objects.filter( + describing_opinion=citing, described_opinion=cited + ).count(), + 1, + ) + class CitationFeedTest(IndexedSolrTestCase): def _tree_has_content(self, content, expected_count): @@ -807,6 +819,7 @@ def test_is_not_descriptive(self): "internal citations and quotations omitted", "citations and internal ellipses omitted", "quotation marks omitted; ellipses ours", + "headings and internal quotations omitted, emphasis and citations altered", "plurality opinion", "opinion of Breyer, J.", "opinion of Mister Justice Black", @@ -826,9 +839,12 @@ def test_is_not_descriptive(self): "Sotomayor, J., statement respecting denial of certiorari", "Roberts, C.J., concurring in part and dissenting in part", "Friendly, J., concurring in the judgment, concurring in part, and dissenting in part", + "Scalia, J., specially concurring in the judgment on this issue", "en banc", "per curiam", "same", + "standard of review", + "opinion of O'Connor, J., respecting the granting of an injunction", "no", "n. 3", "No. 12-345", @@ -853,6 +869,7 @@ def test_is_descriptive(self): "accountant who gave lay opinion testimony might have qualified as expert", "where plaintif's complaint alleges facts which, if proven, would entitle plaintiff to relief under the Eighth Amendment, dismissal of complaint was inappropriate", "ruling that there is nothing either legal or illegal, only thinking makes it so", + "testing that the mere presence of the word quotation doesn't get a parenthetical filtered out if it's long enough", "First Amendment", "mislabeled product", '"Look on my Works, ye Mighty, and despair"', diff --git a/cl/opinion_page/templates/view_opinion_summaries.html b/cl/opinion_page/templates/view_opinion_summaries.html index 6222c9d480..1d0385cee9 100644 --- a/cl/opinion_page/templates/view_opinion_summaries.html +++ b/cl/opinion_page/templates/view_opinion_summaries.html @@ -36,7 +36,9 @@
We looked through our complete collection of opinions and identified the following parenthetical summaries that describe this case:
+ {% if summaries_count > 0 %} +We looked through our complete collection of opinions and identified the following parenthetical summaries that describe this case:
+ {% endif %}