From 6afd2c2c665e720832d17845084d09733887ebff Mon Sep 17 00:00:00 2001 From: Ebru Yucesar Date: Tue, 25 Jun 2024 17:27:09 -0400 Subject: [PATCH] update the way case texts are aggregated --- web/main/legal_document_sources.py | 45 +++++++++++++++--------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/web/main/legal_document_sources.py b/web/main/legal_document_sources.py index 895667826..8b7564899 100644 --- a/web/main/legal_document_sources.py +++ b/web/main/legal_document_sources.py @@ -581,28 +581,28 @@ def pull(legal_doc_source, id): resp.raise_for_status() cluster = resp.json() cluster["html_info"] = {"source": "court listener"} - cluster["sub_opinions"].sort(key=lambda x: int(x.split("/")[-2])) - - if cluster["filepath_json_harvard"]: - harvard_xml_data = "" - for sub_opinion in cluster["sub_opinions"]: - opinion = CourtListener.get_opinion_body(sub_opinion) - if opinion["xml_harvard"]: - opinion_xml = opinion["xml_harvard"].replace( - '', "" - ) - harvard_xml_data += f"{opinion_xml}\n" - case_html = CourtListener.prepare_case_html(cluster, harvard_xml_data) - cluster["html_info"]["source_field"] = "xml_harvard" - else: - opinion = CourtListener.get_opinion_body(cluster["sub_opinions"][0]) - if opinion["html"]: - case_html = opinion["html"] - cluster["html_info"]["source_field"] = "html" - else: - case_html = opinion["plain_text"] - cluster["html_info"]["source_field"] = "plain_text" + cluster["sub_opinions"].sort(key=lambda url: int(url.split("/")[-2])) + sub_opinion_jsons = [] + for opinion in cluster["sub_opinions"]: + sub_opinion_jsons.append(CourtListener.get_opinion_body(opinion)) + + text_source = "" + for content_type in ("xml_harvard", "html", "plain_text"): + case_text = "".join(sub_opinion[content_type] for sub_opinion in sub_opinion_jsons) + if case_text: + case_text = case_text.replace('', "") + text_source = content_type + break + + if not case_text: + msg = f"Case text not found for cluster {id}" + raise Exception(msg) + + if text_source == "xml_harvard": + case_text = CourtListener.prepare_case_html(cluster, case_text) + + cluster["html_info"]["source_field"] = text_source additional_metadata = (CourtListener.get_additional_cluster_metadata(id))["results"][0] except requests.exceptions.HTTPError as e: @@ -629,9 +629,10 @@ def pull(legal_doc_source, id): publication_date=parser.parse(cluster.get("date_modified")), updated_date=datetime.now(), source_ref=str(id), - content=case_html, + content=case_text, metadata=cluster, ) + return case @staticmethod