Skip to content

Commit

Permalink
Merge pull request #45 from andredelft/feature/better-perseus-parsing
Browse files Browse the repository at this point in the history
Better parsing of Perseus XML
  • Loading branch information
andredelft authored Jan 25, 2024
2 parents f514022 + b74cd67 commit be9f5ab
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 28 deletions.
26 changes: 5 additions & 21 deletions heidegger_index/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import requests
from bs4 import BeautifulSoup
from pyCTS import CTS_URN

from django.db import models
from django.conf import settings
Expand All @@ -9,6 +7,7 @@
from django.utils.safestring import mark_safe

from heidegger_index.constants import LemmaType, RefType, MetadataType
from heidegger_index.passage import get_perseus_passage
from heidegger_index.utils import (
gen_sort_key,
slugify,
Expand Down Expand Up @@ -140,25 +139,10 @@ def save(self, *args, **kwargs):
super().save(*args, **kwargs)

def load_work_text(self):
if not self.perseus_content and self.urn and self.type == "w":
lemma_urn = CTS_URN(self.urn)
if not lemma_urn.passage_component:
self.perseus_content = None
else:
p_link = (
"https://scaife-cts.perseus.org/api/cts?request=GetPassage&urn="
+ self.urn
)
p_response = requests.get(p_link)

try:
# TODO: URN-based parsing, or strip bibl & label contents
parsed_xml = BeautifulSoup(p_response.text, "html.parser")
perseus_content = parsed_xml.p.contents[-1].string
except AttributeError:
pass
else:
self.perseus_content = perseus_content
if self.perseus_content or not self.urn or self.type != "w":
return

self.perseus_content = get_perseus_passage(self.urn)

@property
def display(self):
Expand Down
32 changes: 32 additions & 0 deletions heidegger_index/passage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import requests
from bs4 import BeautifulSoup
from pyCTS import CTS_URN


STRIP_TAGS = ["bibl", "label", "note"]


def strip_tei_xml(xml: str):
body = BeautifulSoup(xml, "xml").TEI

if not body:
return

for el in body(STRIP_TAGS):
el.decompose()

return body.text.strip()


def get_perseus_passage(urn: str):
if not CTS_URN(urn).passage_component:
return

api_url = f"https://scaife-cts.perseus.org/api/cts?request=GetPassage&urn={urn}"

try:
r = requests.get(api_url)
except requests.HTTPError:
return

return strip_tei_xml(r.text)
6 changes: 6 additions & 0 deletions heidegger_index/templates/components/_perseus_content.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{% load i18n %}

<blockquote>
{{ lemma.perseus_content|truncatewords:100 }}
({% translate "Bron" %}: <a href="https://scaife.perseus.org/reader/{{ lemma.urn | safe }}" target="_blank"><cite>Perseus Digital Library</cite></a>)
</blockquote>
12 changes: 6 additions & 6 deletions heidegger_index/templates/lemma_detail.html
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,8 @@ <h1 class="lemma-detail__title {{ lemma.icon }}">{{ lemma.display }}</h1>
</div>
{% endif %}

{% if lemma.type == "w" and lemma.perseus_content %}
<blockquote>
{{ lemma.perseus_content }}
({% translate "Bron" %}:
<a href="https://scaife.perseus.org/reader/{{ lemma.urn | safe }}" title="Read full text of {{ lemma.value }} on Perseus" target="_blank"><cite>Perseus Digital Library</cite></a>)
</blockquote>
{% if lemma.perseus_content %}
{% include "components/_perseus_content.html" %}
{% endif %}

{% if lemma.description %}
Expand All @@ -35,6 +31,10 @@ <h1 class="lemma-detail__title {{ lemma.icon }}">{{ lemma.display }}</h1>

{% for child in children %}
<h2 id="{{ child.slug }}">{{ child }}</h2>
{% if child.perseus_content %}
{% include "components/_perseus_content.html" with lemma=child %}
{% endif %}

{% if child.description %}
{{ child.description | safe }}
{% endif %}
Expand Down
3 changes: 2 additions & 1 deletion requirements/production.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ beautifulsoup4==4.11.1
aenum==3.*
django-compressor==4.*
django-libsass==0.*
python-decouple==3.*
python-decouple==3.*
lxml==5.*

0 comments on commit be9f5ab

Please sign in to comment.