Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Translation Alignments: Port translation alignment endpoints from Flask app to ATLAS #4

Merged
merged 37 commits into from
Apr 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
2028e6b
improve admin
jacobwegner Jul 5, 2019
ca306b4
first pass at a VersionAlignment model
jacobwegner Jul 5, 2019
7ef593a
first pass at alignment chunk
jacobwegner Jul 5, 2019
14b661b
update schema for referencing alignments from Line
jacobwegner Jul 5, 2019
f0df5b6
add alignment chunks to schema; add sample queries
jacobwegner Jul 5, 2019
aad9d8a
update fixture
jacobwegner Jul 5, 2019
0eb39b4
fix fixture
jacobwegner Jul 5, 2019
b7c63c0
port alignment chunk loading from the Flask app
jacobwegner Jul 8, 2019
42cecdc
first pass at implementing a reference-based filter
jacobwegner Jul 9, 2019
1b75b71
ensure distinct
jacobwegner Jul 9, 2019
cd1c64a
quick pass at reference filter for lines endpoint
jacobwegner Jul 12, 2019
d13a172
upgrade to latest graphene-django
jacobwegner Aug 20, 2019
c37b1d6
Merge branch 'master' into feature/translation-alignments
jacobwegner Nov 14, 2019
da55a61
update for SQLite
jacobwegner Nov 14, 2019
0cd2604
update db prep script
jacobwegner Nov 14, 2019
345b87e
drop contains relation in favor of a computed property
jacobwegner Nov 14, 2019
14ea562
switch to bulk create
jacobwegner Nov 14, 2019
6c448c9
set batch_size
jacobwegner Nov 14, 2019
6ee0bd2
factor out common functionality as LineReferenceFilterMixin
jacobwegner Nov 14, 2019
fef8bc9
update README for database changes and ref filter
jacobwegner Nov 15, 2019
e906234
update comments
jacobwegner Nov 15, 2019
fa8c408
return alignment chunks that start within ref
jacobwegner Nov 30, 2019
b1b1ac9
add the ability to filter alignment chunks via idx
jacobwegner Dec 4, 2019
57b49b0
Merge branch 'develop' into feature/translation-alignments
jacobwegner Mar 16, 2020
7cc4efc
reset migrations
jacobwegner Mar 16, 2020
3e80aec
fix importer
jacobwegner Mar 16, 2020
6e1a8b7
fix alignments importer
jacobwegner Mar 16, 2020
b554f8a
restore alignment chunk filtering
jacobwegner Mar 16, 2020
c5874b3
Merge branch 'develop' into feature/translation-alignments
jacobwegner Mar 19, 2020
e62a5f6
fix failing tests
jacobwegner Mar 19, 2020
e48a816
ignore .coverage
jacobwegner Mar 19, 2020
3460898
fix up migrations
jacobwegner Mar 19, 2020
2c9e604
fix AlignmentChunk.contains
jacobwegner Mar 19, 2020
bf36d5d
prefer TextAlignment and TextAlignmentChunk
jacobwegner Mar 19, 2020
aaafcf6
don't assume text part chunk kind
jacobwegner Mar 19, 2020
8439315
filter chunks that contain a reference
jacobwegner Mar 23, 2020
69d2846
Merge branch 'develop' into feature/translation-alignments
jacobwegner Apr 3, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ Pipfile
Pipfile.lock
.venv/
.hypothesis/
.coverage
43 changes: 43 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,49 @@ level of `Version` nodes, maintaining the tree structure in the final payload.
}
```

## Text Alignments

### Sample Queries

Get text alignment chunks for a given reference:
```
{
textAlignmentChunks(reference: "urn:cts:greekLit:tlg0012.tlg001.perseus-grc2:1.8") {
edges {
cursor
node {
id
citation
items
alignment {
name
}
}
}
}
}
```

Get a version annotated with text alignment chunks:
```
{
versions (urn:"urn:cts:greekLit:tlg0012.tlg001.perseus-grc2:") {
edges {
node {
metadata,
textAlignmentChunks (first:2){
edges {
node {
citation
}
}
}
}
}
}
}
```

## Tests

Invoke tests via:
Expand Down
20 changes: 20 additions & 0 deletions data/alignments/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"alignments": [
{
"version_urn": "urn:cts:greekLit:tlg0012.tlg001.perseus-grc2",
"content_path": "tlg0012.tlg001.perseus-grc2_aligned_to_tlg0012.tlg001.perseus-eng3.csv",
"metadata": {
"aligned_version_urn": "urn:cts:greekLit:tlg0012.tlg001.perseus-eng3",
"name": "Iliad (English)"
}
},
{
"version_urn": "urn:cts:greekLit:tlg0012.tlg002.perseus-grc2",
"content_path": "tlg0012.tlg002.perseus-grc2_aligned_to_tlg0012.tlg002.perseus-eng3.csv",
"metadata": {
"aligned_version_urn": "urn:cts:greekLit:tlg0012.tlg002.perseus-eng3",
"name": "Odyssey (English)"
}
}
]
}

Large diffs are not rendered by default.

Large diffs are not rendered by default.

26 changes: 24 additions & 2 deletions readhomer_atlas/library/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,33 @@
from treebeard.admin import TreeAdmin
from treebeard.forms import movenodeform_factory

from .models import Node
from .models import Node, TextAlignment, TextAlignmentChunk


@admin.register(Node)
class NodeAdmin(TreeAdmin):
form = movenodeform_factory(Node)


admin.site.register(Node, NodeAdmin)
@admin.register(TextAlignment)
class TextAlignmentAdmin(admin.ModelAdmin):
list_display = ("id", "name", "slug", "metadata", "version")
list_filter = ("version",)
search_fields = ("name", "slug")
prepopulated_fields = {"slug": ["name"]}


@admin.register(TextAlignmentChunk)
class TextAlignmentChunkAdmin(admin.ModelAdmin):
list_display = (
"id",
"citation",
"metadata",
"idx",
"version",
"alignment",
"start",
"end",
)
list_filter = ("version", "alignment")
raw_id_fields = ("start", "end", "version")
4 changes: 4 additions & 0 deletions readhomer_atlas/library/importers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from . import alignments, versions


__all__ = ["alignments", "versions"]
189 changes: 189 additions & 0 deletions readhomer_atlas/library/importers/alignments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import csv
import json
import os
import re

from django.conf import settings

from ..models import Node, TextAlignment, TextAlignmentChunk


ALIGNMENTS_DATA_PATH = os.path.join(settings.PROJECT_ROOT, "data", "alignments")
ALIGNMENTS_METADATA_PATH = os.path.join(ALIGNMENTS_DATA_PATH, "metadata.json")

LINE_KIND_UNKNOWN = None
LINE_KIND_NEW = "new"
LINE_KIND_CONTINUES = "continues"
LINE_KIND_CONTINUATION = "continuation"

CITATION_REFERENCE_REGEX = re.compile(r"\d+\.\d+")
CONTENT_REFERENCE_REGEX = re.compile(r"\[\d+\.\d+\]")
CONTENT_FOOTNOTE_REGEX = re.compile(r"\[\d+\]")


def get_citation(greek_content):
"""
Extracts citation from Greek content.

The `citation` field in the CSV contains inaccuracies,
so we'll build the citation field ourselves.
"""
refs = CITATION_REFERENCE_REGEX.findall(greek_content)
start = refs[0]
end = refs[-1]
if start == end:
return start
else:
return "-".join([start, end])


def transform_greek_content(greek_content):
"""
Removes reference headings and footnote markers from
the milestone's Greek content.
"""
greek_content = CONTENT_FOOTNOTE_REGEX.sub("", greek_content)
references = CONTENT_REFERENCE_REGEX.findall(greek_content)
tokens = [v for v in greek_content.split(" ") if v]
content = []
for pos, reference in enumerate(references):
ref_idx = tokens.index(reference)
try:
next_idx = tokens.index(references[pos + 1])
except IndexError:
subset = tokens[ref_idx:]
else:
subset = tokens[ref_idx:next_idx]
ref_label = CITATION_REFERENCE_REGEX.findall(subset.pop(0))[0]
content.append((ref_label, " ".join(subset)))
return content


def map_leaves_to_milestones(leaves_to_milestones_lu, milestone_id, pos, leaves):
"""
Updates the `LEAVES_TO_MILESTONES` index with each
reference within a given citation.
"""
for reference, text in leaves:
leaves_to_milestones_lu.setdefault(reference, []).append(pos)


def get_alignment_milestones(path):
LEAVES_TO_MILESTONES = {}
ALIGNMENTS = []

with open(path) as f:
reader = csv.reader(f)
# discard header row
next(reader)
for pos, row in enumerate(reader):
milestone_id = row[0]
greek_content = row[2]
# @@@ Can't use the citation field as-is due to discrepancies
# in the source material
# citation = row[1]
citation = get_citation(greek_content)
# @@@ `map_leaves_to_milestones` could return text extracted from leaves
# in a generator, but since the greek_content contains partial references.
# we end up needing to just extract it from the CSV data
cleaned_greek_content = transform_greek_content(greek_content)

# Map leaves to milestones
map_leaves_to_milestones(
LEAVES_TO_MILESTONES, milestone_id, pos, cleaned_greek_content
)

ALIGNMENTS.append(
{
"id": milestone_id,
"citation": citation,
"greek_content": cleaned_greek_content,
"english_content": [
# @@@ continuation
(citation, row[3], LINE_KIND_UNKNOWN)
],
}
)

for milestone_idx, alignment in enumerate(ALIGNMENTS):
for pos, entry in enumerate(alignment["greek_content"]):
offsets = LEAVES_TO_MILESTONES[entry[0]]
if [milestone_idx] == offsets:
alignment["greek_content"][pos] += (LINE_KIND_NEW,)
elif offsets[0] == milestone_idx:
alignment["greek_content"][pos] += (LINE_KIND_CONTINUES,)
else:
alignment["greek_content"][pos] += (LINE_KIND_CONTINUATION,)

return {"ALIGNMENTS": ALIGNMENTS, "LEAVES_TO_MILESTONES": LEAVES_TO_MILESTONES}


def _alignment_chunk_obj(version, line_lookup, alignment, milestone, milestone_idx):
chunk_obj = TextAlignmentChunk(
citation=milestone["citation"],
idx=milestone_idx,
version=version,
alignment=alignment,
items=[milestone["greek_content"], milestone["english_content"]],
)
try:
start_ref, end_ref = milestone["citation"].split("-")
except ValueError:
start_ref = end_ref = milestone["citation"]
start_chapter, start_verse = [int(i) for i in start_ref.split(".")]
end_chapter, end_verse = [int(i) for i in end_ref.split(".")]

try:
chunk_obj.start = line_lookup[f"{start_chapter}.{start_verse}"]
chunk_obj.end = line_lookup[f"{end_chapter}.{end_verse}"]
return chunk_obj
except KeyError:
# greek version mismatch; could be others
print(
f"Skipping milestone due to line(s) not found. [alignment.name={alignment.name} citation={milestone['citation']} milestone_id={milestone['id']}]"
)
return


def _build_line_lookup(version):
lookup = {}
citation_scheme = version.metadata["citation_scheme"]
for line in version.get_descendants().filter(kind=citation_scheme[-1]):
lookup[line.ref] = line
return lookup


def _import_alignment(data):
full_content_path = os.path.join(ALIGNMENTS_DATA_PATH, data["content_path"])

milestones = get_alignment_milestones(full_content_path)
# the version urns in data need a trailing colon
version_urn = f'{data["version_urn"]}:'
version = Node.objects.get(urn=version_urn)
alignment, _ = TextAlignment.objects.update_or_create(
version=version,
name=data["metadata"]["name"],
defaults=dict(metadata=data["metadata"]),
)

chunks_created = 0
line_lookup = _build_line_lookup(version)
to_create = []
for milestone_idx, milestone in enumerate(milestones["ALIGNMENTS"]):
chunk = _alignment_chunk_obj(
version, line_lookup, alignment, milestone, milestone_idx
)
if chunk:
to_create.append(chunk)
chunks_created += 1
created = len(TextAlignmentChunk.objects.bulk_create(to_create, batch_size=500))
assert created == chunks_created


def import_alignments(reset=False):
if reset:
TextAlignment.objects.all().delete()

alignments_metadata = json.load(open(ALIGNMENTS_METADATA_PATH))
for alignment_data in alignments_metadata["alignments"]:
_import_alignment(alignment_data)
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

from readhomer_atlas import constants

from .models import Node
from .urn import URN
from ..models import Node
from ..urn import URN


LIBRARY_DATA_PATH = os.path.join(settings.PROJECT_ROOT, "data", "library")
Expand Down Expand Up @@ -268,8 +268,9 @@ def get_first_value_for_language(values, lang):
return next(iter(filter(lambda x: x["lang"] == lang, values)), None).get("value")


def import_versions():
Node.objects.filter(kind="nid").delete()
def import_versions(reset=True):
if reset:
Node.objects.filter(kind="nid").delete()

library = resolve_library()

Expand Down
5 changes: 4 additions & 1 deletion readhomer_atlas/library/management/commands/prepare_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,7 @@ def handle(self, *args, **options):
call_command("migrate")

self.stdout.write("--[Loading versions]--")
importers.import_versions()
importers.versions.import_versions(reset=True)

self.stdout.write("--[Loading alignments]--")
importers.alignments.import_alignments(reset=True)
Loading