Skip to content

Commit

Permalink
Merge pull request #14 from OpenPecha/feat/read-alignment
Browse files Browse the repository at this point in the history
Feat/read alignment
  • Loading branch information
10zinten authored Jul 30, 2024
2 parents c17805d + 4b82b93 commit 965d155
Show file tree
Hide file tree
Showing 29 changed files with 1,870 additions and 201 deletions.
14 changes: 9 additions & 5 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,24 @@ jobs:

steps:
- uses: actions/checkout@v3

- name: Set up Python 3.8
uses: actions/setup-python@v3
with:
python-version: "3.8"

- name: Install dependencies
run: |
pip install -U pip
pip install .
pip install .[dev]
- name: Test with pytest
env:
GITHUB_TOKEN: "dummy value"
run: PYTHONPATH=src pytest

- name: Test Coverage
run: PYTHONPATH=src pytest --cov project_name
env:
GITHUB_TOKEN: "dummy value"
run: PYTHONPATH=src pytest --cov openpecha
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ classifiers = [

dependencies = [
"pydantic >= 2.7.4",
"stam == 0.8.2",

"stam == 0.8.4",
"PyGithub >= 2.3.0",
]

[project.optional-dependencies]
Expand Down
119 changes: 102 additions & 17 deletions src/openpecha/alignment/__init__.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,121 @@
from typing import List, Tuple
import json
from pathlib import Path
from typing import Dict, List, Tuple


class AlignmentMetadata:
pass
from openpecha.alignment.metadata import AlignmentMetaData
from openpecha.config import ALIGNMENT_PATH, _mkdir
from openpecha.github_utils import (
clone_repo,
create_github_repo,
upload_folder_to_github,
)
from openpecha.ids import get_uuid
from openpecha.pecha import Pecha


class Alignment:
def __init__(
self,
metadata: AlignmentMetadata,
parser_segment_pairs=None,
alignment_segment_pairs=None,
metadata: AlignmentMetaData,
base_path: Path = None,
segment_pairs: Dict[str, Dict[str, str]] = None,
pechas: Dict[str, Pecha] = None,
):
self.id_ = metadata.id_
self.base_path = (
_mkdir(ALIGNMENT_PATH / self.id_) if not base_path else base_path
)
self.metadata = metadata
self.parser_segment_pairs = parser_segment_pairs
self.alignment_segment_pairs = alignment_segment_pairs
self.segment_pairs = segment_pairs
self.pechas = pechas

@classmethod
def from_path(cls, path: str):
pass
def from_path(cls, path: Path, pechas: Dict[str, Pecha] = None):
metadata_path = path / "metadata.json"
with open(metadata_path, encoding="utf-8") as f:
metadata = json.load(f)
metadata = AlignmentMetaData.from_dict(
metadata=metadata["segments_metadata"], alignment_id=metadata["id_"]
)

anns_path = path / "alignment.json"
with open(anns_path, encoding="utf-8") as fp:
segment_pairs = json.load(fp)

return cls(
base_path=path,
metadata=metadata,
segment_pairs=segment_pairs,
pechas=pechas,
)

@classmethod
def from_id(cls, alignment_id: str):
pass
def from_id(cls, alignment_id: str, output_path: Path = ALIGNMENT_PATH):
alignment_path = clone_repo(alignment_id, output_path)
return Alignment.from_path(alignment_path)

@classmethod
def from_segment_pairs(
cls,
segment_pairs: List[Tuple[Tuple[str, str], Tuple[str, str]]],
metadata: AlignmentMetadata,
metadata: AlignmentMetaData,
):
return cls(metadata=metadata, parser_segment_pairs=segment_pairs)
"""assign uuid for each alignment id"""
transformed_segment_pairs = {}
for (source_id, source_ann_id), (target_id, target_ann_id) in segment_pairs:
transformed_segment_pairs[get_uuid()] = {
source_id: source_ann_id,
target_id: target_ann_id,
}
return cls(metadata=metadata, segment_pairs=transformed_segment_pairs)

def load_pechas(self):
pecha_ids = self.metadata.segments_metadata.keys()
pechas: Dict[str, Pecha] = {
pecha_id: Pecha.from_id(pecha_id) for pecha_id in pecha_ids
}
if self.pechas is None:
self.pechas = pechas
return self.pechas

def write(self, output_path: Path):
""" """
alignment_id = self.id_
alignment_path = _mkdir(output_path / alignment_id)

""" write metadata"""
metadata_path = alignment_path / "metadata.json"
with open(metadata_path, "w", encoding="utf-8") as f:
json.dump(self.metadata.to_dict(), f, indent=2)

""" write alingment annotations"""
ann_path = alignment_path / "alignment.json"
with open(ann_path, "w", encoding="utf-8") as fp:
json.dump(self.segment_pairs, fp, indent=2)

def get_segment_pairs(self):
if not self.segment_pairs:
return None

for id_ in self.segment_pairs:
yield {id_: self.get_segment_pair(id_)}

def get_segment_pair(self, id_: str):
if not self.segment_pairs or not self.pechas:
return None

segment_pair = {}
for pecha_id, ann_id in self.segment_pairs[id_].items():
"""get annotation store"""
ann_type = self.metadata.segments_metadata[pecha_id].type
pecha_ann_store = self.pechas[pecha_id].get_annotation_store(ann_type)
ann = pecha_ann_store.annotation(ann_id)
segment_pair[pecha_id] = str(ann)
return segment_pair

def save(self, path: str):
pass
def upload_update_with_github(self):
"""upload files if first time"""
""" update files if already exist"""
repo_created = create_github_repo(self.id_)
if repo_created:
upload_folder_to_github(self.id_, self.base_path)
81 changes: 81 additions & 0 deletions src/openpecha/alignment/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from enum import Enum
from typing import Dict

from pydantic import BaseModel, model_validator

from openpecha.ids import get_alignment_id
from openpecha.pecha.layer import LayerEnum


class AlignmentRelationEnum(Enum):
source = "Source"
target = "Target"


class LanguageEnum(Enum):
tibetan = "Tibetan"
english = "English"
chinese = "Chinese"
sanskrit = "Sanskrit"


class SegmentMetaData(BaseModel):
type: LayerEnum
relation: AlignmentRelationEnum
lang: LanguageEnum
base: str
layer: str

def to_dict(self) -> Dict:
"""
Convert the SegmentMetaData instance to a dictionary,
converting enum attributes to their values.
"""
return {
"type": self.type.value,
"relation": self.relation.value,
"lang": self.lang.value,
"base": self.base,
"layer": self.layer,
}


class AlignmentMetaData(BaseModel):
id_: str
segments_metadata: Dict[str, SegmentMetaData]

@classmethod
def from_dict(cls, metadata: Dict, alignment_id: str = None) -> "AlignmentMetaData":
segments_metadata: Dict[str, SegmentMetaData] = {}
for segment_source_id, segment_metadata in metadata.items():
type = LayerEnum(segment_metadata["type"])
relation = AlignmentRelationEnum(segment_metadata["relation"])
lang = LanguageEnum(segment_metadata["lang"])
base = segment_metadata["base"]
layer = segment_metadata["layer"]

segments_metadata[segment_source_id] = SegmentMetaData(
type=type, relation=relation, lang=lang, base=base, layer=layer
)

return cls(id_=alignment_id, segments_metadata=segments_metadata)

@model_validator(mode="before")
@classmethod
def set_id(cls, values):
if "id_" not in values or values["id_"] is None:
values["id_"] = get_alignment_id()
return values

def to_dict(self) -> Dict:
"""
Convert the AlignmentMetaData instance to a dictionary,
converting enum attributes to their values.
"""
return {
"id_": self.id_,
"segments_metadata": {
segment_id: segment.to_dict()
for segment_id, segment in self.segments_metadata.items()
},
}
Loading

0 comments on commit 965d155

Please sign in to comment.