generated from OpenPecha/python-project-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #14 from OpenPecha/feat/read-alignment
Feat/read alignment
- Loading branch information
Showing
29 changed files
with
1,870 additions
and
201 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,36 +1,121 @@ | ||
from typing import List, Tuple | ||
import json | ||
from pathlib import Path | ||
from typing import Dict, List, Tuple | ||
|
||
|
||
class AlignmentMetadata: | ||
pass | ||
from openpecha.alignment.metadata import AlignmentMetaData | ||
from openpecha.config import ALIGNMENT_PATH, _mkdir | ||
from openpecha.github_utils import ( | ||
clone_repo, | ||
create_github_repo, | ||
upload_folder_to_github, | ||
) | ||
from openpecha.ids import get_uuid | ||
from openpecha.pecha import Pecha | ||
|
||
|
||
class Alignment: | ||
def __init__( | ||
self, | ||
metadata: AlignmentMetadata, | ||
parser_segment_pairs=None, | ||
alignment_segment_pairs=None, | ||
metadata: AlignmentMetaData, | ||
base_path: Path = None, | ||
segment_pairs: Dict[str, Dict[str, str]] = None, | ||
pechas: Dict[str, Pecha] = None, | ||
): | ||
self.id_ = metadata.id_ | ||
self.base_path = ( | ||
_mkdir(ALIGNMENT_PATH / self.id_) if not base_path else base_path | ||
) | ||
self.metadata = metadata | ||
self.parser_segment_pairs = parser_segment_pairs | ||
self.alignment_segment_pairs = alignment_segment_pairs | ||
self.segment_pairs = segment_pairs | ||
self.pechas = pechas | ||
|
||
@classmethod | ||
def from_path(cls, path: str): | ||
pass | ||
def from_path(cls, path: Path, pechas: Dict[str, Pecha] = None): | ||
metadata_path = path / "metadata.json" | ||
with open(metadata_path, encoding="utf-8") as f: | ||
metadata = json.load(f) | ||
metadata = AlignmentMetaData.from_dict( | ||
metadata=metadata["segments_metadata"], alignment_id=metadata["id_"] | ||
) | ||
|
||
anns_path = path / "alignment.json" | ||
with open(anns_path, encoding="utf-8") as fp: | ||
segment_pairs = json.load(fp) | ||
|
||
return cls( | ||
base_path=path, | ||
metadata=metadata, | ||
segment_pairs=segment_pairs, | ||
pechas=pechas, | ||
) | ||
|
||
@classmethod | ||
def from_id(cls, alignment_id: str): | ||
pass | ||
def from_id(cls, alignment_id: str, output_path: Path = ALIGNMENT_PATH): | ||
alignment_path = clone_repo(alignment_id, output_path) | ||
return Alignment.from_path(alignment_path) | ||
|
||
@classmethod | ||
def from_segment_pairs( | ||
cls, | ||
segment_pairs: List[Tuple[Tuple[str, str], Tuple[str, str]]], | ||
metadata: AlignmentMetadata, | ||
metadata: AlignmentMetaData, | ||
): | ||
return cls(metadata=metadata, parser_segment_pairs=segment_pairs) | ||
"""assign uuid for each alignment id""" | ||
transformed_segment_pairs = {} | ||
for (source_id, source_ann_id), (target_id, target_ann_id) in segment_pairs: | ||
transformed_segment_pairs[get_uuid()] = { | ||
source_id: source_ann_id, | ||
target_id: target_ann_id, | ||
} | ||
return cls(metadata=metadata, segment_pairs=transformed_segment_pairs) | ||
|
||
def load_pechas(self): | ||
pecha_ids = self.metadata.segments_metadata.keys() | ||
pechas: Dict[str, Pecha] = { | ||
pecha_id: Pecha.from_id(pecha_id) for pecha_id in pecha_ids | ||
} | ||
if self.pechas is None: | ||
self.pechas = pechas | ||
return self.pechas | ||
|
||
def write(self, output_path: Path): | ||
""" """ | ||
alignment_id = self.id_ | ||
alignment_path = _mkdir(output_path / alignment_id) | ||
|
||
""" write metadata""" | ||
metadata_path = alignment_path / "metadata.json" | ||
with open(metadata_path, "w", encoding="utf-8") as f: | ||
json.dump(self.metadata.to_dict(), f, indent=2) | ||
|
||
""" write alingment annotations""" | ||
ann_path = alignment_path / "alignment.json" | ||
with open(ann_path, "w", encoding="utf-8") as fp: | ||
json.dump(self.segment_pairs, fp, indent=2) | ||
|
||
def get_segment_pairs(self): | ||
if not self.segment_pairs: | ||
return None | ||
|
||
for id_ in self.segment_pairs: | ||
yield {id_: self.get_segment_pair(id_)} | ||
|
||
def get_segment_pair(self, id_: str): | ||
if not self.segment_pairs or not self.pechas: | ||
return None | ||
|
||
segment_pair = {} | ||
for pecha_id, ann_id in self.segment_pairs[id_].items(): | ||
"""get annotation store""" | ||
ann_type = self.metadata.segments_metadata[pecha_id].type | ||
pecha_ann_store = self.pechas[pecha_id].get_annotation_store(ann_type) | ||
ann = pecha_ann_store.annotation(ann_id) | ||
segment_pair[pecha_id] = str(ann) | ||
return segment_pair | ||
|
||
def save(self, path: str): | ||
pass | ||
def upload_update_with_github(self): | ||
"""upload files if first time""" | ||
""" update files if already exist""" | ||
repo_created = create_github_repo(self.id_) | ||
if repo_created: | ||
upload_folder_to_github(self.id_, self.base_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
from enum import Enum | ||
from typing import Dict | ||
|
||
from pydantic import BaseModel, model_validator | ||
|
||
from openpecha.ids import get_alignment_id | ||
from openpecha.pecha.layer import LayerEnum | ||
|
||
|
||
class AlignmentRelationEnum(Enum): | ||
source = "Source" | ||
target = "Target" | ||
|
||
|
||
class LanguageEnum(Enum): | ||
tibetan = "Tibetan" | ||
english = "English" | ||
chinese = "Chinese" | ||
sanskrit = "Sanskrit" | ||
|
||
|
||
class SegmentMetaData(BaseModel): | ||
type: LayerEnum | ||
relation: AlignmentRelationEnum | ||
lang: LanguageEnum | ||
base: str | ||
layer: str | ||
|
||
def to_dict(self) -> Dict: | ||
""" | ||
Convert the SegmentMetaData instance to a dictionary, | ||
converting enum attributes to their values. | ||
""" | ||
return { | ||
"type": self.type.value, | ||
"relation": self.relation.value, | ||
"lang": self.lang.value, | ||
"base": self.base, | ||
"layer": self.layer, | ||
} | ||
|
||
|
||
class AlignmentMetaData(BaseModel): | ||
id_: str | ||
segments_metadata: Dict[str, SegmentMetaData] | ||
|
||
@classmethod | ||
def from_dict(cls, metadata: Dict, alignment_id: str = None) -> "AlignmentMetaData": | ||
segments_metadata: Dict[str, SegmentMetaData] = {} | ||
for segment_source_id, segment_metadata in metadata.items(): | ||
type = LayerEnum(segment_metadata["type"]) | ||
relation = AlignmentRelationEnum(segment_metadata["relation"]) | ||
lang = LanguageEnum(segment_metadata["lang"]) | ||
base = segment_metadata["base"] | ||
layer = segment_metadata["layer"] | ||
|
||
segments_metadata[segment_source_id] = SegmentMetaData( | ||
type=type, relation=relation, lang=lang, base=base, layer=layer | ||
) | ||
|
||
return cls(id_=alignment_id, segments_metadata=segments_metadata) | ||
|
||
@model_validator(mode="before") | ||
@classmethod | ||
def set_id(cls, values): | ||
if "id_" not in values or values["id_"] is None: | ||
values["id_"] = get_alignment_id() | ||
return values | ||
|
||
def to_dict(self) -> Dict: | ||
""" | ||
Convert the AlignmentMetaData instance to a dictionary, | ||
converting enum attributes to their values. | ||
""" | ||
return { | ||
"id_": self.id_, | ||
"segments_metadata": { | ||
segment_id: segment.to_dict() | ||
for segment_id, segment in self.segments_metadata.items() | ||
}, | ||
} |
Oops, something went wrong.