-
Notifications
You must be signed in to change notification settings - Fork 0
/
alih-kata-tanpa-penyahtaksa.py
137 lines (108 loc) · 5.66 KB
/
alih-kata-tanpa-penyahtaksa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# The code mostly generated by ChatGPT (I can't code and just stitch it together part by part)
# Even I don't understand much but I will explain what the code will do inside README.md
import pysubs2
import os
import re
folder_masuk = "Masuk"
folder_keluar = "Keluar"
folder_kamus = "Kamus"
def muat_katan(file_path):
data = {}
with open(file_path) as file:
lines = file.readlines()
for line in lines:
rumi, jawi = line.strip().split("\t")
rumi = rumi.lower()
if rumi in data:
data[rumi].append(jawi)
else:
data[rumi] = [jawi]
return data
def alih_kata(padanan, translations, context):
katan = padanan.group(0)
if katan.lower() in translations:
translations_list = translations[katan.lower()]
if len(translations_list) > 0:
return translations_list[0]
return katan
def alih_ayat(ayat, translations, padanan_tanda):
def gantikan_tanda(padanan):
tanda = padanan.group(0)
if tanda in padanan_tanda:
return padanan_tanda[tanda]
return tanda
ayat_diterjemah = re.sub(r'(?<!\\)(\\[Nnh])', r' {\1} ', ayat)
ayat_diterjemah = re.sub(r'(?<!{){[^}]+}|[A-Za-z]+', lambda padanan: alih_kata(padanan, translations, ayat), ayat_diterjemah)
ayat_diterjemah = re.sub(r'[?,;,.]', gantikan_tanda, ayat_diterjemah)
ayat_diterjemah = ayat_diterjemah.replace('{\\N}', '\\N').replace('{\\n}', '\\n').replace('{\\h}', '\\h')
return ayat_diterjemah
def baiki_ejaan(teks):
# Cari dan ganti 'د ' dan 'ک ' yang berjarak untuk membetulkan perkataan cthnya 'د سکوله'
teks = re.sub(r'\bد\s', 'د', teks)
teks = re.sub(r'\bک\s', 'ک', teks)
return teks
def alih_kata_sarikata(file_path, translations, padanan_tanda, padanan_tanggaman_akhiran, padanan_tanggaman_awalan):
sarikata = pysubs2.load(file_path)
tidak_teralih = {}
extracted_content = {}
for i, dialog in enumerate(sarikata.events):
if dialog.is_comment:
continue
teks_dialog = dialog.text
extracted = re.findall(r'{(.*?)}', teks_dialog)
for extract in extracted:
placeholder = f'__{len(extracted_content)}__'
extracted_content[placeholder] = '{' + extract + '}'
teks_dialog = teks_dialog.replace('{' + extract + '}', placeholder)
dialog_terjemah = baiki_ejaan(alih_ayat(teks_dialog, translations, padanan_tanda))
dialog_terjemah = "{\\fe-1}" + dialog_terjemah
for placeholder, content in extracted_content.items():
dialog_terjemah = dialog_terjemah.replace(placeholder, content)
dialog.text = dialog_terjemah
katan_rumi_jawi = re.findall(r'(?<!{)(?<!\\)(?:\\\\)*(?<!\\[Nnh])\b[A-Za-z]+(?<!\\)(?!})(?<!\\[Nnh])', teks_dialog)
for katan in katan_rumi_jawi:
if katan.lower() not in translations and not re.search(r'(?<!\\){[^}]+}', teks_dialog):
katan_terjemah = alih_kata(re.search(katan, teks_dialog), translations, teks_dialog)
if katan_terjemah == katan:
for tanggaman_akhiran in padanan_tanggaman_akhiran:
if katan.lower().endswith(tanggaman_akhiran):
katan_terjemah = alih_kata(re.search(katan[:-len(tanggaman_akhiran)], teks_dialog), translations, teks_dialog) + padanan_tanggaman_akhiran[tanggaman_akhiran]
break
if katan_terjemah == katan:
for tanggaman_awalan in padanan_tanggaman_awalan:
if katan.lower().startswith(tanggaman_awalan):
katan_terjemah = padanan_tanggaman_awalan[tanggaman_awalan] + alih_kata(re.search(katan[len(tanggaman_awalan):], teks_dialog), translations, teks_dialog)
break
if katan_terjemah != katan:
dialog_terjemah = dialog_terjemah.replace(katan, katan_terjemah)
else:
tidak_teralih.setdefault(i, []).append(katan)
sarikata.events[i].text = dialog_terjemah
laluan_sarikata = os.path.join(folder_keluar, "[Dialih Kata]" + os.path.basename(file_path))
sarikata.save(laluan_sarikata)
return tidak_teralih
katan_kamus = {}
for file in os.listdir(folder_kamus):
if file.endswith(".tsv"):
file_path = os.path.join(folder_kamus, file)
katan_kamus = {**katan_kamus, **muat_katan(file_path)}
padanan_tanda = {"?": "؟", ";": "⁏", ",": "⹁", ".": "."}
padanan_tanggaman_akhiran = {"lah": "له", "kah": "که", "nya": "ڽ", "kan": "کن", "i": "ي", "ku": "کو", "mu": "مو"}
padanan_tanggaman_awalan = {"ber": "بر", "mem": "مم", "meng": "مڠ", "se": "س", "tak": "تق", "per": "ڤر"}
files_untranslated = {}
for fail in os.listdir(folder_masuk):
if fail.endswith((".ass", ".ssa", ".srt")):
print(f"Mengalih kata: {fail}\n---")
laluan_fail = os.path.join(folder_masuk, fail)
tidak_teralih = alih_kata_sarikata(laluan_fail, katan_kamus, padanan_tanda, padanan_tanggaman_akhiran, padanan_tanggaman_awalan)
if len(tidak_teralih) > 0:
files_untranslated[fail] = tidak_teralih
for fail, tidak_teralih in files_untranslated.items():
print(f"Perkataan yang tidak teralih dalam \"{fail}\":")
for i, dialog_index in enumerate(tidak_teralih):
dialog = files_untranslated[fail][dialog_index]
print(f"Baris {dialog_index + 1}: {dialog}")
print()
print(
"Pengalih tulisan ini tidaklah sempurna, sentuhan manusia tetap juga diperlukan."
)