Skip to content

Commit

Permalink
change
Browse files Browse the repository at this point in the history
  • Loading branch information
BrianBu47 committed Jun 18, 2024
1 parent 533bee0 commit 7619477
Show file tree
Hide file tree
Showing 4 changed files with 296 additions and 0 deletions.
95 changes: 95 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from flask import Flask, request, send_file
import main
from datetime import datetime
import json
import redis

app=Flask(__name__)
'''
configueration phase
'''
# load config
configfile=None
with open("config.json","r") as f:
configfile=json.loads(f.read())
# model pipeline config
main.pipeinit(configfile["Model"])
# redis config
redis_enabled=False
r=None
cache_prefix=None
try:
r=redis.Redis(host=configfile["Redis"]["address"],
port=configfile["Redis"]["port"],
password=configfile["Redis"]["password"],
decode_responses=True
)
cache_prefix=configfile["Redis"]["prefix"]
r.set('foo', 'bar')
r.get('foo')
type(r.get('ne'))
print("redis initiated")
except:
print("redis not connected")
# application port config
port="9900"
if configfile["Application"]["port"] !="":
port=configfile["Application"]["port"]
# application from and to are not applicable here

# basic Translation
def translate_one(data):
result=main.translate_batch([data])[0]
#print(result)
return result
'''
controllers
'''
# this works for the xunity autotranslator, the normal bunch
@app.route("/translate",methods=["GET"])
def auto_translate():
data=request.args.get('text')
print(data)
# Check Redis config
if redis_enabled==True:
rd=r.get(cache_prefix+data)
if rd!=None:
print("cache hit",data,"=>",rd)
return rd
else:
print("cache miss")
time=datetime.now()
result=translate_one(data)
print("time:",datetime.now()-time)
if r.set(cache_prefix+data,result):
print("cache in",data,"=>",result)
return result
else:
time=datetime.now()
result=translate_one(data)
print("time:",datetime.now()-time)
print(result)
return result
# this opens the upload interface
@app.route("/translate/upload",methods=["GET"])
def show_upload():
with open("interface.html","r") as f:
return f.read()
# this translates json
# this is not async should be a problem, but capacity wise i don't have enough machines
@app.route("/translate/json",methods=["POST"])
def json_translate():
result={}
k=list(request.json.keys())
r=main.translate_batch(k)
for i in range(len(k)):
result[k[i]]=r[i]
filename=datetime.now().strftime("%y%m%d%H%M%S")
with open(f"tmp/{filename}.json","w") as f:
json.dump(result,f,indent=2,ensure_ascii=False)
return send_file(f"tmp/{filename}.json")
'''
start up application
'''
if __name__=='__main__':
app.run(host="0.0.0.0", port=port)
35 changes: 35 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"Application":{
"port":"9900",
"from":"",
"to":""
},
"Redis":{
"address":"",
"password":"",
"port":"",
"prefix":""
},
"DB":"",
"Mysql":{
"address":"",
"user":"",
"password":"",
"port":"",
"db":""
},
"Sqlite":{
"path":"",
"filename":"",
"sizelimit":""
},
"Model":{
"model_path":"iryneko571/mt5-translation-ja_zh-game-small",
"repetition_penalty":1.4,
"batch_size":64,
"max_length":256
},
"Translation":{
"lang":"<-ja2zh->"
}
}
60 changes: 60 additions & 0 deletions interface.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Upload and Send JSON File</title>
</head>
<body>

<h2>Upload a JSON File</h2>

<input type="file" id="fileInput" accept=".json">
<p id="statusMessage"></p>
<p id="fileid" style="display:none"></p>
<a id="downloadAnchorElem" style="display:none"></a>

<script>
document.getElementById('fileInput').addEventListener('change', function(event) {
const file = event.target.files[0];

if (file && file.type === "application/json") {
const reader = new FileReader();

reader.onload = function(e) {
const content = e.target.result;
sendJsonToServer(content);
};

reader.readAsText(file);
} else {
alert("Please upload a valid JSON file.");
}
});

function sendJsonToServer(jsonContent) {
fetch('/translate/json', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: jsonContent
})
.then(response => response.json())
.then(data => {
document.getElementById('statusMessage').textContent = "File successfully sent!";
//console.log("Response from server:", data);
//document.getElementById('fileid').textContent = data;
var dataStr = "data:text/json;charset=utf-8," + encodeURIComponent(JSON.stringify(data,null,2));
var dlAnchorElem = document.getElementById('downloadAnchorElem');
dlAnchorElem.setAttribute("href", dataStr );
dlAnchorElem.setAttribute("download", "scene.json");
dlAnchorElem.click();
})
.catch(error => {
document.getElementById('statusMessage').textContent = "Error sending file.";
console.error("Error:", error);
});
}
</script>

</body>
</html>
106 changes: 106 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
import re
import copy
from datetime import datetime
from tqdm.auto import tqdm
from datasets import Dataset

#model_path="iryneko571/mt5-translation-ja_zh-game-small"
'''
setup pipeline for translation
'''
pipe=None
def pipeinit(config):
global pipe
then = datetime.now()
print(then,"initiate model")
pipe = pipeline("translation",
model=config["model_path"],
tokenizer=config["model_path"],
repetition_penalty=config["repetition_penalty"], # just avoid repeating in a cheap way
batch_size=config["batch_size"], # just a reference don't set it too high
max_length=config["max_length"])
now = datetime.now()
print(now,"init time",now-then)

'''
text preprocess and post process
switch between different types of breaks
will add different stuff such as \t later
'''
# preprocess translatables
def preprocess(batch):
samples=[None] * len(batch)
for i in range(len(batch)):
if "\r\n" in batch[i]:
samples[i]=(batch[i].replace("\r\n","\\n"),"rn") # rn for old type
continue
if "\\n" in batch[i]:
samples[i]=(batch[i],"nn") # nn for two slash
continue
if "\n" in batch[i]:
samples[i]=(batch[i].replace("\n","\\n"),"n") # n for one slash
continue
else:
samples[i]=(batch[i],"s") # s for safe
return samples

# process translated back to original format
def postprocess(samples):
batch=[None] * len(samples)
for i in range(len(samples)):
t, a = samples[i]
if a=="rn":
batch[i]=t.replace("\\n","\r\n")
continue
if a=="nn":
batch[i]=t
continue
if a=="n":
batch[i]=t.replace("\\n","\n")
continue
if a=="s":
batch[i]=t
continue
else:
print(f"error determine the type of {t}")
return batch

'''
initial batch translation
will combine the batch and do the translation
'''
def liststream(list):
for i in range(len(list)):
yield i
def translate_batch(batch,lang='<-ja2zh->'): # batch is an array of string
# preprocess
samples=preprocess(batch)
# format translist
trans_list=[None] * len(batch)
for i in range(len(batch)):
t,a = samples[i]
trans_list[i]=f'{lang} {batch[i]}'
# now translate
global pipe
transdict={
"text":trans_list
}
datalist=Dataset.from_dict(transdict)
translated=[]
for out in tqdm(pipe(KeyDataset(datalist, "text")),total=len(datalist)):
#print(out)
for o in out:
translated.append(o)
#translated={"test":"test"}
#pipe(dataset, batch_size=batch_size), )
# format result
resultlist=[None] * len(translated)
for i in range(len(translated)):
resultlist[i]=(translated[i]['translation_text'],samples[i][1])
# postprocess
result=postprocess(resultlist)
# return results
return result

0 comments on commit 7619477

Please sign in to comment.