diff --git a/app.py b/app.py new file mode 100644 index 0000000..d300544 --- /dev/null +++ b/app.py @@ -0,0 +1,95 @@ +from flask import Flask, request, send_file +import main +from datetime import datetime +import json +import redis + +app=Flask(__name__) +''' +configueration phase +''' +# load config +configfile=None +with open("config.json","r") as f: + configfile=json.loads(f.read()) +# model pipeline config +main.pipeinit(configfile["Model"]) +# redis config +redis_enabled=False +r=None +cache_prefix=None +try: + r=redis.Redis(host=configfile["Redis"]["address"], + port=configfile["Redis"]["port"], + password=configfile["Redis"]["password"], + decode_responses=True + ) + cache_prefix=configfile["Redis"]["prefix"] + r.set('foo', 'bar') + r.get('foo') + type(r.get('ne')) + print("redis initiated") +except: + print("redis not connected") +# application port config +port="9900" +if configfile["Application"]["port"] !="": + port=configfile["Application"]["port"] +# application from and to are not applicable here + +# basic Translation +def translate_one(data): + result=main.translate_batch([data])[0] + #print(result) + return result +''' +controllers +''' +# this works for the xunity autotranslator, the normal bunch +@app.route("/translate",methods=["GET"]) +def auto_translate(): + data=request.args.get('text') + print(data) + # Check Redis config + if redis_enabled==True: + rd=r.get(cache_prefix+data) + if rd!=None: + print("cache hit",data,"=>",rd) + return rd + else: + print("cache miss") + time=datetime.now() + result=translate_one(data) + print("time:",datetime.now()-time) + if r.set(cache_prefix+data,result): + print("cache in",data,"=>",result) + return result + else: + time=datetime.now() + result=translate_one(data) + print("time:",datetime.now()-time) + print(result) + return result +# this opens the upload interface +@app.route("/translate/upload",methods=["GET"]) +def show_upload(): + with open("interface.html","r") as f: + return f.read() +# this translates json +# this is not async should be a problem, but capacity wise i don't have enough machines +@app.route("/translate/json",methods=["POST"]) +def json_translate(): + result={} + k=list(request.json.keys()) + r=main.translate_batch(k) + for i in range(len(k)): + result[k[i]]=r[i] + filename=datetime.now().strftime("%y%m%d%H%M%S") + with open(f"tmp/{filename}.json","w") as f: + json.dump(result,f,indent=2,ensure_ascii=False) + return send_file(f"tmp/{filename}.json") +''' +start up application +''' +if __name__=='__main__': + app.run(host="0.0.0.0", port=port) \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..f2952d4 --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "Application":{ + "port":"9900", + "from":"", + "to":"" + }, + "Redis":{ + "address":"", + "password":"", + "port":"", + "prefix":"" + }, + "DB":"", + "Mysql":{ + "address":"", + "user":"", + "password":"", + "port":"", + "db":"" + }, + "Sqlite":{ + "path":"", + "filename":"", + "sizelimit":"" + }, + "Model":{ + "model_path":"iryneko571/mt5-translation-ja_zh-game-small", + "repetition_penalty":1.4, + "batch_size":64, + "max_length":256 + }, + "Translation":{ + "lang":"<-ja2zh->" + } +} \ No newline at end of file diff --git a/interface.html b/interface.html new file mode 100644 index 0000000..66fc061 --- /dev/null +++ b/interface.html @@ -0,0 +1,60 @@ + + + + + + Upload and Send JSON File + + + +

Upload a JSON File

+ + +

+ + + + + + + \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..cd9a7ee --- /dev/null +++ b/main.py @@ -0,0 +1,106 @@ +from transformers import pipeline +from transformers.pipelines.pt_utils import KeyDataset +import re +import copy +from datetime import datetime +from tqdm.auto import tqdm +from datasets import Dataset + +#model_path="iryneko571/mt5-translation-ja_zh-game-small" +''' +setup pipeline for translation +''' +pipe=None +def pipeinit(config): + global pipe + then = datetime.now() + print(then,"initiate model") + pipe = pipeline("translation", + model=config["model_path"], + tokenizer=config["model_path"], + repetition_penalty=config["repetition_penalty"], # just avoid repeating in a cheap way + batch_size=config["batch_size"], # just a reference don't set it too high + max_length=config["max_length"]) + now = datetime.now() + print(now,"init time",now-then) + +''' +text preprocess and post process +switch between different types of breaks +will add different stuff such as \t later +''' +# preprocess translatables +def preprocess(batch): + samples=[None] * len(batch) + for i in range(len(batch)): + if "\r\n" in batch[i]: + samples[i]=(batch[i].replace("\r\n","\\n"),"rn") # rn for old type + continue + if "\\n" in batch[i]: + samples[i]=(batch[i],"nn") # nn for two slash + continue + if "\n" in batch[i]: + samples[i]=(batch[i].replace("\n","\\n"),"n") # n for one slash + continue + else: + samples[i]=(batch[i],"s") # s for safe + return samples + +# process translated back to original format +def postprocess(samples): + batch=[None] * len(samples) + for i in range(len(samples)): + t, a = samples[i] + if a=="rn": + batch[i]=t.replace("\\n","\r\n") + continue + if a=="nn": + batch[i]=t + continue + if a=="n": + batch[i]=t.replace("\\n","\n") + continue + if a=="s": + batch[i]=t + continue + else: + print(f"error determine the type of {t}") + return batch + +''' +initial batch translation +will combine the batch and do the translation +''' +def liststream(list): + for i in range(len(list)): + yield i +def translate_batch(batch,lang='<-ja2zh->'): # batch is an array of string + # preprocess + samples=preprocess(batch) + # format translist + trans_list=[None] * len(batch) + for i in range(len(batch)): + t,a = samples[i] + trans_list[i]=f'{lang} {batch[i]}' + # now translate + global pipe + transdict={ + "text":trans_list + } + datalist=Dataset.from_dict(transdict) + translated=[] + for out in tqdm(pipe(KeyDataset(datalist, "text")),total=len(datalist)): + #print(out) + for o in out: + translated.append(o) + #translated={"test":"test"} + #pipe(dataset, batch_size=batch_size), ) + # format result + resultlist=[None] * len(translated) + for i in range(len(translated)): + resultlist[i]=(translated[i]['translation_text'],samples[i][1]) + # postprocess + result=postprocess(resultlist) + # return results + return result +