change

IryNeko · Jun 18, 2024 · 7619477 · 7619477
1 parent 533bee0
commit 7619477
Show file tree

Hide file tree

Showing 4 changed files with 296 additions and 0 deletions.
diff --git a/app.py b/app.py
@@ -0,0 +1,95 @@
+from flask import Flask, request, send_file
+import main
+from datetime import datetime
+import json
+import redis
+
+app=Flask(__name__)
+'''
+configueration phase
+'''
+# load config
+configfile=None
+with open("config.json","r") as f:
+    configfile=json.loads(f.read())
+# model pipeline config
+main.pipeinit(configfile["Model"])
+# redis config
+redis_enabled=False
+r=None
+cache_prefix=None
+try:
+    r=redis.Redis(host=configfile["Redis"]["address"], 
+                  port=configfile["Redis"]["port"], 
+                  password=configfile["Redis"]["password"], 
+                  decode_responses=True
+                 )
+    cache_prefix=configfile["Redis"]["prefix"]
+    r.set('foo', 'bar')
+    r.get('foo')
+    type(r.get('ne'))
+    print("redis initiated")
+except:
+    print("redis not connected")
+# application port config
+port="9900"
+if configfile["Application"]["port"] !="":
+    port=configfile["Application"]["port"]
+# application from and to are not applicable here
+
+# basic Translation
+def translate_one(data):
+    result=main.translate_batch([data])[0]
+        #print(result)
+    return result
+'''
+controllers
+'''
+# this works for the xunity autotranslator, the normal bunch
+@app.route("/translate",methods=["GET"])
+def auto_translate():
+    data=request.args.get('text')
+    print(data)
+    # Check Redis config
+    if redis_enabled==True:
+        rd=r.get(cache_prefix+data)
+        if rd!=None:
+            print("cache hit",data,"=>",rd)
+            return rd
+        else:
+            print("cache miss")
+            time=datetime.now()
+            result=translate_one(data)
+            print("time:",datetime.now()-time)
+            if r.set(cache_prefix+data,result):
+                print("cache in",data,"=>",result)
+            return result
+    else:
+        time=datetime.now()
+        result=translate_one(data)
+        print("time:",datetime.now()-time)
+        print(result)
+        return result
+# this opens the upload interface
+@app.route("/translate/upload",methods=["GET"])
+def show_upload():
+    with open("interface.html","r") as f:
+        return f.read()
+# this translates json
+# this is not async should be a problem, but capacity wise i don't have enough machines
+@app.route("/translate/json",methods=["POST"])
+def json_translate():
+    result={}
+    k=list(request.json.keys())
+    r=main.translate_batch(k)
+    for i in range(len(k)):
+        result[k[i]]=r[i]
+    filename=datetime.now().strftime("%y%m%d%H%M%S")
+    with open(f"tmp/{filename}.json","w") as f:
+        json.dump(result,f,indent=2,ensure_ascii=False)
+    return send_file(f"tmp/{filename}.json")
+'''
+start up application
+'''
+if __name__=='__main__':
+    app.run(host="0.0.0.0", port=port)
diff --git a/config.json b/config.json
@@ -0,0 +1,35 @@
+{
+    "Application":{
+        "port":"9900",
+        "from":"",
+        "to":""
+    },
+    "Redis":{
+        "address":"",
+        "password":"",
+        "port":"",
+        "prefix":""
+    },
+    "DB":"",
+    "Mysql":{
+        "address":"",
+        "user":"",
+        "password":"",
+        "port":"",
+        "db":""
+    },
+    "Sqlite":{
+        "path":"",
+        "filename":"",
+        "sizelimit":""
+    },
+    "Model":{
+        "model_path":"iryneko571/mt5-translation-ja_zh-game-small",
+        "repetition_penalty":1.4,
+        "batch_size":64,
+        "max_length":256
+    },
+    "Translation":{
+        "lang":"<-ja2zh->"
+    }
+}
diff --git a/interface.html b/interface.html
@@ -0,0 +1,60 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Upload and Send JSON File</title>
+</head>
+<body>
+
+<h2>Upload a JSON File</h2>
+
+<input type="file" id="fileInput" accept=".json">
+<p id="statusMessage"></p>
+<p id="fileid" style="display:none"></p>
+<a id="downloadAnchorElem" style="display:none"></a>
+
+<script>
+document.getElementById('fileInput').addEventListener('change', function(event) {
+    const file = event.target.files[0];
+
+    if (file && file.type === "application/json") {
+        const reader = new FileReader();
+
+        reader.onload = function(e) {
+            const content = e.target.result;
+            sendJsonToServer(content);
+        };
+
+        reader.readAsText(file);
+    } else {
+        alert("Please upload a valid JSON file.");
+    }
+});
+
+function sendJsonToServer(jsonContent) {
+    fetch('/translate/json', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: jsonContent
+    })
+    .then(response => response.json())
+    .then(data => {
+        document.getElementById('statusMessage').textContent = "File successfully sent!";
+        //console.log("Response from server:", data);
+        //document.getElementById('fileid').textContent = data;
+        var dataStr = "data:text/json;charset=utf-8," + encodeURIComponent(JSON.stringify(data,null,2));
+        var dlAnchorElem = document.getElementById('downloadAnchorElem');
+        dlAnchorElem.setAttribute("href",     dataStr     );
+        dlAnchorElem.setAttribute("download", "scene.json");
+        dlAnchorElem.click();
+    })
+    .catch(error => {
+        document.getElementById('statusMessage').textContent = "Error sending file.";
+        console.error("Error:", error);
+    });
+}
+</script>
+
+</body>
+</html>
diff --git a/main.py b/main.py
@@ -0,0 +1,106 @@
+from transformers import pipeline
+from transformers.pipelines.pt_utils import KeyDataset
+import re
+import copy
+from datetime import datetime
+from tqdm.auto import tqdm
+from datasets import Dataset
+
+#model_path="iryneko571/mt5-translation-ja_zh-game-small"
+'''
+setup pipeline for translation
+'''
+pipe=None
+def pipeinit(config):
+    global pipe
+    then = datetime.now()
+    print(then,"initiate model")
+    pipe = pipeline("translation",
+                    model=config["model_path"],
+                    tokenizer=config["model_path"],
+                    repetition_penalty=config["repetition_penalty"], # just avoid repeating in a cheap way
+                    batch_size=config["batch_size"], # just a reference don't set it too high
+                    max_length=config["max_length"])
+    now = datetime.now()
+    print(now,"init time",now-then)
+
+'''
+text preprocess and post process
+switch between different types of breaks
+will add different stuff such as \t later
+'''
+# preprocess translatables
+def preprocess(batch):
+    samples=[None] * len(batch)
+    for i in range(len(batch)):
+        if "\r\n" in batch[i]:
+            samples[i]=(batch[i].replace("\r\n","\\n"),"rn") # rn for old type
+            continue
+        if "\\n" in batch[i]:
+            samples[i]=(batch[i],"nn") # nn for two slash
+            continue
+        if "\n" in batch[i]:
+            samples[i]=(batch[i].replace("\n","\\n"),"n") # n for one slash
+            continue
+        else:
+            samples[i]=(batch[i],"s") # s for safe
+    return samples
+
+# process translated back to original format
+def postprocess(samples):
+    batch=[None] * len(samples)
+    for i in range(len(samples)):
+        t, a = samples[i]
+        if a=="rn":
+            batch[i]=t.replace("\\n","\r\n")
+            continue
+        if a=="nn":
+            batch[i]=t
+            continue
+        if a=="n":
+            batch[i]=t.replace("\\n","\n")
+            continue
+        if a=="s":
+            batch[i]=t
+            continue
+        else:
+            print(f"error determine the type of {t}")
+    return batch
+
+'''
+initial batch translation
+will combine the batch and do the translation
+'''
+def liststream(list):
+    for i in range(len(list)):
+        yield i
+def translate_batch(batch,lang='<-ja2zh->'): # batch is an array of string
+    # preprocess
+    samples=preprocess(batch)
+    # format translist
+    trans_list=[None] * len(batch)
+    for i in range(len(batch)):
+        t,a = samples[i]
+        trans_list[i]=f'{lang} {batch[i]}'
+    # now translate
+    global pipe
+    transdict={
+        "text":trans_list
+    }
+    datalist=Dataset.from_dict(transdict)
+    translated=[]
+    for out in tqdm(pipe(KeyDataset(datalist, "text")),total=len(datalist)):
+        #print(out)
+        for o in out:
+            translated.append(o)
+    #translated={"test":"test"}
+    #pipe(dataset, batch_size=batch_size), )
+    # format result
+    resultlist=[None] * len(translated)
+    for i in range(len(translated)):
+        resultlist[i]=(translated[i]['translation_text'],samples[i][1])
+    # postprocess
+    result=postprocess(resultlist)
+    # return results
+    return result
+