From a5d48db960dfccb138c23432732110a6f62810fc Mon Sep 17 00:00:00 2001
From: BaiBlanc <1458491606@qq.com>
Date: Mon, 31 Aug 2020 01:06:29 +0200
Subject: [PATCH] Fixing bugs

---
 gsoc/zheyuan/pipeline/batch_paraphrase.py    | 10 ++++++++--
 gsoc/zheyuan/pipeline/pipeline.sh            |  9 +++++++--
 gsoc/zheyuan/utility/GloVe/glove_finetune.py |  2 +-
 gsoc/zheyuan/utility/vocab_creator.py        |  2 +-
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/gsoc/zheyuan/pipeline/batch_paraphrase.py b/gsoc/zheyuan/pipeline/batch_paraphrase.py
index 0afe08a..d1c12a5 100644
--- a/gsoc/zheyuan/pipeline/batch_paraphrase.py
+++ b/gsoc/zheyuan/pipeline/batch_paraphrase.py
@@ -1,4 +1,5 @@
 import argparse
+import os
 import tensorflow as tf
 tf.compat.v1.enable_eager_execution()
 from paraphrase_questions import paraphrase_questions,get_pretrained_model,prepare_model,set_seed,pick_final_sentence, pick_final_sentence_advanced
@@ -13,8 +14,9 @@ def batch_paraphrase(templates_path, model_dir):
     folder_path = get_pretrained_model(const.URL)
     set_seed(42)
     tokenizer, device, model = prepare_model(folder_path)
-    with open(templates_path, "r") as lines:
-        with open(templates_path + "_paraphrased", "w") as w:
+    dir = os.path.realpath(templates_path)
+    with open(dir, "r") as lines:
+        with open(dir + "_paraphrased", "w") as w:
             for line in lines:
                 prop = line.strip("\n").split(seperator)
                 question = prop[3]
@@ -22,6 +24,7 @@ def batch_paraphrase(templates_path, model_dir):
                 paraphrased = pick_final_sentence(question, paraphrased_candidates)
                 advanced = pick_final_sentence_advanced(device, question, paraphrased_candidates, model_dir)
                 w.write(line)
+                print("Original", line)
                 # for i, candidate in enumerate(paraphrased_candidates):
                 #     new_prop = prop[:-1]
                 #     new_prop[3] = candidate
@@ -35,11 +38,14 @@ def batch_paraphrase(templates_path, model_dir):
                 new_prop.append("Paraphrased \n")
                 new_line = seperator.join(new_prop)
                 w.write(new_line)
+                print("Paraphrase", new_line)
+
                 new_prop = prop[:-1]
                 new_prop[3] = advanced
                 new_prop.append("Paraphrased advanced\n")
                 new_line = seperator.join(new_prop)
                 w.write(new_line)
+                print("Advanced", new_line)
 
 
 if __name__=="__main__":
diff --git a/gsoc/zheyuan/pipeline/pipeline.sh b/gsoc/zheyuan/pipeline/pipeline.sh
index d5b5e5b..051de28 100755
--- a/gsoc/zheyuan/pipeline/pipeline.sh
+++ b/gsoc/zheyuan/pipeline/pipeline.sh
@@ -58,8 +58,14 @@ fi
 
 
     # 1. Generate templates
-    python multi_generate_templates.py --label '['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'Algorithm', 'Altitude', 'AnatomicalStructure', 'Area', 'Award', 'Biomolecule', 'Blazon', 'Browser', 'ChartsPlacements', 'ChemicalSubstance', 'Cipher', 'Colour', 'Currency', 'Demographics', 'Depth', 'Device', 'Diploma', 'Disease', 'ElectionDiagram', 'ElectricalSubstation', 'EthnicGroup', 'FileSystem', 'Flag', 'Food', 'GeneLocation', 'GrossDomesticProduct', 'Holiday', 'Identifier', 'Language', 'List', 'Media', 'MedicalSpecialty', 'Medicine', 'Name', 'PersonFunction', 'Population', 'Protocol', 'PublicService', 'Relationship', 'PersonFunction', 'SportsSeason', 'Spreadsheet', 'StarCluster', 'Statistic', 'Tank', 'TimePeriod', 'UnitOfWork', 'Unknown']' --project_name $1 --depth 1 --multi True
+    partr="../utility/part-r-00000"
 
+    if [ ! -d $partr ]; then
+      wget https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz
+      gzip -d part-r-00000.gz
+    fi
+    python multi_generate_templates.py --label '['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'AnatomicalStructure', 'Device', 'TimePeriod', 'Activity']' --project_name $1 --depth 1 --multi True
+#'['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'Algorithm', 'Altitude', 'AnatomicalStructure', 'Area', 'Award', 'Biomolecule', 'Blazon', 'Browser', 'ChartsPlacements', 'ChemicalSubstance', 'Cipher', 'Colour', 'Currency', 'Demographics', 'Depth', 'Device', 'Diploma', 'Disease', 'ElectionDiagram', 'ElectricalSubstation', 'EthnicGroup', 'FileSystem', 'Flag', 'Food', 'GeneLocation', 'GrossDomesticProduct', 'Holiday', 'Identifier', 'Language', 'List', 'Media', 'MedicalSpecialty', 'Medicine', 'Name', 'PersonFunction', 'Population', 'Protocol', 'PublicService', 'Relationship', 'PersonFunction', 'SportsSeason', 'Spreadsheet', 'StarCluster', 'Statistic', 'Tank', 'TimePeriod', 'UnitOfWork', 'Unknown']'
     # 2. Batch Paraphrasing
     # 2.1 Download BERT-Classifier
 
@@ -132,4 +138,3 @@ fi
     cd ..
 
 
-fi
\ No newline at end of file
diff --git a/gsoc/zheyuan/utility/GloVe/glove_finetune.py b/gsoc/zheyuan/utility/GloVe/glove_finetune.py
index 14a2629..54da0c3 100644
--- a/gsoc/zheyuan/utility/GloVe/glove_finetune.py
+++ b/gsoc/zheyuan/utility/GloVe/glove_finetune.py
@@ -51,7 +51,7 @@ def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300):
             sentence = sentence.strip("\n")
             sentence = "<s> " + sentence + " </s>"
             for word in sentence.split():
-                word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?").replace("i̇", ""))
+                word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?"))
     print(len(word_en), word_en[:20])
 
     vocab_en = list(set(word_en) - set(["<s>", "</s>"]))
diff --git a/gsoc/zheyuan/utility/vocab_creator.py b/gsoc/zheyuan/utility/vocab_creator.py
index 4755f5a..2519853 100644
--- a/gsoc/zheyuan/utility/vocab_creator.py
+++ b/gsoc/zheyuan/utility/vocab_creator.py
@@ -39,7 +39,7 @@ def sparql_vocab(project_path):
 
 def add_s_tokens(path):
     with open(path+"/data.sparql", "r") as lines:
-        with open("./GloVe/GloVe-master/data_s.sparql", "w") as w:
+        with open(path+"/../../GloVe/data_s.sparql", "w") as w:
             for line in lines:
                 new_line = "<s> " + line.strip() + " </s>\n"
                 w.write(new_line)