From a5d48db960dfccb138c23432732110a6f62810fc Mon Sep 17 00:00:00 2001
From: BaiBlanc <1458491606@qq.com>
Date: Mon, 31 Aug 2020 01:06:29 +0200
Subject: [PATCH] Fixing bugs
---
gsoc/zheyuan/pipeline/batch_paraphrase.py | 10 ++++++++--
gsoc/zheyuan/pipeline/pipeline.sh | 9 +++++++--
gsoc/zheyuan/utility/GloVe/glove_finetune.py | 2 +-
gsoc/zheyuan/utility/vocab_creator.py | 2 +-
4 files changed, 17 insertions(+), 6 deletions(-)
diff --git a/gsoc/zheyuan/pipeline/batch_paraphrase.py b/gsoc/zheyuan/pipeline/batch_paraphrase.py
index 0afe08a..d1c12a5 100644
--- a/gsoc/zheyuan/pipeline/batch_paraphrase.py
+++ b/gsoc/zheyuan/pipeline/batch_paraphrase.py
@@ -1,4 +1,5 @@
import argparse
+import os
import tensorflow as tf
tf.compat.v1.enable_eager_execution()
from paraphrase_questions import paraphrase_questions,get_pretrained_model,prepare_model,set_seed,pick_final_sentence, pick_final_sentence_advanced
@@ -13,8 +14,9 @@ def batch_paraphrase(templates_path, model_dir):
folder_path = get_pretrained_model(const.URL)
set_seed(42)
tokenizer, device, model = prepare_model(folder_path)
- with open(templates_path, "r") as lines:
- with open(templates_path + "_paraphrased", "w") as w:
+ dir = os.path.realpath(templates_path)
+ with open(dir, "r") as lines:
+ with open(dir + "_paraphrased", "w") as w:
for line in lines:
prop = line.strip("\n").split(seperator)
question = prop[3]
@@ -22,6 +24,7 @@ def batch_paraphrase(templates_path, model_dir):
paraphrased = pick_final_sentence(question, paraphrased_candidates)
advanced = pick_final_sentence_advanced(device, question, paraphrased_candidates, model_dir)
w.write(line)
+ print("Original", line)
# for i, candidate in enumerate(paraphrased_candidates):
# new_prop = prop[:-1]
# new_prop[3] = candidate
@@ -35,11 +38,14 @@ def batch_paraphrase(templates_path, model_dir):
new_prop.append("Paraphrased \n")
new_line = seperator.join(new_prop)
w.write(new_line)
+ print("Paraphrase", new_line)
+
new_prop = prop[:-1]
new_prop[3] = advanced
new_prop.append("Paraphrased advanced\n")
new_line = seperator.join(new_prop)
w.write(new_line)
+ print("Advanced", new_line)
if __name__=="__main__":
diff --git a/gsoc/zheyuan/pipeline/pipeline.sh b/gsoc/zheyuan/pipeline/pipeline.sh
index d5b5e5b..051de28 100755
--- a/gsoc/zheyuan/pipeline/pipeline.sh
+++ b/gsoc/zheyuan/pipeline/pipeline.sh
@@ -58,8 +58,14 @@ fi
# 1. Generate templates
- python multi_generate_templates.py --label '['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'Algorithm', 'Altitude', 'AnatomicalStructure', 'Area', 'Award', 'Biomolecule', 'Blazon', 'Browser', 'ChartsPlacements', 'ChemicalSubstance', 'Cipher', 'Colour', 'Currency', 'Demographics', 'Depth', 'Device', 'Diploma', 'Disease', 'ElectionDiagram', 'ElectricalSubstation', 'EthnicGroup', 'FileSystem', 'Flag', 'Food', 'GeneLocation', 'GrossDomesticProduct', 'Holiday', 'Identifier', 'Language', 'List', 'Media', 'MedicalSpecialty', 'Medicine', 'Name', 'PersonFunction', 'Population', 'Protocol', 'PublicService', 'Relationship', 'PersonFunction', 'SportsSeason', 'Spreadsheet', 'StarCluster', 'Statistic', 'Tank', 'TimePeriod', 'UnitOfWork', 'Unknown']' --project_name $1 --depth 1 --multi True
+ partr="../utility/part-r-00000"
+ if [ ! -d $partr ]; then
+ wget https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz
+ gzip -d part-r-00000.gz
+ fi
+ python multi_generate_templates.py --label '['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'AnatomicalStructure', 'Device', 'TimePeriod', 'Activity']' --project_name $1 --depth 1 --multi True
+#'['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'Algorithm', 'Altitude', 'AnatomicalStructure', 'Area', 'Award', 'Biomolecule', 'Blazon', 'Browser', 'ChartsPlacements', 'ChemicalSubstance', 'Cipher', 'Colour', 'Currency', 'Demographics', 'Depth', 'Device', 'Diploma', 'Disease', 'ElectionDiagram', 'ElectricalSubstation', 'EthnicGroup', 'FileSystem', 'Flag', 'Food', 'GeneLocation', 'GrossDomesticProduct', 'Holiday', 'Identifier', 'Language', 'List', 'Media', 'MedicalSpecialty', 'Medicine', 'Name', 'PersonFunction', 'Population', 'Protocol', 'PublicService', 'Relationship', 'PersonFunction', 'SportsSeason', 'Spreadsheet', 'StarCluster', 'Statistic', 'Tank', 'TimePeriod', 'UnitOfWork', 'Unknown']'
# 2. Batch Paraphrasing
# 2.1 Download BERT-Classifier
@@ -132,4 +138,3 @@ fi
cd ..
-fi
\ No newline at end of file
diff --git a/gsoc/zheyuan/utility/GloVe/glove_finetune.py b/gsoc/zheyuan/utility/GloVe/glove_finetune.py
index 14a2629..54da0c3 100644
--- a/gsoc/zheyuan/utility/GloVe/glove_finetune.py
+++ b/gsoc/zheyuan/utility/GloVe/glove_finetune.py
@@ -51,7 +51,7 @@ def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300):
sentence = sentence.strip("\n")
sentence = " " + sentence + " "
for word in sentence.split():
- word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?").replace("i̇", ""))
+ word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?"))
print(len(word_en), word_en[:20])
vocab_en = list(set(word_en) - set(["", ""]))
diff --git a/gsoc/zheyuan/utility/vocab_creator.py b/gsoc/zheyuan/utility/vocab_creator.py
index 4755f5a..2519853 100644
--- a/gsoc/zheyuan/utility/vocab_creator.py
+++ b/gsoc/zheyuan/utility/vocab_creator.py
@@ -39,7 +39,7 @@ def sparql_vocab(project_path):
def add_s_tokens(path):
with open(path+"/data.sparql", "r") as lines:
- with open("./GloVe/GloVe-master/data_s.sparql", "w") as w:
+ with open(path+"/../../GloVe/data_s.sparql", "w") as w:
for line in lines:
new_line = " " + line.strip() + " \n"
w.write(new_line)