From a5d48db960dfccb138c23432732110a6f62810fc Mon Sep 17 00:00:00 2001 From: BaiBlanc <1458491606@qq.com> Date: Mon, 31 Aug 2020 01:06:29 +0200 Subject: [PATCH] Fixing bugs --- gsoc/zheyuan/pipeline/batch_paraphrase.py | 10 ++++++++-- gsoc/zheyuan/pipeline/pipeline.sh | 9 +++++++-- gsoc/zheyuan/utility/GloVe/glove_finetune.py | 2 +- gsoc/zheyuan/utility/vocab_creator.py | 2 +- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/gsoc/zheyuan/pipeline/batch_paraphrase.py b/gsoc/zheyuan/pipeline/batch_paraphrase.py index 0afe08a..d1c12a5 100644 --- a/gsoc/zheyuan/pipeline/batch_paraphrase.py +++ b/gsoc/zheyuan/pipeline/batch_paraphrase.py @@ -1,4 +1,5 @@ import argparse +import os import tensorflow as tf tf.compat.v1.enable_eager_execution() from paraphrase_questions import paraphrase_questions,get_pretrained_model,prepare_model,set_seed,pick_final_sentence, pick_final_sentence_advanced @@ -13,8 +14,9 @@ def batch_paraphrase(templates_path, model_dir): folder_path = get_pretrained_model(const.URL) set_seed(42) tokenizer, device, model = prepare_model(folder_path) - with open(templates_path, "r") as lines: - with open(templates_path + "_paraphrased", "w") as w: + dir = os.path.realpath(templates_path) + with open(dir, "r") as lines: + with open(dir + "_paraphrased", "w") as w: for line in lines: prop = line.strip("\n").split(seperator) question = prop[3] @@ -22,6 +24,7 @@ def batch_paraphrase(templates_path, model_dir): paraphrased = pick_final_sentence(question, paraphrased_candidates) advanced = pick_final_sentence_advanced(device, question, paraphrased_candidates, model_dir) w.write(line) + print("Original", line) # for i, candidate in enumerate(paraphrased_candidates): # new_prop = prop[:-1] # new_prop[3] = candidate @@ -35,11 +38,14 @@ def batch_paraphrase(templates_path, model_dir): new_prop.append("Paraphrased \n") new_line = seperator.join(new_prop) w.write(new_line) + print("Paraphrase", new_line) + new_prop = prop[:-1] new_prop[3] = advanced new_prop.append("Paraphrased advanced\n") new_line = seperator.join(new_prop) w.write(new_line) + print("Advanced", new_line) if __name__=="__main__": diff --git a/gsoc/zheyuan/pipeline/pipeline.sh b/gsoc/zheyuan/pipeline/pipeline.sh index d5b5e5b..051de28 100755 --- a/gsoc/zheyuan/pipeline/pipeline.sh +++ b/gsoc/zheyuan/pipeline/pipeline.sh @@ -58,8 +58,14 @@ fi # 1. Generate templates - python multi_generate_templates.py --label '['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'Algorithm', 'Altitude', 'AnatomicalStructure', 'Area', 'Award', 'Biomolecule', 'Blazon', 'Browser', 'ChartsPlacements', 'ChemicalSubstance', 'Cipher', 'Colour', 'Currency', 'Demographics', 'Depth', 'Device', 'Diploma', 'Disease', 'ElectionDiagram', 'ElectricalSubstation', 'EthnicGroup', 'FileSystem', 'Flag', 'Food', 'GeneLocation', 'GrossDomesticProduct', 'Holiday', 'Identifier', 'Language', 'List', 'Media', 'MedicalSpecialty', 'Medicine', 'Name', 'PersonFunction', 'Population', 'Protocol', 'PublicService', 'Relationship', 'PersonFunction', 'SportsSeason', 'Spreadsheet', 'StarCluster', 'Statistic', 'Tank', 'TimePeriod', 'UnitOfWork', 'Unknown']' --project_name $1 --depth 1 --multi True + partr="../utility/part-r-00000" + if [ ! -d $partr ]; then + wget https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz + gzip -d part-r-00000.gz + fi + python multi_generate_templates.py --label '['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'AnatomicalStructure', 'Device', 'TimePeriod', 'Activity']' --project_name $1 --depth 1 --multi True +#'['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'Algorithm', 'Altitude', 'AnatomicalStructure', 'Area', 'Award', 'Biomolecule', 'Blazon', 'Browser', 'ChartsPlacements', 'ChemicalSubstance', 'Cipher', 'Colour', 'Currency', 'Demographics', 'Depth', 'Device', 'Diploma', 'Disease', 'ElectionDiagram', 'ElectricalSubstation', 'EthnicGroup', 'FileSystem', 'Flag', 'Food', 'GeneLocation', 'GrossDomesticProduct', 'Holiday', 'Identifier', 'Language', 'List', 'Media', 'MedicalSpecialty', 'Medicine', 'Name', 'PersonFunction', 'Population', 'Protocol', 'PublicService', 'Relationship', 'PersonFunction', 'SportsSeason', 'Spreadsheet', 'StarCluster', 'Statistic', 'Tank', 'TimePeriod', 'UnitOfWork', 'Unknown']' # 2. Batch Paraphrasing # 2.1 Download BERT-Classifier @@ -132,4 +138,3 @@ fi cd .. -fi \ No newline at end of file diff --git a/gsoc/zheyuan/utility/GloVe/glove_finetune.py b/gsoc/zheyuan/utility/GloVe/glove_finetune.py index 14a2629..54da0c3 100644 --- a/gsoc/zheyuan/utility/GloVe/glove_finetune.py +++ b/gsoc/zheyuan/utility/GloVe/glove_finetune.py @@ -51,7 +51,7 @@ def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300): sentence = sentence.strip("\n") sentence = " " + sentence + " " for word in sentence.split(): - word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?").replace("i̇", "")) + word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?")) print(len(word_en), word_en[:20]) vocab_en = list(set(word_en) - set(["", ""])) diff --git a/gsoc/zheyuan/utility/vocab_creator.py b/gsoc/zheyuan/utility/vocab_creator.py index 4755f5a..2519853 100644 --- a/gsoc/zheyuan/utility/vocab_creator.py +++ b/gsoc/zheyuan/utility/vocab_creator.py @@ -39,7 +39,7 @@ def sparql_vocab(project_path): def add_s_tokens(path): with open(path+"/data.sparql", "r") as lines: - with open("./GloVe/GloVe-master/data_s.sparql", "w") as w: + with open(path+"/../../GloVe/data_s.sparql", "w") as w: for line in lines: new_line = " " + line.strip() + " \n" w.write(new_line)