Skip to content

Commit

Permalink
Merge pull request #36 from BaiBlanc/master
Browse files Browse the repository at this point in the history
Bugs Fixed and optional parameters added
  • Loading branch information
BaiBlanc authored Aug 31, 2020
2 parents d795771 + a5d48db commit 669e427
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 19 deletions.
10 changes: 8 additions & 2 deletions generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@
'dbo:Athlete': ['dbo:LacrossePlayer'],
'dbo:SportsTeam': ['dboBasketballTeam']
}
EXAMPLES_PER_TEMPLATE = 600

# EXAMPLES_PER_TEMPLATE = 600

def extract_bindings(data, template):
matches = list()
Expand Down Expand Up @@ -316,12 +316,18 @@ def normalize(ontology_class):
metavar='templateFile', help='templates', required=True)
requiredNamed.add_argument(
'--output', dest='output', metavar='outputDirectory', help='dataset directory', required=True)
requiredNamed.add_argument(
'--examples', dest='examples', metavar='examples per template', help='dataset directory', required=False)
args = parser.parse_args()

template_file = args.templates
output_dir = args.output
use_resources_dump = args.continue_generation

examples = args.examples
if examples:
EXAMPLES_PER_TEMPLATE = int(examples)
else:
EXAMPLES_PER_TEMPLATE = 600
# print use_resources_dump => False

time = datetime.datetime.today()
Expand Down
Empty file removed gsoc/zheyuan/README.md
Empty file.
7 changes: 4 additions & 3 deletions gsoc/zheyuan/pipeline/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,20 @@
To run the complete pipeline, please use the command:

```bash
./pipeline.sh [$1 Project's name] [$2 Integer] [$3 Integer]
./pipeline.sh [$1 Project's name] [$2 Integer] [$3 Integer] [$4 Integer] [$5 Integer]
```
$1 -- The project's name -- String -- Required
$2 -- Dimension of the GloVe embeddings -- Integer [50|100|200|300] -- Optional, 300 by default
$3 -- Number of unit in the LSTM cells -- Integer -- Optional, 512 by default

$4 -- Training steps -- Integer -- Optional, 60000 by default
$5 -- EXAMPLES_PER_TEMPLATE -- Integer -- Optional, 600 by default
Examples

```bash
./pipeline.sh Project1
```
```bash
./pipeline.sh Project2 300 512
./pipeline.sh Project2 300 512 60000 600
```


Expand Down
10 changes: 8 additions & 2 deletions gsoc/zheyuan/pipeline/batch_paraphrase.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import os
import tensorflow as tf
tf.compat.v1.enable_eager_execution()
from paraphrase_questions import paraphrase_questions,get_pretrained_model,prepare_model,set_seed,pick_final_sentence, pick_final_sentence_advanced
Expand All @@ -13,15 +14,17 @@ def batch_paraphrase(templates_path, model_dir):
folder_path = get_pretrained_model(const.URL)
set_seed(42)
tokenizer, device, model = prepare_model(folder_path)
with open(templates_path, "r") as lines:
with open(templates_path + "_paraphrased", "w") as w:
dir = os.path.realpath(templates_path)
with open(dir, "r") as lines:
with open(dir + "_paraphrased", "w") as w:
for line in lines:
prop = line.strip("\n").split(seperator)
question = prop[3]
paraphrased_candidates = paraphrase_questions(tokenizer, device, model, question)
paraphrased = pick_final_sentence(question, paraphrased_candidates)
advanced = pick_final_sentence_advanced(device, question, paraphrased_candidates, model_dir)
w.write(line)
print("Original", line)
# for i, candidate in enumerate(paraphrased_candidates):
# new_prop = prop[:-1]
# new_prop[3] = candidate
Expand All @@ -35,11 +38,14 @@ def batch_paraphrase(templates_path, model_dir):
new_prop.append("Paraphrased \n")
new_line = seperator.join(new_prop)
w.write(new_line)
print("Paraphrase", new_line)

new_prop = prop[:-1]
new_prop[3] = advanced
new_prop.append("Paraphrased advanced\n")
new_line = seperator.join(new_prop)
w.write(new_line)
print("Advanced", new_line)


if __name__=="__main__":
Expand Down
45 changes: 37 additions & 8 deletions gsoc/zheyuan/pipeline/pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
# $1 -- The project's name -- String -- Required
# $2 -- Dimension of the GloVe embeddings -- Integer [50|100|200|300] -- Optional, 300 by default
# $3 -- Number of unit in the LSTM cells -- Integer -- Optional, 512 by default
# $4 -- Training steps -- Integer -- Optional, 60000 by default
# $5 -- EXAMPLES_PER_TEMPLATE -- Integer -- Optional, 600 by default

if [ ! -n "$1" ] ;then
echo "you have not input a project name!"
else
echo "The project name will be set to $1"
fi
if [ ! -n "$2" ] ;then
dimension=300
elif [[ ! $2 =~ ^[0-9]*$ ]]; then
Expand All @@ -27,18 +30,42 @@ else
fi
if [ ! -n "$3" ] ;then
num_units=512
elif [[ ! $2 =~ ^[0-9]*$ ]]; then
elif [[ ! $3 =~ ^[0-9]*$ ]]; then
echo "Please enter an integer [ >=512 recommended ] to the third parameter to set the number of units of LSTM cells"

else
num_units=$3
echo "The number of units of LSTM cells is set to $num_units"
fi
if [ ! -n "$4" ] ;then
training_steps=60000
elif [[ ! $4 =~ ^[0-9]*$ ]]; then
echo "Please enter an integer [ >=60000 recommended ] to the fourth parameter to set the number of training steps for Learner"

else
training_steps=$4
echo "The number of training steps for Learner is set to $training_steps"
fi
if [ ! -n "$5" ] ;then
examples_per_template=600
elif [[ ! $5 =~ ^[0-9]*$ ]]; then
echo "Please enter an integer [ >=600 recommended ] to the fifth parameter to set the number of examples per template"

else
examples_per_template=$5
echo "The number of examples per template is set to $examples_per_template"
fi


# 1. Generate templates
python multi_generate_templates.py --label '['Colour', 'Organisation', 'Person', 'Software', 'Artwork', 'Place', 'Work', 'Bird']' --project_name $1 --depth 1 --multi True
partr="../utility/part-r-00000"

if [ ! -d $partr ]; then
wget https://s3.amazonaws.com/subjectiveEye/0.9/subjectiveEye3D/part-r-00000.gz
gzip -d part-r-00000.gz
fi
python multi_generate_templates.py --label '['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'AnatomicalStructure', 'Device', 'TimePeriod', 'Activity']' --project_name $1 --depth 1 --multi True
#'['Agent', 'Place', 'Work', 'Species', 'TopicalConcept', 'MeanOfTransportation', 'Event', 'Algorithm', 'Altitude', 'AnatomicalStructure', 'Area', 'Award', 'Biomolecule', 'Blazon', 'Browser', 'ChartsPlacements', 'ChemicalSubstance', 'Cipher', 'Colour', 'Currency', 'Demographics', 'Depth', 'Device', 'Diploma', 'Disease', 'ElectionDiagram', 'ElectricalSubstation', 'EthnicGroup', 'FileSystem', 'Flag', 'Food', 'GeneLocation', 'GrossDomesticProduct', 'Holiday', 'Identifier', 'Language', 'List', 'Media', 'MedicalSpecialty', 'Medicine', 'Name', 'PersonFunction', 'Population', 'Protocol', 'PublicService', 'Relationship', 'PersonFunction', 'SportsSeason', 'Spreadsheet', 'StarCluster', 'Statistic', 'Tank', 'TimePeriod', 'UnitOfWork', 'Unknown']'
# 2. Batch Paraphrasing
# 2.1 Download BERT-Classifier

Expand All @@ -58,7 +85,7 @@ fi
cd ../../../ # [neural-qa]/gsoc/

mkdir ./data/$1
python generator.py --templates ./gsoc/zheyuan/pipeline/$1/basic_sentence_and_template_generator_paraphrased --output ./data/$1
python generator.py --templates ./gsoc/zheyuan/pipeline/$1/basic_sentence_and_template_generator_paraphrased --output ./data/$1 --examples $examples_per_template
# 3.2 Generate vocab (simple tokenizing and normalization)
cd ./gsoc/zheyuan/utility # [neural-qa]/gsoc/zheyuan/utility
python vocab_creator.py --path ../../../data/$1
Expand All @@ -69,7 +96,7 @@ fi
if [ ! -d ./GloVe/glove.6B ]; then
curl --output ./GloVe/glove.6B.zip http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip

unzip ./GloVe/glove.6B.zip -d ./Glove/glove.6B
unzip ./GloVe/glove.6B.zip -d ./GloVe/glove.6B

else
ls ./GloVe/glove.6b
Expand All @@ -79,14 +106,17 @@ fi
cd ./GloVe
python glove_finetune.py --path ../../../../data/$1
cd ../../../../GloVe
if [ "$(uname)"=="Darwin" ]; then
if [ "$(uname)" == "Darwin" ]; then
# Mac OS X
echo "This is a Mac OSX environment"
sed -i "" "s/CORPUS=.*/CORPUS=data_s.sparql/" demo.sh
sed -i "" "s/SAVE_FILE=.*/SAVE_FILE=embed/" demo.sh
sed -i "" "s/VECTOR_SIZE=.*/VECTOR_SIZE=$dimension/" demo.sh
sed -i "" "s/VOCAB_MIN_COUNT=.*/VOCAB_MIN_COUNT=1/" demo.sh
elif [ "$(expr substr $(uname -s) 1 5)"=="Linux" ]; then
elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then

# GNU/Linux
echo "This is a Linux environment"
sed -i "s/CORPUS=.*/CORPUS=data_s.sparql/" demo.sh
sed -i "s/SAVE_FILE=.*/SAVE_FILE=embed/" demo.sh
sed -i "s/VECTOR_SIZE=.*/VECTOR_SIZE=$dimension/" demo.sh
Expand All @@ -104,8 +134,7 @@ fi
cd ../../
# 4.2 Training with embedding
cd nmt
python -m nmt.nmt --src=en --tgt=sparql --embed_prefix=../data/$1/embed --vocab_prefix=../data/$1/vocab --dev_prefix=../data/$1/dev --test_prefix=../data/$1/test --train_prefix=../data/$1/train --out_dir=../data/$1"_"$dimension"d_model" --num_train_steps=60000 --steps_per_stats=100 --num_layers=2 --num_units=$num_units --dropout=0.2 --metrics=bleu,accuracy
python -m nmt.nmt --src=en --tgt=sparql --embed_prefix=../data/$1/embed --vocab_prefix=../data/$1/vocab --dev_prefix=../data/$1/dev --test_prefix=../data/$1/test --train_prefix=../data/$1/train --out_dir=../data/$1"_"$dimension"d_model" --num_train_steps=$training_steps --steps_per_stats=100 --num_layers=2 --num_units=$num_units --dropout=0.2 --metrics=bleu,accuracy
cd ..


fi
4 changes: 2 additions & 2 deletions gsoc/zheyuan/utility/GloVe/glove_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300):
sentence = sentence.strip("\n")
sentence = "<s> " + sentence + " </s>"
for word in sentence.split():
word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?").replace("i̇", ""))
word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?"))
print(len(word_en), word_en[:20])

vocab_en = list(set(word_en) - set(["<s>", "</s>"]))
Expand All @@ -67,7 +67,7 @@ def finetune_glove(project_path, glove_path="glove.6B.300d.txt", dimension=300):
finetune_glove = batch_finetune(finetune_glove, word_split, dimension)
start = end
end = start + stride
finetune_glove = batch_finetune(finetune_glove, word_en[start:])
finetune_glove = batch_finetune(finetune_glove, word_en[start:], dimension)
unk = calculate_unknown(finetune_glove, dimension)
finetune_glove["<UNK>"] = unk
with open(project_path+"/embed.en", "w") as w:
Expand Down
7 changes: 5 additions & 2 deletions gsoc/zheyuan/utility/vocab_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ def english_vocab(project_path):
word_en.append(word.strip(":").strip("\"").strip("»").strip("+").strip("?"))

vocab_en = list(set(word_en))
vocab_en.remove("")
try:
vocab_en.remove("")
except:
print("There is no \'\' in vocab_en")
with open(project_path+"/vocab.en", "w") as w:
for vocab in vocab_en:

Expand All @@ -36,7 +39,7 @@ def sparql_vocab(project_path):

def add_s_tokens(path):
with open(path+"/data.sparql", "r") as lines:
with open("./GloVe/GloVe-master/data_s.sparql", "w") as w:
with open(path+"/../../GloVe/data_s.sparql", "w") as w:
for line in lines:
new_line = "<s> " + line.strip() + " </s>\n"
w.write(new_line)
Expand Down

0 comments on commit 669e427

Please sign in to comment.