-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsetup.py
156 lines (135 loc) · 5.08 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import os
import xml.etree.ElementTree as ET
DATASETS_FOLDER = "datasets" #the folder that contains the dataset directories to read in
FORMATTED_DATA_FILENAME = "true_data.json" #the name of the file that contains the data to read in
EVALUATION_RESULTS_FILENAME = "evaluation.json"
DATASETS = ["ETPC", "SAv2", "TURL", "MPC", "QQP", "ParaNMT", "APH", "APT", "PAWSWiki", "ParaSCI", "MSCOCO", "SaR"] #the folders in the DATASETS_FOLDER should be named like the datasets here
MACHINE_PARAPHRASED_DATASETS = ["SAv2", "MPC", "ParaNMT", "APT", "PAWSWiki"]
HUMAN_PARAPHRASED_DATASETS = ["ETPC", "TURL", "QQP", "APH", "ParaSCI", "MSCOCO", "SaR"]
OUT_DIR = "output" #the directory to output the formatted files in
FIGURES_FOLDER = "figures"
EMBEDDINGS_FOLDER = "embeddings"
EXAMPLES_FOLDER = "examples"
DETECTION_FOLDER = "detection"
GRIDSEARCH_FOLDER = "gridsearch"
EVALUATION_FOLDER = "evaluation"
CORRELATIONS_FOLDER = "correlations"
MODELS_FOLDER = "models"
# Methods:
FUZZY = "fuzzy"
SEM_GLOVE = "semantic_glove"
SEM_BERT = "semantic_bert"
SEM_T5 = "semantic_t5"
NGRAM3 = "3gram"
NGRAM4 = "4gram"
NGRAM5 = "5gram"
FASTTEXT = "fasttext"
TFIDF_COSINE = "tfidf_cosine"
DETECTION_METHODS = [FUZZY, NGRAM3, SEM_BERT, SEM_T5, TFIDF_COSINE, FASTTEXT, SEM_GLOVE]
PRED_SUF = "_pred" # the suffix to append to each method name to get the prediction probability
# Methods (Class Prediction):
FUZZY_PRED = FUZZY+PRED_SUF
SEM_GLOVE_PRED = SEM_GLOVE+PRED_SUF
SEM_BERT_PRED = SEM_BERT+PRED_SUF
SEM_T5_PRED = SEM_T5+PRED_SUF
NGRAM3_PRED = NGRAM3+PRED_SUF
FASTTEXT_PRED = FASTTEXT+PRED_SUF
TFIDF_COSINE_PRED = TFIDF_COSINE+PRED_SUF
METHOD_PRED_CLASSES = [FUZZY_PRED, SEM_GLOVE_PRED, SEM_BERT_PRED, SEM_T5_PRED, NGRAM3_PRED, FASTTEXT_PRED, TFIDF_COSINE_PRED]
FIGURE_SIZE = 2000
MAX_DATASET_INPUT = 10000 # how many pairs to parse per dataset (sampled randomly from bigger datasets)
EXAMPLE_AMOUNT = 500 # how many examples to extract (top sim, low sim & random sim)
STUDY_EXAMPLE_AMOUNT = 10 # amount of extracted examples for human study
# Variable Names for the outputs:
TEXT1 = "text_1"
TEXT2 = "text_2"
DATASET = "dataset"
PAIR_ID = "pair_id"
SPLIT = "split_original" # should be either "test", "train", "dev", "val" or None, NOT the splits from this study, but from the original datasets if provided
TUPLE_ID = "tuple_id"
ID1 = "id_1"
ID2 = "id_2"
PARAPHRASE = "is_paraphrase"
COSINE_DISTANCE = "cosine_distance"
ORIGIN = "origin"
SUPPLEMENT_FROM = "supplement_from"
PARAPHRASE_TYPE = "paraphrase_type"
# ETPC-relevant attributes:
TYPE_ID = "type_id" # the ETP annotation for the paraphrase type
TEXT1_SCOPE = "text1_scope" # the token id scope x,y that marks the part of sentence which has been modified
TEXT2_SCOPE = "text2_scope"
SENSE_PRESERVING = "sense_preserving"
TRAIN_SPLIT_MAX = 4000 # max amount of train split pairs used for grid search SVM training
# Variables for embeddings and their visualization
TOKENS1 = "tokens_1"
TOKENS2 = "tokens_2"
TEXT_PREVIEW = "text_preview"
TEXT_PREVIEW1 = "text_preview_1"
TEXT_PREVIEW2 = "text_preview_2"
EMBEDDINGS = "embeddings"
TEXT_ID = "text_id"
EMBED = "embed"
# Eval Variables
DATASET_COLUMN_NAME = "dataset"
METHOD = "detection_method"
PAIRS = "pairs"
PARA_PAIRS = "paraphrases"
ORIG_PAIRS = "originals"
GINI_PRED = "gini_predicition"
GINI_PROB = "gini_probability"
TP = "TP"
TN = "TN"
FP = "FP"
FN = "FN"
ACCURACY = "accuracy"
PRECISION = "precision"
SPECIFICITY = "specificity"
RECALL = "recall"
F1 = "f1"
CORR_GRAPH_SIZE = 3000
#with open(os.path.join(os.path.join(DATASETS_FOLDER, "ETPC"), "paraphrase_types.xml"), encoding='utf-8', mode = "r") as file:
# tree = ET.parse(file)
# root = tree.getroot()
# for i, elem in enumerate(root):
# PARAPHRASE_TYPES[int(elem[0].text)] = { TYPE_NAME: elem[1].text, TYPE_CATEGORY: elem[2].text }
# PARAPHRASE_TYPES[0] = { TYPE_NAME: "Unknown", TYPE_CATEGORY: "Unknown" } # Add the "Unknown" type (needed for other unclassified datasets)
# Paraphrase Types explained
# (numbers different from ETPC Paper as they seem to have skipped some numbers in EPT dev code)
'''
Morphology-based changes
1 Inflectional changes + / -
2 Modal verb changes +
3 Derivational changes +
Lexicon-based changes
4 Spelling changes +
5 Same polarity substitution (habitual) +
6 Same polarity substitution (contextual) + / -
7 Same polarity sub. (named entity) + / -
8 Change of format +
Lexico-syntactic based changes
9 Opposite polarity sub. (habitual) + / -
10 Opposite polarity sub. (contextual) + / -
11 Synthetic/analytic substitution +
13 Converse substitution + / -
Syntax-based changes
14 Diathesis alternation + / -
15 Negation switching + / -
16 Ellipsis +
17 Coordination changes +
18 Subordination and nesting changes +
Discourse-based changes
21 Punctuation changes +
22 Direct/indirect style alternations + / -
23 Sentence modality changes +
24 Syntax/discourse structure changes +
Other changes
25 Addition/Deletion + / -
26 Change of order +
28 Semantic (General Inferences) + / -
Extremes
29 Identity +
30 Non-Paraphrase -
31 Entailment
32 Synthetic/analytic substitution (named ent.)
100 Negation (independed from paraphrase annotation)
'''