forked from toolleeo/special-section-processing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SpecSecElab.py
151 lines (108 loc) · 4.69 KB
/
SpecSecElab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import sys
import glob, os
import csv
import itertools
import argparse
import re
# Lettura del file csv contenente gli articoli postprocessati
# Reading the csv file containing the postprocessed articles
def read_csv(file):
papers = []
with open(file, 'r', encoding='utf-8') as csv_file:
reader = csv.DictReader(csv_file, delimiter='\t')
for item in reader:
papers.append(item)
return papers
# Creazione dei file csv di output
# Creation of output csv files
def create_csv(file_name, th, *args):
keys = ['id', 'papers']
for i in range(0, th):
keys.append(f'coherence_th{i+1}')
if args:
keys.append(args[0])
with open(file_name, 'w', newline='', encoding='utf-8') as output_file:
writer = csv.DictWriter(output_file, fieldnames=keys, delimiter='\t')
writer.writeheader()
# Scrittura dei file csv di output
# Writing output csv files
def write_csv(file_name, id, coherence, n_papers, *args):
with open(file_name, 'a', newline='', encoding='utf-8') as csv_file:
csv_writer = csv.writer(csv_file, delimiter='\t')
csv_row = [id, n_papers]
for item in coherence:
csv_row.append(round(item, 3))
if args:
csv_row.append(round(args[0], 3))
csv_writer.writerow(csv_row)
# Calcolo del numero di lati del grafo una Special Section o Special Section fake in base alla soglia
# Calculation of the number of edges of the graph a Special Section or Special Section fake based on the threshold
def edges_calc(doc, th):
n_edges = 0
for paper1, paper2 in itertools.combinations(doc, 2):
if len(set(paper1['abstract_filtered'].lower().split()).intersection(set(paper2['abstract_filtered'].lower().split()))) >= th:
n_edges += 1
return n_edges
# Calcolo della coerenza tra gli articoli di una stessa Special Section o Special Section fake
# Calculation of coherence between articles of the same Special Section or Special Section fake
def metric_calc(path, th):
coherence = []
with open(str.format(*glob.glob(os.path.join(path, path.split('/')[-1] + '_postproc.csv'))), 'r', encoding='utf-8') as file:
csv_reader = csv.DictReader(file, delimiter='\t')
doc = []
for item in csv_reader:
doc.append(item)
max_edges = (len(doc)*(len(doc)-1)/2)
for i in range(0, th):
try:
coherence.append(edges_calc(doc, i+1) / max_edges)
except:
coherence.append(0)
return coherence, len(doc)
# Calcolo del valore sparsity per gli articoli di una Special Section fake
# Calculation of the sparsity value for the articles of a fake Special Section
def sparsity_calc(papers_real, fake_spec_sec):
num_spec_sec = []
papers_fake = read_csv(os.path.join(fake_spec_sec, fake_spec_sec.split('/')[-1] + '_postproc.csv'))
for item in papers_fake:
for i in range(0, len(papers_real)):
if re.sub('[^a-zA-Z]', '', item['title'].strip().lower()) in papers_real[i]:
num_spec_sec.append(i)
break
try:
sparsity = len(set(num_spec_sec))/len(papers_fake)
except:
sparsity = 'null'
return sparsity
# Estrazione del titolo degli articoli delle Special Section
# Extraction of the title of the articles in the Special Sections
def extract_titles(spec_sec):
titles = []
for i in spec_sec:
temp_titles = []
for j in i:
temp_titles.append(re.sub('[^a-zA-Z]', '', j['title'].strip().lower()))
titles.append(temp_titles)
return titles
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-th', type=int, help='threshold (soglia: numero intero)')
parser.add_argument('--spec_sec', nargs='*', help='elenco di directory delle Special Section')
parser.add_argument('--spec_sec_fake', nargs='*', help='elenco di directory delle Special Section Fake')
args = parser.parse_args()
if args.th is None:
args.th = 10
create_csv('Spec_Sec_metrics.csv', args.th)
real_spec_sec = []
for dir in args.spec_sec:
real_spec_sec.append(read_csv(os.path.join(dir, dir.split('/')[-1] + '_postproc.csv')))
coherence, n_papers = metric_calc(dir, args.th)
write_csv('Spec_Sec_metrics.csv', dir.split('/')[-1], coherence, n_papers)
real_papers_titles = extract_titles(real_spec_sec)
create_csv('Spec_Sec_fake_metrics.csv', args.th, 'sparsity')
for dir in args.spec_sec_fake:
coherence, n_papers = metric_calc(dir, args.th)
consistency = sparsity_calc(real_papers_titles, dir)
write_csv('Spec_Sec_fake_metrics.csv', dir.split('/')[-1], coherence, n_papers, consistency)
if __name__ == '__main__':
main()