forked from nicoloval/paint-black
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGini_entropy_builder.py
executable file
·275 lines (213 loc) · 11.8 KB
/
Gini_entropy_builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#!/usr/bin/env python3
import networkx as nx
import blocksci
import zarr
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm
from collections import defaultdict
import sys, os, os.path, socket
import logging
import matplotlib.pyplot as plt
import matplotlib.dates
from util import SYMBOLS, DIR_PARSED, SimpleChrono
import random
import math
import json
import graph_tool.all as gt
import operator
import concurrent.futures
from scipy.stats import entropy
def parse_command_line():
import sys, optparse
parser = optparse.OptionParser()
parser.add_option("--curr", action='store', dest="currency", type='str',
default=None, help="name of the currency")
parser.add_option("--heur", action='store', dest="heuristic", type='str',
default=None, help="heuristics to apply")
parser.add_option("--start", action="store", dest="start_date",
default = None, help= "starting date for network creation in YYYY-MM-DD format")
parser.add_option("--end", action="store", dest="end_date",
default = None, help = "ending date for network creation in YYYY-MM-DD format")
parser.add_option("--freq", action="store", dest="frequency",
default = "day", help = "time aggregation of networks - choose between day, week, 2weeks, 4weeks")
options, args = parser.parse_args()
options.currency = SYMBOLS[options.currency]
switcher = {"day":1, "week":7, "2weeks":14, "4weeks":28}
options.cluster_folder = f"{DIR_PARSED}/{options.currency}/heur_{options.heuristic}"
options.blocks_folder = f"{DIR_PARSED}/{options.currency}/heur_all_data"
options.networks_folder = f"{DIR_PARSED}/{options.currency}/heur_{options.heuristic}_networks_{options.frequency}"
options.frequency = switcher[options.frequency]
return options, args
def daterange(date1, date2, by=1):
return [ date1 + timedelta(n) for n in range(0, int((date2 - date1).days)+1, by) ]
def calculate_and_return_metrics(data, label):
data_arr = np.array(list(data.values()), dtype=float)
# Calculate and print Gini Coefficient
gini_coefficient = gini(data_arr)
# print(f'Gini Coefficient for {label}: {gini_coefficient}')
# Normalize then Calculate and print the shannon Entropy
data_arr /= data_arr.sum()
e = entropy(data_arr)
# print(f'Entropy for {label}: {e}')
# Create a dictionary for percentiles
data_arr = np.array(list(data.values()), dtype=float)
percentiles = [0, 25, 50, 75, 100]
values = np.percentile(data_arr, percentiles)
percentile_dict = {p: v for p, v in zip(percentiles, values)}
# for p, v in percentile_dict.items():
# print(f'{p}th percentile for {label}: {v}')
return gini_coefficient, e, percentile_dict
def gini(array):
"""Calculate the Gini coefficient of a numpy array."""
# based on bottom eq: http://www.statsdirect.com/help/content/image/stat0206_wmf.gif
# from: http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
array = np.sort(array) # values must be sorted
index = np.arange(1, array.shape[0]+1) # index per array element
n = array.shape[0] # number of array elements
return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array))) #Gini coefficient
def community_inequality_analysis(date):
switcherback = {1:"day", 7:"week", 14:"2weeks", 28:"4weeks"}
logging.info(f'Analyzing Inequality for communities for the week of:{date} has started')
start_time = datetime.now()
chrono.add_tic("net")
networks_path = f"/srv/abacus-1/bitcoin_darknet/grayscale_op_ali/heur_{options.heuristic}_data_v3/heur_{options.heuristic}_networks_full_shifted_community/{switcherback[options.frequency]}"
unit_graph_file = f"{networks_path}/{date.strftime('%Y-%m-%d')}.graphml.bz2"
if not os.path.exists(unit_graph_file):
logging.info(f'community building the date:{date} is unsuccesful since original network does not exist')
return 0.0,0.0,0.0,0.0,0.0,[0.0,0.0]
# Load Graph
try:
g = gt.load_graph(unit_graph_file)
except OSError:
logging.info(f'community building of the date:{date} is unsuccesful because of OSError')
return
# Initialize an empty dictionary to store the dark_ratio, dark_assets, and current_assets values for each block
block_dark_assets = {}
block_darkness_ratios = {}
block_current_assets = {}
# Initialize an empty list to store the dark_ratio, dark_assets, and current_assets values for each vertex
vertex_dark_assets = {}
vertex_darkness_ratios = {}
vertex_current_assets = {}
# Traverse the nodes in the graph
for v in g.vertices():
# Extract the block, dark_ratio, dark_assets and current_assets properties
block = g.vp.block[v]
dark_ratio = g.vp.dark_ratio[v]
dark_assets = g.vp.dark_assets[v]
current_assets = g.vp.current_assets[v]
# Append the respective properties to the appropriate list in the dictionary
if block in block_dark_assets:
block_dark_assets[block].append(dark_assets)
block_darkness_ratios[block].append(dark_ratio)
block_current_assets[block].append(current_assets)
else:
block_dark_assets[block] = [dark_assets]
block_darkness_ratios[block] = [dark_ratio]
block_current_assets[block] = [current_assets]
# Append the respective properties to the appropriate list for vertex level analysis
if v not in vertex_dark_assets:
vertex_dark_assets[v] = [dark_assets]
vertex_darkness_ratios[v] = [dark_ratio]
vertex_current_assets[v] = [current_assets]
else:
vertex_dark_assets[v].append(dark_assets)
vertex_darkness_ratios[v].append(dark_ratio)
vertex_current_assets[v].append(current_assets)
# Compute the total dark assets, average darkness ratio, and total current assets for each block
block_total_dark_assets = {block: np.sum(assets) for block, assets in block_dark_assets.items()}
block_avg_darkness_ratios = {block: np.mean(ratios) for block, ratios in block_darkness_ratios.items()}
block_total_current_assets = {block: np.sum(assets) for block, assets in block_current_assets.items()}
# Compute the total dark assets, average darkness ratio, and total current assets for each block
vertex_total_dark_assets = {v: np.sum(assets) for v, assets in vertex_dark_assets.items()}
vertex_avg_darkness_ratios = {v: np.mean(ratios) for v, ratios in vertex_darkness_ratios.items()}
vertex_total_current_assets = {v: np.sum(assets) for v, assets in vertex_current_assets.items()}
# Calculate and return metrics for block level
block_dark_assets_metrics = calculate_and_return_metrics(block_total_dark_assets, "Block Level Total Dark Assets")
block_darkness_ratios_metrics = calculate_and_return_metrics(block_avg_darkness_ratios, "Block Level Average Darkness Ratios")
block_current_assets_metrics = calculate_and_return_metrics(block_total_current_assets, "Block Level Total Current Assets")
# Calculate and return metrics for vertex level
vertex_dark_assets_metrics = calculate_and_return_metrics(vertex_total_dark_assets, "Vertex Level Dark Assets")
vertex_darkness_ratios_metrics = calculate_and_return_metrics(vertex_avg_darkness_ratios, "Vertex Level Darkness Ratios")
vertex_current_assets_metrics = calculate_and_return_metrics(vertex_total_current_assets, "Vertex Level Current Assets")
logging.info(f'Building for the date:{date} has finished with t={datetime.now() - start_time} finished:')
logging.info(f" Original graph: {g}")
tqdm_bar.set_description(f"{switcherback[options.frequency]} of '{date.strftime('%Y-%m-%d')} took {chrono.elapsed('net')} sec", refresh=True)
return {
"block": {
"dark_assets": block_dark_assets_metrics,
"darkness_ratios": block_darkness_ratios_metrics,
"current_assets": block_current_assets_metrics
},
"vertex": {
"dark_assets": vertex_dark_assets_metrics,
"darkness_ratios": vertex_darkness_ratios_metrics,
"current_assets": vertex_current_assets_metrics
}
}
def process_timeunit(timeunit):
# Run randomizer + assortativity builder and store result
metrics_dict = community_inequality_analysis(timeunit)
# Add values to data dictionaries
date = timeunit.strftime('%Y-%m-%d')
x_values.append(date)
x_values.sort()
for level, level_data in metrics_dict.items(): # Level can be 'block' or 'vertex'
for metric, metric_data in level_data.items(): # Metric can be 'dark_assets', 'darkness_ratios', 'current_assets'
gini_coefficient, entropy, percentile_dict = metric_data
# Construct the key for data_dicts
key_prefix = level if level == 'block' else 'vertex' # Append 'vertex' to vertex level keys
key = f"{key_prefix}_{metric}_gini"
if key not in data_dicts:
data_dicts[key] = {}
data_dicts[key][date] = gini_coefficient
key = f"{key_prefix}_{metric}_entropy"
if key not in data_dicts:
data_dicts[key] = {}
data_dicts[key][date] = entropy
for percentile, value in percentile_dict.items():
key = f"{key_prefix}_{metric}_{percentile}th_percentile"
if key not in data_dicts:
data_dicts[key] = {}
data_dicts[key][date] = value
# Save data dictionaries after each process
for key, data_dict in data_dicts.items():
sorted_data = dict(sorted(data_dict.items(), key=operator.itemgetter(0)))
file_path = os.path.join(f'jsonResults_v3/h{options.heuristic}/community_inequality', f'{key}_2009-01-03_{end_date}.json')
with open(file_path, 'w') as f:
save_json = json.dumps(sorted_data)
f.write(save_json)
return timeunit
if __name__ == "__main__":
options, args = parse_command_line()
switcherback = {1:"day", 7:"week", 14:"2weeks", 28:"4weeks"}
logging.basicConfig(level=logging.DEBUG, filename=f"logfiles/daily_weekly_final_heur_{options.heuristic}_v3/community_inequality_logfile", filemode="a+", format="%(asctime)-15s %(levelname)-8s %(message)s")
chrono = SimpleChrono()
chrono.print(message="init")
chrono.add_tic('proc')
start_date = datetime.strptime(options.start_date, "%Y-%m-%d").date()
end_date = datetime.strptime(options.end_date, "%Y-%m-%d").date()
print(f'start_date is set as: {start_date}')
print(f'end_date is set as: {end_date}')
datelist = daterange(start_date, end_date, by=options.frequency)
tqdm_bar = tqdm(datelist, desc="processed files")
x_values = []
data_dicts = {}
# Create a ThreadPoolExecutor
with tqdm(total=len(datelist)) as progress:
with concurrent.futures.ThreadPoolExecutor() as executor:
# Define a helper function to update the progress bar
def update_progress(future):
progress.update()
progress.set_description(f"week of '{timeunit.strftime('%Y-%m-%d')} took {chrono.elapsed('proc')} sec", refresh=True)
# Process timeunits in parallel
futures = []
for timeunit in datelist:
future = executor.submit(process_timeunit, timeunit)
future.add_done_callback(update_progress)
futures.append(future)
# Wait for all futures to complete
concurrent.futures.wait(futures)
print('Process terminated, graphs and attributes created.')
print(f"Graphs created in {chrono.elapsed('proc', format='%H:%M:%S')}")