-
Notifications
You must be signed in to change notification settings - Fork 12
/
digg_preprocessing.py
105 lines (83 loc) · 3.35 KB
/
digg_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""
Data from https://www.isi.edu/~lerman/downloads/digg2009.html
Extract network and diffusion cascades from Digg
"""
import os
import pandas as pd
import networkx as nx
import numpy as np
from urllib.request import urlopen
from zipfile import ZipFile
def extract_network(file):
friends = pd.read_csv(file,header=None)
#--------- Remove self friendships
friends = friends[friends[2]!=friends[3]]
#--------- Repeat the reciprocal edges and append them
reciprocal = friends[friends[0]==1]
friends = friends.drop(0,1)
reciprocal = reciprocal.drop(0,1)
#---- Create the reciprocal edge for each pair
tmp = reciprocal[2].copy()
reciprocal[2] = reciprocal[3]
reciprocal[3] = tmp
#--------- Find the edges that already exist in the dataset as reciprocal, and remove them,
#--------- to avoid overwriting the currect time of the reciprocal edges that already exist
to_remove = reciprocal.reset_index().merge(friends,left_on=[2,3],right_on=[2,3]).set_index('index').index
reciprocal = reciprocal.drop(to_remove)
friends = friends.append(reciprocal)
friends[friends.duplicated([2,3],keep=False)] #-- this should be empty
#----------- Store the weighted follow network
friends.columns = ["time","a","b"]
friends = friends[["a","b","time"]]
friends.to_csv("../digg_network.txt",index=False,sep=" ",header=False)
def extract_cascades(file):
#----------- Derive and store the train and test cascades
votes = pd.read_csv(file,header=None)
votes.columns = ["time","user","post"]
votes = votes.sort_values(by=["time"])
#---- Find the threshold after which the cascades are test cascades (final 20% of cascades)
start_times = votes.groupby("post")["time"].min() #--- take into consideration only the starting time of each cascade
start_times = start_times.sort_values()
no_test_cascades = round(20*len(start_times)/100)
threshold = min(start_times.tail(no_test_cascades))
#sum(start_times<threshold )/start_times.shape[0]
f_train = open("train_cascades.txt","w")
f_test = open("test_cascades.txt","w")
#--------- For each cascade
for i in votes["post"].unique():
print(i)
sub = votes[votes["post"]==i]
s = ""
#---- id:time, id:time etc...
for post in sub.sort_values(by=['time']).iterrows():
s = s+str(post[1]["user"])+" "+str(post[1]["time"])+";"#":"+str(post[1]["time"])+","
s = s[:-1]
#---- Check if it has started before or after the threshold
if(min(sub["time"])<threshold):
f_train.write(s+"\n")
else:
f_test.write(s+"\n")
f_train.close()
f_test.close()
def download():
zipresp = urlopen("http://www.isi.edu/~lerman/downloads/digg_votes.zip")
tempzip = open("digg_votes.zip", "wb")
tempzip.write(zipresp.read())
tempzip.close()
zf = ZipFile("digg_votes.zip")
zf.extractall()
zf.close()
zipresp = urlopen("http://www.isi.edu/~lerman/downloads/digg_friends.zip")
tempzip = open("digg_friends.zip", "wb")
tempzip.write(zipresp.read())
tempzip.close()
zf = ZipFile("digg_friends.zip")
zf.extractall()
zf.close()
def digg_preprocessing(path):
os.chdir(path)
download()
file_friends = "digg_friends.csv"
file_casc = "digg_votes1.csv"
digg_extract_network(file_friends)
digg_extract_cascades(file_casc)