-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_graph.py
85 lines (72 loc) · 4.02 KB
/
create_graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
""" This file is for reading in the graph and creating additional attributes.
"""
import snap
import os
import re
from dateutil import parser
FROM_REGEX = re.compile(r'From:?\s[^@]+@[^@]+\.[^@]+\n')
TO_REGEX = re.compile(r'To:?\s[^@]+@[^@]+\.[^@]+\n')
DATE_REGEX = re.compile(r'Date: \w\w\w, [0-9]+ \w\w\w [0-9]{4} \d\d:\d\d:\d\d [-+]?\d\d\d\d \([A-Z]{3}\)')
DATE_FORMAT = '%a, %e %b %Y %H:%M:%S %z (%Z)'
rootdir = './enron_mail_20110402/maildir'
def load_data():
emailToNid = {}
id = 0
f_emails = open('email.txt','w')
f_edges = open('edges.txt', 'w')
for subdir, dirs, files in os.walk(rootdir):
for dir in dirs:
user_dir = os.path.join(subdir, dir + '/sent')
if os.path.isdir(user_dir):
for email in os.listdir(user_dir):
dir_entry_path = os.path.join(user_dir, email)
if os.path.isfile(dir_entry_path):
with open(dir_entry_path, 'r') as email_file:
from_email = None
to_email = None
date_obj = None
for email_line in email_file:
if from_email and to_email and date_email:
break
# Handle from_field
from_field = re.match(FROM_REGEX, email_line)
if from_field and not from_email:
from_field = from_field.group(0).strip().split('From: ')[-1]
from_field = from_field.split()[-1]
from_field = from_field.translate(None, '<>')
from_email = from_field
if from_field not in emailToNid:
emailToNid[from_field] = id
id += 1
new_email = '%d,%s\n' % (emailToNid[from_field], from_field)
f_emails.write(new_email)
# Handle to_field
to_field = re.match(TO_REGEX, email_line)
if to_field and not to_email:
to_field = to_field.group(0).strip().split('To: ')[-1]
to_field = to_field.split()[-1]
to_field = to_field.translate(None, '<>')
to_email = to_field
if to_field not in emailToNid:
emailToNid[to_field] = id
id += 1
new_email = '%d,%s\n' % (emailToNid[to_field], to_field)
f_emails.write(new_email)
# Handle date_email
date_email = re.match(DATE_REGEX, email_line)
if date_email and not date_obj:
email_string = date_email.group(0).split('Date:')[-1].lstrip()
date_obj = parser.parse(email_string)
if from_email and to_email and date_obj:
to_id = emailToNid[to_email]
from_id = emailToNid[from_email]
date_string = date_obj.strftime('%Y-%m-%d %H:%M:%S')
############################################
# Format: date, to_id, from_id, email_path #
############################################
to_write = '%s,%d,%d,%s\n' % (date_string, to_id, from_id, dir_entry_path)
f_edges.write(to_write)
f_edges.close()
f_emails.close()
if __name__ == '__main__':
load_data()