-
Notifications
You must be signed in to change notification settings - Fork 2
/
read_erisk_data.py
64 lines (55 loc) · 2.64 KB
/
read_erisk_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import xml.etree.ElementTree as ET
import glob, os
def read_subject_writings(subject_file):
writings = []
with open(subject_file) as sf:
contents = sf.read()
root = ET.fromstring(contents)
try:
subject = root.findall('ID')[0].text.strip()
except Exception:
print('Cannot extract ID', contents[:500], '\n-------\n')
for w in root.iter('WRITING'):
subject_writings = {'subject': subject}
for title in w.findall('TITLE'):
subject_writings['title'] = title.text
for text in w.findall('TEXT'):
subject_writings['text'] = text.text
for date in w.findall('DATE'):
subject_writings['date'] = date.text
writings.append(subject_writings)
return writings
def read_texts_2020(datadir_root_T1_2020,
datadirs_T1_2020,
labels_files_T1_2020,
test_suffix='0000',
chunked_subsets=None):
writings = {'train': [], 'test': []}
writings_df = pd.DataFrame()
labels_df = pd.DataFrame()
# for subset in ('train', 'test'):
for subset in ('test',):
for subdir in [os.path.join(datadir_root_T1_2020[subset], subp) for subp in datadirs_T1_2020[subset]]:
for subject_file in os.listdir(subdir):
writings[subset].extend(read_subject_writings(os.path.join(subdir, subject_file)))
writings_df_part = pd.DataFrame(writings[subset])
# add a suffix for users in the test -- the numbers are duplicated with the ones in train
if subset=='test':
writings_df_part['subject'] = writings_df_part['subject'].apply(lambda s: s+test_suffix)
print(subset, writings_df_part.subject)
writings_df_part['subset'] = subset
writings_df = pd.concat([writings_df, writings_df_part])
writings_df.reindex()
for label_file in labels_files_T1_2020[subset]:
labels = pd.read_csv(os.path.join(datadir_root_T1_2020[subset], label_file),
delimiter='\s+', names=['subject', 'label'])
# add a suffix for users in the test -- the numbers are duplicated with the ones in train
if subset=='test':
labels['subject'] = labels['subject'].apply(lambda s: s+test_suffix)
labels_df = pd.concat([labels_df, labels])
labels_df = labels_df.drop_duplicates()
labels_df = labels_df.set_index('subject')
writings_df = writings_df.drop_duplicates()
writings_df = writings_df.join(labels_df, on='subject')
return writings_df