generated from ucsd-psych201a/replication_template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
anonymize.py
71 lines (60 loc) · 1.9 KB
/
anonymize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
Script for anonymizing data from Qualtrics
"""
import os
import csv
import pandas as pd
from argparse import ArgumentParser
cols_to_drop = [
"RecipientLastName",
"RecipientFirstName",
"RecipientEmail",
"ExternalDataReference",
"IPAddress",
"LocationLatitude",
"LocationLongitude",
"PROLIFIC_PID",
]
def anonymize_data_file(file_path):
"""
Drop all identifying information from a data file.
This would be way easier with pandas, but that would add a dependency.
"""
indices_to_drop = []
with open(file_path, "r") as f_in:
reader = csv.reader(f_in)
for header in reader:
break
for i, colname in enumerate(header):
if colname in cols_to_drop:
indices_to_drop.append(i)
anon_file_path = file_path.replace(".csv", "-anon.csv")
with open(anon_file_path, "w") as f_out:
f_in.seek(0)
for row in reader:
cols_to_keep = [
row[i] for i in range(len(row)) if i not in indices_to_drop
]
f_out.write(",".join(cols_to_keep) + "\n")
def get_raw_data_files(root_dir):
"""
Retrieve data files from a directory.
"""
all_data_files = []
for path in os.walk(root_dir):
# if the path contains a non-anonymized csv
for filename in path[2]:
if (
filename.split(".")[-1] == "csv"
and filename.split("-")[-1] != "anon.csv"
):
all_data_files.append("/".join([path[0], filename]))
return all_data_files
parser = ArgumentParser()
parser.add_argument("--root_dir", type=str, default="data")
if __name__ == "__main__":
args = parser.parse_args()
raw_data_files = get_raw_data_files(args.root_dir)
for data_file in raw_data_files:
print(f"anonymizing {data_file}...")
anonymize_data_file(data_file)