-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathrpms_psychs_partition.py
executable file
·100 lines (74 loc) · 2.56 KB
/
rpms_psychs_partition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python
import pandas as pd
import sys
from datetime import date, datetime
from os.path import abspath, dirname
from os import getcwd, chdir
from glob import glob
sys.path.append(dirname(abspath(__file__)))
from idvalidator import validate
if len(sys.argv)<2 or sys.argv[1] in ['-h','--help']:
print(f'''Usage:
./{__file__} /path/to/RPMS_incoming/
./{__file__} /path/to/RPMS_incoming/ 31.12.2022.csv
Splits PSYCHS follow up forms into CHRs and HCs''')
exit(0)
try:
suffix=sys.argv[2]
except:
suffix=date.today().strftime('%d.%m.%Y.csv')
dir_bak=getcwd()
chdir(sys.argv[1])
try:
files=[glob(p)[0] for p in [f'PrescientStudy_Prescient_psychs_p1p8_fu_{suffix}',
f'PrescientStudy_Prescient_psychs_p9ac32_fu_{suffix}']]
except IndexError:
print('No PSYCHS follow up forms could be found')
exit()
for file in files:
print(file)
dfpsychs=pd.read_csv(file,dtype=str,keep_default_na=False)
dfchr=pd.DataFrame(columns=dfpsychs.columns)
dfhc=pd.DataFrame(columns=[c.replace('chrpsychs','hcpsychs') for c in dfpsychs.columns])
dfpsychs.set_index('subjectkey',inplace=True)
dfchr.set_index('subjectkey',inplace=True)
dfhc.set_index('subjectkey',inplace=True)
dfincl=pd.read_csv(glob('PrescientStudy_Prescient_inclusionexclusion_criteria_review_*.csv')[0])
for i,row in dfincl.iterrows():
if not validate(row['subjectkey']):
continue
try:
# additional [ ] used around row['subjectkey']
# to make the result a row when there is a single row
subject_row=dfpsychs.loc[ [row['subjectkey']] ]
except KeyError:
continue
s=row['subjectkey']
try:
chr_hc= int(row['chrcrit_part'])
if chr_hc==1:
chr_hc='UHR'
elif chr_hc==2:
chr_hc='HealthyControl'
except ValueError:
print('chrcrit_part is empty for', s)
continue
if chr_hc=='UHR':
# CHR
dfchr=pd.concat([dfchr,subject_row])
elif chr_hc=='HealthyControl':
# HC
subject_row.columns=dfhc.columns
dfhc=pd.concat([dfhc,subject_row])
else:
# irrelevant
print(f'CHR/HC status could not be determined')
continue
print('')
outfile=file
dfchr=dfchr.reset_index()
dfchr.to_csv(outfile,index=False)
outfile=file.replace('_fu_','_fu_hc_')
dfhc=dfhc.reset_index()
dfhc.to_csv(outfile,index=False)
chdir(dir_bak)