-
Notifications
You must be signed in to change notification settings - Fork 7
/
Merge_BHC_files.py
235 lines (196 loc) · 9.79 KB
/
Merge_BHC_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#!/home/yourname/anaconda3/bin/python
#!/usr/bin/env python3
import glob
import os
# import re
import pandas as pd
from termcolor import colored
pd.set_option('display.max_columns', None)
# import base64
# import warnings
# import sys, string, subprocess
#import argparse
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
# from sys import platform as _platform
def check_file_exists(longfname, rw):
return(os.path.isfile(longfname))
def makelable_dict(df):
lijst_df_rssd = list(set([x for x in list(df) if 'RSSD' in x]))
lijst_df_bhck = list(set([x[-4:] for x in list(df) if not 'year' in x and not 'qid' in x if not 'RSSD' in x]))
dfd = pd.read_csv("MDRM_CSV.csv", encoding="ISO-8859-1", low_memory=False, parse_dates=False, infer_datetime_format=False, skiprows=[0])
dfd = dfd.loc[(dfd['Reporting Form'] == "FR Y-9C") | (dfd['Reporting Form'] == "FR Y-9C"), ['Mnemonic', 'Item Code', 'Item Name']]
dfd = dfd.rename(columns={'Item Code': 'item', 'Item Name': 'label', 'Mnemonic': 'mnem'}).drop_duplicates(subset=['item'])#.set_index('item')
dfd = dfd.assign(long_mnem = dfd.mnem + dfd.item)
df_rssd = pd.DataFrame(index=lijst_df_rssd).join(dfd.set_index('long_mnem')['label']).dropna()
df_FRY9C = pd.DataFrame(index=lijst_df_bhck).join(dfd.set_index('item')['label']).dropna()
df_lables = pd.DataFrame(list(set([x for x in list(df) if not 'year' in x and not 'qid' in x if not 'RSSD' in x])), columns = ['long_key'])
df_lables['item'] = df_lables['long_key'].str[-4:]
df_lables = df_lables.set_index('item').join(df_FRY9C).reset_index().dropna()
df_lables = df_lables[['long_key', 'label']].set_index('long_key')
df_lables.index.names = ['item']
df_lables = pd.concat([df_lables, df_rssd])
df_lables['label'] = df_lables['label'].str[:50]
df_lables = df_lables.to_dict()
return df_lables['label']
# df_lables = makelable_dict(df)
def concat_pieces(pieces, fname, featherfile, statafile, label):
if len(pieces) > 0:
df = pd.concat(pieces, ignore_index=True, sort=False) # changed 20180716 (sort=False)
df = order(df, ['RSSD9001', 'RSSD9999', 'year', 'qid'])
print('Writing to cvs.gz')
df.to_csv(fname + '.gz', index=False, sep="^", compression='gzip')
print('Writing to featherfile')
df.to_feather(featherfile)
if statafile != "0":
print('Writing to Stata')
df.to_stata(statafile, version=119, variable_labels=makelable_dict(df))
print('\n%s has a count of %s.' % (label, df["qid"].count()))
else:
print(label + "leeg")
return df
def order(frame, var):
varlist = [w for w in frame.columns if w not in var]
frame = frame[var + varlist]
return frame
def delfile(file_to_delete):
if file_to_delete != "0":
try:
os.remove(file_to_delete)
except:
pass
def writefilesorted(list_to_write, pad, file_to_write):
with open(pad + file_to_write, mode='wt') as myfile:
myfile.write('\n'.join(list_to_write))
def qtr(x):
return {'0331': 1, '0630': 2, '0930': 3, '1231': 4}[x]
def read_vars(pad, fname, up):
columns = pd.read_csv(pad + fname, header=None).apply(lambda x: x.str.strip())
columns = [x for x in columns[0].to_list()]
if up == "U":
columns = [x.upper() for x in columns]
return columns
def skip_bad_file(fname):
with open(fname) as f:
lines = list(f)
g = open(fname, "w")
for line in lines:
data = line.split('^')
if data[0] != "1115406":
g.write(line)
else:
print(colored("\nSkipped the bad line in 2003 Q 1.\n", 'red'))
g.close()
# main(pad, path, bank_out_file, var_out_file, panelfile, vars_in_file, filesfile, statafile, add2db, user, password, host):
def main(pad, path, bank_out_file, var_out_file, panelfile, featherfile, vars_in_file, filesfile, statafile): # , add2db, user, password, host):
banks = []
variables = []
all_pieces = []
filecount = 0
tot_filecount = 0
tgt_list = read_vars(pad, vars_in_file, "U")
if check_file_exists(pad + filesfile, "r"):
lyst = read_vars(pad, filesfile, "L")
add_pad = 1
else:
lyst = sorted(glob.glob(path))
add_pad = 0
tot_filecount = len(lyst)
print(('\nTotal number of files submitted: %s.\n') % tot_filecount)
for naam in lyst:
if add_pad == 1:
fname = pad + naam
else:
fname = naam
print("File: " + fname, end='')
if check_file_exists(fname, "r"):
if "bhcf0303.txt" in fname:
print(colored("-", 'red'))
skip_bad_file(fname)
else:
print("+")
f = open(fname, 'r')
filecount += 1
with f:
word = f.readline().upper().split('^')
if word[0] == "RSSD9001":
idx = 0
if word[1] == "RSSD9001":
idx = 1
#some files have dashes on line 2, but not all files
rowcount = 0
for line in f:
data = line.split('^')
if data[0] != "--------":
if data[idx] not in banks:
banks.append(data[idx])
else:
rowcount += 1
f.close()
# Read date in frame, use the idx to make sure the right rows are imported
# For presice I set parse_dates=False, infer_datetime_format=False
if rowcount != 0:
dfr = pd.read_csv(fname, sep='^', encoding="ISO-8859-1", low_memory=False, parse_dates=False, infer_datetime_format=False, skiprows=[rowcount], engine="c")
else:
dfr = pd.read_csv(fname, sep='^', encoding="ISO-8859-1", low_memory=False, parse_dates=False, infer_datetime_format=False, engine="c")
#Set vars to upper case
for item in list(dfr):
if not item == item.upper():
dfr = dfr.rename(columns={item: item.upper()})
if item not in variables:
variables.append(item)
#All variables of the frame are here:
var_list = list(dfr)
#Assing the main id variables
dfr['year'] = int(str(dfr["RSSD9999"].loc[0])[0:4])
dfr['qid'] = qtr(str(dfr["RSSD9999"].loc[0])[4:9])
print('Rowcount: %2s, Cell 1: %10s. Cell 2: %10s. Cell 3: %10s.' % (rowcount, var_list[0], var_list[1], var_list[2]), end='')
print(' RSSD9999: ', end='')
print(colored(dfr["RSSD9999"].loc[0], 'red'), end='')
print(' Year: %5s. Qtr: %2s. Count %4s. Vars: %5s' % (dfr["year"].loc[0], dfr["qid"].loc[0], dfr["RSSD9001"].count(), len(var_list)), end='')
print(' Banks: %5s.' % len(banks), end='')
print(' Variables: %5s.' % len(variables), end='')
print(' Filecount: %4s of %4s.\n' % (filecount, tot_filecount), end='')
tmp_list = ['RSSD9001', 'RSSD9999', 'year', 'qid']
for item in tgt_list:
if item in var_list:
if item not in tmp_list:
tmp_list.append(item)
print(colored(item + " ", 'green'), end='')
else:
print(colored(item + " ", 'red'), end='')
# print("\nNote: please update mdrm_scraper.\n")
dfr = dfr[tmp_list]
all_pieces.append(dfr)
else:
print("\nThis BHC file is missing:"),
print(colored(fname, 'red'), end='')
print("I will skip.\n")
filecount += 1
df = concat_pieces(all_pieces, panelfile, featherfile, statafile, 'Panel')
print("Totals")
print('Banks: %s.' % len(banks))
print('Variables: %s.' % len(variables))
#print("\n")
#Now write to files
writefilesorted(banks, pad, bank_out_file)
writefilesorted(variables, pad, var_out_file)
print("Done!!!!\n")
return df
if __name__ == '__main__':
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument("-pad", "--pad", default=os.getcwd(), help="Set path of your files")
parser.add_argument("-bank_out_file", "--bank_out_file", default='banks.csv', help='Set file name for banks output file. Enter 0 if not required. Default is "banks.csv".')
parser.add_argument("-var_out_file", "--var_out_file", default='variables.csv', help='Set file name for variables output file. Enter 0 if not required. Default is "variables.csv".')
parser.add_argument("-panelfile", "--panelfile", default='panelfile.csv', help='Set file name for panel file. "panelfile.csv".')
parser.add_argument("-featherfile", "--featherfile", default='panelfile.feather', help='Set file name for feather file. "panelfile.feather".')
parser.add_argument("-vars_in_file", "--vars_in_file", default='bhc_vars.csv', help='Set file name for variables input file. Default is "bhc_vars.csv".')
parser.add_argument("-filesfile", "--filesfile", default='lyst3.csv', help='Set file name for files input file. Default is "lyst3.csv".')
parser.add_argument("-statafile", "--statafile", default='panelfile.dta', help='Set file name for stata output. Enter 0 if not required. Default is "panelfile.dta".')
args = vars(parser.parse_args())
pad = args["pad"] + "/"
path = pad + "bhcf*.txt"
print("\nInputs:")
for key, value in args.items():
print(key, value)
print("\n")
df = main(pad, path, args["bank_out_file"], args["var_out_file"], args["panelfile"], args["featherfile"], args["vars_in_file"], args["filesfile"], args["statafile"]) # , args["add2db"], args["user"], args["password"], args["host"])