-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrubber.py
549 lines (428 loc) · 20.5 KB
/
scrubber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
# scrubber.py
# http://sites.google.com/site/pocketsense/
# fix known issues w/ OFX downloads
# rlc: 2010
# 05-Aug-2010*rlc
# - Added _scrubTime() function to fix NULL time stamps so that transactions record on the correct date
# regardless of time zone.
# 28-Jan-2011*rlc
# - Added _scrubDTSTART() to fix missing <DTEND> fields when a <DTSTART> exists.
# - Recoded scrub routines to use regex substitutions
# 28-Aug-2012*rlc
# - Added quietScrub option to sites.dat (suppresses scrub messages)
# 02-Feb-2013*rlc
# - Bug fix in _scrubDTSTART
# - Added scrub routine to verify Investment buy and sell transactions
# 17-Feb-2013*rlc
# - Added scrub routine for CORRECTACTION and CORRECTFITID tags (not supported by Money)
# 06-Jul-2013*rlc
# - Bug fix in _scrubDTSTART()
# 20-Feb-2014*rlc
# - Bug fix in _scrubINVsign() for SELL transactions
#14-Aug-2016*rlc
# - Update to handle new Discover Bank FITID format.
#11-Mar-2017*rlc
# - Added REINVEST transactions to _scrubINVsign()
# - Added _scrubRemoveZeroTrans(). Removes $0.00 transactions when enabled in sites.dat
#08-Apr-2017*cgn / rlc
# - Revert _scrubINVsign() to previous version
# - Add _scrubREINVESTsign() to handle REINVEST transactions separately
# Note the differnt field order vs what's used for BUY/SELL transactions in _scrubINVsign()
#27-Aug-2017*rlc
# - Bug patch to fix timedelta call
#24-Oct-2017*ad / rlc
# - Update to address recent change by Discover Bank re transaction ids
# - Insert check# field for discover bank transactions such that Money recognizes it
#01-Jan-2018*rlc
# - Revert Discover Bank fitid substitution to the same as it was before the 24-Oct update
#14-Apr-2018*rlc
# - Replace ampsersand "&" symbol w/ & code when not part of valid escape code. see _scrubGeneral()
#27-Jul-2018*dbc
# - Add TRowePrice scrub function to fix paid-out dividends/cap gains that are marked as reinvested
import os, sys, re
import site_cfg
from datetime import datetime, timedelta
from control2 import *
from rlib1 import *
userdat = site_cfg.site_cfg()
stat = False #global used between re lambda subs to track status
def scrubPrint(line):
if not userdat.quietScrub:
print " +" + line
def scrub(filename, site):
#filename = string
#site = DICT structure containing full site info from sites.dat
siteURL = FieldVal(site, 'url').upper()
dtHrs = FieldVal(site, 'timeOffset')
accType = FieldVal(site, 'CAPS')[1]
f = open(filename,'r')
ofx = f.read() #as-found ofx message
#print ofx
#NOTE: Discover Card and Bank use the same server @ discovercard.com
if 'DISCOVERCARD' in siteURL: ofx= _scrubDiscover(ofx, accType)
if 'TROWEPRICE.COM' in siteURL: ofx = _scrubTRowePrice(ofx)
ofx= _scrubTime(ofx) #fix 000000 and NULL datetime stamps
if dtHrs <> 0: ofx = _scrubShiftTime(ofx, dtHrs) #note: always call *after* _scrubTime()
ofx= _scrubDTSTART(ofx) #fix missing <DTEND> fields
#fix malformed investment buy/sell/reinvest signs (neg vs pos), if they exist
if "<INVSTMTTRNRS>" in ofx.upper():
ofx= _scrubINVsign(ofx)
ofx= _scrubREINVESTsign(ofx)
#remove $0.00 transactions
if userdat.skipZeroTransactions: ofx = _scrubRemoveZeroTrans(ofx)
#perform general ofx cleanup
ofx = _scrubGeneral(ofx)
#close the input file
f.close()
#write the new version to the same file
f = open(filename, 'w')
f.write(ofx)
f.close
#-----------------------------------------------------------------------------
# OFX.DISCOVERCARD.COM
# 1. Discover OFX files will contain transaction identifiers w/ the following format:
# FITIDYYYYMMDDamt#####, where
# FITID = string literal
# YYYY = year (numeric)
# MM = month (numeric)
# DD = day (numeric)
# amt = dollar amount of the transaction, including a hypen for negative entries (e.g., -24.95)
# ##### = 5 char value
# 2. The 5-digit serial number can change each time you connect to the server,
# meaning that the same transaction can download with different FITID numbers.
# That's not good, since Money requires a unique FITID value for each valid transaction.
# Varying serial numbers result in duplicate transactions!
# 3. We'll replace the 5-digit serial number with one of our own.
# The default will be 0 for every transaction,
# and we'll increment by one for each subsequent transaction that that matches
# a previous transaction in the file.
# 8/14/2016: Discover BANK now uses an FITID format of SDF######, where ###### is unique for the day.
# The length seems to vary, but the largest observed is 6 digits
# Unfortunately, the digits can be assigned to multiple transactions on the same day, so it isn't
# guaranteed to be unique.
# Modified routine to uniquely handle BASTMT vs CCSTMT statements.
# NOTE: There was brief period in late 2017 where Discover Bank changed their fitid format, but soon
# reverted to the same as described above.
_sD_knownvals = [] #global to keep track of Discover FITID values between regex.sub() calls
def _scrubDiscover(ofx, accType):
if accType=='CCSTMT':
scrubPrint("Scrubber: Processing Discover Card statement.")
else:
scrubPrint("Scrubber: Processing Discover Bank statement.")
ofx_final = '' #new ofx message
_sD_knownvals = [] #reset our global set of known vals (just in case)
# dev: insert a line break after each transaction for readability.
# also helps block multi-transaction matching in below regexes via ^\s option
p = re.compile(r'(<STMTTRN>)',re.IGNORECASE)
ofx = p.sub(r'\n<STMTTRN>', ofx)
#regex p captures everything from <FITID> up to the next <tag>, but excludes the next "<".
#p produces 2 results: r.group(1) = <FITID> field, r.group(2)=value
#the ^<\s prevents matching on the next < or newline
p = re.compile(r'(<FITID>)([^<\s]+)',re.IGNORECASE)
#call substitution (inline lamda, takes regex result = r as tuple)
ofx_final = p.sub(lambda r: _scrubDiscover_r1(r, accType), ofx)
if accType=='BASTMT':
#regex p captures everything from <TRNTYPE>DEBIT up to the next "<" aftert the <NAME>Check tag and field.
# Discover Bank codes checks as
# <STMTTRN><TRNTYPE>DEBIT<...><NAME>Check ###########</STMTTRN>
# p produces 4 results:
# r.group(1) = <TRNTYPE>DEBIT,
# r.group(2) = stuff up to next "<NAME>Check "
# r.group(3) = "<NAME>Check ", including the trailing spaces (at least 1)
# r.group(4) is the check number (1 or more digits)
# Rearranged, the result should produce a entry that will import the check number in Money
# <STMTTRN><TRNTYPE>CHECK<...><CHECKNUM>############<NAME>Check</STMTTRN>
ofx = ofx_final
p = re.compile(r'(<TRNTYPE>DEBIT)([^\s]+)(<NAME>Check[ ]+)([0-9]+)',re.IGNORECASE)
ofx_final = p.sub(lambda r: _scrubDiscover_r2(r, accType), ofx)
return ofx_final
def _scrubDiscover_r1(r, accType):
#regex subsitution function: change fitid value
global _sD_knownvals
fieldtag = r.group(1)
fitid = r.group(2).strip(' ')
fitid_b = fitid #base fitid before annotating
#strip the serial value for credit card transactions
if accType=='CCSTMT':
bx = len(fitid) - 5
fitid_b = fitid[:bx]
#find a unique serial#, from 0 to 9999
seq = 0 #default
while seq < 9999:
fitid = fitid_b + str(seq)
exists = (fitid in _sD_knownvals)
if exists: #already used it... try another
seq=seq+1
else:
break #unique value... write it out
_sD_knownvals.append(fitid) #remember the assigned value between calls
return fieldtag + fitid #return the new string for regex.sub()
def _scrubDiscover_r2(r, accType):
#regex subsitution function: insert checknum field for BANK statements
trntype = r.group(1)
rest = r.group(2)
name = r.group(3).strip(' ')
checknum = r.group(4)
return '<TRNTYPE>CHECK' + rest + '<CHECKNUM>' + checknum + name
#--------------------------------
def _scrubTime(ofx):
#Replace NULL time stamps with noontime (12:00)
#regex p captures everything from <DT*> up to the next <tag>, but excludes the next "<".
#p produces 2 results: group(1) = <DT*> field, group(2)=dateval
p = re.compile(r'(<DT.+?>)([^<\s]+)',re.IGNORECASE)
#call date correct function (inline lamda, takes regex result = r tuple)
global stat
stat = False
ofx_final = p.sub(lambda r: _scrubTime_r1(r), ofx)
if stat: scrubPrint("Scrubber: Null time values updated.")
return ofx_final
def _scrubTime_r1(r):
# Replace zero and NULL time fields with a "NOON" timestamp (120000)
# Force "date" to be the same as the date listed, regardless of time zone by setting time to NOON.
# Applies when no time is given, and when time == MIDNIGHT (000000)
global stat
fieldtag = r.group(1)
DT = r.group(2).strip(' ') #date+time
# Full date/time format example: 20100730000000.000[-4:EDT]
if DT[8:] == '' or DT[8:14] == '000000':
#null time given. Adjust to 120000 value (noon).
DT = DT[:8] + '120000'
stat = True
return fieldtag + DT
#--------------------------------
def _scrubDTSTART(ofx):
# <DTSTART> field for an account statement must have a matching <DTEND> field
# If DTEND is missing, insert <DTEND>="now"
# The assumption is made that only one statement exists in the OFX file (no multi-statement files!)
ofx_final = ofx
now = datetime.now()
nowstr = now.strftime("%Y%m%d%H%M00")
if ofx.find('<DTSTART>') >= 0 and ofx.find('<DTEND>') < 0:
#we have a dtstart, but no dtend... fix it.
scrubPrint("Scrubber: Fixing missing <DTEND> field")
#regex p captures everything from <DTSTART> up to the next <tag> or white space into group(1)
p = re.compile(r'(<DTSTART>[^<\s]+)',re.IGNORECASE)
if Debug: print "DTSTART: findall()=", p.findall(ofx_final)
#replace group1 with (group1 + <DTEND> + datetime)
ofx_final = p.sub(r'\1<DTEND>'+nowstr, ofx_final)
return ofx_final
def _scrubShiftTime(ofx, h):
#Shift DTASOF time values by (float) h hours
#Added: 15-Feb-2011, rlc
#regex p captures everything from <DTASOF> up to the next <tag> or white-space.
#p produces 2 results: group(1) = <DTASOF> field, group(2)=dateval
p = re.compile(r'(<DTASOF>)([^<\s]+)',re.IGNORECASE | re.DOTALL)
#call date correct function (inline lamda, takes regex result = r tuple)
if p.search(ofx):
scrubPrint("Scrubber: Shifting DTASOF time values " + str(h) + " hours.")
ofx_final = p.sub(lambda r: _scrubShiftTime_r1(r,h), ofx)
return ofx_final
def _scrubShiftTime_r1(r,h):
#Shift time value by (float) h hours for regex search result r.
#Added: 15-Feb-2011, rlc
fieldtag = r.group(1) #date field tag (e.g., <DTASOF>)
DT = r.group(2).strip(' ') #date+time
if Debug: print "fieldtag=", fieldtag, "| DT=" + DT
# Full date/time format example: 20100730120000.000[-4:EDT]
#separate into date/time + timezone
tz = ""
if '[' in DT:
p = DT.index('[')
tz = DT[p:]
DT = DT[:p]
#strip the decimal fraction, if we have it
if '.' in DT:
d = DT.index('.')
DT = DT[:d]
if Debug: scrubPrint("New DT=" + DT + "| tz=" + tz)
#shift the time
tval = datetime.strptime(DT,"%Y%m%d%H%M%S") #convert str to datetime
deltaT = timedelta(hours=h)
tval += deltaT #add hours
DT = tval.strftime("%Y%m%d%H%M%S") + tz #convert new datetime to str
return fieldtag + DT
def _scrubINVsign(ofx):
#Fix malformed parameters in Investment buy/sell sections, if they exist
#Issue first noticed with Fidelity netbenefits 401k accounts: rlc*2013
#BUY transactions:
# UNITS must be positive
# TOTAL must be negative
#SELL transactions:
# UNITS must be negative
# TOTAL must be positive
global stat
stat = False
p = re.compile(r'(<INVBUY>|<INVSELL>)(.+?<UNITS>)(.+?)(<.+?<TOTAL>)([^<\r\n]+)', re.IGNORECASE)
ofx_final=p.sub(lambda r: _scrubINVsign_r1(r), ofx)
if stat:
scrubPrint("Scrubber: Invalid investment sign (pos/neg) found. Corrected.")
return ofx_final
def _scrubINVsign_r1(r):
global stat
type=""
if "INVBUY" in r.group(1): type = "INVBUY"
if "INVSELL" in r.group(1): type = "INVSELL"
qty = r.group(3)
total=r.group(5)
qty_v=float2(qty)
total_v=float2(total)
if (type=="INVBUY" and qty_v<0) or (type=="INVSELL" and qty_v>0):
stat=True
qty=str(-1*qty_v)
if (type=="INVBUY" and total_v>0) or (type=="INVSELL" and total_v<0):
stat=True
total=str(-1*total_v)
return r.group(1) + r.group(2) + qty + r.group(4) + total
def _scrubREINVESTsign(ofx):
#Fix malformed parameters in REINVEST transactions, if they exist
#Issue first noticed with Fidelity netbenefits 401k accounts: cgn*2016
#REINVEST transactions:
# UNITS must be positive
# TOTAL must be negative
global stat
stat=False
p = re.compile(r'(<REINVEST>)(.+?<TOTAL>)(.+?)(<.+?<UNITS>)([^<\r\n]+)', re.IGNORECASE)
ofx_final=p.sub(lambda r: _scrubREINVESTsign_r1(r), ofx)
if stat:
scrubPrint(" +Scrubber: Invalid reinvestment sign (pos/neg) found. Corrected.")
return ofx_final
def _scrubREINVESTsign_r1(r):
global stat
qty = r.group(5)
total=r.group(3)
qty_v=float2(qty)
total_v=float2(total)
if (qty_v<0):
stat=True
qty=str(-1*qty_v)
if (total_v>0):
stat=True
total=str(-1*total_v)
return r.group(1) + r.group(2) + total + r.group(4) + qty
def _scrubGeneral(ofx):
# General scrub routine for general updates
#1. Remove tag/value pairs that Money doesn't support
#define unsupported tags that we've had trouble with
uTags = []
uTags.append('CORRECTACTION')
uTags.append('CORRECTFITID')
for tag in uTags:
p = re.compile(r'<'+tag+'>[^<]+',re.IGNORECASE)
if p.search(ofx):
ofx = p.sub('',ofx)
scrubPrint("Scrubber: <"+tag+"> tags removed. Not supported by Money.")
#2. Replace ampersands '&' that aren't part of a valid escape code (i.e., is NOT like &,  etc)
# literally: replace '&' chars with '&' when the next chars are not
# a '#' or valid alphanumerics followed by a ;
p = re.compile(r'&(?!#?\w+;)')
if p.search(ofx):
scrubPrint("Scrubber: Replace invalid '&' chars with '&'")
ofx = p.sub('&',ofx)
return ofx
def _scrubRemoveZeroTrans(ofx):
#Remove transactions with a $0.00 value
#regex p captures transaction records
#p produces 3 results: group(1) = trans header, group(2)=Amount, group(3)=trans suffix
global stat
stat=False
p = re.compile(r'(<STMTTRN>.*?<TRNAMT>)(.+?)(<.*?</STMTTRN>)',
flags=re.DOTALL | re.IGNORECASE)
ofx = p.sub(lambda r: _scrubRemoveZeroTrans_r1(r), ofx)
if stat: scrubPrint('Zero amount ($0.00) transactions removed.')
return ofx
def _scrubRemoveZeroTrans_r1(r):
# return null transaction when amount=0
global stat
amount = float2(r.group(2))
if amount==0: stat=True
return None if amount==0 else r.group(1)+r.group(2)+r.group(3)
#-----------------------------------------------------------------------------
# OFX fiorg T. Rowe Price
# 1. T. Rowe Price OFX reports paid-out (non-reinvested) dividends and capital gains incorrectly
# in such a way that MS Money ignores the transaction.
# These transactions can be identified as reinvestments having 0.0 shares.
# The transaction will be bounded by <REINVEST> and </REINVEST> and contain <UNITS>0.0.
# Furthermore, the payment is incorrectly shown as a negative value, the memo field is misleading, and
# a required field (<SUBACCTFUND>) is missing.
#
# For dividends (<INCOMETYPE>DIV) make these changes to the transaction:
# Change <REINVEST> to <INCOME>
# Change <MEMO>DIVIDEND (REINVEST) to <MEMO>DIVIDEND PAID
# Change <TOTAL>-#.## to <TOTAL>#.##
# Following <SUBACCTSEC>CASH add <SUBACCTFUND>CASH
# Delete <UNITS>0.0
# Delete <UNITPRICE>#.##
# Change </REINVEST> to </INCOME>
# For short term capital gains (<INCOMETYPE>CGSHORT) make these changes to the transaction:
# Change <REINVEST> to <INCOME>
# Change <MEMO>SHORT TERM CAP GAIN REIN to <MEMO>SHORT TERM CAP GAIN PAID
# Change <TOTAL>-#.## to <TOTAL>#.##
# Following <SUBACCTSEC>CASH add <SUBACCTFUND>CASH
# Delete <UNITS>0.0
# Delete <UNITPRICE>#.##
# Change </REINVEST> to </INCOME>
# For long term capital gains (<INCOMETYPE>CGLONG) make these changes to the transaction:
# Change <REINVEST> to <INCOME>
# Change <MEMO>LONG TERM CAPITAL GAI to <MEMO>LONG TERM CAPITAL GAIN PAID
# Change <TOTAL>-#.## to <TOTAL>#.##
# Following <SUBACCTSEC>CASH add <SUBACCTFUND>CASH
# Delete <UNITS>0.0
# Delete <UNITPRICE>#.##
# Change </REINVEST> to </INCOME>
def _scrubTRowePrice(ofx):
global stat
ofx_final = '' #new ofx message
stat = False
if Debug: print('Function _scrubTRowePrice(OFX) called')
#Process all <REINVEST>...</REINVEST> transactions
#Use non-greedy quantifier .+? to avoid matching across transactions.
p = re.compile(r'<REINVEST>.+?</REINVEST>',re.IGNORECASE)
#re.sub() command operates on every non-overlapping occurence of pattern when passed a function for replacement
ofx_final = p.sub(lambda r: _scrubTRowePrice_r1(r), ofx)
if stat: scrubPrint("Scrubber: T Rowe Price dividends/capital gains paid out.")
return ofx_final
def _scrubTRowePrice_r1(r):
#regex subsitution function: if <UNITS>0.0 then convert transaction from <REINVEST> to <INCOME>
global stat
#Copy the reinvested transaction for manipulation
ReinvTrans = r.group(0)
#Create variable for the paid out transaction
PaidTrans = ''
# If units are 0.0 then scrub the ofx transaction
if '<UNITS>0.0<' in ReinvTrans.upper():
#Flag that at least one transaction is scrubbed
stat = True
#Use regex to parse the REINVEST transaction with following format
#<REINVEST>...<MEMO>erroneous memo</INVTRAN>...<INCOMETYPE>DIV or CGSHORT or CGLONG<TOTAL>-#.##<SUBACCTSEC>CASH<UNITS>0.0<UNITPRICE>33.33</REINVEST>
#into these 10 groups:
# m.group(1) = <REINVEST>
# m.group(2) = ...<MEMO>
# m.group(3) = erroneous memo
# m.group(4) = </INVTRAN>...<INCOMETYPE>
# m.group(5) = type of income (eg DIV, CGSHORT, CGLONG)
# m.group(6) = <TOTAL>-#.##
# m.group(7) = <SUBACCTSEC>CASH
# m.group(8) = <UNITS>#.###
# m.group(9) = <UNITPRICE>#.##
# m.group(10) = </REINVEST>
p = re.compile(r'(<REINVEST>)(<.+?<MEMO>)(.+?[^<]*)(</INVTRAN>.+?<INCOMETYPE>)(.+?[^<]*)(<TOTAL>.+?[^<]*)(<SUBACCTSEC>.+?[^<]*)(<UNITS>.+?[^<]*)(<UNITPRICE>.+?[^<]*)(</REINVEST>)',re.IGNORECASE)
m = p.match(ReinvTrans)
gr01 = '<INCOME>' #Change from <REINVEST>
gr02 = m.group(2)
gr04 = m.group(4)
gr05 = m.group(5)
if gr05 == 'DIV' : gr03 = 'DIVIDEND PAID'
elif gr05 == 'CGSHORT' : gr03 = 'SHORT TERM CAP GAIN PAID'
elif gr05 == 'CGLONG' : gr03 = 'LONG TERM CAPITAL GAIN PAID'
else : gr03 = m.group(3) #Leave as reported
gr06 = m.group(6).replace('-','')
gr07 = m.group(7) + '<SUBACCTFUND>CASH'
#No need to capture m.group(8) since it is deleted
#No need to capture m.group(9) since it is deleted
gr10 = '</INCOME>'
PaidTrans = gr01+gr02+gr03+gr04+gr05+gr06+gr07+gr10
if Debug: print('Reinv Trans: '+ReinvTrans)
if Debug: print('Paid Trans: '+PaidTrans)
return PaidTrans #return the new string for regex.sub()
# end t.rowe.price div reinvest scrubber
#-----------------------------------------------------------------------------