From 89edf3a5db1afb8066d41b3cfbe55881a371e082 Mon Sep 17 00:00:00 2001 From: Emmanuel Leroy Date: Tue, 7 Jun 2022 17:42:59 -0700 Subject: [PATCH 1/5] WIP parallelize random lookups --- datagen_customer.py | 6 ++-- datagen_transaction.py | 10 +++--- profile_weights.py | 75 +++++++++++++++++++++++++++++++++++------- 3 files changed, 72 insertions(+), 19 deletions(-) diff --git a/datagen_customer.py b/datagen_customer.py index c2385f0..0590158 100644 --- a/datagen_customer.py +++ b/datagen_customer.py @@ -1,7 +1,7 @@ from faker import Faker import sys from datetime import timedelta, date -from random import random +import random from main_config import MainConfig import argparse import pathlib @@ -83,7 +83,7 @@ def get_first_name(self): return self.fake.first_name_female() def generate_age_gender(self): - n = random() + n = random.random() g_a = age_gender[min([a for a in age_gender if a > n])] while True: @@ -105,7 +105,7 @@ def get_random_location(self): """ Assumes lst is sorted. Returns closest value to num. """ - num = random() + num = random.random() lst = list(cities.keys()) pos = bisect_left(lst, num) if pos == 0: diff --git a/datagen_transaction.py b/datagen_transaction.py index b23d2e5..6a73ce9 100644 --- a/datagen_transaction.py +++ b/datagen_transaction.py @@ -35,13 +35,14 @@ merchants[row[0]] = [] merchants[row[0]].append(row[1]) +fake = Faker() + class Customer: def __init__(self, raw): self.raw = raw.strip().split('|') self.attrs = self.parse_customer(raw) self.fraud_dates = [] - self.fake = Faker() def print_trans(self, trans, is_fraud, fraud_dates): is_traveling = trans[1] @@ -63,8 +64,8 @@ def print_trans(self, trans, is_fraud, fraud_dates): rad = (float(travel_max) / 100) * 1.43 # geo_coordinate() uses uniform distribution with lower = (center-rad), upper = (center+rad) - merch_lat = self.fake.coordinate(center=float(cust_lat),radius=rad) - merch_long = self.fake.coordinate(center=float(cust_long),radius=rad) + merch_lat = fake.coordinate(center=float(cust_lat),radius=rad) + merch_long = fake.coordinate(center=float(cust_long),radius=rad) if (is_fraud == 0 and t[1] not in fraud_dates) or is_fraud == 1: features = self.raw + t + [chosen_merchant, str(merch_lat), str(merch_long)] @@ -104,6 +105,8 @@ def main(customer_file, profile_file, start_date, end_date, out_path=None): profile = Profile({**profile_obj}) profile.set_date_range(start_date, end_date) fraud_profile = Profile({**profile_fraud_obj}) + + inter_val = (end_date - start_date).days - 7 # for each customer, if the customer fits this profile # generate appropriate number of transactions with open(customer_file, 'r') as f: @@ -120,7 +123,6 @@ def main(customer_file, profile_file, start_date, end_date, out_path=None): # decide if we generate fraud or not if fraud_flag < 99: #11->25 fraud_interval = random.randint(1,1) #7->1 - inter_val = (end_date - start_date).days - 7 # rand_interval is the random no of days to be added to start date rand_interval = random.randint(1, inter_val) #random start date is selected diff --git a/profile_weights.py b/profile_weights.py index e591527..e9b4178 100644 --- a/profile_weights.py +++ b/profile_weights.py @@ -18,7 +18,9 @@ def __init__(self, profile): years_wt, leap_wt = self.prep_holidays() self.proportions['date_wt']['time_of_year'] = years_wt self.proportions['date_wt']['time_of_year_leap'] = leap_wt + self.amt_specs = self.pre_compute_amt_specs() self.fake = Faker() + # Faker.seed(0) def set_date_range(self, start, end): self.start = start @@ -115,8 +117,8 @@ def prep_years(self): years_wt = sorted(self.profile['date_wt']['year'].keys()) # sync weights to extracted years for i, y in enumerate(years): - if years_wt[i] in self.profile['date_wt']['year']: - final_year[y] = self.profile['date_wt']['year'][years_wt[i]] + if i < len(years_wt): + final_year[y] = self.profile['date_wt']['year'].get(years_wt[i], 100) # if not enough years provided, make it 100 else: final_year[y] = 100 @@ -132,9 +134,9 @@ def combine_date_params(self, weights): else: time_name = 'time_of_year' - date_wt = weights['year'][curr.year]*\ - weights[time_name][(curr.month,curr.day)]*\ - weights['day_of_week'][curr.weekday()] + date_wt = (weights['year'][curr.year] + * weights[time_name][(curr.month,curr.day)] + * weights['day_of_week'][curr.weekday()]) new_date_weights[curr] = date_wt curr += timedelta(days=1) @@ -164,6 +166,15 @@ def closest_rand(self, pro, num): return pro[lst[-1]] return pro[lst[pos]] + def pre_compute_amt_specs(self): + amt_specs = {} + for category in self.profile['categories_amt'].keys(): + amt_specs[category] = { + 'shape': self.profile['categories_amt'][category]['mean']**2 / self.profile['categories_amt'][category]['stdev']**2, + 'scale': self.profile['categories_amt'][category]['stdev']**2 / self.profile['categories_amt'][category]['mean'] + } + return amt_specs + def sample_amt(self, category): shape = self.profile['categories_amt'][category]['mean']**2 / self.profile['categories_amt'][category]['stdev']**2 scale = self.profile['categories_amt'][category]['stdev']**2 / self.profile['categories_amt'][category]['mean'] @@ -196,6 +207,23 @@ def sample_time(self, am_or_pm, is_fraud): secs = random.randrange(60) return [hour, mins, secs] + def get_rand_2d(self, n, m, o): + x = [np.arange(n)] + for i in range(m): + x.append(np.random.random(n)) + for j in range(o): + x.append(np.zeros(n)) + return np.array(x).T + + def closest_rand_parallel(self, r, i, j, obj): + # get the closest number in obj keys from the number in col i, return in col j + lst = np.array(list(obj.keys())[::-1]) + # sort by the ith colum + r2 = r[r[:,i].argsort()] + for x in lst: + r2[:,j] = np.where(r2[:,1] <= x, x, r2[:,j]) + return r2 + def sample_from(self, is_fraud): # randomly sample number of transactions @@ -215,21 +243,44 @@ def sample_from(self, is_fraud): output = [] rand_date = np.random.random(num_trans) rand_cat = np.random.random(num_trans) - epoch_start = datetime(1970,1,1,0,0,0) + + # get an 2d array of random numbers + empty columns for mapping + rnds = self.get_rand_2d(num_trans, 2, 4) + + rnds = self.closest_rand_parallel(rnds, 1, 3, self.proportions['date_prop']) + rnds = self.closest_rand_parallel(rnds, 2, 5, self.proportions['shopping_time']) + rnds = self.closest_rand_parallel(rnds, 2, 4, self.proportions['categories_wt']) + + # get counts for each category + unique, counts = np.unique(rnds[:,4], return_counts=True) + # sort by category + rnds = rnds[rnds[:,5].argsort()] + offset = 0 + # for each category get the number of sample amounts + for i, cat_prop in enumerate(unique): + cat_specs = self.amt_specs[self.proportions['categories_wt'][cat_prop]] + shape = cat_specs['shape'] + scale = cat_specs['scale'] + rnd_amts = np.random.gamma(shape, scale, counts[i]) + rnds[offset: offset + counts[i], 6] = rnd_amts + offset += counts[i] + + # re-sort by index (see if needed) + # rnds = rnds[rnds[:,0].argsort()] fraud_dates = [] - for i, num in enumerate(rand_date): + for i in range(num_trans): # , num in enumerate(rand_date): trans_num = self.fake.md5(raw_output=False) - chosen_date = self.closest_rand(self.proportions['date_prop'], num) + chosen_date = self.proportions['date_prop'][rnds[i, 3]] #self.closest_rand(self.proportions['date_prop'], num) chosen_date_str = chosen_date.strftime('%Y-%m-%d') if is_fraud == 1: fraud_dates.append(chosen_date_str) - chosen_cat = self.closest_rand(self.proportions['categories_wt'], rand_cat[i]) - chosen_amt = self.sample_amt(chosen_cat) - chosen_daypart = self.closest_rand(self.proportions['shopping_time'], rand_cat[i]) + chosen_cat = self.proportions['categories_wt'][rnds[i, 4]] #self.closest_rand(self.proportions['categories_wt'], rand_cat[i]) + chosen_amt = rnds[i, 6] # self.sample_amt(chosen_cat) + chosen_daypart = self.proportions['shopping_time'][rnds[i, 5]] #self.closest_rand(self.proportions['shopping_time'], ) hr, mn, sec = self.sample_time(chosen_daypart, is_fraud) chosen_date = datetime.combine(chosen_date, time(hour=hr, minute=mn, second=sec)) - epoch = int((chosen_date - epoch_start).total_seconds()) + epoch = int(chosen_date.timestamp()) #int((chosen_date - epoch_start).total_seconds()) output.append([str(trans_num), chosen_date_str, f"{hr:02d}:{mn:02d}:{sec:02d}", str(epoch), str(chosen_cat), str(chosen_amt), str(is_fraud)]) return output, is_traveling, travel_max, fraud_dates From aacb3bf333e817361d8a9ae69b8a9a57cfb0a417 Mon Sep 17 00:00:00 2001 From: Emmanuel Leroy Date: Thu, 9 Jun 2022 12:40:54 -0700 Subject: [PATCH 2/5] parallelize version --- 150k_create_tx_1.sh | 33 ----------- 150k_create_tx_2.sh | 10 ---- README.md | 111 +++++++++++++++++++++++------------ create_all_transactions.sh | 18 ------ create_tx_1.sh | 35 ----------- create_tx_2.sh | 9 --- datagen.py | 98 +++++++++++++++++++++++++++++++ datagen_customer.py | 2 +- datagen_transaction.py | 81 ++++++++++++++----------- generate_customers.bat | 2 - generate_transactions.bat | 14 ----- profile_weights.py | 43 ++++---------- requirements.txt | 2 + static_merchant_generator.py | 4 +- 14 files changed, 234 insertions(+), 228 deletions(-) delete mode 100644 150k_create_tx_1.sh delete mode 100644 150k_create_tx_2.sh delete mode 100644 create_all_transactions.sh delete mode 100644 create_tx_1.sh delete mode 100644 create_tx_2.sh create mode 100644 datagen.py delete mode 100644 generate_customers.bat delete mode 100644 generate_transactions.bat create mode 100644 requirements.txt diff --git a/150k_create_tx_1.sh b/150k_create_tx_1.sh deleted file mode 100644 index b651432..0000000 --- a/150k_create_tx_1.sh +++ /dev/null @@ -1,33 +0,0 @@ - - -##Linux / OSX Boxes -#Generate customer file - -#python datagen_customer.py 10000 4444 profiles/main_config.json >> ../data/customers.csv - -#Generate transactions per profile - -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/40_60_bigger_cities.json 1-1-2012 12-31-2015 >> /capstone_A/data_gen/data/40_60_bigger_cities.csv & -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/40_60_smaller_cities.json 1-1-2012 12-31-2015 >> /capstone_A/data_gen/data/40_60_smaller_cities.csv & -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/all_60_up.json 1-1-2012 12-31-2015 >> /capstone_A/data_gen/data/all_60_up.csv & -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/male_30_40_bigger_cities.json 1-1-2012 12-31-2015 >> /capstone_A/data_gen/data/male_30_40_bigger_cities.csv & - -##Windows Boxes -#Generate customer file - -#python datagen_customer.py 10 4444 C:\Users\Brandon\git\data_generation\profiles\main_config.json >>C:\Users\Brandon\git\data_generation\data\customers.csv - -#Generate transactions per profile - -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\40_60_bigger_cities.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\40_60_bigger_cities.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\40_60_smaller_cities.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\40_60_smaller_cities.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\all_60_up.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\all_60_up.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\female_30_40_bigger_cities.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\female_30_40_bigger_cities.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\female_30_40_smaller_cities.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\female_30_40_smaller_cities.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\leftovers.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\leftovers.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\male_30_40_bigger_cities.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\male_30_40_bigger_cities.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\male_30_40_smaller_cities.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\male_30_40_smaller_cities.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\millenials.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\millenials.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\young_adults.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\young_adults.csv -exit 0 - diff --git a/150k_create_tx_2.sh b/150k_create_tx_2.sh deleted file mode 100644 index fe294da..0000000 --- a/150k_create_tx_2.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/leftovers.json 1-1-2012 12-31-2015 >> /capstone_B/data_gen/data/leftovers.csv & -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/male_30_40_smaller_cities.json 1-1-2012 12-31-2015 >> /capstone_B/data_gen/data/male_30_40_smaller_cities.csv & -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/millenials.json 1-1-2012 12-31-2015 >> /capstone_B/data_gen/data/millenials.csv & -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/young_adults.json 1-1-2012 12-31-2015 >> /capstone_B/data_gen/data/young_adults.csv & -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/female_30_40_bigger_cities.json 1-1-2012 12-31-2015 >> /capstone_B/data_gen/data/female_30_40_bigger_cities.csv& -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/female_30_40_smaller_cities.json 1-1-2012 12-31-2015 >> /capstone_B/data_gen/data/female_30_40_smaller_cities.csv & -exit 0 - diff --git a/README.md b/README.md index 9bac182..7cc0955 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,74 @@ -## Generate Fake Credit Card Transaction Data, Including Fraudulent Transactions - -### General Usage -* Create customers data file (see generate_customers.bat for syntax) -* Create transactions, utilizing prior customer file (see various .sh/.bat for syntax) - -This code is heavily modified, but based on original code by [Josh Plotkin](https://github.com/joshplotkin/data_generation). Change log of modifications to original code are below. - -### Modifications: - -#### v 0.4 -* Only surface-level changes done in scripts so that simulation can be done using Python3 -* Corrected bat files to generate transactions files. - -#### v 0.3 -* Completely re-worked profiles / segementation of customers -* introduced fraudulent transactions -* introduced fraudulent profiles -* modification of transaction amount generation via Gamma distribution -* added 150k_ shell scripts for multi-threaded data generation (one python process for each segment launched in the background) - -#### v 0.2 -* Added unix time stamp for transactions for easier programamtic evaluation. -* Individual profiles modified so that there is more variation in the data. -* Modified random generation of age/gender. Original code did not appear to work correctly? -* Added batch files for windows users - -#### v 0.1 -* Transaction times are now included instead of just dates -* Profile specific spending windows (AM/PM with weighting of transaction times) -* Merchant names (specific to spending categories) are now included (along with code for generation) -* Travel probability is added, with profile specific options -* Travel max distances is added, per profile option -* Merchant location is randomized based on home location and profile travel probabilities -* Simulated transaction numbers via faker MD5 hash (replacing sequential 0..n numbering) -* Includes credit card number via faker -* improved cross-platform file path compatibility +# Generate Fake Credit Card Transaction Data, Including Fraudulent Transactions + +## General Usage + +In this version, the general usage has changed: + +Please run the datagen script as follow: + +```bash +python datagen.py -n -o +``` + +To see the full list of options, use: + +```bash +python datagen.py -h +``` + +You can pass additional options with the following flags: + +- `-config `: pass the name of the config file, defaults to `./profiles/main_config.json` +- `-seed `: pass a seed to the Faker class +- `-c `: pass the path to an already generated customer file +- `-o `: folder to save files into + +This version is modified from the version v0.5 to parallelize the work using `multiprocessing`, so as to take advantage of all available CPUs and bring a huge speed improvement. + +Because of the way it parallelize the work (chunking transaction generation by chunking the customer list), there will be multiple transaction files generated per profile. Also not that if the number of customers is small, there may be empty files (i.e. files where no customer in the chunk matched the profile). This is expected. + +With standard profiles, it was benchmarked as generating ~95MB/thread/min. With a 64cores/128threads AMD E3, I was able to generate 1.4TB of data, 4.5B transactions, in just under 2h, as opposed to days when running the previous version. + +The generation code is originally based on code by [Josh Plotkin](https://github.com/joshplotkin/data_generation). Change log of modifications to original code are below. + +## Change Log + +### v1.0 + +- Parallelized version, bringing orders of magnitude faster generation depending on the hardware used. + +### v0.5 + +- 12x speed up thanks to some code refactoring. + +### v0.4 + +- Only surface-level changes done in scripts so that simulation can be done using Python3 +- Corrected bat files to generate transactions files. + +### v0.3 + +- Completely re-worked profiles / segmentation of customers +- introduced fraudulent transactions +- introduced fraudulent profiles +- modification of transaction amount generation via Gamma distribution +- added 150k_ shell scripts for multi-threaded data generation (one python process for each segment launched in the background) + +### v0.2 + +- Added unix time stamp for transactions for easier programamtic evaluation. +- Individual profiles modified so that there is more variation in the data. +- Modified random generation of age/gender. Original code did not appear to work correctly? +- Added batch files for windows users + +### v0.1 + +- Transaction times are now included instead of just dates +- Profile specific spending windows (AM/PM with weighting of transaction times) +- Merchant names (specific to spending categories) are now included (along with code for generation) +- Travel probability is added, with profile specific options +- Travel max distances is added, per profile option +- Merchant location is randomized based on home location and profile travel probabilities +- Simulated transaction numbers via faker MD5 hash (replacing sequential 0..n numbering) +- Includes credit card number via faker +- improved cross-platform file path compatibility diff --git a/create_all_transactions.sh b/create_all_transactions.sh deleted file mode 100644 index dcf442b..0000000 --- a/create_all_transactions.sh +++ /dev/null @@ -1,18 +0,0 @@ - -python datagen_customer.py 10000 4144 profiles/main_config.json >> /mnt/1k/customers.csv - - - -python datagen_transaction.py /mnt/1k/customers.csv /capstone/new_segements_data_gen/profiles/adults_2550_female_rural.json 1-1-2012 12-31-2015 >> /mnt/1k/adults_2550_female_rural.csv & -python datagen_transaction.py /mnt/1k/customers.csv /capstone/new_segements_data_gen/profiles/adults_2550_female_urban.json 1-1-2012 12-31-2015 >> /mnt/1k/adults_2550_female_urban.csv & -python datagen_transaction.py /mnt/1k/customers.csv /capstone/new_segements_data_gen/profiles/adults_2550_male_rural.json 1-1-2012 12-31-2015 >> /mnt/1k/adults_2550_male_rural.csv & -python datagen_transaction.py /mnt/1k/customers.csv /capstone/new_segements_data_gen/profiles/adults_2550_male_urban.json 1-1-2012 12-31-2015 >> /mnt/1k/adults_2550_male_urban.csv & -python datagen_transaction.py /mnt/1k/customers.csv /capstone/new_segements_data_gen/profiles/young_adults_female_rural.json 1-1-2012 12-31-2015 >> /mnt/1k/young_adults_female_rural.csv & -python datagen_transaction.py /mnt/1k/customers.csv /capstone/new_segements_data_gen/profiles/young_adults_female_urban.json 1-1-2012 12-31-2015 >> /mnt/1k/young_adults_female_urban.csv & -python datagen_transaction.py /mnt/1k/customers.csv /capstone/new_segements_data_gen/profiles/young_adults_male_rural.json 1-1-2012 12-31-2015 >> /mnt/1k/young_adults_male_rural.csv & -python datagen_transaction.py /mnt/1k/customers.csv /capstone/new_segements_data_gen/profiles/young_adults_male_urban.json 1-1-2012 12-31-2015 >> /mnt/1k/young_adults_male_urban.csv & -python datagen_transaction.py /mnt/1k/customers.csv /capstone/new_segements_data_gen/profiles/adults_50up_female_rural.json 1-1-2012 12-31-2015 >> /mnt/1k/adults_50up_female_rural.csv & -python datagen_transaction.py /mnt/1k/customers.csv /capstone/new_segements_data_gen/profiles/adults_50up_female_urban.json 1-1-2012 12-31-2015 >> /mnt/1k/adults_50up_female_urban.csv & -python datagen_transaction.py /mnt/1k/customers.csv /capstone/new_segements_data_gen/profiles/adults_50up_male_rural.json 1-1-2012 12-31-2015 >> /mnt/1k/adults_50up_male_rural.csv & -python datagen_transaction.py /mnt/1k/customers.csv /capstone/new_segements_data_gen/profiles/adults_50up_male_urban.json 1-1-2012 12-31-2015 >> /mnt/1k/adults_50up_male_urban.csv & - diff --git a/create_tx_1.sh b/create_tx_1.sh deleted file mode 100644 index e747538..0000000 --- a/create_tx_1.sh +++ /dev/null @@ -1,35 +0,0 @@ - - -##Linux / OSX Boxes -#Generate customer file - -#python datagen_customer.py 10000 4444 profiles/main_config.json >> ../data/customers.csv - -#Generate transactions per profile - -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/40_60_bigger_cities.json 1-1-2012 12-31-2015 >> /capstone/data_gen/data/40_60_bigger_cities.csv & -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/40_60_smaller_cities.json 1-1-2012 12-31-2015 >> /capstone/data_gen/data/40_60_smaller_cities.csv & -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/all_60_up.json 1-1-2012 12-31-2015 >> /capstone/data_gen/data/all_60_up.csv & -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/female_30_40_bigger_cities.json 1-1-2012 12-31-2015 >> /capstone/data_gen/data/female_30_40_bigger_cities.csv& -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/female_30_40_smaller_cities.json 1-1-2012 12-31-2015 >> /capstone/data_gen/data/female_30_40_smaller_cities.csv & - - -##Windows Boxes -#Generate customer file - -#python datagen_customer.py 10 4444 C:\Users\Brandon\git\data_generation\profiles\main_config.json >>C:\Users\Brandon\git\data_generation\data\customers.csv - -#Generate transactions per profile - -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\40_60_bigger_cities.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\40_60_bigger_cities.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\40_60_smaller_cities.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\40_60_smaller_cities.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\all_60_up.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\all_60_up.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\female_30_40_bigger_cities.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\female_30_40_bigger_cities.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\female_30_40_smaller_cities.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\female_30_40_smaller_cities.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\leftovers.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\leftovers.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\male_30_40_bigger_cities.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\male_30_40_bigger_cities.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\male_30_40_smaller_cities.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\male_30_40_smaller_cities.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\millenials.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\millenials.csv -#python datagen_transaction.py C:\Users\Brandon\git\data_generation\data\customers.csv C:\Users\Brandon\git\data_generation\profiles\young_adults.json 1-1-2012 12-31-2015 >> C:\Users\Brandon\git\data_generation\data\young_adults.csv -exit 0 - diff --git a/create_tx_2.sh b/create_tx_2.sh deleted file mode 100644 index bf47eaa..0000000 --- a/create_tx_2.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/leftovers.json 1-1-2012 12-31-2015 >> /capstone/data_gen/data/leftovers.csv & -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/male_30_40_bigger_cities.json 1-1-2012 12-31-2015 >> /capstone/data_gen/data/male_30_40_bigger_cities.csv & -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/male_30_40_smaller_cities.json 1-1-2012 12-31-2015 >> /capstone/data_gen/data/male_30_40_smaller_cities.csv & -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/millenials.json 1-1-2012 12-31-2015 >> /capstone/data_gen/data/millenials.csv & -python datagen_transaction.py /capstone/data_gen/data/customers.csv ./profiles/young_adults.json 1-1-2012 12-31-2015 >> /capstone/data_gen/data/young_adults.csv & -exit 0 - diff --git a/datagen.py b/datagen.py new file mode 100644 index 0000000..7d3b25f --- /dev/null +++ b/datagen.py @@ -0,0 +1,98 @@ +import argparse +import pathlib +import os +import json +from multiprocessing import Pool, cpu_count + +from datagen_customer import main as datagen_customers +from datagen_transaction import main as datagen_transactions +from datagen_transaction import valid_date + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Customer Generator') + parser.add_argument('-n', '--nb_customers', type=int, help='Number of customers to generate', default=10) + parser.add_argument('start_date', type=valid_date, help='Transactions start date') + parser.add_argument('end_date', type=valid_date, help='Transactions start date') + parser.add_argument('-seed', type=int, nargs='?', help='Random generator seed', default=42) + parser.add_argument('-config', type=pathlib.Path, nargs='?', help='Profile config file (typically profiles/main_config.json")', default='./profiles/main_config.json') + parser.add_argument('-c', '--customer_file', type=pathlib.Path, help='Customer file generated with the datagen_customer script', default=None) + parser.add_argument('-o', '--output', type=pathlib.Path, help='Output Folder path', default='data') + + + args = parser.parse_args() + num_cust = args.nb_customers + seed_num = args.seed + config = args.config + out_path = args.output + customer_file = args.customer_file + start_date = args.start_date + end_date = args.end_date + out_path = args.output + customers_out_file = customer_file or os.path.join(out_path, 'customers.csv') + + # create the folder if it does not exist + if not os.path.exists(out_path): + os.mkdir(out_path) + + # if no customers file provided, generate a customers file + if customer_file is None and num_cust is not None: + if os.path.exists(customers_out_file): + # prompt user to overwrite + agree = input(f"File {customers_out_file} already exists. Overwrite? (y/N)") + if agree.lower() != 'y': + exit(1) + datagen_customers(num_cust, seed_num, config, customers_out_file) + elif customer_file is None: + print('Either a customer file or a number of customers to create must be provided') + exit(1) + + # if we're supplied with a customer file, we need to figure how many we have + if customer_file is not None: + num_cust = 0 + with open(customer_file, 'r') as f: + for row in f.readlines(): + num_cust += 1 + + # figure out reasonable chunk size + num_cpu = cpu_count() + print(f"Num CPUs: {num_cpu}") + chunk_size = max(min(int(num_cust / 5), 1000), 1000 * int(num_cust / (1000 * num_cpu))) + # because from one profile to another, there may be a 10-50x difference in size, it is best to use small + # chunk sizes so as to spread the work across all CPUs. Bigger chunks means a core may process small profiles + # quickly and then be idle, while other cores process large profiles. Smaller chunks will run faster + + # zero padding determination + zero_pad = len(str(num_cust - 1)) + + # read config + with open(config, 'r') as f: + configs = json.load(f) + + profile_names = configs.keys() + + args_array = [] + for profile_file in configs.keys(): + customer_file_offset_start = 0 + customer_file_offset_end = min(num_cust - 1, chunk_size - 1) + while customer_file_offset_start <= max(num_cust - 1, chunk_size): + print(f"profile: {profile_file}, chunk size: {chunk_size}, \ + chunk: {customer_file_offset_start}-{customer_file_offset_end}") + transactions_filename = os.path.join(out_path, + profile_file.replace('.json', + f'_{str(customer_file_offset_start).zfill(zero_pad)}-{str(customer_file_offset_end).zfill(zero_pad)}.csv')) + # Arguments need to be passed as a tuple + args_array.append(( + customers_out_file, + pathlib.Path(os.path.join('profiles', profile_file)), + start_date, + end_date, + transactions_filename, + customer_file_offset_start, + customer_file_offset_end + )) + customer_file_offset_start += chunk_size + customer_file_offset_end = min(num_cust - 1, customer_file_offset_end + chunk_size) + + with Pool() as p: + p.starmap(datagen_transactions, args_array) diff --git a/datagen_customer.py b/datagen_customer.py index 0590158..3982cc0 100644 --- a/datagen_customer.py +++ b/datagen_customer.py @@ -1,6 +1,6 @@ from faker import Faker import sys -from datetime import timedelta, date +from datetime import date import random from main_config import MainConfig import argparse diff --git a/datagen_transaction.py b/datagen_transaction.py index 6a73ce9..3a5a5cb 100644 --- a/datagen_transaction.py +++ b/datagen_transaction.py @@ -11,6 +11,7 @@ from datagen_customer import headers +fake = Faker() transaction_headers = [ 'trans_num', 'trans_date', @@ -35,8 +36,6 @@ merchants[row[0]] = [] merchants[row[0]].append(row[1]) -fake = Faker() - class Customer: def __init__(self, raw): @@ -86,7 +85,7 @@ def valid_date(s): raise argparse.ArgumentTypeError(msg) -def main(customer_file, profile_file, start_date, end_date, out_path=None): +def main(customer_file, profile_file, start_date, end_date, out_path=None, start_offset=0, end_offset=sys.maxsize): profile_name = profile_file.name profile_file_fraud = pathlib.Path(*list(profile_file.parts)[:-1] + [f"fraud_{profile_name}"]) @@ -111,38 +110,52 @@ def main(customer_file, profile_file, start_date, end_date, out_path=None): # generate appropriate number of transactions with open(customer_file, 'r') as f: f.readline() - headers.extend(transaction_headers) - print("|".join(headers)) - for row in f.readlines(): - cust = Customer(row) - if cust.attrs['profile'] == profile_name: - is_fraud = 0 - fraud_flag = random.randint(0,100) # set fraud flag here, as we either gen real or fraud, not both for - # the same day. - fraud_dates = [] - # decide if we generate fraud or not - if fraud_flag < 99: #11->25 - fraud_interval = random.randint(1,1) #7->1 - # rand_interval is the random no of days to be added to start date - rand_interval = random.randint(1, inter_val) - #random start date is selected - newstart = start_date + timedelta(days=rand_interval) - # based on the fraud interval , random enddate is selected - newend = newstart + timedelta(days=fraud_interval) - # we assume that the fraud window can be between 1 to 7 days #7->1 - fraud_profile.set_date_range(newstart, newend) - is_fraud = 1 - temp_tx_data = fraud_profile.sample_from(is_fraud) - fraud_dates = temp_tx_data[3] + print("|".join(headers + transaction_headers)) + line_num = 0 + fail = False + # skip lines out of range + while line_num < start_offset: + try: + f.readline() + line_num += 1 + except EOFError: + # end of file? + fail = True + break + if not fail: + for row in f.readlines(): + cust = Customer(row) + if cust.attrs['profile'] == profile_name: + is_fraud = 0 + fraud_flag = random.randint(0,100) # set fraud flag here, as we either gen real or fraud, not both for + # the same day. + fraud_dates = [] + # decide if we generate fraud or not + if fraud_flag < 99: #11->25 + fraud_interval = random.randint(1,1) #7->1 + # rand_interval is the random no of days to be added to start date + rand_interval = random.randint(1, inter_val) + #random start date is selected + newstart = start_date + timedelta(days=rand_interval) + # based on the fraud interval , random enddate is selected + newend = newstart + timedelta(days=fraud_interval) + # we assume that the fraud window can be between 1 to 7 days #7->1 + fraud_profile.set_date_range(newstart, newend) + is_fraud = 1 + temp_tx_data = fraud_profile.sample_from(is_fraud) + fraud_dates = temp_tx_data[3] + cust.print_trans(temp_tx_data, is_fraud, fraud_dates) + + # we're done with fraud (or didn't do it) but still need regular transactions + # we pass through our previously selected fraud dates (if any) to filter them + # out of regular transactions + + is_fraud = 0 + temp_tx_data = profile.sample_from(is_fraud) cust.print_trans(temp_tx_data, is_fraud, fraud_dates) - - # we're done with fraud (or didn't do it) but still need regular transactions - # we pass through our previously selected fraud dates (if any) to filter them - # out of regular transactions - - is_fraud = 0 - temp_tx_data = profile.sample_from(is_fraud) - cust.print_trans(temp_tx_data, is_fraud, fraud_dates) + line_num += 1 + if line_num > end_offset: + break if out_path is not None: sys.stdout = original_sys_stdout diff --git a/generate_customers.bat b/generate_customers.bat deleted file mode 100644 index ec02ba9..0000000 --- a/generate_customers.bat +++ /dev/null @@ -1,2 +0,0 @@ -REM python datagen_customer.py noOfCustomers randomSimulationSeed "configPath" >> "OPcsv" -python datagen_customer.py 1000 4444 ".\profiles\main_config.json" >> ".\data\customers.csv" \ No newline at end of file diff --git a/generate_transactions.bat b/generate_transactions.bat deleted file mode 100644 index 8d81ee3..0000000 --- a/generate_transactions.bat +++ /dev/null @@ -1,14 +0,0 @@ -REM python datagen_transaction.py "customersCSV" "profileJSON" simulationStartDate simulationEndDate >> "OPcsv" - -python datagen_transaction.py ".\data\customers.csv" ".\profiles\adults_2550_female_rural.json" 1-1-2012 12-31-2013 >> ".\data\adults_2550_female_rural.csv" -python datagen_transaction.py ".\data\customers.csv" ".\profiles\adults_2550_female_urban.json" 1-1-2012 12-31-2013 >> ".\data\adults_2550_female_urban.csv" -python datagen_transaction.py ".\data\customers.csv" ".\profiles\adults_2550_male_rural.json" 1-1-2012 12-31-2013 >> ".\data\adults_2550_male_rural.csv" -python datagen_transaction.py ".\data\customers.csv" ".\profiles\adults_2550_male_urban.json" 1-1-2012 12-31-2013 >> ".\data\adults_2550_male_urban.csv" -python datagen_transaction.py ".\data\customers.csv" ".\profiles\young_adults_female_rural.json" 1-1-2012 12-31-2013 >> ".\data\young_adults_female_rural.csv" -python datagen_transaction.py ".\data\customers.csv" ".\profiles\young_adults_female_urban.json" 1-1-2012 12-31-2013 >> ".\data\young_adults_female_urban.csv" -python datagen_transaction.py ".\data\customers.csv" ".\profiles\young_adults_male_rural.json" 1-1-2012 5-31-2012 >> ".\data\young_adults_male_rural.csv" -python datagen_transaction.py ".\data\customers.csv" ".\profiles\young_adults_male_urban.json" 1-1-2012 5-31-2012 >> ".\data\young_adults_male_urban.csv" -python datagen_transaction.py ".\data\customers.csv" ".\profiles\adults_50up_female_rural.json" 1-1-2012 12-31-2013 >> ".\data\adults_50up_female_rural.csv" -python datagen_transaction.py ".\data\customers.csv" ".\profiles\adults_50up_female_urban.json" 1-1-2012 12-31-2013 >> ".\data\adults_50up_female_urban.csv" -python datagen_transaction.py ".\data\customers.csv" ".\profiles\adults_50up_male_rural.json" 1-1-2012 12-31-2013 >> ".\data\adults_50up_male_rural.csv" -python datagen_transaction.py ".\data\customers.csv" ".\profiles\adults_50up_male_urban.json" 1-1-2012 12-31-2013 >> ".\data\adults_50up_male_urban.csv" \ No newline at end of file diff --git a/profile_weights.py b/profile_weights.py index e9b4178..3e519f5 100644 --- a/profile_weights.py +++ b/profile_weights.py @@ -154,18 +154,6 @@ def make_weights(self): # the cumsum as the key from which to sample self.date_weights() - def closest_rand(self, pro, num): - """ - Assumes lst is sorted. Returns closest value > num. - """ - lst = list(pro.keys()) - pos = bisect_left(lst, num) - if pos == 0: - return pro[lst[0]] - if pos == len(pro): - return pro[lst[-1]] - return pro[lst[pos]] - def pre_compute_amt_specs(self): amt_specs = {} for category in self.profile['categories_amt'].keys(): @@ -175,15 +163,6 @@ def pre_compute_amt_specs(self): } return amt_specs - def sample_amt(self, category): - shape = self.profile['categories_amt'][category]['mean']**2 / self.profile['categories_amt'][category]['stdev']**2 - scale = self.profile['categories_amt'][category]['stdev']**2 / self.profile['categories_amt'][category]['mean'] - amt = np.random.gamma(shape, scale, 1)[0] - #seeing lots of <$1.00 charges, hacky fix even though it breaks the gamma distribution - if amt < 1: - amt = np.random.uniform(1.00, 10.00) - return str("{:.2f}".format(amt)) - def sample_time(self, am_or_pm, is_fraud): if am_or_pm == 'AM': @@ -241,8 +220,6 @@ def sample_from(self, is_fraud): is_traveling = False output = [] - rand_date = np.random.random(num_trans) - rand_cat = np.random.random(num_trans) # get an 2d array of random numbers + empty columns for mapping rnds = self.get_rand_2d(num_trans, 2, 4) @@ -262,25 +239,25 @@ def sample_from(self, is_fraud): shape = cat_specs['shape'] scale = cat_specs['scale'] rnd_amts = np.random.gamma(shape, scale, counts[i]) - rnds[offset: offset + counts[i], 6] = rnd_amts + # as in previous version, when transactions are under $1, use uniform 1-10 range + rnd_amts_lower = np.random.uniform(1.00, 10.00, counts[i]) + rnds[offset: offset + counts[i], 6] = np.where(rnd_amts < 1, rnd_amts_lower, rnd_amts) offset += counts[i] - # re-sort by index (see if needed) - # rnds = rnds[rnds[:,0].argsort()] - fraud_dates = [] - for i in range(num_trans): # , num in enumerate(rand_date): + # now loop through and pick from random array + for i in range(num_trans): trans_num = self.fake.md5(raw_output=False) - chosen_date = self.proportions['date_prop'][rnds[i, 3]] #self.closest_rand(self.proportions['date_prop'], num) + chosen_date = self.proportions['date_prop'][rnds[i, 3]] chosen_date_str = chosen_date.strftime('%Y-%m-%d') if is_fraud == 1: fraud_dates.append(chosen_date_str) - chosen_cat = self.proportions['categories_wt'][rnds[i, 4]] #self.closest_rand(self.proportions['categories_wt'], rand_cat[i]) - chosen_amt = rnds[i, 6] # self.sample_amt(chosen_cat) - chosen_daypart = self.proportions['shopping_time'][rnds[i, 5]] #self.closest_rand(self.proportions['shopping_time'], ) + chosen_cat = self.proportions['categories_wt'][rnds[i, 4]] + chosen_amt = "{:.2f}".format(rnds[i, 6]) + chosen_daypart = self.proportions['shopping_time'][rnds[i, 5]] hr, mn, sec = self.sample_time(chosen_daypart, is_fraud) chosen_date = datetime.combine(chosen_date, time(hour=hr, minute=mn, second=sec)) - epoch = int(chosen_date.timestamp()) #int((chosen_date - epoch_start).total_seconds()) + epoch = int(chosen_date.timestamp()) output.append([str(trans_num), chosen_date_str, f"{hr:02d}:{mn:02d}:{sec:02d}", str(epoch), str(chosen_cat), str(chosen_amt), str(is_fraud)]) return output, is_traveling, travel_max, fraud_dates diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..12c53bb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +Faker==13.12.0 +numpy==1.22.3 \ No newline at end of file diff --git a/static_merchant_generator.py b/static_merchant_generator.py index 4f57a2f..b97adfd 100644 --- a/static_merchant_generator.py +++ b/static_merchant_generator.py @@ -26,8 +26,8 @@ "kids_pets", "personal_care", "travel"] -print header +print(header) for c in category_list: for _ in range(0, n): - print c + "|" + 'fraud_' + fake.company() + print(f"{c}|{fake.company()}") From 5bdcf65ef771dbda8170b6506f0248766593d0e2 Mon Sep 17 00:00:00 2001 From: Emmanuel Leroy Date: Thu, 9 Jun 2022 15:53:39 -0700 Subject: [PATCH 3/5] Create LICENSE.md --- LICENSE.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE.md diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..7d33b08 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2016-2022 Brandon Harris + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 93a9084f18751d9c6bd4816fd4e36509665fb448 Mon Sep 17 00:00:00 2001 From: Emmanuel Leroy Date: Thu, 9 Jun 2022 16:11:28 -0700 Subject: [PATCH 4/5] insure we create the full folder path --- datagen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datagen.py b/datagen.py index 7d3b25f..3a4734c 100644 --- a/datagen.py +++ b/datagen.py @@ -33,7 +33,7 @@ # create the folder if it does not exist if not os.path.exists(out_path): - os.mkdir(out_path) + os.makedirs(out_path) # if no customers file provided, generate a customers file if customer_file is None and num_cust is not None: From 8cc7484a4eb5f9a82781c74b2a2c0447b527d667 Mon Sep 17 00:00:00 2001 From: Emmanuel Leroy Date: Fri, 10 Jun 2022 08:45:13 -0700 Subject: [PATCH 5/5] update README --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7cc0955..ce0417c 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Generate Fake Credit Card Transaction Data, Including Fraudulent Transactions +Note: Version v1.0 behavior has changed in such a way that it runs much faster, however transaction files are chunked, so that several files get generated per profile. If your downstream process expects 1 file per profile, please checkout the v0.5 release branch `release/v0.5`. + ## General Usage In this version, the general usage has changed: @@ -27,7 +29,7 @@ This version is modified from the version v0.5 to parallelize the work using `mu Because of the way it parallelize the work (chunking transaction generation by chunking the customer list), there will be multiple transaction files generated per profile. Also not that if the number of customers is small, there may be empty files (i.e. files where no customer in the chunk matched the profile). This is expected. -With standard profiles, it was benchmarked as generating ~95MB/thread/min. With a 64cores/128threads AMD E3, I was able to generate 1.4TB of data, 4.5B transactions, in just under 2h, as opposed to days when running the previous version. +With standard profiles, it was benchmarked as generating ~95MB/thread/min. With a 64 cores/128 threads AMD E3, I was able to generate 1.4TB of data, 4.5B transactions, in just under 2h, as opposed to days when running the previous versions. The generation code is originally based on code by [Josh Plotkin](https://github.com/joshplotkin/data_generation). Change log of modifications to original code are below.