-
Notifications
You must be signed in to change notification settings - Fork 0
/
datagen.py
84 lines (75 loc) · 2.94 KB
/
datagen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import datetime, csv, sys
import faker
from random import randint
import multiprocess
import time
from typing import *
class DataGen(object):
"""
A simple wrapper for the faker and random modules that will quickly generate fake data in 5 parallel threads
and export to a csv. Number of rows should be a multiple of 10.
"""
def __init__(self: object, filename: str, rownum: int) -> None:
self.faker = faker.Faker()
self.__filename__ = filename + ".csv"
self.rownum = rownum/5
if self.rownum < int(1000):
self.chunks = int(self.rownum)
self.iters = int(1)
else:
self.chunks = int(1000)
self.iters = int(self.rownum/1000)
def dates(self: object, start: str, end: str) -> None:
""" To generate dates, provide a start and end date in the form of YYYY-MM-DD"""
self.start = datetime.datetime.strptime(start, '%Y-%m-%d')
self.end = datetime.datetime.strptime(end, '%Y-%m-%d')
def gen(self: object, _queue: Iterable) -> None:
for i in range(0,self.iters):
output = []
for j in range(0,self.chunks):
obj = {}
if self.start and self.end:
obj["date"] = self.faker.date_between_dates(self.start, self.end)
obj["campaign_name"] = self.faker.catch_phrase()
obj["location"] = self.faker.simple_profile()["address"]
obj["page_name"] = self.faker.uri()
obj["clicks"] = randint(1,1000)
obj["impressions"] = randint(1000,100000)
output.append(obj)
_queue.put(output)
def write(self: object, _queue: Iterable, _stop_token: str) -> None:
self.i = 0
while True:
obj = _queue.get()
if obj == "STOP":
sys.exit()
filepath = self.__filename__
f = open(filepath,'a')
w = csv.DictWriter(f,obj[0].keys())
if isinstance(obj,list):
if self.i == 0:
w.writeheader()
self.i += 1
w.writerows(obj)
f.close()
def generate(self: object) -> None:
print("Beginning data generation...")
start_seconds = time.time()
queue = multiprocess.Queue()
w = multiprocess.Process(target=self.write, args=(queue,"STOP"))
jobs = []
for i in range(0,5):
p = multiprocess.Process(target=self.gen,args=(queue,))
jobs.append(p)
p.start()
w.start()
for i, item in enumerate(jobs):
item.join()
queue.put("STOP")
w.join()
elapsed_time = (time.time() - start_seconds)/60
print("Generation completed. Elapsed time: ", "{0:.2f}".format(elapsed_time), " minutes")
if __name__ == "__main__":
dg = DataGen("test100k",100000)
dg.dates("2016-01-01","2016-11-30")
dg.generate()