-
Notifications
You must be signed in to change notification settings - Fork 0
/
Generate_Fake_JSON.py
121 lines (104 loc) · 3.22 KB
/
Generate_Fake_JSON.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import json
import uuid
import time
import datetime
import os
import numpy as np
import matplotlib.pyplot as plt
################################################################
# TW lat & lon boundaries
tw_boundary = {
"min_lat": 20.6833,
"max_lat": 25.3005,
"min_lon": 118.0669,
"max_lon": 122.0201,
}
# save massive data there
data_path = r"D:\Fake_Data\Data"
BATCH_MAX_NUM = 10
DATA_MIN = 1e4
DATA_MAX = 5 * 1e4
################################################################
def progress_bar(width: int, iters: int):
for i in range(1, width + 1):
# time.sleep(duration) :.2f
strbarwidth = f"[{'#' * iters}{'-' * (width - iters)}] - {round((iters) * (100/width))}%\r"
print(strbarwidth, end='')
def generate_coordinates() -> float:
lon = np.random.uniform(tw_boundary["min_lon"], tw_boundary["max_lon"])
lat = np.random.uniform(tw_boundary["min_lat"], tw_boundary["max_lat"])
return lon, lat
# TODO: Consider that some users will be the same in different times -> UID may not always be different
def generate_userid() -> str:
return str(uuid.uuid4())
def generate_time(current_batch : int) -> str:
year = 2023
month = 4
day = 1
hour = 10
minute = 10 + (current_batch * 5) // 60
second = (current_batch * 5) % 60 + np.random.randint(0, 5)
if minute >= 60:
hour += minute // 60
minute %= 60
if hour >= 24:
day += hour // 24
hour %= 24
# return a datetime string
dt = datetime.datetime(year, month, day, hour, minute, second)
return dt.strftime("%Y-%m-%d %H:%M:%S")
def save_data(user_id, lon, lat, time, folder_path):
data = {
"user_id": user_id,
"longitude": lon,
"latitude": lat,
"time": time
}
# set filename
time_str = time.replace(":", "-").replace(" ", "_")
filename = f"data_{time_str}_{user_id}.json"
with open(os.path.join(folder_path, filename), "w") as file:
json.dump(data, file)
def batch_generate(batch_num : int) -> int:
# make a dir
dir_name = f"batch_{batch_num}"
full_folder_path = os.path.join(data_path,dir_name)
os.makedirs(full_folder_path, exist_ok=True)
# random decide how many data to generate in this batch
data_num = np.random.randint(DATA_MIN,DATA_MAX)
# generate data
def generate_data():
for _ in range(data_num):
lon, lat = generate_coordinates()
user_id = generate_userid()
time = generate_time(current_batch = batch_num)
yield user_id, lon, lat, time
# generator to speed up
for user_id, lon, lat, time in generate_data():
save_data(user_id, lon, lat, time, full_folder_path)
# return data_num for visualization
return data_num
def draw_data(data:list) -> None:
plt.title("Fake Data Distribution")
plt.xlabel("Batch")
plt.ylabel("Data")
plt.bar(range(len(data)), data)
plt.tight_layout()
# save plot with the same directory as code -> not convinent to enter data storage directory
plt.savefig("Fake_Data_Distribution.png",dpi = 300)
def main():
# start
BATCH_NUM = []
print("START GENERATING FAKE DATA....")
st = time.time()
for batch_num in range(1,BATCH_MAX_NUM + 1):
data_num = batch_generate(batch_num)
BATCH_NUM.append(data_num)
# show progress
progress_bar(BATCH_MAX_NUM,batch_num)
print("")
draw_data(BATCH_NUM)
end = time.time()
print(f"FINISH GENERATING FAKE DATA in {end-st:.2f} sec")
if __name__ == '__main__':
main()