-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_training_dataset.py
34 lines (27 loc) · 1.04 KB
/
generate_training_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from multiprocessing import Pool, cpu_count
import pandas as pd
from db.interface import open_connection, close_connection
import db.match_table as mt
from services import data_provider
from logger import logging
conn = open_connection()
def get_feature_vector_for_match(match):
match_df = mt.get_match(match["id"], conn=conn)
feature_vector = {}
if match_df.shape[0]:
match_data = match_df.iloc[0, :].to_dict()
feature_vector = {**match_data, **data_provider.create_match_feature_vector(match_data, conn=conn)}
else:
logging.warning(f"No match data for match id {match['id']}")
return feature_vector
def run():
pool = Pool(cpu_count() * 3)
match_df = mt.get_matches_for_division(1, "2006-01-01", conn=conn)
matches = [record for (_, record) in match_df.to_dict('index').items()]
match_features = pool.map(get_feature_vector_for_match, matches)
df = pd.DataFrame(match_features)
df.to_csv("new_master_data.csv")
pool.close()
if __name__ == "__main__":
run()
close_connection(conn)