-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfeatureengineer.py
62 lines (57 loc) · 2.17 KB
/
featureengineer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from itertools import chain
import pandas as pd
import numpy as np
import datetime
import time
import os
from pathlib import Path
from alpha_vantage.timeseries import TimeSeries
import sys
df_constituents = pd.read_csv(
"https://raw.githubusercontent.com/fja05680/sp500/master/S%26P%20500%20Historical%20Components%20%26%20Changes(01-21-2021).csv",
parse_dates=True,
index_col=0,
).sort_index(ascending=False)
ALPHA_VANTAGE_DIR_PATH = Path("alphadata").absolute()
def generate_monthly_stats(df):
#print(df)
log_return = df["Close"].apply(np.log).diff()
half_way_point = len(df) // 2
return {
"Open": df["Open"].iloc[0],
"High": df["High"].max(),
"Low": df["Low"].min(),
"Close": df["Close"].iloc[-1],
"Volume": df["Volume"].sum(),
"first_half_log_return_mean": log_return.iloc[:half_way_point].mean(),
"first_half_log_return_std": log_return.iloc[:half_way_point].std(),
"second_half_log_return_mean": log_return.iloc[half_way_point:].mean(),
"second_half_log_return_std": log_return.iloc[half_way_point:].std(),
"first_second_half_log_return_diff": (
log_return.iloc[half_way_point:].sum()
- log_return.iloc[:half_way_point].sum()
),
"log_return_mean": log_return.mean(),
"log_return_std": log_return.std(),
"log_return_min": log_return.min(),
"log_return_max": log_return.max(),
"month_log_return": np.log(df["Close"].iloc[-1] / df["Open"].iloc[0]),
"pct_bull": (log_return > 0).mean()
}
tickers = set(",".join(df_constituents.tickers.values).split(","))
slippage = .005 # 0.5% slippage per trade
dict_dfs = dict()
for t in tickers:
# this stock is not available on alpha vantage
if not (ALPHA_VANTAGE_DIR_PATH / f"{t}.csv").is_file():
continue
temp = (
pd.read_csv(ALPHA_VANTAGE_DIR_PATH / f"{t}.csv", index_col=0, parse_dates=True)
.groupby(pd.Grouper(freq="1M"))
.apply(generate_monthly_stats)
)
print(temp)
temp["next_month_log_return"] = np.log(
np.exp(temp["month_log_return"].shift(-1)) * (1 - slippage) / (1 + slippage)
)
dict_dfs[t] = temp