-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathindex.py
206 lines (158 loc) · 5.96 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import importlib
import re
from typing import Optional, Tuple
import bs4
import pandas as pd
import requests
from stock_market.data.constants import (
PERFORMANCE_PERIODIC,
PERFORMANCE_TOP_STOCKS,
PERFORMERS_BOTTOM_STOCKS,
SP500_URL,
)
AVAILABLE_INDEX = ["SP500"]
class IndexView(object):
"""
Analysis on market indexes.
Parameters
----------
index: str
The market index of interest for analysis. Currently supports the indexes in AVAILABLE_INDEX.
"""
def __init__(self, index: str):
# Check availability of index
index_name = index.upper()
if index_name not in AVAILABLE_INDEX:
raise Warning(
f"Please select from the available indexes: {AVAILABLE_INDEX}"
)
# Extract specified index data
data = getattr(importlib.import_module("stock_market.data"), index_name)
# Column name constants
self._column_names = {
"ticker_symbol": "Ticker",
"ticker_full": "Name",
"ticker_sector": "Sector",
}
# Self stores
self.data = data
self.sector_list = list(set(data[self._column_names["ticker_sector"]]))
# Value from property
self._summary = dict()
@property
def summary_sector_view(self) -> pd.DataFrame:
"""
Summary of number of stocks by sector.
"""
if "sector_view" not in self._summary:
# Setup for metric population
data = self.data
ticker_symbol = self._column_names["ticker_symbol"]
ticker_sector = self._column_names["ticker_sector"]
# Number of stocks by sector
sector_count = dict()
sector_count["sector_count"] = (
data[
[
ticker_symbol,
ticker_sector,
]
]
.groupby([ticker_sector])
.count()
.to_dict()[ticker_symbol]
)
self._summary["sector_view"] = sector_count
# Populate sector count in pandas form
sector_count = pd.DataFrame.from_dict(self._summary["sector_view"])
return sector_count
@property
def summary_performance(self) -> pd.DataFrame:
"""
High level summary of index's periodic performance.
"""
if "performance" not in self._summary:
# Run scrape function to extract all metrics in one go
index_scrape = _sp500()
self._summary["performance"] = index_scrape[PERFORMANCE_PERIODIC]
self._summary["top_stocks"] = index_scrape[PERFORMANCE_TOP_STOCKS]
self._summary["bottom_stocks"] = index_scrape[PERFORMERS_BOTTOM_STOCKS]
periodic_performance = pd.DataFrame.from_dict(
{"periodic_performance": self._summary["performance"]}
)
# TODO: Properly sort the periodic time periods
return periodic_performance
@property
def summary_stocks_today(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Summary of today's top and bottom stock performances.
"""
if ("top_stocks" not in self._summary) or (
"bottom_stocks" not in self._summary
):
# Run scrape function to extract all metrics in one go
index_scrape = _sp500()
self._summary["performance"] = index_scrape[PERFORMANCE_PERIODIC]
self._summary["top_stocks"] = index_scrape[PERFORMANCE_TOP_STOCKS]
self._summary["bottom_stocks"] = index_scrape[PERFORMERS_BOTTOM_STOCKS]
return self._summary["top_stocks"], self._summary["bottom_stocks"]
# Scraper for sp500
def _sp500():
"""
Scraping SP500 information from MarketWatch. (link in constants folder in data directory)
"""
# Search and store the following information
ws_dict = dict()
for metric in [
PERFORMANCE_PERIODIC,
PERFORMANCE_TOP_STOCKS,
PERFORMERS_BOTTOM_STOCKS,
]:
# Regex search for the above metrics
regex = re.compile(f"element element--table ({metric})")
# Web Scraped data
ws_metric = bs4.BeautifulSoup(
requests.get(SP500_URL).content, "html.parser"
).find("div", {"class": regex})
# Check if data return requires a webscrape fix
if len(ws_metric) == 0:
print(f"The web-scrape metric name seems to be changed for {metric}.")
return None
ws_dict[metric] = ws_metric
# Extract data points for each metric
metric_data = {}
# 1) Performance per periods
data_1 = dict()
data = ws_dict[PERFORMANCE_PERIODIC].find_all("td")
for i in range(0, len(data), 2):
# Every even index represents the info and odd index represents the value
data_1[data[i].text.replace("\n", "")] = data[i + 1].text.replace("\n", "")
# 2) Top performing stocks today
data_2 = _stock_performers_ws(data=ws_dict[PERFORMANCE_TOP_STOCKS])
# 3) Bottom performing stocks today
data_3 = _stock_performers_ws(data=ws_dict[PERFORMERS_BOTTOM_STOCKS])
# All data store
metric_data[PERFORMANCE_PERIODIC] = data_1
metric_data[PERFORMANCE_TOP_STOCKS] = data_2
metric_data[PERFORMERS_BOTTOM_STOCKS] = data_3
return metric_data
# Helper function for _sp500()
def _stock_performers_ws(
data: bs4.element.Tag,
) -> Optional[pd.DataFrame]:
"""
Web scrapes the top and bottom performing stocks for an index in MarketWatch.
"""
data_ws = data.find_all("tr")
if len(data_ws) == 0:
return None
# Setup stock data
stock_data = []
# First row is the column names
col_names = list(filter(None, data_ws[0].text.split("\n")))
# Extract all other row info
for row in range(1, len(data_ws)):
stock_data.append(list(filter(None, data_ws[row].text.split("\n"))))
# Form pandas dataframe
data_df = pd.DataFrame(stock_data, columns=col_names)
return data_df