Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add scrape of ETF top/major holdings #1872

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions yfinance/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from .scrapers.analysis import Analysis
from .scrapers.fundamentals import Fundamentals
from .scrapers.holders import Holders
from .scrapers.holdings import Holdings
from .scrapers.quote import Quote, FastInfo
from .scrapers.history import PriceHistory

Expand Down Expand Up @@ -68,6 +69,7 @@ def __init__(self, ticker, session=None, proxy=None):
self._price_history = None # lazy-load
self._analysis = Analysis(self._data, self.ticker)
self._holders = Holders(self._data, self.ticker)
self._holdings = Holdings(self._data, self.ticker)
self._quote = Quote(self._data, self.ticker)
self._fundamentals = Fundamentals(self._data, self.ticker)

Expand Down Expand Up @@ -194,6 +196,13 @@ def get_mutualfund_holders(self, proxy=None, as_dict=False):
return data.to_dict()
return data

def get_major_holdings(self, proxy=None, as_dict=False):
self._holdings.proxy = proxy
data = self._holdings.major
if as_dict:
return data.to_dict()
return data

def get_insider_purchases(self, proxy=None, as_dict=False):
self._holders.proxy = proxy or self.proxy
data = self._holders.insider_purchases
Expand Down
1 change: 1 addition & 0 deletions yfinance/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,4 +152,5 @@
"sectorTrend",
"recommendationTrend",
"futuresChain",
"topHoldings"
)
74 changes: 74 additions & 0 deletions yfinance/scrapers/holdings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import pandas as pd
from bs4 import BeautifulSoup

from yfinance.data import YfData

class Holdings:
_SCRAPE_URL_ = 'https://finance.yahoo.com/quote'

def __init__(self, data: YfData, symbol: str, proxy=None):
self._data = data
self._symbol = symbol
self.proxy = proxy
self._major = None

@property
def major(self) -> pd.DataFrame:
if self._major is None:
self._scrape(self.proxy)
return self._major

def _scrape(self, proxy):
ticker_url = "{}/{}".format(self._SCRAPE_URL_, self._symbol)
try:
resp = self._data.cache_get(ticker_url + '/holdings', proxy)
if "/holdings" not in resp.url:
raise Exception(f'{self.ticker}: does not have holdings')

# Manually parse, because while pandas.read_html gets tables,
# it doesn't get table names.
soup = BeautifulSoup(resp.text, 'html.parser')
h3 = None
tables = {}
for element in soup.descendants:
if element.name == 'h3':
h3 = element.get_text(strip=True)
elif element.name == 'table' and h3:
try:
df_list = pd.read_html(str(element))
if df_list:
tables[h3] = df_list[0]
except ValueError:
pass

# Prettify tables, convert types
for k in tables.keys():
d = tables[k]

d = d.set_index(d.columns[0])
d.index.name = None
d.columns = [k]

d[k] = d[k].str.replace('--', '')
f_pct = d[k].str.contains('%')
if (f_pct|(d[k]=='')).all():
d[k] = d[k].str.replace('%', '')
try:
d[k] = pd.to_numeric(d[k])
except ValueError:
pass

tables[k] = d

# print("------------------------")
# for k in tables.keys():
# print(k)
# t = tables[k]
# print(t)
# print(t[t.columns[0]].dtype)
# print("------------------------")

self._major_holdings = tables

except Exception:
self._major_holdings = []
4 changes: 4 additions & 0 deletions yfinance/ticker.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,3 +280,7 @@ def earnings_forecasts(self) -> _pd.DataFrame:
@property
def history_metadata(self) -> dict:
return self.get_history_metadata()

@property
def major_holdings(self):
return self.get_major_holdings()
Loading