Skip to content

Commit

Permalink
Added add tables
Browse files Browse the repository at this point in the history
  • Loading branch information
Suchismit4 committed Jan 14, 2025
1 parent d1bbb41 commit 257714c
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 1 deletion.
Binary file modified crsp_vs_yfinance_subplots.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified src/data/core/__pycache__/util.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/core/operations/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
1 change: 0 additions & 1 deletion src/data/core/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,6 @@ def reduce(
# Extract valid data based on mask_indices
valid_data = data[self.indices, ...] # Shape: (T, assets, 1)


# Perform rolling
rolled_result = TimeSeriesOps.u_roll(
data=valid_data,
Expand Down
Binary file modified src/data/loaders/wrds/__pycache__/compustat.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/loaders/wrds/__pycache__/crsp.cpython-312.pyc
Binary file not shown.
18 changes: 18 additions & 0 deletions src/data/loaders/wrds/compustat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from src.data.core.util import FrequencyType
from typing import Dict, Any, List
from .generic import GenericWRDSDataLoader
import pyreadstat

class CompustatDataFetcher(GenericWRDSDataLoader):
"""
Expand Down Expand Up @@ -52,6 +53,21 @@ def _preprocess_df(self, df: pd.DataFrame, **config) -> pd.DataFrame:
**config
)

# Load additional Compustat table (e.g., filenamesq for company static info)
filenamesq_path = "/wrds/comp/sasdata/d_na/filenamesq.sas7bdat"
extra_df, _ = pyreadstat.read_file_multiprocessing(
pyreadstat.read_sas7bdat,
filenamesq_path,
num_processes=config.get('num_processes', 16)
)
extra_df.columns = extra_df.columns.str.lower()
print(extra_df.columns)
quit(1)
extra_df.rename(columns={'gvkey': 'identifier', 'conm': 'company_name'}, inplace=True)

# Merge company names onto main dataframe
df = df.merge(extra_df[['identifier', 'company_name']], on='identifier', how='left')

# CompuStat has duplicate multiple entries on some timeframes.
# We keep only the last one and forward dates to date end.
# Ensure 'date' is a datetime object for proper comparison
Expand All @@ -62,5 +78,7 @@ def _preprocess_df(self, df: pd.DataFrame, **config) -> pd.DataFrame:

# Set the date to the last day of the year
df['date'] = df['date'].apply(lambda x: x.replace(month=12, day=31))



return df
79 changes: 79 additions & 0 deletions src/data/loaders/wrds/crsp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import xarray as xr
from src.data.core.util import FrequencyType
from .generic import GenericWRDSDataLoader
import pyreadstat


class CRSPDataFetcher(GenericWRDSDataLoader):
"""
Expand Down Expand Up @@ -57,5 +59,82 @@ def _preprocess_df(self, df: pd.DataFrame, **config) -> pd.DataFrame:
# convert 'permco' to int if present
if 'permco' in df.columns:
df['permco'] = df['permco'].astype(int)

# 1. Merge msenames: company names
try:
names_path = "/wrds/crsp/sasdata/a_stock/msenames.sas7bdat"
names_df, _ = pyreadstat.read_file_multiprocessing(
pyreadstat.read_sas7bdat,
names_path,
num_processes=config.get('num_processes', 16)
)
names_df.columns = names_df.columns.str.lower()
# Rename key columns as needed
if 'permno' in names_df.columns and 'comnam' in names_df.columns:
names_df.rename(columns={'permno': 'identifier', 'comnam': 'company_name'}, inplace=True)
else:
raise KeyError("Expected columns 'permno' and 'comnam' not found in msenames.")
# Merge company names
df = pd.merge(df, names_df[['identifier', 'company_name']], on='identifier', how='left')
except Exception as e:
print(f"Warning: Could not merge msenames data due to error: {e}")

# 2. Merge msedist: distributions (e.g., dividends, repurchases)
try:
dist_path = "/wrds/crsp/sasdata/a_stock/msedist.sas7bdat"
dist_df, _ = pyreadstat.read_file_multiprocessing(
pyreadstat.read_sas7bdat,
dist_path,
num_processes=config.get('num_processes', 16)
)
dist_df.columns = dist_df.columns.str.lower()
# Rename and select relevant columns; adjust as necessary.
# Assuming msedist contains 'permno', 'date', and distribution details.
if 'permno' in dist_df.columns and 'date' in dist_df.columns:
dist_df.rename(columns={'permno': 'identifier'}, inplace=True)
# Convert SAS date to datetime for merging, if needed.
dist_df['date'] = self.convert_sas_date(dist_df['date'])
else:
raise KeyError("Expected columns 'permno' and 'date' not found in msedist.")
# Merge distribution data on 'identifier' and 'date'
# Using suffixes to avoid collisions if same column names exist.
df = pd.merge(
df,
dist_df,
on=['identifier', 'date'],
how='left',
suffixes=('', '_dist')
)
except Exception as e:
print(f"Warning: Could not merge msedist data due to error: {e}")

# 3. Merge msedelist: delisting information
try:
delist_path = "/wrds/crsp/sasdata/a_stock/msedelist.sas7bdat"
delist_df, _ = pyreadstat.read_file_multiprocessing(
pyreadstat.read_sas7bdat,
delist_path,
num_processes=config.get('num_processes', 16)
)
delist_df.columns = delist_df.columns.str.lower()
# Rename and preprocess columns as necessary.
if 'permno' in delist_df.columns and 'dlstdt' in delist_df.columns:
delist_df.rename(columns={'permno': 'identifier', 'dlstdt': 'delist_date'}, inplace=True)
# Convert SAS date to datetime for merging, if needed.
delist_df['delist_date'] = self.convert_sas_date(delist_df['delist_date'])
else:
raise KeyError("Expected columns 'permno' and 'dlstdt' not found in msedelist.")
# Merge delisting data.
# This merge strategy depends on how you want to incorporate delisting info.
# Here, we add delisting date information to each record if available.
df = pd.merge(
df,
delist_df[['identifier', 'delist_date']],
on='identifier',
how='left',
suffixes=('', '_delist')
)
except Exception as e:
print(f"Warning: Could not merge msedelist data due to error: {e}")

return df

0 comments on commit 257714c

Please sign in to comment.