Skip to content

Commit

Permalink
Cleanup & fix.
Browse files Browse the repository at this point in the history
  • Loading branch information
Suchismit4 committed Dec 30, 2024
1 parent c1b9a6c commit e3f7010
Show file tree
Hide file tree
Showing 21 changed files with 27 additions and 74 deletions.
6 changes: 1 addition & 5 deletions example_compustata.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,7 @@ def main():
])

# Access the Compustat dataset
print("Converting to ds")
compustat_ds = dataset['/market/equities/wrds/compustat'].ds

# Print the dataset structure
print(compustat_ds)
print(dataset)

if __name__ == "__main__":
main()
Binary file modified src/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/__pycache__/manager.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/__pycache__/provider.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/abstracts/__pycache__/base.cpython-312.pyc
Binary file not shown.
25 changes: 16 additions & 9 deletions src/data/abstracts/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def _apply_filters(self, df: pd.DataFrame, filters: Dict[str, Any]) -> pd.DataFr
Returns:
pd.DataFrame: The filtered DataFrame.
"""
print(df.columns)

for column, condition in filters.items():
if isinstance(condition, tuple) or isinstance(condition, list) \
and len(condition) == 2:
Expand Down Expand Up @@ -123,12 +123,19 @@ def _metadata_matches(self, metadata_path: Path, request_params: Dict[str, Any])
cached_params = json.load(f)
# Direct comparison
# Convert any sequences to sorted lists before comparison
for params in (cached_params, request_params):
for key in params:
if isinstance(params[key], (list, tuple)):
params[key] = sorted(list(params[key]))

return cached_params == request_params
def normalize(params):
if isinstance(params, dict):
return {k: normalize(v) for k, v in params.items()}
if isinstance(params, (list, tuple)):
return sorted(normalize(x) for x in params)
return params

is_equal = normalize(cached_params) == normalize(request_params)

if not is_equal:
print(f"Parameters differ:\nCached: {cached_params}\nRequest: {request_params}")

return is_equal
except Exception as e:
print(f"Error reading metadata file {metadata_path}: {e}")
return False
Expand Down Expand Up @@ -200,8 +207,8 @@ def save_to_cache(
time_indexes = ds.coords['time'].attrs.pop('indexes', None)

try:
# Add params to ds.attrs
ds.attrs.update(params)
# TODO: Add params to ds.attrs.
# ds.attrs.update(params)

ds.to_netcdf(
path=netcdf_path,
Expand Down
Binary file modified src/data/core/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/core/__pycache__/computations.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/core/__pycache__/operations.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/core/__pycache__/struct.cpython-312.pyc
Binary file not shown.
44 changes: 1 addition & 43 deletions src/data/core/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,52 +192,10 @@ def from_table(
index_names = ['year', 'month', 'day', asset_column]
full_index = pd.MultiIndex.from_product(index_components, names=index_names)

dup_subset = ['year','month','day','identifier']
pre_dup_mask = data.duplicated(subset=dup_subset, keep=False)
pre_dup_data = data[pre_dup_mask].sort_values(dup_subset)

print("==== DUPLICATE ROWS BEFORE SETTING INDEX ====") #DEBUG
print(pre_dup_data)

counts = (
data
.reset_index(drop=True) # Make sure we don't have a MultiIndex yet
.groupby(['year','month','day','identifier'])
.size()
.sort_values(ascending=False)
)

# Show only those with duplicates (DEBUG)
counts_dup = counts[counts > 1]
print("==== MULTIINDEX GROUPS THAT APPEAR MORE THAN ONCE ====")
print(counts_dup)

# Set DataFrame index and reindex to include all possible combinations
data.set_index(index_names, inplace=True)


# OVERLOOK (DEBUG)
print("The multi-index is not unique. Identifying duplicate index entries:")
# Find duplicated index entries
duplicated_indices = data.index[data.index.duplicated(keep=False)]
print(duplicated_indices.unique())
print(data[data.index.isin(duplicated_indices)])
duplicated_mask = data.index.duplicated(keep=False)
duplicated_data = data[duplicated_mask]
print("==== DUPLICATE ROWS ====")
print(duplicated_data)
i = 0
for idx, group in duplicated_data.groupby(level=[0, 1, 2, 3]):
print("MultiIndex:", idx)
print(group)
print("-----")
i += 1
if (i > 3):
break

# quit(0)
# OVERLOOK STOP

# Perform the re-indexing
data = data.reindex(full_index)

# Create the time coordinate array
Expand Down
Binary file modified src/data/loaders/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/loaders/open_bb/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/loaders/open_bb/__pycache__/generic.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/loaders/wrds/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/loaders/wrds/__pycache__/compustat.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/loaders/wrds/__pycache__/crsp.cpython-312.pyc
Binary file not shown.
Binary file modified src/data/loaders/wrds/__pycache__/generic.cpython-312.pyc
Binary file not shown.
25 changes: 9 additions & 16 deletions src/data/loaders/wrds/compustat.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,23 +51,16 @@ def _preprocess_df(self, df: pd.DataFrame, **config) -> pd.DataFrame:
identifier_col='gvkey',
**config
)

# Print duplicate rows for verification of removal
duplicate_rows = df[df.duplicated(subset='identifier', keep=False)]
print("Duplicate Rows:")
print(duplicate_rows)


# CompuStat has duplicate multiple entries on some time-frames.
# We keep only the last one and forward dates to date end.
# Ensure 'date' is a datetime object for proper comparison
df['date'] = pd.to_datetime(df['date'])

# Sort by 'date' and drop duplicates while keeping the first occurrence
# Sort by 'date' and drop duplicates while keeping the last occurrence
df = df.sort_values('date', ascending=False).drop_duplicates(subset='identifier', keep='last')

# Add 'lag' parameter for lookahead bias calibration
df['lag'] = 0

# Assert whether all identifiers are unique and print df
print(f"df['identifier'].is_unique(): {df['identifier'].is_unique}")
print(df)


# Set the date to the last day of the year
df['date'] = df['date'].apply(lambda x: x.replace(month=12, day=31))

return df
1 change: 0 additions & 1 deletion src/data/loaders/wrds/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def load_data(self, **config) -> xr.Dataset:
# Attempt cache load
cached_ds = self.load_from_cache(cache_path, request_params=params)
if cached_ds is not None:
print("Loaded from NetCDF cache")
return cached_ds

# Load from local source
Expand Down

0 comments on commit e3f7010

Please sign in to comment.