Cleanup & fix.

Suchismit4 · Dec 30, 2024 · e3f7010 · e3f7010
1 parent c1b9a6c
commit e3f7010
Show file tree

Hide file tree

Showing 21 changed files with 27 additions and 74 deletions.
diff --git a/example_compustata.py b/example_compustata.py
@@ -40,11 +40,7 @@ def main():
     ])
 
     # Access the Compustat dataset
-    print("Converting to ds")
-    compustat_ds = dataset['/market/equities/wrds/compustat'].ds
-
-    # Print the dataset structure
-    print(compustat_ds)
+    print(dataset)
 
 if __name__ == "__main__":
     main()
diff --git a/src/__pycache__/__init__.cpython-312.pyc b/src/__pycache__/__init__.cpython-312.pyc
diff --git a/src/data/__pycache__/__init__.cpython-312.pyc b/src/data/__pycache__/__init__.cpython-312.pyc
diff --git a/src/data/__pycache__/manager.cpython-312.pyc b/src/data/__pycache__/manager.cpython-312.pyc
diff --git a/src/data/__pycache__/provider.cpython-312.pyc b/src/data/__pycache__/provider.cpython-312.pyc
diff --git a/src/data/abstracts/__pycache__/base.cpython-312.pyc b/src/data/abstracts/__pycache__/base.cpython-312.pyc
diff --git a/src/data/abstracts/base.py b/src/data/abstracts/base.py
@@ -37,7 +37,7 @@ def _apply_filters(self, df: pd.DataFrame, filters: Dict[str, Any]) -> pd.DataFr
         Returns:
             pd.DataFrame: The filtered DataFrame.
         """
-        print(df.columns)
+
         for column, condition in filters.items():
             if isinstance(condition, tuple) or isinstance(condition, list) \
                     and len(condition) == 2:
@@ -123,12 +123,19 @@ def _metadata_matches(self, metadata_path: Path, request_params: Dict[str, Any])
                 cached_params = json.load(f)
             # Direct comparison
             # Convert any sequences to sorted lists before comparison
-            for params in (cached_params, request_params):
-                for key in params:
-                    if isinstance(params[key], (list, tuple)):
-                        params[key] = sorted(list(params[key]))
-
-            return cached_params == request_params
+            def normalize(params):
+                if isinstance(params, dict):
+                    return {k: normalize(v) for k, v in params.items()}
+                if isinstance(params, (list, tuple)):
+                    return sorted(normalize(x) for x in params)
+                return params
+
+            is_equal = normalize(cached_params) == normalize(request_params)
+
+            if not is_equal:
+                print(f"Parameters differ:\nCached: {cached_params}\nRequest: {request_params}")
+
+            return is_equal
         except Exception as e:
             print(f"Error reading metadata file {metadata_path}: {e}")
             return False
@@ -200,8 +207,8 @@ def save_to_cache(
         time_indexes = ds.coords['time'].attrs.pop('indexes', None)
 
         try:
-            # Add params to ds.attrs
-            ds.attrs.update(params)
+            # TODO: Add params to ds.attrs.
+            # ds.attrs.update(params)
 
             ds.to_netcdf(
                 path=netcdf_path,

diff --git a/src/data/core/__pycache__/__init__.cpython-312.pyc b/src/data/core/__pycache__/__init__.cpython-312.pyc
diff --git a/src/data/core/__pycache__/computations.cpython-312.pyc b/src/data/core/__pycache__/computations.cpython-312.pyc
diff --git a/src/data/core/__pycache__/operations.cpython-312.pyc b/src/data/core/__pycache__/operations.cpython-312.pyc
diff --git a/src/data/core/__pycache__/struct.cpython-312.pyc b/src/data/core/__pycache__/struct.cpython-312.pyc
diff --git a/src/data/core/struct.py b/src/data/core/struct.py
@@ -192,52 +192,10 @@ def from_table(
         index_names = ['year', 'month', 'day', asset_column]
         full_index = pd.MultiIndex.from_product(index_components, names=index_names)
 
-        dup_subset = ['year','month','day','identifier']
-        pre_dup_mask = data.duplicated(subset=dup_subset, keep=False)
-        pre_dup_data = data[pre_dup_mask].sort_values(dup_subset)
-
-        print("==== DUPLICATE ROWS BEFORE SETTING INDEX ====") #DEBUG
-        print(pre_dup_data)
-
-        counts = (
-            data
-            .reset_index(drop=True)    # Make sure we don't have a MultiIndex yet
-            .groupby(['year','month','day','identifier'])  
-            .size()  
-            .sort_values(ascending=False)
-        )
-
-        # Show only those with duplicates (DEBUG)
-        counts_dup = counts[counts > 1]
-        print("==== MULTIINDEX GROUPS THAT APPEAR MORE THAN ONCE ====")
-        print(counts_dup)
-
         # Set DataFrame index and reindex to include all possible combinations
         data.set_index(index_names, inplace=True)
-
 
-        # OVERLOOK (DEBUG)
-        print("The multi-index is not unique. Identifying duplicate index entries:")
-        # Find duplicated index entries
-        duplicated_indices = data.index[data.index.duplicated(keep=False)]
-        print(duplicated_indices.unique())
-        print(data[data.index.isin(duplicated_indices)])
-        duplicated_mask = data.index.duplicated(keep=False)
-        duplicated_data = data[duplicated_mask]
-        print("==== DUPLICATE ROWS ====")
-        print(duplicated_data)
-        i = 0
-        for idx, group in duplicated_data.groupby(level=[0, 1, 2, 3]): 
-            print("MultiIndex:", idx)
-            print(group)
-            print("-----")
-            i += 1
-            if (i > 3):
-                break
-
-        # quit(0)
-        # OVERLOOK STOP
-
+        # Perform the re-indexing        
         data = data.reindex(full_index)
 
         # Create the time coordinate array

diff --git a/src/data/loaders/__pycache__/__init__.cpython-312.pyc b/src/data/loaders/__pycache__/__init__.cpython-312.pyc
diff --git a/src/data/loaders/open_bb/__pycache__/__init__.cpython-312.pyc b/src/data/loaders/open_bb/__pycache__/__init__.cpython-312.pyc
diff --git a/src/data/loaders/open_bb/__pycache__/generic.cpython-312.pyc b/src/data/loaders/open_bb/__pycache__/generic.cpython-312.pyc
diff --git a/src/data/loaders/wrds/__pycache__/__init__.cpython-312.pyc b/src/data/loaders/wrds/__pycache__/__init__.cpython-312.pyc
diff --git a/src/data/loaders/wrds/__pycache__/compustat.cpython-312.pyc b/src/data/loaders/wrds/__pycache__/compustat.cpython-312.pyc
diff --git a/src/data/loaders/wrds/__pycache__/crsp.cpython-312.pyc b/src/data/loaders/wrds/__pycache__/crsp.cpython-312.pyc
diff --git a/src/data/loaders/wrds/__pycache__/generic.cpython-312.pyc b/src/data/loaders/wrds/__pycache__/generic.cpython-312.pyc
diff --git a/src/data/loaders/wrds/compustat.py b/src/data/loaders/wrds/compustat.py
@@ -51,23 +51,16 @@ def _preprocess_df(self, df: pd.DataFrame, **config) -> pd.DataFrame:
             identifier_col='gvkey',
             **config
         )
-
-        # Print duplicate rows for verification of removal
-        duplicate_rows = df[df.duplicated(subset='identifier', keep=False)]
-        print("Duplicate Rows:")
-        print(duplicate_rows)
-
+
+        # CompuStat has duplicate multiple entries on some time-frames.
+        # We keep only the last one and forward dates to date end.
         # Ensure 'date' is a datetime object for proper comparison
         df['date'] = pd.to_datetime(df['date'])
-
-        # Sort by 'date' and drop duplicates while keeping the first occurrence
+            
+        # Sort by 'date' and drop duplicates while keeping the last occurrence
         df = df.sort_values('date', ascending=False).drop_duplicates(subset='identifier', keep='last')
-
-        # Add 'lag' parameter for lookahead bias calibration
-        df['lag'] = 0
-
-        # Assert whether all identifiers are unique and print df
-        print(f"df['identifier'].is_unique(): {df['identifier'].is_unique}")
-        print(df)
-
+
+        # Set the date to the last day of the year
+        df['date'] = df['date'].apply(lambda x: x.replace(month=12, day=31))
+
         return df
diff --git a/src/data/loaders/wrds/generic.py b/src/data/loaders/wrds/generic.py
@@ -66,7 +66,6 @@ def load_data(self, **config) -> xr.Dataset:
         # Attempt cache load
         cached_ds = self.load_from_cache(cache_path, request_params=params)
         if cached_ds is not None:
-            print("Loaded from NetCDF cache")
             return cached_ds
 
         # Load from local source