changed preprocessing and made eval and model updates

DAI-Lab · Sep 20, 2024 · f391291 · f391291
1 parent e5b73c8
commit f391291
Show file tree

Hide file tree

Showing 8 changed files with 116 additions and 134 deletions.
diff --git a/config/model_config.yaml b/config/model_config.yaml
@@ -1,21 +1,21 @@
 seq_len: 96 # should not be changed for the current datasets
 input_dim: 2 # or 1 depending on user, but is dynamically set
 noise_dim: 256
-cond_emb_dim: 256
+cond_emb_dim: 64
 shuffle: True
 
 conditioning_vars: # for each desired conditioning variable, add the name and number of categories
   month: 12
   weekday: 7
-  building_type: 3
-  solar: 2
-  car1: 2
-  city: 7
-  state: 3
+  # building_type: 3
+  # solar: 2
+  # car1: 2
+  # city: 7
+  # state: 3
 
 diffcharge:
   batch_size: 64
-  n_epochs: 1
+  n_epochs: 1000
   init_lr: 3e-5
   network: cnn # attention
   guidance_scale: 1.2
@@ -29,13 +29,13 @@ diffcharge:
 
 diffusion_ts:
   batch_size: 64
-  n_epochs: 1
+  n_epochs: 1000
   n_steps: 1000
   base_lr: 1e-4
   n_layer_enc: 4
   n_layer_dec: 5
-  d_model: 128
-  cond_emb_dim: 128
+  d_model: 256
+  cond_emb_dim: 256
   sampling_timesteps: null
   loss_type: l1 #l2
   beta_schedule: cosine #linear
@@ -63,7 +63,7 @@ diffusion_ts:
 
 acgan:
   batch_size: 16
-  n_epochs: 1
+  n_epochs: 200
   validate: false # add validation during training
   lr_gen: 2e-4
   lr_discr: 1e-4
diff --git a/datasets/pecanstreet.py b/datasets/pecanstreet.py
@@ -150,16 +150,10 @@ def _preprocess_data(self, data: pd.DataFrame) -> pd.DataFrame:
         )
 
         if self.normalize:
-            if self.normalization_method == "group":
-                self.stats["grid"] = self._calculate_and_store_statistics_group(
-                    filtered_data, "grid"
-                )
-                filtered_data = self._apply_normalization_group(filtered_data, "grid")
-            elif self.normalization_method == "global":
-                self.stats["grid"] = self._calculate_and_store_statistics_global(
-                    filtered_data, "grid"
-                )
-                filtered_data = self._apply_normalization_global(filtered_data, "grid")
+            self.stats["grid"] = self._calculate_and_store_statistics(
+                filtered_data, "grid"
+            )
+            filtered_data = self._normalize_and_scale(filtered_data, "grid")
 
         if self.include_generation:
             solar_data = self._preprocess_solar(data)
@@ -174,11 +168,9 @@ def _preprocess_data(self, data: pd.DataFrame) -> pd.DataFrame:
             by=["dataid", "month", "weekday"]
         )
 
-    def _calculate_and_store_statistics_group(
-        self, data: pd.DataFrame, column: str
-    ) -> Dict:
+    def _calculate_and_store_statistics(self, data: pd.DataFrame, column: str) -> Dict:
         """
-        Calculates and stores statistical data for group normalization, such as mean and standard deviation.
+        Calculates and stores statistical data for group normalization, such as min, max, mean and standard deviation.
 
         Args:
             data (pd.DataFrame): The data on which to calculate statistics.
@@ -192,16 +184,19 @@ def calculate_stats(group):
             all_values = np.concatenate(group[column].values)
             mean = np.mean(all_values)
             std = np.std(all_values)
-            return pd.Series({"mean": mean, "std": std})
+            min = np.min(all_values)
+            max = np.max(all_values)
+            return pd.Series({"mean": mean, "std": std, "min": min, "max": max})
 
-        grouped_stats = data.groupby(["dataid", "month", "weekday"]).apply(
-            calculate_stats
-        )
-        return grouped_stats.to_dict(orient="index")
+        if self.normalization_method == "group":
+            grouped_stats = data.groupby(["dataid", "month", "weekday"]).apply(
+                calculate_stats
+            )
+            return grouped_stats.to_dict(orient="index")
+        else:
+            return calculate_stats(data).to_dict()
 
-    def _apply_normalization_group(
-        self, data: pd.DataFrame, column: str
-    ) -> pd.DataFrame:
+    def _normalize_and_scale(self, data: pd.DataFrame, column: str) -> pd.DataFrame:
         """
         Applies group normalization to the data.
 
@@ -213,60 +208,28 @@ def _apply_normalization_group(
             pd.DataFrame: The normalized data.
         """
 
-        def normalize(row):
-            stats = self.stats[column][(row["dataid"], row["month"], row["weekday"])]
-            mean, std = stats["mean"], stats["std"]
+        def normalize_and_scale_row(row):
+            if self.normalization_method == "group":
+                stats = self.stats[column][
+                    (row["dataid"], row["month"], row["weekday"])
+                ]
+            else:
+                stats = self.stats[column]
+
+            mean, std, min, max = (
+                stats["mean"],
+                stats["std"],
+                stats["min"],
+                stats["max"],
+            )
             values = np.array(row[column])
             normalized = (values - mean) / (std + 1e-8)
             if self.threshold:
                 normalized = np.clip(normalized, *self.threshold)
-            return normalized
-
-        data[column] = data.apply(normalize, axis=1)
-        return data
-
-    def _calculate_and_store_statistics_global(
-        self, data: pd.DataFrame, column: str
-    ) -> Dict:
-        """
-        Calculates and stores statistical data for global normalization.
-
-        Args:
-            data (pd.DataFrame): The data on which to calculate statistics.
-            column (str): The column for which to calculate statistics.
-
-        Returns:
-            Dict: A dictionary containing the global mean and standard deviation.
-        """
-        all_values = np.concatenate(data[column].values)
-        mean = np.mean(all_values)
-        std = np.std(all_values)
-        return {"mean": mean, "std": std}
-
-    def _apply_normalization_global(
-        self, data: pd.DataFrame, column: str
-    ) -> pd.DataFrame:
-        """
-        Applies global normalization to the data.
-
-        Args:
-            data (pd.DataFrame): The data to normalize.
-            column (str): The column on which normalization is applied.
-
-        Returns:
-            pd.DataFrame: The normalized data.
-        """
-
-        mean = self.stats[column]["mean"]
-        std = self.stats[column]["std"]
-
-        def normalize(values):
-            normalized = (values - mean) / (std + 1e-8)
-            if self.threshold:
-                normalized = np.clip(normalized, *self.threshold)
-            return normalized
+            scaled = (normalized - min) / (max - min)
+            return scaled
 
-        data[column] = data[column].apply(normalize)
+        data[column] = data.apply(normalize_and_scale_row, axis=1)
         return data
 
     def _preprocess_solar(self, data: pd.DataFrame) -> pd.DataFrame:
@@ -288,18 +251,10 @@ def _preprocess_solar(self, data: pd.DataFrame) -> pd.DataFrame:
         solar_data = solar_data[solar_data["solar"].apply(len) == 96]
 
         if self.normalize:
-            if self.normalization_method == "group":
-                valid_stats = self._calculate_and_store_statistics_group(
-                    solar_data, "solar"
-                )
-                self.stats["solar"] = valid_stats
-                solar_data = self._apply_normalization_group(solar_data, "solar")
-            elif self.normalization_method == "global":
-                valid_stats = self._calculate_and_store_statistics_global(
-                    solar_data, "solar"
-                )
-                self.stats["solar"] = valid_stats
-                solar_data = self._apply_normalization_global(solar_data, "solar")
+            self.stats["solar"] = self._calculate_and_store_statistics(
+                solar_data, "solar"
+            )
+            solar_data = self._normalize_and_scale(solar_data, "solar")
 
         return solar_data
 
@@ -520,18 +475,18 @@ def inverse_transform_column(self, row: pd.Series, colname: str) -> np.ndarray:
                 raise ValueError(
                     f"No stats found for {colname} with dataid={row['dataid']}, month={row['month']}, weekday={row['weekday']}"
                 )
-            mean = stats["mean"]
-            std = stats["std"]
         elif self.normalization_method == "global":
             stats = self.stats[colname]
-            mean = stats["mean"]
-            std = stats["std"]
         else:
             raise ValueError("Invalid normalization_method")
 
-        scaled = row[colname]
-        unscaled = scaled * (std + 1e-8) + mean
-        return unscaled
+        mean = stats["mean"]
+        std = stats["std"]
+        min = stats["min"]
+        max = stats["max"]
+        unscaled = row[colname] * (max - min) + min
+        unnormalized = unscaled * (std + 1e-8) + mean
+        return unnormalized
 
     def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
         """

diff --git a/eval/evaluator.py b/eval/evaluator.py
@@ -104,7 +104,7 @@ def run_eval(
         syn_user_data, real_user_data = self.generate_dataset_for_eval(
             model, dataset.data
         )
-        syn_user_data["dataid"] = real_user_data.reset_index()["dataid"]
+        syn_user_data.reset_index()["dataid"] = real_user_data.reset_index()["dataid"]
 
         real_user_data_inv = dataset.inverse_transform(real_user_data)
         syn_user_data_inv = dataset.inverse_transform(syn_user_data)
@@ -149,7 +149,7 @@ def evaluate_all_user_models(self):
         real_data = []
 
         for user_id in user_ids:
-            if user_id == 3687:
+            if user_id == 2318:
                 syn_user_data, real_user_data = self.evaluate_for_user(user_id)
                 syn_data.append(syn_user_data)
                 real_data.append(real_user_data)
@@ -163,20 +163,21 @@ def evaluate_all_user_models(self):
         self._log_final_results()
 
     def generate_samples_for_eval(
-        self, dataid: int, model: Any, num_samples: int
+        self, dataid: int, model: Any, dataset: Any, num_samples: int
     ) -> pd.DataFrame:
         """
         Generate synthetic samples for evaluation.
 
         Args:
             dataid (int): The ID of the data point.
             model: The trained model to generate samples.
+            dataset: The original dataset to ensure that only contained conditional variables are generated.
             num_samples (int): The number of samples to generate.
 
         Returns:
             pd.DataFrame: A DataFrame containing the generated samples.
         """
-        random_conditioning_vars = model.sample_random_conditioning_vars(1)
+        random_conditioning_vars = model.sample_random_conditioning_vars(dataset, 1)
 
         for keys, tensor in random_conditioning_vars.items():
             random_conditioning_vars[keys] = tensor.repeat(
@@ -199,33 +200,36 @@ def generate_samples_for_eval(
         return syn_ts_df
 
     def generate_dataset_for_eval(
-        self, model: Any, real_user_df: pd.DataFrame, num_samples: int = 100
+        self,
+        model: Any,
+        real_user_df: pd.DataFrame,
     ) -> pd.DataFrame:
         """
         Generate a synthetic dataset for evaluation, using a stratified subset.
 
         Args:
             model: The trained model to generate samples.
             real_user_df (pd.DataFrame): The real user data.
-            num_samples (int): The number of samples to generate (default: 1000).
 
         Returns:
             pd.DataFrame: A DataFrame containing the generated dataset.
             pd.DataFrame: Chosen subset of the real dataset
         """
         y = real_user_df["dataid"]
 
-        strat_split = StratifiedShuffleSplit(
-            n_splits=1, test_size=num_samples, random_state=42
-        )
+        if y.nunique() > 1:  # if there are multiple users, use a stratified subset
+            num_samples = real_user_df.shape[0] // 10
+            strat_split = StratifiedShuffleSplit(
+                n_splits=1, test_size=num_samples, random_state=42
+            )
 
-        for _, subset_index in strat_split.split(real_user_df, y):
-            subset_real_data = real_user_df.iloc[subset_index]
+            for _, subset_index in strat_split.split(real_user_df, y):
+                subset_real_data = real_user_df.iloc[subset_index]
 
-        subset_real_data = subset_real_data.reset_index(drop=True)
+            real_user_df = subset_real_data.reset_index(drop=True)
 
         real_conditioning_vars = {
-            name: torch.tensor(subset_real_data[name].values, dtype=torch.long)
+            name: torch.tensor(real_user_df[name].values, dtype=torch.long)
             for name in model.opt.categorical_dims.keys()
         }
 
@@ -236,10 +240,10 @@ def generate_dataset_for_eval(
                 generated_ts.shape[0], -1, generated_ts.shape[1]
             )
 
-        syn_ts = subset_real_data.copy()
+        syn_ts = real_user_df.copy()
         syn_ts["timeseries"] = list(generated_ts)
 
-        return syn_ts, subset_real_data
+        return syn_ts, real_user_df
 
     def evaluate_all_pv_users(self):
         """
@@ -450,6 +454,7 @@ def _create_visualizations(
         samples = self.generate_samples_for_eval(
             real_user_data["dataid"].iloc[0],
             model,
+            dataset,
             num_samples=100,
         )
         samples = dataset.inverse_transform(samples)

diff --git a/generator/conditioning.py b/generator/conditioning.py
@@ -16,14 +16,15 @@ def __init__(self, categorical_dims, embedding_dim, device):
         )
         total_dim = len(categorical_dims) * embedding_dim
         self.mlp = nn.Sequential(
-            nn.Linear(total_dim, 128), nn.ReLU(), nn.Linear(128, embedding_dim)
+            # nn.Linear(total_dim, 128), nn.ReLU(), nn.Linear(128, embedding_dim)
+            nn.Linear(total_dim, embedding_dim)
         ).to(device)
 
     def forward(self, categorical_vars):
         embeddings = []
         for name, embedding in self.category_embeddings.items():
             var = categorical_vars[name].to(self.device)
             embeddings.append(embedding(var))
-        conditioning_vector = torch.cat(embeddings, dim=1)
-        conditioning_vector = self.mlp(conditioning_vector)
-        return conditioning_vector
+        conditioning_matrix = torch.cat(embeddings, dim=1)
+        conditioning_matrix = self.mlp(conditioning_matrix)
+        return conditioning_matrix
diff --git a/generator/diffcharge/diffusion.py b/generator/diffcharge/diffusion.py
@@ -156,6 +156,17 @@ def sample(self, n_samples, categorical_vars, smooth=True, guidance_scale=1.0):
                     x[i] = torch.tensor(filtered_x, dtype=torch.float32).to(self.device)
             return x
 
+    def sample_random_conditioning_vars(self, dataset, batch_size):
+        sampled_rows = dataset.data.sample(n=batch_size).reset_index(drop=True)
+
+        categorical_vars = {}
+        for var_name in self.categorical_dims.keys():
+            categorical_vars[var_name] = torch.tensor(
+                sampled_rows[var_name].values, device=self.device
+            )
+
+        return categorical_vars
+
     def generate(self, categorical_vars):
         num_samples = categorical_vars[next(iter(categorical_vars))].shape[0]
         return self.sample(