Skip to content

Commit

Permalink
changed preprocessing and made eval and model updates
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Fuest committed Sep 20, 2024
1 parent e5b73c8 commit f391291
Show file tree
Hide file tree
Showing 8 changed files with 116 additions and 134 deletions.
22 changes: 11 additions & 11 deletions config/model_config.yaml
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
seq_len: 96 # should not be changed for the current datasets
input_dim: 2 # or 1 depending on user, but is dynamically set
noise_dim: 256
cond_emb_dim: 256
cond_emb_dim: 64
shuffle: True

conditioning_vars: # for each desired conditioning variable, add the name and number of categories
month: 12
weekday: 7
building_type: 3
solar: 2
car1: 2
city: 7
state: 3
# building_type: 3
# solar: 2
# car1: 2
# city: 7
# state: 3

diffcharge:
batch_size: 64
n_epochs: 1
n_epochs: 1000
init_lr: 3e-5
network: cnn # attention
guidance_scale: 1.2
Expand All @@ -29,13 +29,13 @@ diffcharge:

diffusion_ts:
batch_size: 64
n_epochs: 1
n_epochs: 1000
n_steps: 1000
base_lr: 1e-4
n_layer_enc: 4
n_layer_dec: 5
d_model: 128
cond_emb_dim: 128
d_model: 256
cond_emb_dim: 256
sampling_timesteps: null
loss_type: l1 #l2
beta_schedule: cosine #linear
Expand Down Expand Up @@ -63,7 +63,7 @@ diffusion_ts:

acgan:
batch_size: 16
n_epochs: 1
n_epochs: 200
validate: false # add validation during training
lr_gen: 2e-4
lr_discr: 1e-4
135 changes: 45 additions & 90 deletions datasets/pecanstreet.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,16 +150,10 @@ def _preprocess_data(self, data: pd.DataFrame) -> pd.DataFrame:
)

if self.normalize:
if self.normalization_method == "group":
self.stats["grid"] = self._calculate_and_store_statistics_group(
filtered_data, "grid"
)
filtered_data = self._apply_normalization_group(filtered_data, "grid")
elif self.normalization_method == "global":
self.stats["grid"] = self._calculate_and_store_statistics_global(
filtered_data, "grid"
)
filtered_data = self._apply_normalization_global(filtered_data, "grid")
self.stats["grid"] = self._calculate_and_store_statistics(
filtered_data, "grid"
)
filtered_data = self._normalize_and_scale(filtered_data, "grid")

if self.include_generation:
solar_data = self._preprocess_solar(data)
Expand All @@ -174,11 +168,9 @@ def _preprocess_data(self, data: pd.DataFrame) -> pd.DataFrame:
by=["dataid", "month", "weekday"]
)

def _calculate_and_store_statistics_group(
self, data: pd.DataFrame, column: str
) -> Dict:
def _calculate_and_store_statistics(self, data: pd.DataFrame, column: str) -> Dict:
"""
Calculates and stores statistical data for group normalization, such as mean and standard deviation.
Calculates and stores statistical data for group normalization, such as min, max, mean and standard deviation.
Args:
data (pd.DataFrame): The data on which to calculate statistics.
Expand All @@ -192,16 +184,19 @@ def calculate_stats(group):
all_values = np.concatenate(group[column].values)
mean = np.mean(all_values)
std = np.std(all_values)
return pd.Series({"mean": mean, "std": std})
min = np.min(all_values)
max = np.max(all_values)
return pd.Series({"mean": mean, "std": std, "min": min, "max": max})

grouped_stats = data.groupby(["dataid", "month", "weekday"]).apply(
calculate_stats
)
return grouped_stats.to_dict(orient="index")
if self.normalization_method == "group":
grouped_stats = data.groupby(["dataid", "month", "weekday"]).apply(
calculate_stats
)
return grouped_stats.to_dict(orient="index")
else:
return calculate_stats(data).to_dict()

def _apply_normalization_group(
self, data: pd.DataFrame, column: str
) -> pd.DataFrame:
def _normalize_and_scale(self, data: pd.DataFrame, column: str) -> pd.DataFrame:
"""
Applies group normalization to the data.
Expand All @@ -213,60 +208,28 @@ def _apply_normalization_group(
pd.DataFrame: The normalized data.
"""

def normalize(row):
stats = self.stats[column][(row["dataid"], row["month"], row["weekday"])]
mean, std = stats["mean"], stats["std"]
def normalize_and_scale_row(row):
if self.normalization_method == "group":
stats = self.stats[column][
(row["dataid"], row["month"], row["weekday"])
]
else:
stats = self.stats[column]

mean, std, min, max = (
stats["mean"],
stats["std"],
stats["min"],
stats["max"],
)
values = np.array(row[column])
normalized = (values - mean) / (std + 1e-8)
if self.threshold:
normalized = np.clip(normalized, *self.threshold)
return normalized

data[column] = data.apply(normalize, axis=1)
return data

def _calculate_and_store_statistics_global(
self, data: pd.DataFrame, column: str
) -> Dict:
"""
Calculates and stores statistical data for global normalization.
Args:
data (pd.DataFrame): The data on which to calculate statistics.
column (str): The column for which to calculate statistics.
Returns:
Dict: A dictionary containing the global mean and standard deviation.
"""
all_values = np.concatenate(data[column].values)
mean = np.mean(all_values)
std = np.std(all_values)
return {"mean": mean, "std": std}

def _apply_normalization_global(
self, data: pd.DataFrame, column: str
) -> pd.DataFrame:
"""
Applies global normalization to the data.
Args:
data (pd.DataFrame): The data to normalize.
column (str): The column on which normalization is applied.
Returns:
pd.DataFrame: The normalized data.
"""

mean = self.stats[column]["mean"]
std = self.stats[column]["std"]

def normalize(values):
normalized = (values - mean) / (std + 1e-8)
if self.threshold:
normalized = np.clip(normalized, *self.threshold)
return normalized
scaled = (normalized - min) / (max - min)
return scaled

data[column] = data[column].apply(normalize)
data[column] = data.apply(normalize_and_scale_row, axis=1)
return data

def _preprocess_solar(self, data: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -288,18 +251,10 @@ def _preprocess_solar(self, data: pd.DataFrame) -> pd.DataFrame:
solar_data = solar_data[solar_data["solar"].apply(len) == 96]

if self.normalize:
if self.normalization_method == "group":
valid_stats = self._calculate_and_store_statistics_group(
solar_data, "solar"
)
self.stats["solar"] = valid_stats
solar_data = self._apply_normalization_group(solar_data, "solar")
elif self.normalization_method == "global":
valid_stats = self._calculate_and_store_statistics_global(
solar_data, "solar"
)
self.stats["solar"] = valid_stats
solar_data = self._apply_normalization_global(solar_data, "solar")
self.stats["solar"] = self._calculate_and_store_statistics(
solar_data, "solar"
)
solar_data = self._normalize_and_scale(solar_data, "solar")

return solar_data

Expand Down Expand Up @@ -520,18 +475,18 @@ def inverse_transform_column(self, row: pd.Series, colname: str) -> np.ndarray:
raise ValueError(
f"No stats found for {colname} with dataid={row['dataid']}, month={row['month']}, weekday={row['weekday']}"
)
mean = stats["mean"]
std = stats["std"]
elif self.normalization_method == "global":
stats = self.stats[colname]
mean = stats["mean"]
std = stats["std"]
else:
raise ValueError("Invalid normalization_method")

scaled = row[colname]
unscaled = scaled * (std + 1e-8) + mean
return unscaled
mean = stats["mean"]
std = stats["std"]
min = stats["min"]
max = stats["max"]
unscaled = row[colname] * (max - min) + min
unnormalized = unscaled * (std + 1e-8) + mean
return unnormalized

def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Expand Down
35 changes: 20 additions & 15 deletions eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def run_eval(
syn_user_data, real_user_data = self.generate_dataset_for_eval(
model, dataset.data
)
syn_user_data["dataid"] = real_user_data.reset_index()["dataid"]
syn_user_data.reset_index()["dataid"] = real_user_data.reset_index()["dataid"]

real_user_data_inv = dataset.inverse_transform(real_user_data)
syn_user_data_inv = dataset.inverse_transform(syn_user_data)
Expand Down Expand Up @@ -149,7 +149,7 @@ def evaluate_all_user_models(self):
real_data = []

for user_id in user_ids:
if user_id == 3687:
if user_id == 2318:
syn_user_data, real_user_data = self.evaluate_for_user(user_id)
syn_data.append(syn_user_data)
real_data.append(real_user_data)
Expand All @@ -163,20 +163,21 @@ def evaluate_all_user_models(self):
self._log_final_results()

def generate_samples_for_eval(
self, dataid: int, model: Any, num_samples: int
self, dataid: int, model: Any, dataset: Any, num_samples: int
) -> pd.DataFrame:
"""
Generate synthetic samples for evaluation.
Args:
dataid (int): The ID of the data point.
model: The trained model to generate samples.
dataset: The original dataset to ensure that only contained conditional variables are generated.
num_samples (int): The number of samples to generate.
Returns:
pd.DataFrame: A DataFrame containing the generated samples.
"""
random_conditioning_vars = model.sample_random_conditioning_vars(1)
random_conditioning_vars = model.sample_random_conditioning_vars(dataset, 1)

for keys, tensor in random_conditioning_vars.items():
random_conditioning_vars[keys] = tensor.repeat(
Expand All @@ -199,33 +200,36 @@ def generate_samples_for_eval(
return syn_ts_df

def generate_dataset_for_eval(
self, model: Any, real_user_df: pd.DataFrame, num_samples: int = 100
self,
model: Any,
real_user_df: pd.DataFrame,
) -> pd.DataFrame:
"""
Generate a synthetic dataset for evaluation, using a stratified subset.
Args:
model: The trained model to generate samples.
real_user_df (pd.DataFrame): The real user data.
num_samples (int): The number of samples to generate (default: 1000).
Returns:
pd.DataFrame: A DataFrame containing the generated dataset.
pd.DataFrame: Chosen subset of the real dataset
"""
y = real_user_df["dataid"]

strat_split = StratifiedShuffleSplit(
n_splits=1, test_size=num_samples, random_state=42
)
if y.nunique() > 1: # if there are multiple users, use a stratified subset
num_samples = real_user_df.shape[0] // 10
strat_split = StratifiedShuffleSplit(
n_splits=1, test_size=num_samples, random_state=42
)

for _, subset_index in strat_split.split(real_user_df, y):
subset_real_data = real_user_df.iloc[subset_index]
for _, subset_index in strat_split.split(real_user_df, y):
subset_real_data = real_user_df.iloc[subset_index]

subset_real_data = subset_real_data.reset_index(drop=True)
real_user_df = subset_real_data.reset_index(drop=True)

real_conditioning_vars = {
name: torch.tensor(subset_real_data[name].values, dtype=torch.long)
name: torch.tensor(real_user_df[name].values, dtype=torch.long)
for name in model.opt.categorical_dims.keys()
}

Expand All @@ -236,10 +240,10 @@ def generate_dataset_for_eval(
generated_ts.shape[0], -1, generated_ts.shape[1]
)

syn_ts = subset_real_data.copy()
syn_ts = real_user_df.copy()
syn_ts["timeseries"] = list(generated_ts)

return syn_ts, subset_real_data
return syn_ts, real_user_df

def evaluate_all_pv_users(self):
"""
Expand Down Expand Up @@ -450,6 +454,7 @@ def _create_visualizations(
samples = self.generate_samples_for_eval(
real_user_data["dataid"].iloc[0],
model,
dataset,
num_samples=100,
)
samples = dataset.inverse_transform(samples)
Expand Down
9 changes: 5 additions & 4 deletions generator/conditioning.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@ def __init__(self, categorical_dims, embedding_dim, device):
)
total_dim = len(categorical_dims) * embedding_dim
self.mlp = nn.Sequential(
nn.Linear(total_dim, 128), nn.ReLU(), nn.Linear(128, embedding_dim)
# nn.Linear(total_dim, 128), nn.ReLU(), nn.Linear(128, embedding_dim)
nn.Linear(total_dim, embedding_dim)
).to(device)

def forward(self, categorical_vars):
embeddings = []
for name, embedding in self.category_embeddings.items():
var = categorical_vars[name].to(self.device)
embeddings.append(embedding(var))
conditioning_vector = torch.cat(embeddings, dim=1)
conditioning_vector = self.mlp(conditioning_vector)
return conditioning_vector
conditioning_matrix = torch.cat(embeddings, dim=1)
conditioning_matrix = self.mlp(conditioning_matrix)
return conditioning_matrix
11 changes: 11 additions & 0 deletions generator/diffcharge/diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,17 @@ def sample(self, n_samples, categorical_vars, smooth=True, guidance_scale=1.0):
x[i] = torch.tensor(filtered_x, dtype=torch.float32).to(self.device)
return x

def sample_random_conditioning_vars(self, dataset, batch_size):
sampled_rows = dataset.data.sample(n=batch_size).reset_index(drop=True)

categorical_vars = {}
for var_name in self.categorical_dims.keys():
categorical_vars[var_name] = torch.tensor(
sampled_rows[var_name].values, device=self.device
)

return categorical_vars

def generate(self, categorical_vars):
num_samples = categorical_vars[next(iter(categorical_vars))].shape[0]
return self.sample(
Expand Down
Loading

0 comments on commit f391291

Please sign in to comment.