From 049dcef69dd0719a22e114d9d88cd85ab656a4ca Mon Sep 17 00:00:00 2001 From: James Fulton Date: Tue, 3 Oct 2023 12:28:59 +0000 Subject: [PATCH] allow different learning rates for submodules + fixes --- configs/model/multimodal.yaml | 10 ++- pvnet/models/base_model.py | 2 +- .../multimodal/site_encoders/encoders.py | 4 +- pvnet/optimizers.py | 78 +++++++++++++++---- 4 files changed, 71 insertions(+), 23 deletions(-) diff --git a/configs/model/multimodal.yaml b/configs/model/multimodal.yaml index c654694a..9581c0da 100644 --- a/configs/model/multimodal.yaml +++ b/configs/model/multimodal.yaml @@ -6,7 +6,7 @@ output_quantiles: [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98] # NWP encoder #-------------------------------------------- -nwp_encoder: +nwp_encoder: _target_: pvnet.models.multimodal.encoders.encoders3d.DefaultPVNet _partial_: True in_channels: 2 @@ -28,7 +28,7 @@ sat_encoder: conv3d_channels: 32 image_size_pixels: 24 -add_image_embedding_channel: True +add_image_embedding_channel: False #-------------------------------------------- # PV encoder settings @@ -80,8 +80,10 @@ pv_history_minutes: 180 # ---------------------------------------------- optimizer: _target_: pvnet.optimizers.AdamWReduceLROnPlateau - lr: 0.0001 - weight_decay: 0.25 + lr: + pv_encoder: 0.002 + default: 0.0001 + weight_decay: 0.02 amsgrad: True patience: 5 factor: 0.1 diff --git a/pvnet/models/base_model.py b/pvnet/models/base_model.py index 74cadd39..7561c893 100644 --- a/pvnet/models/base_model.py +++ b/pvnet/models/base_model.py @@ -448,4 +448,4 @@ def configure_optimizers(self): if self.lr is not None: # Use learning rate found by learning rate finder callback self._optimizer.lr = self.lr - return self._optimizer(self.parameters()) + return self._optimizer(self) diff --git a/pvnet/models/multimodal/site_encoders/encoders.py b/pvnet/models/multimodal/site_encoders/encoders.py index 71698c41..51549c83 100644 --- a/pvnet/models/multimodal/site_encoders/encoders.py +++ b/pvnet/models/multimodal/site_encoders/encoders.py @@ -85,7 +85,7 @@ def _calculate_attention(self, x): def _encode_value(self, x): # Shape: [batch size, sequence length, PV site] - pv_site_seqs = x[BatchKey.pv] + pv_site_seqs = x[BatchKey.pv].float() batch_size = pv_site_seqs.shape[0] pv_site_seqs = pv_site_seqs.swapaxes(1,2).flatten(0,1) @@ -97,7 +97,7 @@ def _encode_value(self, x): def forward(self, x): """Run model forward""" # Output has shape: [batch size, num_sites, value_dim] - encodeded_seqs = self.encode_value(x) + encodeded_seqs = self._encode_value(x) # Calculate learned averaging weights attn_avg_weights = self._calculate_attention(x) diff --git a/pvnet/optimizers.py b/pvnet/optimizers.py index 54d64af7..bf370ef7 100644 --- a/pvnet/optimizers.py +++ b/pvnet/optimizers.py @@ -11,7 +11,7 @@ class AbstractOptimizer(ABC): Optimizer classes will be used by model like: > OptimizerGenerator = AbstractOptimizer() - > optimizer = OptimizerGenerator(model.parameters()) + > optimizer = OptimizerGenerator(model) The returned object `optimizer` must be something that may be returned by `pytorch_lightning`'s `configure_optimizers()` method. See : @@ -33,9 +33,9 @@ def __init__(self, lr=0.0005, **kwargs): self.lr = lr self.kwargs = kwargs - def __call__(self, model_parameters): + def __call__(self, model): """Return optimizer""" - return torch.optim.Adam(model_parameters, lr=self.lr, **self.kwargs) + return torch.optim.Adam(model.parameters(), lr=self.lr, **self.kwargs) class AdamW(AbstractOptimizer): @@ -46,9 +46,9 @@ def __init__(self, lr=0.0005, **kwargs): self.lr = lr self.kwargs = kwargs - def __call__(self, model_parameters): + def __call__(self, model): """Return optimizer""" - return torch.optim.AdamW(model_parameters, lr=self.lr, **self.kwargs) + return torch.optim.AdamW(model.parameters(), lr=self.lr, **self.kwargs) class AdamWReduceLROnPlateau(AbstractOptimizer): @@ -56,20 +56,66 @@ class AdamWReduceLROnPlateau(AbstractOptimizer): def __init__(self, lr=0.0005, patience=3, factor=0.5, threshold=2e-4, **opt_kwargs): """AdamW optimizer and reduce on plateau scheduler""" - self.lr = lr + self._lr = lr self.patience = patience self.factor = factor self.threshold = threshold self.opt_kwargs = opt_kwargs - - def __call__(self, model_parameters): - """Return optimizer""" - opt = torch.optim.AdamW(model_parameters, lr=self.lr, **self.opt_kwargs) - sch = torch.optim.lr_scheduler.ReduceLROnPlateau( - opt, - factor=self.factor, - patience=self.patience, - threshold=self.threshold, + + def _call_multi(self, model): + + remaining_params = {k:p for k,p in model.named_parameters()} + + group_args = [] + + for key in self._lr.keys(): + if key=="default": + continue + + submodule_params = [] + for param_name in list(remaining_params.keys()): + if param_name.startswith(key): + submodule_params += [remaining_params.pop(param_name)] + + group_args += [{"params": submodule_params, "lr": self._lr[key]}] + + remaining_params = [p for k, p in remaining_params.items()] + group_args += [{"params": remaining_params}] + + opt = torch.optim.AdamW( + group_args, + lr=self._lr["default"] if model.lr is None else model.lr, + **self.opt_kwargs ) - sch = {"scheduler": sch, "monitor": "MAE/train"} + sch = { + "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau( + opt, + factor=self.factor, + patience=self.patience, + threshold=self.threshold, + ), + "monitor": "quantile_loss/val" if model.use_quantile_regression else "MAE/train" + } + return [opt], [sch] + + + def __call__(self, model): + """Return optimizer""" + if not isinstance(self._lr, float): + return self._call_multi(model) + else: + assert False + default_lr = self._lr if model.lr is None else model.lr + opt = torch.optim.AdamW(model.parameters(), lr=default_lr, **self.opt_kwargs) + sch = torch.optim.lr_scheduler.ReduceLROnPlateau( + opt, + factor=self.factor, + patience=self.patience, + threshold=self.threshold, + ) + sch = { + "scheduler": sch, + "monitor": "quantile_loss/train" if model.use_quantile_regression else "MAE/train", + } + return [opt], [sch]