Camera Ready check

vita-epfl · Jun 3, 2024 · 6635984 · 6635984
1 parent 48599ec
commit 6635984
Show file tree

Hide file tree

Showing 28 changed files with 203 additions and 2,598 deletions.
diff --git a/HumanPose/code/config.py b/HumanPose/code/config.py
@@ -23,6 +23,7 @@ def __init__(self) -> None:
 
         # 2. Initializing ParseConfig object --------------------------------------------------------------------------
         self.trials = conf_yml['trials']
+        self.model_name = conf_yml['model']
         self.dataset = conf_yml['dataset']
         self.experiment_settings = conf_yml['experiment_settings']
         self.architecture = conf_yml['architecture']

diff --git a/HumanPose/code/configuration.yml b/HumanPose/code/configuration.yml
@@ -1,7 +1,9 @@
-experiment_name: "TIC-TAC"
+experiment_name: "None" # A folder with this name will be created in save_path
 
 trials: 1
-use_hessian: False
+model: "ViTPose" # One of ['Hourglass', 'ViTPose']
+
+use_hessian: False # Default: False, can also be True
 
 save_path: "None"
 
@@ -12,11 +14,12 @@ dataset: {
 }
 
 experiment_settings: {
-  epochs: 100,       # Default: 100
-  lr: 0.01,          # Default: 1e-2
+  epochs: 150,       # Default: 150
+  lr: 0.001,         # Default: {1e-2: Hoursglass, 1e-3: ViTPose}
   batch_size: 32,    # Default: 32
 }
 
+# ViTPose configuration is in models/vit_pose/vitpose_config.py
 architecture: {
   hourglass: {nstack: 2, channels: 64},
   aux_net: {fc: [64, 64, 64]}

diff --git a/HumanPose/code/loss.py b/HumanPose/code/loss.py
@@ -1,15 +1,18 @@
 import torch
+from typing import Union
+
 
 from utils.tic import get_positive_definite_matrix, get_tic_covariance
+from models.vit_pose.ViTPose import ViTPose
 from models.stacked_hourglass.StackedHourglass import PoseNet as Hourglass
 
 
-def mse_gradient(means: torch.Tensor) -> torch.Tensor:
+def mse_loss(means: torch.Tensor) -> torch.Tensor:
     loss = (means ** 2).sum(dim=1)
     return loss.mean()
 
 
-def nll_gradient(means: torch.Tensor, matrix: torch.Tensor, dim: int) -> torch.Tensor:
+def nll_loss(means: torch.Tensor, matrix: torch.Tensor, dim: int) -> torch.Tensor:
     precision_hat = get_positive_definite_matrix(matrix, dim)
 
     loss = -torch.logdet(precision_hat) + torch.matmul(
@@ -19,13 +22,13 @@ def nll_gradient(means: torch.Tensor, matrix: torch.Tensor, dim: int) -> torch.T
     return loss.mean()
 
 
-def diagonal_gradient(means: torch.Tensor, matrix: torch.Tensor, dim: int) -> torch.Tensor:
+def diagonal_loss(means: torch.Tensor, matrix: torch.Tensor, dim: int) -> torch.Tensor:
     var_hat = matrix[:, :dim] ** 2
     loss = torch.log(var_hat) + ((means ** 2) / var_hat)
     return loss.mean()
 
 
-def beta_nll_gradient(means: torch.Tensor, matrix: torch.Tensor, dim: int) -> torch.Tensor:
+def beta_nll_loss(means: torch.Tensor, matrix: torch.Tensor, dim: int) -> torch.Tensor:
     var_hat = matrix[:, :dim] ** 2
     loss = torch.log(var_hat) + ((means ** 2) / var_hat)
     scaling = torch.clone(var_hat).detach() ** 0.5
@@ -34,7 +37,7 @@ def beta_nll_gradient(means: torch.Tensor, matrix: torch.Tensor, dim: int) -> to
     return loss.mean()
 
 
-def faithful_gradient(means: torch.Tensor, matrix: torch.Tensor, dim: int) -> torch.Tensor:
+def faithful_loss(means: torch.Tensor, matrix: torch.Tensor, dim: int) -> torch.Tensor:
     precision_hat = get_positive_definite_matrix(matrix, dim)
 
     # This trains the mean square error module independent of NLL
@@ -51,12 +54,12 @@ def faithful_gradient(means: torch.Tensor, matrix: torch.Tensor, dim: int) -> to
     return loss.mean()
 
 
-def tic_gradient(means: torch.Tensor, matrix: torch.Tensor, dim: int,
-                 pose_net: Hourglass, pose_encodings: dict, use_hessian: bool) -> torch.Tensor:
+def tic_loss(means: torch.Tensor, matrix: torch.Tensor, dim: int, pose_net: Union[ViTPose, Hourglass],
+             pose_encodings: dict, use_hessian: bool, model_name: str, imgs: torch.Tensor) -> torch.Tensor:
 
     psd_matrix = get_positive_definite_matrix(matrix, dim)
     covariance_hat = get_tic_covariance(
-        pose_net, pose_encodings, matrix, psd_matrix, use_hessian)
+        pose_net, pose_encodings, matrix, psd_matrix, use_hessian, model_name, imgs)
 
     precision_hat = torch.linalg.inv(covariance_hat)
 

diff --git a/HumanPose/code/main.py b/HumanPose/code/main.py
@@ -3,6 +3,7 @@
 import copy
 import logging
 from tqdm import tqdm
+from typing import Union
 
 
 # Science-y imports
@@ -15,18 +16,20 @@
 from config import ParseConfig
 from dataloader import load_hp_dataset, HumanPoseDataLoader
 
-from models.auxiliary.AuxiliaryNet import AuxNet
+from models.auxiliary.AuxiliaryNet import AuxNet_HG, AuxNet_ViTPose
 from models.stacked_hourglass.StackedHourglass import PoseNet as Hourglass
+from models.vit_pose import vitpose_config
+from models.vit_pose.ViTPose import ViTPose
 
 from utils.pose import fast_argmax, soft_argmax
 from utils.pose import heatmap_loss, count_parameters
 
 from utils.tic import get_positive_definite_matrix, get_tic_covariance
 from utils.tic import calculate_tac, calculate_ll
 
-from loss import mse_gradient, nll_gradient, diagonal_gradient
-from loss import beta_nll_gradient, faithful_gradient
-from loss import tic_gradient
+from loss import mse_loss, nll_loss, diagonal_loss
+from loss import beta_nll_loss, faithful_loss
+from loss import tic_loss
 
 # Global declarations
 logging.getLogger().setLevel(logging.INFO)
@@ -42,7 +45,7 @@ def __init__(self, sampler: HumanPoseDataLoader, models: tuple, conf: ParseConfi
         """
         Train and compare various covariance methods.
         :param sampler: Instance of HumanPoseDataLoader, samples from MPII + LSP
-        :param models: Contains (Hourglass, AuxNet) models
+        :param models: Contains (Hourglass or ViTPose, AuxNet) models
         :param conf: Stores the configuration for the experiment
         :param training_pkg: Dictionary which will hold models, optimizers, schedulers etc.
         :param trial: Which trial is ongoing
@@ -115,7 +118,7 @@ def train_model(self) -> dict:
 
                     loss_covariance = self.covariance_estimation(
                         aux_net=aux_net, pose_net=net, pose_encodings=pose_features,
-                        pred=pred_uv, gt=gt_uv, name=method)
+                        pred=pred_uv, gt=gt_uv, name=method, imgs=images)
 
                     # Weight update
                     (loss_covariance + torch.mean(hm_loss)).backward()
@@ -170,9 +173,10 @@ def validation(self, e: int) -> None:
                 self.training_pkg[method]['scheduler'].step(
                     self.training_pkg[method]['loss'][self.trial][e])
 
-
-    def covariance_estimation(self, aux_net: AuxNet, pose_net: Hourglass, pose_encodings: dict, 
-                              pred: torch.Tensor, gt: torch.Tensor, name: str) -> torch.Tensor:
+
+    def covariance_estimation(self, aux_net: Union[AuxNet_HG, AuxNet_ViTPose],
+                              pose_net: Union[Hourglass, ViTPose], pose_encodings: dict, 
+                              pred: torch.Tensor, gt: torch.Tensor, name: str, imgs: torch.Tensor) -> torch.Tensor:
         """
         Computing the full covariance matrix
 
@@ -194,40 +198,44 @@ def covariance_estimation(self, aux_net: AuxNet, pose_net: Hourglass, pose_encod
 
         # Various covariance implentations ------------------------------------------------------------
         if name == 'MSE':
-            loss = mse_gradient(means)
+            loss = mse_loss(means)
 
         elif name == 'NLL':
-            loss = nll_gradient(means, matrix, out_dim)
+            loss = nll_loss(means, matrix, out_dim)
 
         elif name == 'Diagonal':
-            loss = diagonal_gradient(means, matrix, out_dim)
+            loss = diagonal_loss(means, matrix, out_dim)
 
         elif name == 'Beta-NLL':
-            loss = beta_nll_gradient(means, matrix, out_dim)
+            loss = beta_nll_loss(means, matrix, out_dim)
 
         elif name == 'Faithful':
-            loss = faithful_gradient(means, matrix, out_dim)
+            loss = faithful_loss(means, matrix, out_dim)
 
         elif name == 'TIC':
-            loss = tic_gradient(means, matrix, out_dim, pose_net, pose_encodings, self.conf.use_hessian)
+            loss = tic_loss(means, matrix, out_dim, pose_net, pose_encodings,
+                            self.conf.use_hessian, self.conf.model_name, imgs)
 
         else:
             raise NotImplementedError
 
         return loss
 
 
-    def _aux_net_inference(self, pose_features: dict, aux_net: AuxNet) -> torch.Tensor:
+    def _aux_net_inference(self, pose_features: dict, aux_net: Union[AuxNet_HG, AuxNet_ViTPose]) -> torch.Tensor:
         """
         Obtaining the flattened matrix from the aux net inference module
         """
-        with torch.no_grad():
-            depth = len(self.conf.architecture['aux_net']['spatial_dim'])
-            encodings = torch.cat(
-                [pose_features['feature_{}'.format(i)].reshape(
-                    self.batch_size, pose_features['feature_{}'.format(i)].shape[1], -1) \
-                    for i in range(depth, 0, -1)],
-                dim=2)
+        if self.conf.model_name == 'Hourglass':
+            with torch.no_grad():
+                depth = len(self.conf.architecture['aux_net']['spatial_dim'])
+                encodings = torch.cat(
+                    [pose_features['feature_{}'.format(i)].reshape(
+                        self.batch_size, pose_features['feature_{}'.format(i)].shape[1], -1) \
+                        for i in range(depth, 0, -1)],
+                    dim=2)
+        else:
+            encodings = pose_features
 
         aux_out = aux_net(encodings)
         return aux_out
@@ -284,7 +292,7 @@ def calculate_metric(self, metric: str) -> None:
                         outputs.shape[0], self.num_hm * 2)
 
                     matrix = self._aux_net_inference(pose_features, aux_net)
-                    covariance = self._get_covariance(method, matrix, net, pose_features)
+                    covariance = self._get_covariance(method, matrix, net, pose_features, images)
                     precision = torch.linalg.inv(covariance)
 
                     if metric == 'tac':
@@ -321,8 +329,8 @@ def calculate_metric(self, metric: str) -> None:
                 os.path.join(self.conf.save_path, "training_pkg_{}.pt".format(self.trial)))
 
 
-    def _get_covariance(self, name: str, matrix: torch.Tensor,
-                        pose_net: Hourglass, pose_encodings: dict) -> torch.Tensor:
+    def _get_covariance(self, name: str, matrix: torch.Tensor, pose_net: Union[Hourglass, ViTPose],
+                        pose_encodings: dict, imgs: torch.Tensor) -> torch.Tensor:
 
         out_dim = 2 * self.num_hm
 
@@ -341,44 +349,55 @@ def _get_covariance(self, name: str, matrix: torch.Tensor,
         elif name in ['TIC']:
             psd_matrix = get_positive_definite_matrix(matrix, out_dim)
             covariance_hat = get_tic_covariance(
-                pose_net, pose_encodings, matrix, psd_matrix, self.conf.use_hessian)
+                pose_net, pose_encodings, matrix, psd_matrix, self.conf.use_hessian, self.conf.model_name, imgs)
+
             return covariance_hat
 
         else:
             raise NotImplementedError
 
 
-    def _aux_net_inference(self, pose_features: dict, aux_net: AuxNet) -> torch.Tensor:
+    def _aux_net_inference(self, pose_features: dict,
+                           aux_net: Union[AuxNet_HG, AuxNet_ViTPose]) -> torch.Tensor:
         """
         Obtaining the flattened matrix from the aux net inference module
         """
-        with torch.no_grad():
-            depth = len(self.conf.architecture['aux_net']['spatial_dim'])
-            encodings = torch.cat([
-                pose_features['feature_{}'.format(i)].reshape(
-                    self.batch_size, pose_features['feature_{}'.format(i)].shape[1], -1) 
-                    for i in range(depth, 0, -1)
-                ], dim=2)
+        if self.conf.model_name == 'Hourglass':
+            with torch.no_grad():
+                depth = len(self.conf.architecture['aux_net']['spatial_dim'])
+                encodings = torch.cat(
+                    [pose_features['feature_{}'.format(i)].reshape(
+                        self.batch_size, pose_features['feature_{}'.format(i)].shape[1], -1) \
+                        for i in range(depth, 0, -1)],
+                    dim=2)
+        else:
+            encodings = pose_features
 
         aux_out = aux_net(encodings)
         return aux_out
 
 
-def init_models(conf: ParseConfig) -> (Hourglass, AuxNet):
+def init_models(conf: ParseConfig) -> tuple:
     """
     Initializes and returns Hourglass and AuxNet models
     """
 
     logging.info('Initializing Auxiliary Network')
-    aux_net = AuxNet(arch=conf.architecture['aux_net'])
 
-    logging.info('Initializing Hourglass Network')
-    pose_net = Hourglass(arch=conf.architecture['hourglass'])
-    print('Number of parameters (Hourglass): {}\n'.format(count_parameters(pose_net)))
 
-    # CUDA support: Single/Multi-GPU
-    # Hourglass net has CUDA definitions inside __init__(), specify only for aux_net
-    aux_net.cuda(torch.device('cuda:{}'.format(torch.cuda.device_count()-1)))
+    if conf.model_name == 'ViTPose':
+        logging.info('Initializing ViTPose Network')
+        pose_net = ViTPose(vitpose_config.model).cuda()
+        aux_net = AuxNet_ViTPose(arch=conf.architecture['aux_net'])
+        aux_net.cuda(torch.device('cuda:{}'.format(torch.cuda.device_count()-1)))
+        print('Number of parameters (ViTPose): {}\n'.format(count_parameters(pose_net)))
+
+    else:
+        logging.info('Initializing Hourglass Network')
+        pose_net = Hourglass(arch=conf.architecture['hourglass'])
+        aux_net = AuxNet_HG(arch=conf.architecture['aux_net'])
+        aux_net.cuda(torch.device('cuda:{}'.format(torch.cuda.device_count()-1)))
+        print('Number of parameters (Hourglass): {}\n'.format(count_parameters(pose_net)))
 
     logging.info('Successful: Model transferred to GPUs.\n')
 
@@ -405,7 +424,7 @@ def main() -> None:
         training_pkg[method]['ll'] = torch.zeros(trials, dtype=torch.float32, device='cuda')
         training_pkg[method]['loss'] = torch.zeros((trials, epochs), device='cuda')
     training_pkg['training_methods'] = training_methods 
-
+    
 
     # 2. Loading datasets -----------------------------------------------------------------------------------
     logging.info('Loading pose dataset(s)\n')

diff --git a/HumanPose/code/models/auxiliary/AuxiliaryNet.py b/HumanPose/code/models/auxiliary/AuxiliaryNet.py
@@ -6,12 +6,12 @@
 from torch.nn.parameter import Parameter
 
 
-class AuxNet(nn.Module):
+class AuxNet_HG(nn.Module):
     def __init__(self, arch):
         """
         Auxiliary network which predicts flattened matrix using intermediate outputs of the Hourglass
         """
-        super(AuxNet, self).__init__()
+        super(AuxNet_HG, self).__init__()
 
         self.fc_arch = arch['fc']
 
@@ -62,6 +62,7 @@ def forward(self, x):
 
         return x
 
+
 class ConvolutionFeatureExtractor(nn.Module):
     def __init__(self, channels):
         super(ConvolutionFeatureExtractor, self).__init__()
@@ -93,3 +94,39 @@ def forward(self, x):
             x_ = x[i+1] + x_
 
         return x_.squeeze()
+
+
+class AuxNet_ViTPose(nn.Module):
+    def __init__(self, arch):
+        """
+        Auxiliary network which predicts flattened matrix using intermediate outputs of the Hourglass
+        """
+        super(AuxNet_ViTPose, self).__init__()
+
+        self.fc_arch = arch['fc']
+
+        # List that houses the network
+        self.pytorch_layers = []
+
+        # Initializing for input-output chaining across layers
+        input_nodes_fc_network = 64
+
+        in_feat = input_nodes_fc_network
+        for out_feat in self.fc_arch:
+            self.pytorch_layers.append(nn.Linear(in_features=in_feat, out_features=out_feat))
+            self.pytorch_layers.append(nn.ReLU())
+            in_feat = out_feat
+
+        self.pytorch_layers = self.pytorch_layers[:-1]  # Removing the ReLU after the output layer
+        self.pytorch_layers = nn.Sequential(*self.pytorch_layers)
+
+
+    def forward(self, x):
+        """
+
+        :param x:
+        :return:
+        """
+        x = x.squeeze()
+        x = self.pytorch_layers(x)
+        return x
diff --git a/ViTPose/code/models/vit_pose/ViTPose.py → HumanPose/code/models/vit_pose/ViTPose.py b/ViTPose/code/models/vit_pose/ViTPose.py → HumanPose/code/models/vit_pose/ViTPose.py
diff --git a/ViTPose/code/models/vit_pose/__init__.py → HumanPose/code/models/vit_pose/__init__.py b/ViTPose/code/models/vit_pose/__init__.py → HumanPose/code/models/vit_pose/__init__.py
diff --git a/ViTPose/code/models/vit_pose/backbone/vit.py → ...Pose/code/models/vit_pose/backbone/vit.py b/ViTPose/code/models/vit_pose/backbone/vit.py → ...Pose/code/models/vit_pose/backbone/vit.py
diff --git a/...it_pose/head/topdown_heatmap_base_head.py → ...it_pose/head/topdown_heatmap_base_head.py b/...it_pose/head/topdown_heatmap_base_head.py → ...it_pose/head/topdown_heatmap_base_head.py
diff --git a/..._pose/head/topdown_heatmap_simple_head.py → ..._pose/head/topdown_heatmap_simple_head.py b/..._pose/head/topdown_heatmap_simple_head.py → ..._pose/head/topdown_heatmap_simple_head.py
diff --git a/ViTPose/code/models/vit_pose/vit_utils.py → HumanPose/code/models/vit_pose/vit_utils.py b/ViTPose/code/models/vit_pose/vit_utils.py → HumanPose/code/models/vit_pose/vit_utils.py
diff --git a/...se/code/models/vit_pose/vitpose_config.py → ...se/code/models/vit_pose/vitpose_config.py b/...se/code/models/vit_pose/vitpose_config.py → ...se/code/models/vit_pose/vitpose_config.py