diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/CPG_cifar100_main_finetune_sparse.py b/CPG_cifar100_main_finetune_sparse.py deleted file mode 100644 index 60bdbd0..0000000 --- a/CPG_cifar100_main_finetune_sparse.py +++ /dev/null @@ -1,587 +0,0 @@ -"""Main entry point for doing all stuff.""" -from __future__ import division, print_function - -import argparse -import json -import warnings - -import torch -import torch.nn as nn -import torch.optim as optim -import torch.backends.cudnn as cudnn -from torch.nn.parameter import Parameter - -import UTILS.utils as utils -import pdb -import os -import math -from tqdm import tqdm -import sys -import numpy as np -from pprint import pprint - -import models.layers as nl -import models -from UTILS.manager import Manager -import dataset -import logging -import shutil - -# To prevent PIL warnings. -warnings.filterwarnings("ignore") - -parser = argparse.ArgumentParser() -parser.add_argument('--arch', type=str, default='resnet50', - help='Architectures') -parser.add_argument('--num_classes', type=int, default=-1, - help='Num outputs for dataset') -# Optimization options. -parser.add_argument('--lr', type=float, default=0.1, - help='Learning rate for parameters, used for baselines') -parser.add_argument('--lr_mask', type=float, default=1e-4, - help='Learning rate for mask') -parser.add_argument('--lr_mask_decay_every', type=int, - help='Step decay every this many epochs') - -# parser.add_argument('--lr_classifier', type=float, -# help='Learning rate for classifier') -# parser.add_argument('--lr_classifier_decay_every', type=int, -# help='Step decay every this many epochs') - -parser.add_argument('--batch_size', type=int, default=32, - help='input batch size for training') -parser.add_argument('--val_batch_size', type=int, default=100, - help='input batch size for validation') -parser.add_argument('--workers', type=int, default=24, help='') -parser.add_argument('--weight_decay', type=float, default=0.0, - help='Weight decay') -# Masking options. -parser.add_argument('--mask_init', default='1s', - choices=['1s', 'uniform', 'weight_based_1s'], - help='Type of mask init') -parser.add_argument('--mask_scale', type=float, default=1e-2, - help='Mask initialization scaling') -parser.add_argument('--mask_scale_gradients', type=str, default='none', - choices=['none', 'average', 'individual'], - help='Scale mask gradients by weights') -parser.add_argument('--threshold_fn', - choices=['binarizer', 'ternarizer'], - help='Type of thresholding function') -parser.add_argument('--threshold', type=float, default=2e-3, help='') -# Paths. -parser.add_argument('--dataset', type=str, default='', - help='Name of dataset') -parser.add_argument('--train_path', type=str, default='', - help='Location of train data') -parser.add_argument('--val_path', type=str, default='', - help='Location of test data') -parser.add_argument('--save_prefix', type=str, default='checkpoints/', - help='Location to save model') -# Other. -parser.add_argument('--cuda', action='store_true', default=True, - help='use CUDA') -# parser.add_argument('--no_mask', action='store_true', default=False, -# help='Used for running baselines, does not use any masking') - -parser.add_argument('--seed', type=int, default=1, help='random seed') - -parser.add_argument('--checkpoint_format', type=str, - default='./{save_folder}/checkpoint-{epoch}.pth.tar', - help='checkpoint file format') - -parser.add_argument('--epochs', type=int, default=160, - help='number of epochs to train') -parser.add_argument('--restore_epoch', type=int, default=0, help='') -parser.add_argument('--image_size', type=int, default=32, help='') -parser.add_argument('--save_folder', type=str, - help='folder name inside one_check folder') -parser.add_argument('--load_folder', default='', help='') - -# parser.add_argument('--datadir', default='/home/ivclab/decathlon-1.0/', -# help='folder containing data folder') -# parser.add_argument('--imdbdir', default='/home/ivclab/decathlon-1.0/annotations', -# help='annotation folder') - -# parser.add_argument('--train_weight', action='store_true', default=False, help='') -# parser.add_argument('--train_mask', action='store_true', default=False, help='') -# parser.add_argument('--train_classifier', action='store_true', default=False, help='') - -parser.add_argument('--pruning_interval', type=int, default=100, help='') -parser.add_argument('--pruning_frequency', type=int, default=10, help='') -parser.add_argument('--initial_sparsity', type=float, default=0.0, help='') -parser.add_argument('--target_sparsity', type=float, default=0.1, help='') - -parser.add_argument('--mode', - choices=['finetune', 'prune', 'inference'], - help='Run mode') - -parser.add_argument('--baseline_acc_file', type=str, help='file to restore baseline validation accuracy') -parser.add_argument('--network_width_multiplier', type=float, default=1.0, help='the multiplier to scale up the channel width') -parser.add_argument('--test_piggymask', action='store_true', default=False, help='') -parser.add_argument('--pruning_ratio_to_acc_record_file', type=str, help='') -parser.add_argument('--allow_acc_diff', type=float, help='') -parser.add_argument('--finetune_again', action='store_true', default=False, help='') -parser.add_argument('--max_allowed_network_width_multiplier', type=float, help='') -parser.add_argument('--log_path', type=str, help='') -parser.add_argument('--total_num_tasks', type=int, help='') -parser.add_argument('--initial_from_previous_task', action='store_true', default=False, help='') - -class Optimizers(object): - def __init__(self): - self.optimizers = [] - self.lrs = [] - # self.args = args - - def add(self, optimizer, lr): - self.optimizers.append(optimizer) - self.lrs.append(lr) - - def step(self): - for optimizer in self.optimizers: - # if isinstance(optimizer, torch.optim.Adam): - # pdb.set_trace() - optimizer.step() - - def zero_grad(self): - for optimizer in self.optimizers: - optimizer.zero_grad() - - def __getitem__(self, index): - return self.optimizers[index] - - def __setitem__(self, index, value): - self.optimizers[index] = value - -def set_logger(filepath): - global logger - logger = logging.getLogger('') - logger.setLevel(logging.INFO) - fh = logging.FileHandler(filepath) - fh.setLevel(logging.INFO) - ch = logging.StreamHandler(sys.stdout) - ch.setLevel(logging.INFO) - - _format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - fh.setFormatter(_format) - ch.setFormatter(_format) - - logger.addHandler(fh) - logger.addHandler(ch) - return - -def main(): - """Do stuff.""" - args = parser.parse_args() - # don't use this, neither set learning rate as a linear function - # of the count of gpus, it will make accuracy lower - # args.batch_size = args.batch_size * torch.cuda.device_count() - args.network_width_multiplier = math.sqrt(args.network_width_multiplier) - args.max_allowed_network_width_multiplier = math.sqrt(args.max_allowed_network_width_multiplier) - if args.mode == 'prune': - args.save_folder = os.path.join(args.save_folder, str(args.target_sparsity)) - if args.initial_sparsity != 0.0: - args.load_folder = os.path.join(args.load_folder, str(args.initial_sparsity)) - - if args.save_folder and not os.path.isdir(args.save_folder): - os.makedirs(args.save_folder) - - if args.log_path: - set_logger(args.log_path) - - if args.pruning_ratio_to_acc_record_file and not os.path.isdir(args.pruning_ratio_to_acc_record_file.rsplit('/', 1)[0]): - os.makedirs(args.pruning_ratio_to_acc_record_file.rsplit('/', 1)[0]) - - if not torch.cuda.is_available(): - logging.info('no gpu device available') - args.cuda = False - - torch.manual_seed(args.seed) - if args.cuda: - torch.cuda.manual_seed(args.seed) - - cudnn.benchmark = True - - # If set > 0, will resume training from a given checkpoint. - resume_from_epoch = 0 - resume_folder = args.load_folder - for try_epoch in range(200, 0, -1): - if os.path.exists(args.checkpoint_format.format( - save_folder=resume_folder, epoch=try_epoch)): - resume_from_epoch = try_epoch - break - - if args.restore_epoch: - resume_from_epoch = args.restore_epoch - - # Set default train and test path if not provided as input. - utils.set_dataset_paths(args) - - if resume_from_epoch and not args.initial_from_previous_task: - filepath = args.checkpoint_format.format(save_folder=resume_folder, epoch=resume_from_epoch) - checkpoint = torch.load(filepath) - checkpoint_keys = checkpoint.keys() - dataset_history = checkpoint['dataset_history'] - dataset2num_classes = checkpoint['dataset2num_classes'] - masks = checkpoint['masks'] - shared_layer_info = checkpoint['shared_layer_info'] - - if 'num_for_construct' in checkpoint_keys: - num_for_construct = checkpoint['num_for_construct'] - if args.mode == 'inference' and 'network_width_multiplier' in shared_layer_info[args.dataset]: # TODO, temporary solution - args.network_width_multiplier = shared_layer_info[args.dataset]['network_width_multiplier'] - else: - dataset_history = [] - dataset2num_classes = {} - masks = {} - shared_layer_info = {} - - if args.mode == 'prune' and not args.pruning_ratio_to_acc_record_file: - sys.exit(-1) - - if args.arch == 'resnet50': - num_for_construct = [64, 64, 64*4, 128, 128*4, 256, 256*4, 512, 512*4] - model = models.__dict__[args.arch](pretrained=True, num_for_construct=num_for_construct, threshold=args.threshold) - elif 'vgg' in args.arch: - custom_cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'] - model = models.__dict__[args.arch](custom_cfg, dataset_history=dataset_history, dataset2num_classes=dataset2num_classes, - network_width_multiplier=args.network_width_multiplier, shared_layer_info=shared_layer_info) - else: - print('Error!') - sys.exit(1) - - # Add and set the model dataset. - model.add_dataset(args.dataset, args.num_classes) - model.set_dataset(args.dataset) - - model = nn.DataParallel(model) - model = model.cuda() - - - if resume_from_epoch and args.initial_from_previous_task: - filepath = args.checkpoint_format.format(save_folder=resume_folder, epoch=resume_from_epoch) - checkpoint = torch.load(filepath) - state_dict = checkpoint['model_state_dict'] - curr_model_state_dict = model.module.state_dict() - for name, param in state_dict.items(): - if 'num_batches_tracked' in name: - continue - try: - curr_model_state_dict[name][:].copy_(param) - except: - pdb.set_trace() - print('here') - - if not masks: - for name, module in model.named_modules(): - if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): - mask = torch.ByteTensor(module.weight.data.size()).fill_(0) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() - masks[name] = mask - else: - # when we expand network, we need to allocate new masks - NEED_ADJUST_MASK = False - for name, module in model.named_modules(): - if isinstance(module, nl.SharableConv2d): - if masks[name].size(1) < module.weight.data.size(1): - assert args.mode == 'finetune' - NEED_ADJUST_MASK = True - elif masks[name].size(1) > module.weight.data.size(1): - assert args.mode == 'inference' - NEED_ADJUST_MASK = True - - - if NEED_ADJUST_MASK: - if args.mode == 'finetune': - for name, module in model.named_modules(): - if isinstance(module, nl.SharableConv2d): - mask = torch.ByteTensor(module.weight.data.size()).fill_(0) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() - mask[:masks[name].size(0), :masks[name].size(1), :, :].copy_(masks[name]) - masks[name] = mask - elif isinstance(module, nl.SharableLinear): - mask = torch.ByteTensor(module.weight.data.size()).fill_(0) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() - mask[:masks[name].size(0), :masks[name].size(1)].copy_(masks[name]) - masks[name] = mask - elif args.mode == 'inference': - for name, module in model.named_modules(): - if isinstance(module, nl.SharableConv2d): - mask = torch.ByteTensor(module.weight.data.size()).fill_(0) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() - mask[:, :, :, :].copy_(masks[name][:mask.size(0), :mask.size(1), :, :]) - masks[name] = mask - elif isinstance(module, nl.SharableLinear): - mask = torch.ByteTensor(module.weight.data.size()).fill_(0) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() - mask[:, :].copy_(masks[name][:mask.size(0), :mask.size(1)]) - masks[name] = mask - - if args.dataset not in shared_layer_info: - - shared_layer_info[args.dataset] = { - 'bias': {}, - 'bn_layer_running_mean': {}, - 'bn_layer_running_var': {}, - 'bn_layer_weight': {}, - 'bn_layer_bias': {}, - 'piggymask': {} - } - - piggymasks = {} - task_id = model.module.datasets.index(args.dataset) + 1 - if task_id > 1: - for name, module in model.module.named_modules(): - if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): - piggymasks[name] = torch.zeros_like(masks['module.' + name], dtype=torch.float32) - piggymasks[name].fill_(0.01) - piggymasks[name] = Parameter(piggymasks[name]) - module.piggymask = piggymasks[name] - #elif args.finetune_again: - # # reinitialize piggymask - # piggymasks = {} - # for name, module in model.module.named_modules(): - # if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): - # piggymasks[name] = torch.zeros_like(masks['module.' + name], dtype=torch.float32) - # piggymasks[name].fill_(0.01) - # piggymasks[name] = Parameter(piggymasks[name]) - # module.piggymask = piggymasks[name] - else: - try: - piggymasks = shared_layer_info[args.dataset]['piggymask'] - except: - piggymasks = {} - task_id = model.module.datasets.index(args.dataset) + 1 - if task_id > 1: - for name, module in model.module.named_modules(): - if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): - module.piggymask = piggymasks[name] - shared_layer_info[args.dataset]['network_width_multiplier'] = args.network_width_multiplier - - if args.num_classes == 2: - train_loader = dataset.cifar100_train_loader_two_class(args.dataset, args.batch_size) - val_loader = dataset.cifar100_val_loader_two_class(args.dataset, args.val_batch_size) - elif args.num_classes == 5: - train_loader = dataset.cifar100_train_loader(args.dataset, args.batch_size) - val_loader = dataset.cifar100_val_loader(args.dataset, args.val_batch_size) - else: - print("num_classes should be either 2 or 5") - sys.exit(1) - - # if we are going to save checkpoint in other folder, then we recalculate the starting epoch - if args.save_folder != args.load_folder: - start_epoch = 0 - else: - start_epoch = resume_from_epoch - - curr_prune_step = begin_prune_step = start_epoch * len(train_loader) - end_prune_step = curr_prune_step + args.pruning_interval * len(train_loader) - - manager = Manager(args, model, shared_layer_info, masks, train_loader, val_loader, begin_prune_step, end_prune_step) - if args.mode == 'inference': - manager.load_checkpoint_only_for_evaluate(resume_from_epoch, resume_folder) - manager.validate(resume_from_epoch-1) - return - - lr = args.lr - lr_mask = args.lr_mask - # update all layers - named_params = dict(model.named_parameters()) - params_to_optimize_via_SGD = [] - named_of_params_to_optimize_via_SGD = [] - masks_to_optimize_via_Adam = [] - named_of_masks_to_optimize_via_Adam = [] - - for name, param in named_params.items(): - if 'classifiers' in name: - if '.{}.'.format(model.module.datasets.index(args.dataset)) in name: - params_to_optimize_via_SGD.append(param) - named_of_params_to_optimize_via_SGD.append(name) - continue - elif 'piggymask' in name: - masks_to_optimize_via_Adam.append(param) - named_of_masks_to_optimize_via_Adam.append(name) - else: - params_to_optimize_via_SGD.append(param) - named_of_params_to_optimize_via_SGD.append(name) - - optimizer_network = optim.SGD(params_to_optimize_via_SGD, lr=lr, - weight_decay=0.0, momentum=0.9, nesterov=True) - optimizers = Optimizers() - optimizers.add(optimizer_network, lr) - - if masks_to_optimize_via_Adam: - optimizer_mask = optim.Adam(masks_to_optimize_via_Adam, lr=lr_mask) - optimizers.add(optimizer_mask, lr_mask) - - manager.load_checkpoint(optimizers, resume_from_epoch, resume_folder) - - """Performs training.""" - curr_lrs = [] - for optimizer in optimizers: - for param_group in optimizer.param_groups: - curr_lrs.append(param_group['lr']) - break - - if args.mode == 'prune': - if 'gradual_prune' in args.load_folder and args.save_folder == args.load_folder: - args.epochs = 20 + resume_from_epoch - logging.info('') - logging.info('Before pruning: ') - logging.info('Sparsity range: {} -> {}'.format(args.initial_sparsity, args.target_sparsity)) - - must_pruning_ratio_for_curr_task = 0.0 - - json_data = {} - if os.path.isfile(args.pruning_ratio_to_acc_record_file): - with open(args.pruning_ratio_to_acc_record_file, 'r') as json_file: - json_data = json.load(json_file) - - - #if args.network_width_multiplier == args.max_allowed_network_width_multiplier and json_data['0.0'] < baseline_acc: - # # if we reach the upperbound and still do not get the accuracy over our target on curr task, we still do pruning - # logging.info('we reach the upperbound and still do not get the accuracy over our target on curr task') - # remain_num_tasks = args.total_num_tasks - len(dataset_history) - # logging.info('remain_num_tasks: {}'.format(remain_num_tasks)) - # ratio_allow_for_curr_task = round(1.0 / (remain_num_tasks + 1), 1) - # logging.info('ratio_allow_for_curr_task: {:.4f}'.format(ratio_allow_for_curr_task)) - # must_pruning_ratio_for_curr_task = 1.0 - ratio_allow_for_curr_task - # if args.initial_sparsity >= must_pruning_ratio_for_curr_task: - # sys.exit(6) - - - manager.validate(start_epoch-1) - logging.info('') - elif args.mode == 'finetune': - if not args.finetune_again: - # manager.pruner.make_finetuning_mask() - logging.info('Finetune stage...') - manager.pruner.current_dataset_idx += 1 - for name, module in model.named_modules(): - if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): - mask = masks[name] - mask[mask.eq(0)] = manager.pruner.current_dataset_idx - else: - logging.info('Piggymask Retrain...') - history_best_avg_val_acc_when_retraining = manager.validate(start_epoch-1) - num_epochs_that_criterion_does_not_get_better = 0 - - stop_lr_mask = True - if manager.pruner.calculate_curr_task_ratio() == 0.0: - logging.info('There is no left space in convolutional layer for curr task' - ', we will try to use prior experience as long as possible') - stop_lr_mask = False - - for epoch_idx in range(start_epoch, args.epochs): - avg_train_acc, curr_prune_step = manager.train(optimizers, epoch_idx, curr_lrs, curr_prune_step) - - avg_val_acc = manager.validate(epoch_idx) - - # if args.mode == 'prune' and (epoch_idx+1) >= (args.pruning_interval + start_epoch) and ( - # avg_val_acc > history_best_avg_val_acc_when_prune): - # pass - if args.finetune_again: - if avg_val_acc > history_best_avg_val_acc_when_retraining: - history_best_avg_val_acc_when_retraining = avg_val_acc - - num_epochs_that_criterion_does_not_get_better = 0 - if args.save_folder is not None: - for path in os.listdir(args.save_folder): - if '.pth.tar' in path: - os.remove(os.path.join(args.save_folder, path)) - else: - print('Something is wrong! Block the program with pdb') - pdb.set_trace() - - history_best_avg_val_acc = avg_val_acc - manager.save_checkpoint(optimizers, epoch_idx, args.save_folder) - else: - num_epochs_that_criterion_does_not_get_better += 1 - - if args.finetune_again and num_epochs_that_criterion_does_not_get_better == 5: - logging.info("stop retraining") - sys.exit(0) - - if args.mode == 'finetune': - if epoch_idx + 1 == 50 or epoch_idx + 1 == 80: - for param_group in optimizers[0].param_groups: - param_group['lr'] *= 0.1 - curr_lrs[0] = param_group['lr'] - if len(optimizers.lrs) == 2: - if epoch_idx + 1 == 50: - for param_group in optimizers[1].param_groups: - param_group['lr'] *= 0.2 - if stop_lr_mask and epoch_idx + 1 == 70: - for param_group in optimizers[1].param_groups: - param_group['lr'] *= 0.0 - - curr_lrs[1] = param_group['lr'] - - if args.save_folder is not None: - pass - # paths = os.listdir(args.save_folder) - # if paths and '.pth.tar' in paths[0]: - # for checkpoint_file in paths: - # os.remove(os.path.join(args.save_folder, checkpoint_file)) - else: - print('Something is wrong! Block the program with pdb') - pdb.set_trace() - - if avg_train_acc > 0.95: - manager.save_checkpoint(optimizers, epoch_idx, args.save_folder) - - logging.info('-' * 16) - - if args.pruning_ratio_to_acc_record_file: - json_data = {} - if os.path.isfile(args.pruning_ratio_to_acc_record_file): - with open(args.pruning_ratio_to_acc_record_file, 'r') as json_file: - json_data = json.load(json_file) - if args.mode == 'finetune' and not args.test_piggymask: - json_data[0.0] = round(avg_val_acc, 4) - - if args.baseline_acc_file: - baseline_json_data = {} - if os.path.isfile(args.baseline_acc_file): - with open(args.baseline_acc_file) as json_file: - baseline_json_data = json.load(json_file) - baseline_json_data[args.dataset] = '{:.4f}'.format(avg_val_acc) - with open(args.baseline_acc_file, 'w') as json_file: - json.dump(baseline_json_data, json_file) - - with open(args.pruning_ratio_to_acc_record_file, 'w') as json_file: - json.dump(json_data, json_file) - if avg_train_acc > 0.95:# and avg_val_acc >= baseline_acc: - pass - - if manager.pruner.calculate_curr_task_ratio() == 0.0: - logging.info('There is no left space in convolutional layer for curr task, so needless to prune') - sys.exit(5) - - elif args.mode == 'prune': - if avg_train_acc > 0.95: - json_data[args.target_sparsity] = round(avg_val_acc, 4) - with open(args.pruning_ratio_to_acc_record_file, 'w') as json_file: - json.dump(json_data, json_file) - else: - sys.exit(6) - - must_pruning_ratio_for_curr_task = 0.0 - - #if args.network_width_multiplier == args.max_allowed_network_width_multiplier and json_data['0.0'] < baseline_acc: - # # if we reach the upperbound and still do not get the accuracy over our target on curr task, we still do pruning - # logging.info('we reach the upperbound and still do not get the accuracy over our target on curr task') - # remain_num_tasks = args.total_num_tasks - len(dataset_history) - # logging.info('remain_num_tasks: {}'.format(remain_num_tasks)) - # ratio_allow_for_curr_task = round(1.0 / (remain_num_tasks + 1), 1) - # logging.info('ratio_allow_for_curr_task: {:.4f}'.format(ratio_allow_for_curr_task)) - # must_pruning_ratio_for_curr_task = 1.0 - ratio_allow_for_curr_task - # if args.target_sparsity >= must_pruning_ratio_for_curr_task: - # sys.exit(6) - -if __name__ == '__main__': - main() diff --git a/CPG_cifar100_with_one_mask.py b/CPG_cifar100_with_one_mask.py deleted file mode 100644 index bc71b4b..0000000 --- a/CPG_cifar100_with_one_mask.py +++ /dev/null @@ -1,656 +0,0 @@ -"""Main entry point for doing all stuff.""" -from __future__ import division, print_function - -import argparse -import json -import warnings - -import torch -import torch.nn as nn -import torch.optim as optim -import torch.backends.cudnn as cudnn -from torch.nn.parameter import Parameter - -import UTILS.utils as utils -import pdb -import os -import math -from tqdm import tqdm -import sys -import numpy as np -from pprint import pprint - -import models_with_one_mask.layers as nl -import models_with_one_mask as models -from UTILS.manager_with_one_mask import Manager -import UTILS.dataset as dataset -import logging -import shutil - -# To prevent PIL warnings. -warnings.filterwarnings("ignore") - -parser = argparse.ArgumentParser() -parser.add_argument('--arch', type=str, default='resnet50', - help='Architectures') -parser.add_argument('--num_classes', type=int, default=-1, - help='Num outputs for dataset') -# Optimization options. -parser.add_argument('--lr', type=float, default=0.1, - help='Learning rate for parameters, used for baselines') -parser.add_argument('--lr_mask', type=float, default=1e-4, - help='Learning rate for mask') -parser.add_argument('--lr_mask_decay_every', type=int, - help='Step decay every this many epochs') - -# parser.add_argument('--lr_classifier', type=float, -# help='Learning rate for classifier') -# parser.add_argument('--lr_classifier_decay_every', type=int, -# help='Step decay every this many epochs') - -parser.add_argument('--batch_size', type=int, default=32, - help='input batch size for training') -parser.add_argument('--val_batch_size', type=int, default=100, - help='input batch size for validation') -parser.add_argument('--workers', type=int, default=24, help='') -parser.add_argument('--weight_decay', type=float, default=0.0, - help='Weight decay') -# Masking options. -parser.add_argument('--mask_init', default='1s', - choices=['1s', 'uniform', 'weight_based_1s'], - help='Type of mask init') -parser.add_argument('--mask_scale', type=float, default=1e-2, - help='Mask initialization scaling') -parser.add_argument('--mask_scale_gradients', type=str, default='none', - choices=['none', 'average', 'individual'], - help='Scale mask gradients by weights') -parser.add_argument('--threshold_fn', - choices=['binarizer', 'ternarizer'], - help='Type of thresholding function') -parser.add_argument('--threshold', type=float, default=2e-3, help='') -# Paths. -parser.add_argument('--dataset', type=str, default='', - help='Name of dataset') -parser.add_argument('--train_path', type=str, default='', - help='Location of train data') -parser.add_argument('--val_path', type=str, default='', - help='Location of test data') -parser.add_argument('--save_prefix', type=str, default='checkpoints/', - help='Location to save model') -# Other. -parser.add_argument('--cuda', action='store_true', default=True, - help='use CUDA') -# parser.add_argument('--no_mask', action='store_true', default=False, -# help='Used for running baselines, does not use any masking') - -parser.add_argument('--seed', type=int, default=1, help='random seed') - -parser.add_argument('--checkpoint_format', type=str, - default='./{save_folder}/checkpoint-{epoch}.pth.tar', - help='checkpoint file format') - -parser.add_argument('--epochs', type=int, default=160, - help='number of epochs to train') -parser.add_argument('--restore_epoch', type=int, default=0, help='') -parser.add_argument('--image_size', type=int, default=32, help='') -parser.add_argument('--save_folder', type=str, - help='folder name inside one_check folder') -parser.add_argument('--load_folder', default='', help='') - -# parser.add_argument('--datadir', default='/home/ivclab/decathlon-1.0/', -# help='folder containing data folder') -# parser.add_argument('--imdbdir', default='/home/ivclab/decathlon-1.0/annotations', -# help='annotation folder') - -# parser.add_argument('--train_weight', action='store_true', default=False, help='') -# parser.add_argument('--train_mask', action='store_true', default=False, help='') -# parser.add_argument('--train_classifier', action='store_true', default=False, help='') - -parser.add_argument('--pruning_interval', type=int, default=100, help='') -parser.add_argument('--pruning_frequency', type=int, default=10, help='') -parser.add_argument('--initial_sparsity', type=float, default=0.0, help='') -parser.add_argument('--target_sparsity', type=float, default=0.1, help='') - -parser.add_argument('--mode', - choices=['finetune', 'prune', 'inference'], - help='Run mode') - -parser.add_argument('--baseline_acc_file', type=str, help='file to restore baseline validation accuracy') -parser.add_argument('--network_width_multiplier', type=float, default=1.0, help='the multiplier to scale up the channel width') -parser.add_argument('--test_piggymask', action='store_true', default=False, help='') -parser.add_argument('--pruning_ratio_to_acc_record_file', type=str, help='') -parser.add_argument('--allow_acc_diff', type=float, help='') -parser.add_argument('--finetune_again', action='store_true', default=False, help='') -parser.add_argument('--max_allowed_network_width_multiplier', type=float, help='') -parser.add_argument('--log_path', type=str, help='') -parser.add_argument('--total_num_tasks', type=int, help='') - -class Optimizers(object): - def __init__(self): - self.optimizers = [] - self.lrs = [] - # self.args = args - - def add(self, optimizer, lr): - self.optimizers.append(optimizer) - self.lrs.append(lr) - - def step(self): - for optimizer in self.optimizers: - # if isinstance(optimizer, torch.optim.Adam): - # pdb.set_trace() - optimizer.step() - - def zero_grad(self): - for optimizer in self.optimizers: - optimizer.zero_grad() - - def __getitem__(self, index): - return self.optimizers[index] - - def __setitem__(self, index, value): - self.optimizers[index] = value - -def set_logger(filepath): - global logger - logger = logging.getLogger('') - logger.setLevel(logging.INFO) - fh = logging.FileHandler(filepath) - fh.setLevel(logging.INFO) - ch = logging.StreamHandler(sys.stdout) - ch.setLevel(logging.INFO) - - _format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - fh.setFormatter(_format) - ch.setFormatter(_format) - - logger.addHandler(fh) - logger.addHandler(ch) - return - -def main(): - """Do stuff.""" - args = parser.parse_args() - # don't use this, neither set learning rate as a linear function - # of the count of gpus, it will make accuracy lower - # args.batch_size = args.batch_size * torch.cuda.device_count() - - if args.mode == 'prune': - args.save_folder = os.path.join(args.save_folder, str(args.target_sparsity)) - if args.initial_sparsity != 0.0: - args.load_folder = os.path.join(args.load_folder, str(args.initial_sparsity)) - - if args.save_folder and not os.path.isdir(args.save_folder): - os.makedirs(args.save_folder) - - if args.log_path: - set_logger(args.log_path) - - if args.pruning_ratio_to_acc_record_file and not os.path.isdir(args.pruning_ratio_to_acc_record_file.rsplit('/', 1)[0]): - os.makedirs(args.pruning_ratio_to_acc_record_file.rsplit('/', 1)[0]) - - if not torch.cuda.is_available(): - logging.info('no gpu device available') - args.cuda = False - - torch.manual_seed(args.seed) - if args.cuda: - torch.cuda.manual_seed(args.seed) - - cudnn.benchmark = True - - # If set > 0, will resume training from a given checkpoint. - resume_from_epoch = 0 - resume_folder = args.load_folder - for try_epoch in range(200, 0, -1): - if os.path.exists(args.checkpoint_format.format( - save_folder=resume_folder, epoch=try_epoch)): - resume_from_epoch = try_epoch - break - - if args.restore_epoch: - resume_from_epoch = args.restore_epoch - - # Set default train and test path if not provided as input. - utils.set_dataset_paths(args) - if resume_from_epoch: - filepath = args.checkpoint_format.format(save_folder=resume_folder, epoch=resume_from_epoch) - checkpoint = torch.load(filepath) - checkpoint_keys = checkpoint.keys() - dataset_history = checkpoint['dataset_history'] - dataset2num_classes = checkpoint['dataset2num_classes'] - masks = checkpoint['masks'] - shared_layer_info = checkpoint['shared_layer_info'] - piggymask_floats = checkpoint['piggymask_floats'] - piggymask_task_tags = checkpoint['piggymask_task_tags'] - - if 'num_for_construct' in checkpoint_keys: - num_for_construct = checkpoint['num_for_construct'] - if args.mode == 'inference' and 'network_width_multiplier' in shared_layer_info[args.dataset]: # TODO, temporary solution - args.network_width_multiplier = shared_layer_info[args.dataset]['network_width_multiplier'] - else: - dataset_history = [] - dataset2num_classes = {} - masks = {} - shared_layer_info = {} - piggymask_floats = {} - piggymask_task_tags = {} - - if args.baseline_acc_file is None or not os.path.isfile(args.baseline_acc_file): - sys.exit(3) - with open(args.baseline_acc_file, 'r') as jsonfile: - json_data = json.load(jsonfile) - baseline_acc = float(json_data[args.dataset]) - - if args.mode == 'prune' and not args.pruning_ratio_to_acc_record_file: - sys.exit(-1) - - if args.arch == 'resnet50': - num_for_construct = [64, 64, 64*4, 128, 128*4, 256, 256*4, 512, 512*4] - model = models.__dict__[args.arch](pretrained=True, num_for_construct=num_for_construct, threshold=args.threshold) - elif 'vgg' in args.arch: - custom_cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'] - model = models.__dict__[args.arch](custom_cfg, dataset_history=dataset_history, dataset2num_classes=dataset2num_classes, - network_width_multiplier=args.network_width_multiplier, shared_layer_info=shared_layer_info, groups=int(args.network_width_multiplier)) - else: - print('Error!') - sys.exit(1) - - # Add and set the model dataset. - model.add_dataset(args.dataset, args.num_classes) - model.set_dataset(args.dataset) - - model = nn.DataParallel(model) - model = model.cuda() - - NEED_ADJUST_MASK = False - task_id = model.module.datasets.index(args.dataset) + 1 - if not masks: - for name, module in model.module.named_modules(): - if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): - mask = torch.ByteTensor(module.weight.data.size()).fill_(0) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() - masks[name] = mask - module.packnet_mask = mask - else: - # when we expand network, we need to allocate new masks - for name, module in model.module.named_modules(): - if isinstance(module, nl.SharableConv2d): - if masks[name].size(0) < module.weight.data.size(0): - assert args.mode == 'finetune' - NEED_ADJUST_MASK = True - elif masks[name].size(0) > module.weight.data.size(0): - assert args.mode == 'inference' - NEED_ADJUST_MASK = True - - - if NEED_ADJUST_MASK: - if args.mode == 'finetune': - for name, module in model.module.named_modules(): - if isinstance(module, nl.SharableConv2d): - mask = torch.ByteTensor(module.weight.data.size()).fill_(task_id) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() - mask[:masks[name].size(0), :, :, :].copy_(masks[name]) - masks[name] = mask - elif isinstance(module, nl.SharableLinear): - mask = torch.ByteTensor(module.weight.data.size()).fill_(task_id) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() - mask[:masks[name].size(0), :masks[name].size(1)].copy_(masks[name]) - masks[name] = mask - elif args.mode == 'inference': - for name, module in model.module.named_modules(): - if isinstance(module, nl.SharableConv2d): - mask = torch.ByteTensor(module.weight.data.size()).fill_(task_id) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() - mask[:, :, :, :].copy_(masks[name][:mask.size(0), :, :, :]) - masks[name] = mask - elif isinstance(module, nl.SharableLinear): - mask = torch.ByteTensor(module.weight.data.size()).fill_(task_id) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() - mask[:, :].copy_(masks[name][:mask.size(0), :mask.size(1)]) - masks[name] = mask - - for name, module in model.module.named_modules(): - if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): - module.packnet_mask = masks[name] - - - if args.dataset not in shared_layer_info: - - shared_layer_info[args.dataset] = { - 'bias': {}, - 'bn_layer_running_mean': {}, - 'bn_layer_running_var': {}, - 'bn_layer_weight': {}, - 'bn_layer_bias': {} - } - - NEED_ADJUST_MASK = False - if task_id == 1: - for name, module in model.module.named_modules(): - if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): - module.inference_task_id = task_id - - elif task_id == 2 and not piggymask_floats: - for name, module in model.module.named_modules(): - if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): - piggymask_floats[name] = torch.zeros_like(masks[name], dtype=torch.float32) - piggymask_task_tags[name] = torch.zeros_like(masks[name]) - piggymask_floats[name] = torch.where(masks[name] != 0, - torch.full_like(piggymask_floats[name], 0.01), piggymask_floats[name]) - piggymask_task_tags[name] = torch.where(masks[name] != 0, - torch.full_like(piggymask_task_tags[name], task_id), piggymask_task_tags[name]) - piggymask_floats[name] = Parameter(piggymask_floats[name]) - module.piggymask_float = piggymask_floats[name] - module.piggymask_task_tag = piggymask_task_tags[name] - module.inference_task_id = task_id - elif task_id >= 2: - # when we expand network, we need to allocate new piggymasks - for name, module in model.module.named_modules(): - if isinstance(module, nl.SharableConv2d): - if piggymask_floats[name].size(0) < module.weight.data.size(0): - assert args.mode == 'finetune' - NEED_ADJUST_MASK = True - elif piggymask_floats[name].size(0) > module.weight.data.size(0): - assert args.mode == 'inference' - NEED_ADJUST_MASK = True - - if NEED_ADJUST_MASK: - if args.mode == 'finetune': - for name, module in model.module.named_modules(): - if isinstance(module, nl.SharableConv2d): - piggymask_float = torch.zeros_like(masks[name], dtype=torch.float32) - piggymask_task_tag = torch.zeros_like(masks[name]) - piggymask_float[:piggymask_floats[name].size(0), :, :, :].copy_(piggymask_floats[name]) - piggymask_task_tag[:piggymask_task_tags[name].size(0), :, :, :].copy_( - piggymask_task_tags[name]) - piggymask_floats[name] = Parameter(piggymask_float) - piggymask_task_tags[name] = piggymask_task_tag - elif isinstance(module, nl.SharableLinear): - piggymask_float = torch.zeros_like(masks[name], dtype=torch.float32) - piggymask_task_tag = torch.zeros_like(masks[name]) - piggymask_float[:piggymask_floats[name].size(0), :piggymask_floats[name].size(1)].copy_( - piggymask_floats[name]) - piggymask_task_tag[:piggymask_task_tags[name].size(0), :piggymask_task_tags[name].size(1)].copy_( - piggymask_task_tags[name]) - piggymask_floats[name] = Parameter(piggymask_float) - piggymask_task_tags[name] = piggymask_task_tag - elif args.mode == 'inference': - for name, module in model.module.named_modules(): - if isinstance(module, nl.SharableConv2d): - piggymask_float = torch.zeros_like(masks[name], dtype=torch.float32) - piggymask_task_tag = torch.zeros_like(masks[name]) - piggymask_float[:, :, :, :].copy_( - piggymask_floats[name][:piggymask_float.size(0), :, :, :]) - piggymask_floats[name] = Parameter(piggymask_float) - piggymask_task_tag[:, :, :, :].copy_( - piggymask_task_tags[name][:piggymask_task_tag.size(0), :, :, :]) - piggymask_task_tags[name] = piggymask_task_tag - elif isinstance(module, nl.SharableLinear): - piggymask_float = torch.zeros_like(masks[name], dtype=torch.float32) - piggymask_task_tag = torch.zeros_like(masks[name]) - piggymask_float[:, :].copy_( - piggymask_floats[name][:piggymask_float.size(0), :piggymask_float.size(1)]) - piggymask_floats[name] = Parameter(piggymask_float) - piggymask_task_tag[:, :].copy_( - piggymask_task_tags[name][:piggymask_task_tag.size(0), :piggymask_task_tag.size(1)]) - piggymask_task_tags[name] = piggymask_task_tag - - for name, module in model.module.named_modules(): - if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): - if args.mode == 'finetune' and not args.finetune_again: - piggymask_task_tags[name].data[piggymask_task_tags[name].data.eq(0) & (masks[name] != 0)] = task_id - piggymask_floats[name].data[piggymask_task_tags[name].data.eq(task_id)] = 0.01 - - module.piggymask_float = piggymask_floats[name] - module.piggymask_task_tag = piggymask_task_tags[name] - module.inference_task_id = task_id - - shared_layer_info[args.dataset]['network_width_multiplier'] = args.network_width_multiplier - - if args.num_classes == 2: - train_loader = dataset.cifar100_train_loader_two_class(args.dataset, args.batch_size) - val_loader = dataset.cifar100_val_loader_two_class(args.dataset, args.val_batch_size) - elif args.num_classes == 5: - train_loader = dataset.cifar100_train_loader(args.dataset, args.batch_size) - val_loader = dataset.cifar100_val_loader(args.dataset, args.val_batch_size) - else: - print("num_classes should be either 2 or 5") - sys.exit(1) - - # if we are going to save checkpoint in other folder, then we recalculate the starting epoch - if args.save_folder != args.load_folder: - start_epoch = 0 - else: - start_epoch = resume_from_epoch - - curr_prune_step = begin_prune_step = start_epoch * len(train_loader) - end_prune_step = curr_prune_step + args.pruning_interval * len(train_loader) - manager = Manager(args, model, shared_layer_info, masks, train_loader, val_loader, begin_prune_step, end_prune_step) - if args.mode == 'inference': - manager.load_checkpoint_only_for_evaluate(resume_from_epoch, resume_folder) - manager.validate(resume_from_epoch-1) - return - - # manager.inference_dataset_idx - lr = args.lr - lr_mask = args.lr_mask - # update all layers - named_params = dict(model.named_parameters()) - params_to_optimize_via_SGD = [] - named_of_params_to_optimize_via_SGD = [] - masks_to_optimize_via_Adam = [] - named_of_masks_to_optimize_via_Adam = [] - - for name, param in named_params.items(): - if 'classifiers' in name: - if '.{}.'.format(model.module.datasets.index(args.dataset)) in name: - params_to_optimize_via_SGD.append(param) - named_of_params_to_optimize_via_SGD.append(name) - continue - elif 'piggymask' in name: - masks_to_optimize_via_Adam.append(param) - named_of_masks_to_optimize_via_Adam.append(name) - else: - params_to_optimize_via_SGD.append(param) - named_of_params_to_optimize_via_SGD.append(name) - - optimizer_network = optim.SGD(params_to_optimize_via_SGD, lr=lr, - weight_decay=0.0, momentum=0.9, nesterov=True) - optimizers = Optimizers() - optimizers.add(optimizer_network, lr) - - if masks_to_optimize_via_Adam: - optimizer_mask = optim.Adam(masks_to_optimize_via_Adam, lr=lr_mask) - optimizers.add(optimizer_mask, lr_mask) - - manager.load_checkpoint(optimizers, resume_from_epoch, resume_folder, NEED_ADJUST_MASK) - -# k = int(args.network_width_multiplier) -# assert k >= 2 -# for name, module in model.module.named_modules(): -# if isinstance(module, nl.SharableConv2d): -# n = len(module.weight) -# n = int((n // k * (k-1)) * 0.1) -# # module.weight.data[:n, :, :, :] = 0.0 -# module.packnet_mask[:n, :, :, :] = 255 - -# if isinstance(module, nl.SharableLinear): -# n = len(module.bias) -# n = int((n // k * (k-1)) * 0.1) -# # module.weight.data[:n, :] = 0.0 -# # module.bias.data[:n] = 0.0 -# module.packnet_mask[:n, :] = 255 - - -# if isinstance(module, nn.BatchNorm2d): -# n = len(module.weight) -# n = int((n // k * (k-1)) * 0.1) -# # module.weight.data[:n] = 0.0 - - """Performs training.""" - curr_lrs = [] - for optimizer in optimizers: - for param_group in optimizer.param_groups: - curr_lrs.append(param_group['lr']) - break - - if args.mode == 'prune': - if 'gradual_prune' in args.load_folder and args.save_folder == args.load_folder: - args.epochs = 20 + resume_from_epoch - logging.info('') - logging.info('Before pruning: ') - logging.info('Sparsity range: {} -> {}'.format(args.initial_sparsity, args.target_sparsity)) - - must_pruning_ratio_for_curr_task = 0.0 - - json_data = {} - if os.path.isfile(args.pruning_ratio_to_acc_record_file): - with open(args.pruning_ratio_to_acc_record_file, 'r') as json_file: - json_data = json.load(json_file) - - - if args.network_width_multiplier == args.max_allowed_network_width_multiplier and json_data['0.0'] < baseline_acc: - # if we reach the upperbound and still do not get the accuracy over our target on curr task, we still do pruning - logging.info('we reach the upperbound and still do not get the accuracy over our target on curr task') - remain_num_tasks = args.total_num_tasks - len(dataset_history) - logging.info('remain_num_tasks: {}'.format(remain_num_tasks)) - ratio_allow_for_curr_task = round(1.0 / (remain_num_tasks + 1), 1) - logging.info('ratio_allow_for_curr_task: {:.4f}'.format(ratio_allow_for_curr_task)) - must_pruning_ratio_for_curr_task = 1.0 - ratio_allow_for_curr_task - if args.initial_sparsity >= must_pruning_ratio_for_curr_task: - sys.exit(6) - - - manager.validate(start_epoch-1) - logging.info('') - elif args.mode == 'finetune': - if not args.finetune_again: - manager.pruner.make_finetuning_mask() - logging.info('Finetune stage...') - else: - logging.info('Piggymask Retrain...') - history_best_avg_val_acc_when_retraining = manager.validate(start_epoch-1) - num_epochs_that_criterion_does_not_get_better = 0 - - stop_lr_mask = True - if manager.pruner.calculate_curr_task_ratio() == 0.0: - logging.info('There is no left space in convolutional layer for curr task' - ', we will try to use prior experience as long as possible') - stop_lr_mask = False - - for epoch_idx in range(start_epoch, args.epochs): - avg_train_acc, curr_prune_step = manager.train(optimizers, epoch_idx, curr_lrs, curr_prune_step) - - avg_val_acc = manager.validate(epoch_idx) - - # if args.mode == 'prune' and (epoch_idx+1) >= (args.pruning_interval + start_epoch) and ( - # avg_val_acc > history_best_avg_val_acc_when_prune): - # pass - if args.finetune_again: - if avg_val_acc > history_best_avg_val_acc_when_retraining: - history_best_avg_val_acc_when_retraining = avg_val_acc - - num_epochs_that_criterion_does_not_get_better = 0 - if args.save_folder is not None: - for path in os.listdir(args.save_folder): - if '.pth.tar' in path: - os.remove(os.path.join(args.save_folder, path)) - else: - print('Something is wrong! Block the program with pdb') - pdb.set_trace() - - history_best_avg_val_acc = avg_val_acc - manager.save_checkpoint(optimizers, epoch_idx, args.save_folder) - else: - num_epochs_that_criterion_does_not_get_better += 1 - - if args.finetune_again and num_epochs_that_criterion_does_not_get_better == 5: - logging.info("stop retraining") - sys.exit(0) - - if args.mode == 'finetune': - if epoch_idx + 1 == 50 or epoch_idx + 1 == 80: - for param_group in optimizers[0].param_groups: - param_group['lr'] *= 0.1 - curr_lrs[0] = param_group['lr'] - if len(optimizers.lrs) == 2: - if epoch_idx + 1 == 50: - for param_group in optimizers[1].param_groups: - param_group['lr'] *= 0.2 - if stop_lr_mask and epoch_idx + 1 == 70: - for param_group in optimizers[1].param_groups: - param_group['lr'] *= 0.0 - - curr_lrs[1] = param_group['lr'] - - if args.save_folder is not None: - pass - # paths = os.listdir(args.save_folder) - # if paths and '.pth.tar' in paths[0]: - # for checkpoint_file in paths: - # os.remove(os.path.join(args.save_folder, checkpoint_file)) - else: - print('Something is wrong! Block the program with pdb') - - - if task_id >= 2: - for name, module in model.module.named_modules(): - if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): - if args.mode == 'finetune': - module.piggymask_task_tag[module.piggymask_float.le(0.005)] = 0 - - if avg_train_acc > 0.95: - manager.save_checkpoint(optimizers, epoch_idx, args.save_folder) - - logging.info('-' * 16) - - if args.pruning_ratio_to_acc_record_file: - json_data = {} - if os.path.isfile(args.pruning_ratio_to_acc_record_file): - with open(args.pruning_ratio_to_acc_record_file, 'r') as json_file: - json_data = json.load(json_file) - - if args.mode == 'finetune' and not args.test_piggymask: - json_data[0.0] = round(avg_val_acc, 4) - with open(args.pruning_ratio_to_acc_record_file, 'w') as json_file: - json.dump(json_data, json_file) - if avg_train_acc > 0.95 and avg_val_acc >= baseline_acc: - pass - else: - logging.info("It's time to expand the Network") - logging.info('Auto expand network') - sys.exit(2) - - if manager.pruner.calculate_curr_task_ratio() == 0.0: - logging.info('There is no left space in convolutional layer for curr task, so needless to prune') - sys.exit(5) - - elif args.mode == 'prune': - if avg_train_acc > 0.95: - json_data[args.target_sparsity] = round(avg_val_acc, 4) - with open(args.pruning_ratio_to_acc_record_file, 'w') as json_file: - json.dump(json_data, json_file) - else: - sys.exit(6) - - must_pruning_ratio_for_curr_task = 0.0 - - if args.network_width_multiplier == args.max_allowed_network_width_multiplier and json_data['0.0'] < baseline_acc: - # if we reach the upperbound and still do not get the accuracy over our target on curr task, we still do pruning - logging.info('we reach the upperbound and still do not get the accuracy over our target on curr task') - remain_num_tasks = args.total_num_tasks - len(dataset_history) - logging.info('remain_num_tasks: {}'.format(remain_num_tasks)) - ratio_allow_for_curr_task = round(1.0 / (remain_num_tasks + 1), 1) - logging.info('ratio_allow_for_curr_task: {:.4f}'.format(ratio_allow_for_curr_task)) - must_pruning_ratio_for_curr_task = 1.0 - ratio_allow_for_curr_task - if args.target_sparsity >= must_pruning_ratio_for_curr_task: - sys.exit(6) - -if __name__ == '__main__': - main() diff --git a/CPG_cifar100_main.py b/CPG_face_main.py similarity index 59% rename from CPG_cifar100_main.py rename to CPG_face_main.py index daa27f8..9fad275 100644 --- a/CPG_cifar100_main.py +++ b/CPG_face_main.py @@ -1,19 +1,19 @@ """Main entry point for doing all stuff.""" -# always choose the best checkpoint - from __future__ import division, print_function import argparse import json import warnings +import logging import torch import torch.nn as nn import torch.optim as optim import torch.backends.cudnn as cudnn from torch.nn.parameter import Parameter +import torchvision.transforms as transforms -import UTILS.utils as utils +import FACE_UTILS as utils import pdb import os import math @@ -24,18 +24,24 @@ import models.layers as nl import models -from UTILS.manager import Manager -import UTILS.dataset as dataset -import logging +from FACE_UTILS.manager import Manager +import FACE_UTILS.dataset as dataset +from FACE_UTILS.LFWDataset import LFWDataset + + +#{{{ Arguments +INIT_WEIGHT_PATH = 'common_data/face_weight.pth' +LFW_PAIRS_PATH = 'common_data/lfw_pairs.txt' # To prevent PIL warnings. warnings.filterwarnings("ignore") parser = argparse.ArgumentParser() -parser.add_argument('--arch', type=str, default='resnet50', +parser.add_argument('--arch', type=str, default='spherenet20', help='Architectures') parser.add_argument('--num_classes', type=int, default=-1, help='Num outputs for dataset') + # Optimization options. parser.add_argument('--lr', type=float, default=0.1, help='Learning rate for parameters, used for baselines') @@ -43,19 +49,14 @@ help='Learning rate for mask') parser.add_argument('--lr_mask_decay_every', type=int, help='Step decay every this many epochs') - -# parser.add_argument('--lr_classifier', type=float, -# help='Learning rate for classifier') -# parser.add_argument('--lr_classifier_decay_every', type=int, -# help='Step decay every this many epochs') - parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training') -parser.add_argument('--val_batch_size', type=int, default=100, +parser.add_argument('--val_batch_size', type=int, default=1, help='input batch size for validation') parser.add_argument('--workers', type=int, default=24, help='') parser.add_argument('--weight_decay', type=float, default=0.0, help='Weight decay') + # Masking options. parser.add_argument('--mask_init', default='1s', choices=['1s', 'uniform', 'weight_based_1s'], @@ -69,62 +70,59 @@ choices=['binarizer', 'ternarizer'], help='Type of thresholding function') parser.add_argument('--threshold', type=float, default=2e-3, help='') + + # Paths. parser.add_argument('--dataset', type=str, default='', - help='Name of dataset') + help='Name of dataset') parser.add_argument('--train_path', type=str, default='', - help='Location of train data') + help='Location of train data') parser.add_argument('--val_path', type=str, default='', - help='Location of test data') + help='Location of test data') parser.add_argument('--save_prefix', type=str, default='checkpoints/', - help='Location to save model') + help='Location to save model') +parser.add_argument('--log_path', type=str, default='run.log', + help='') + + # Other. parser.add_argument('--cuda', action='store_true', default=True, help='use CUDA') -# parser.add_argument('--no_mask', action='store_true', default=False, -# help='Used for running baselines, does not use any masking') - -parser.add_argument('--seed', type=int, default=1, help='random seed') - -parser.add_argument('--checkpoint_format', type=str, - default='./{save_folder}/checkpoint-{epoch}.pth.tar', - help='checkpoint file format') - +parser.add_argument('--seed', type=int, default=1, + help='random seed') +parser.add_argument('--checkpoint_format', type=str, + default='./{save_folder}/checkpoint-{epoch}.pth.tar', + help='checkpoint file format') parser.add_argument('--epochs', type=int, default=160, help='number of epochs to train') -parser.add_argument('--restore_epoch', type=int, default=0, help='') -parser.add_argument('--image_size', type=int, default=32, help='') +parser.add_argument('--restore_epoch', type=int, default=0, + help='') parser.add_argument('--save_folder', type=str, help='folder name inside one_check folder') -parser.add_argument('--load_folder', default='', help='') - -# parser.add_argument('--datadir', default='/home/ivclab/decathlon-1.0/', -# help='folder containing data folder') -# parser.add_argument('--imdbdir', default='/home/ivclab/decathlon-1.0/annotations', -# help='annotation folder') - -# parser.add_argument('--train_weight', action='store_true', default=False, help='') -# parser.add_argument('--train_mask', action='store_true', default=False, help='') -# parser.add_argument('--train_classifier', action='store_true', default=False, help='') - -parser.add_argument('--pruning_interval', type=int, default=100, help='') -parser.add_argument('--pruning_frequency', type=int, default=10, help='') -parser.add_argument('--initial_sparsity', type=float, default=0.0, help='') -parser.add_argument('--target_sparsity', type=float, default=0.1, help='') - -parser.add_argument('--mode', - choices=['finetune', 'prune', 'inference'], +parser.add_argument('--load_folder', default='', + help='') +parser.add_argument('--pruning_interval', type=int, default=100, help='') +parser.add_argument('--pruning_frequency', type=int, default=10, help='') +parser.add_argument('--initial_sparsity', type=float, default=0.0, help='') +parser.add_argument('--target_sparsity', type=float, default=0.1, help='') +parser.add_argument('--mode', choices=['finetune', 'prune', 'inference'], help='Run mode') - -parser.add_argument('--jsonfile', type=str, help='file to restore baseline validation accuracy') -parser.add_argument('--network_width_multiplier', type=float, default=1.0, help='the multiplier to scale up the channel width') -# parser.add_argument('--tmp_benchmark_file', type=str, default='tmp_benchmark_file.txt', help='') -parser.add_argument('--test_piggymask', action='store_true', default=False, help='') +parser.add_argument('--jsonfile', type=str, + help='file to restore baseline validation accuracy') +parser.add_argument('--network_width_multiplier', type=float, default=1.0, + help='the multiplier to scale up the channel width') +parser.add_argument('--use_vgg_pretrained', action='store_true', default=False, + help='') +parser.add_argument('--acc_margin', type=float, default=0.01, + help='') +#}}} + + +#{{{ Multiple optimizers class Optimizers(object): def __init__(self): self.optimizers = [] self.lrs = [] - # self.args = args def add(self, optimizer, lr): self.optimizers.append(optimizer) @@ -132,8 +130,6 @@ def add(self, optimizer, lr): def step(self): for optimizer in self.optimizers: - # if isinstance(optimizer, torch.optim.Adam): - # pdb.set_trace() optimizer.step() def zero_grad(self): @@ -145,13 +141,32 @@ def __getitem__(self, index): def __setitem__(self, index, value): self.optimizers[index] = value +#}}} + + +def set_logger(filepath): + global logger + logger = logging.getLogger('') + logger.setLevel(logging.INFO) + fh = logging.FileHandler(filepath) + fh.setLevel(logging.INFO) + ch = logging.StreamHandler(sys.stdout) + ch.setLevel(logging.INFO) + + _format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + fh.setFormatter(_format) + ch.setFormatter(_format) + + logger.addHandler(fh) + logger.addHandler(ch) + return + def main(): """Do stuff.""" + #{{{ Setting arguments, resume epochs and datasets args = parser.parse_args() - # don't use this, neither set learning rate as a linear function - # of the count of gpus, it will make accuracy lower - # args.batch_size = args.batch_size * torch.cuda.device_count() + set_logger(args.log_path) args.network_width_multiplier = math.sqrt(args.network_width_multiplier) if args.save_folder and not os.path.isdir(args.save_folder): @@ -160,7 +175,7 @@ def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') args.cuda = False - + torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) @@ -181,11 +196,11 @@ def main(): # Set default train and test path if not provided as input. utils.set_dataset_paths(args) - + if resume_from_epoch: filepath = args.checkpoint_format.format(save_folder=resume_folder, epoch=resume_from_epoch) checkpoint = torch.load(filepath) - checkpoint_keys = checkpoint.keys() + checkpoint_keys = checkpoint.keys() dataset_history = checkpoint['dataset_history'] dataset2num_classes = checkpoint['dataset2num_classes'] masks = checkpoint['masks'] @@ -201,30 +216,44 @@ def main(): masks = {} shared_layer_info = {} - if args.arch == 'resnet50': - num_for_construct = [64, 64, 64*4, 128, 128*4, 256, 256*4, 512, 512*4] - model = models.__dict__[args.arch](pretrained=True, num_for_construct=num_for_construct, threshold=args.threshold) - elif 'vgg' in args.arch: - custom_cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'] - model = models.__dict__[args.arch](custom_cfg, dataset_history=dataset_history, dataset2num_classes=dataset2num_classes, + if args.arch == 'spherenet20': + model = models.__dict__[args.arch](dataset_history=dataset_history, dataset2num_classes=dataset2num_classes, network_width_multiplier=args.network_width_multiplier, shared_layer_info=shared_layer_info) else: print('Error!') sys.exit(1) - # Add and set the model dataset. + # Add and set the model dataset model.add_dataset(args.dataset, args.num_classes) model.set_dataset(args.dataset) - model = nn.DataParallel(model) - model = model.cuda() + if args.cuda: + # Move model to GPU + model = nn.DataParallel(model) + model = model.cuda() + #}}} + + if args.use_vgg_pretrained and model.module.datasets.index(args.dataset) == 0: + logging.info('Initialize vgg face') + curr_model_state_dict = model.state_dict() + state_dict = torch.load(INIT_WEIGHT_PATH) + if args.arch == 'spherenet20': + for name, param in state_dict.items(): + if 'fc' not in name: + curr_model_state_dict['module.' + name].copy_(param) + curr_model_state_dict['module.classifiers.0.0.weight'].copy_(state_dict['fc5.weight']) + curr_model_state_dict['module.classifiers.0.0.bias'].copy_(state_dict['fc5.bias']) + curr_model_state_dict['module.classifiers.0.1.weight'].copy_(state_dict['fc6.weight']) + else: + logging.info("Currently, we didn't define the mapping of {} between vgg pretrained weight and our model".format(args.arch)) + sys.exit(5) + #{{{ Initializing mask if not masks: for name, module in model.named_modules(): if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): mask = torch.ByteTensor(module.weight.data.size()).fill_(0) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() + mask = mask.cuda() masks[name] = mask else: # when we expand network, we need to allocate new masks @@ -237,58 +266,55 @@ def main(): elif masks[name].size(1) > module.weight.data.size(1): assert args.mode == 'inference' NEED_ADJUST_MASK = True - + if NEED_ADJUST_MASK: if args.mode == 'finetune': for name, module in model.named_modules(): if isinstance(module, nl.SharableConv2d): mask = torch.ByteTensor(module.weight.data.size()).fill_(0) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() + mask = mask.cuda() mask[:masks[name].size(0), :masks[name].size(1), :, :].copy_(masks[name]) masks[name] = mask elif isinstance(module, nl.SharableLinear): - mask = torch.ByteTensor(module.weight.data.size()).fill_(0) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() + mask = torch.ByteTensor(module.weight.data.size()).fill_(0) + mask = mask.cuda() mask[:masks[name].size(0), :masks[name].size(1)].copy_(masks[name]) masks[name] = mask - elif args.mode == 'inference': + elif args.mode == 'inference': for name, module in model.named_modules(): if isinstance(module, nl.SharableConv2d): mask = torch.ByteTensor(module.weight.data.size()).fill_(0) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() + mask = mask.cuda() mask[:, :, :, :].copy_(masks[name][:mask.size(0), :mask.size(1), :, :]) masks[name] = mask elif isinstance(module, nl.SharableLinear): mask = torch.ByteTensor(module.weight.data.size()).fill_(0) - if 'cuda' in module.weight.data.type(): - mask = mask.cuda() + mask = mask.cuda() mask[:, :].copy_(masks[name][:mask.size(0), :mask.size(1)]) masks[name] = mask + #}}} + #{{{ Setting shared layer info and piggymask if args.dataset not in shared_layer_info: - shared_layer_info[args.dataset] = { 'bias': {}, 'bn_layer_running_mean': {}, 'bn_layer_running_var': {}, 'bn_layer_weight': {}, 'bn_layer_bias': {}, + 'prelu_layer_weight': {}, 'piggymask': {} } - piggymasks = {} task_id = model.module.datasets.index(args.dataset) + 1 if task_id > 1: - for name, module in model.module.named_modules(): + for name, module in model.named_modules(): if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): - piggymasks[name] = torch.zeros_like(masks['module.' + name], dtype=torch.float32) + piggymasks[name] = torch.zeros_like(masks[name], dtype=torch.float32) piggymasks[name].fill_(0.01) piggymasks[name] = Parameter(piggymasks[name]) - module.piggymask = piggymasks[name] + module.piggymask = piggymasks[name] else: piggymasks = shared_layer_info[args.dataset]['piggymask'] task_id = model.module.datasets.index(args.dataset) + 1 @@ -298,17 +324,24 @@ def main(): module.piggymask = piggymasks[name] shared_layer_info[args.dataset]['network_width_multiplier'] = args.network_width_multiplier - - if args.num_classes == 2: - train_loader = dataset.cifar100_train_loader_two_class(args.dataset, args.batch_size) - val_loader = dataset.cifar100_val_loader_two_class(args.dataset, args.val_batch_size) - elif args.num_classes == 5: - train_loader = dataset.cifar100_train_loader(args.dataset, args.batch_size) - val_loader = dataset.cifar100_val_loader(args.dataset, args.val_batch_size) + #}}} + + #{{{ Data loader + train_loader = dataset.train_loader(args.train_path, args.batch_size, num_workers=args.workers) + if args.dataset == 'face_verification': + kwargs = {'num_workers': 2, 'pin_memory': True} if torch.cuda.is_available() else {} + val_loader = torch.utils.data.DataLoader( + LFWDataset(dir=args.val_path, pairs_path=LFW_PAIRS_PATH, + transform=transforms.Compose([ + transforms.Resize(112), + transforms.ToTensor(), + transforms.Normalize(mean=[0.5, 0.5, 0.5], + std= [0.5, 0.5, 0.5])])), + batch_size=args.val_batch_size, shuffle=False, **kwargs) else: - print("num_classes should be either 2 or 5") - sys.exit(1) - + val_loader = dataset.val_loader(args.val_path, args.val_batch_size) + #}}} + # if we are going to save checkpoint in other folder, then we recalculate the starting epoch if args.save_folder != args.load_folder: start_epoch = 0 @@ -316,14 +349,18 @@ def main(): start_epoch = resume_from_epoch curr_prune_step = begin_prune_step = start_epoch * len(train_loader) - end_prune_step = curr_prune_step + args.pruning_interval * len(train_loader) + end_prune_step = curr_prune_step + args.pruning_interval * len(train_loader) manager = Manager(args, model, shared_layer_info, masks, train_loader, val_loader, begin_prune_step, end_prune_step) if args.mode == 'inference': manager.load_checkpoint_only_for_evaluate(resume_from_epoch, resume_folder) - manager.validate(resume_from_epoch-1) + if args.dataset == 'face_verification': + manager.evalLFW(resume_from_epoch-1) + else: + manager.validate(resume_from_epoch-1) return + #{{{ Setting optimizers lr = args.lr lr_mask = args.lr_mask # update all layers @@ -337,7 +374,7 @@ def main(): if 'classifiers' in name: if '.{}.'.format(model.module.datasets.index(args.dataset)) in name: params_to_optimize_via_SGD.append(param) - named_of_params_to_optimize_via_SGD.append(name) + named_of_params_to_optimize_via_SGD.append(name) continue elif 'piggymask' in name: masks_to_optimize_via_Adam.append(param) @@ -354,30 +391,10 @@ def main(): if masks_to_optimize_via_Adam: optimizer_mask = optim.Adam(masks_to_optimize_via_Adam, lr=lr_mask) optimizers.add(optimizer_mask, lr_mask) + #}}} manager.load_checkpoint(optimizers, resume_from_epoch, resume_folder) - # total_elements = 0 - # total_zeros_elements = 0 - # for name, module in model.named_modules(): - # if isinstance(module, nl.SharableConv2d): - # zero_channels = module.piggymask.le(args.threshold).sum() - # zero_elements = module.weight.data.numel()/module.piggymask.size(0)*zero_channels - # total_zeros_elements += zero_elements - # total_elements += module.weight.data.numel() - # print('{}: channel level: num_zeros {}, total {}; ' - # 'element level: num_zeros {}, total {}'.format( - # name, zero_channels, module.piggymask.size(0), - # zero_elements, module.weight.data.numel())) - - # # zero_elements = module.piggymask.le(args.threshold).sum() - # # total_zeros_elements += zero_elements - # # total_elements += module.weight.data.numel() - # # print('{}: element level: num_zeros {}, total {}'.format( - # # name, zero_elements, module.piggymask.numel())) - # print('pruning ratio: {}'.format(float(total_zeros_elements)/total_elements)) - # pdb.set_trace() - """Performs training.""" curr_lrs = [] for optimizer in optimizers: @@ -386,46 +403,68 @@ def main(): break if start_epoch != 0: - curr_best_accuracy = manager.validate(start_epoch-1) + if args.dataset == 'face_verification': + curr_best_accuracy = manager.evalLFW(start_epoch-1) + else: + curr_best_accuracy = manager.validate(start_epoch-1) else: curr_best_accuracy = 0.0 if args.jsonfile is None or not os.path.isfile(args.jsonfile): - sys.exit(3) - with open(args.jsonfile, 'r') as jsonfile: + sys.exit(3) ## NO baseline_face_acc.txt founded + + with open(args.jsonfile, 'r') as jsonfile: json_data = json.load(jsonfile) baseline_acc = float(json_data[args.dataset]) - + if args.mode == 'prune': - # with open(os.path.join(os.getcwd(), args.tmp_benchmark_file), 'r') as jsonfile: - # json_data = json.load(jsonfile) - # acc_before_prune = float(json_data['acc_before_prune']) - - # if acc_before_prune - baseline_acc > 0.01: - # history_best_avg_val_acc_when_prune = acc_before_prune - 0.015 - # else: - # history_best_avg_val_acc_when_prune = acc_before_prune - 0.01 - history_best_avg_val_acc_when_prune = baseline_acc - 0.01 + if args.dataset != 'face_verification': + history_best_avg_val_acc_when_prune = baseline_acc - args.acc_margin + else: + if 'spherenet20' in args.arch: + baseline_acc = 0.9942 + history_best_avg_val_acc_when_prune = baseline_acc - args.acc_margin + else: + logging.info('Something is wrong') + exit(1) stop_prune = True if 'gradual_prune' in args.load_folder and args.save_folder == args.load_folder: - args.epochs = 20 + resume_from_epoch - print() - print('Before pruning: ') - print('Sparsity range: {} -> {}'.format(args.initial_sparsity, args.target_sparsity)) - curr_best_accuracy = manager.validate(start_epoch-1) - print() + if args.dataset == 'face_verification': + args.epochs = 10 + resume_from_epoch + else: + args.epochs = 20 + resume_from_epoch + logging.info('\n') + logging.info('Before pruning: ') + logging.info('Sparsity range: {} -> {}'.format(args.initial_sparsity, args.target_sparsity)) + if args.dataset == 'face_verification': + curr_best_accuracy = manager.evalLFW(start_epoch-1) + else: + curr_best_accuracy = manager.validate(start_epoch-1) + logging.info('\n') elif args.mode == 'finetune': manager.pruner.make_finetuning_mask() + if args.dataset == 'face_verification': + manager.evalLFW(0) + manager.save_checkpoint(optimizers, 0, args.save_folder) + return + history_best_avg_val_acc = 0.0 + num_epochs_that_criterion_does_not_get_better = 0 + times_of_decaying_learning_rate = 0 + #{{{ Training Loop for epoch_idx in range(start_epoch, args.epochs): avg_train_acc, curr_prune_step = manager.train(optimizers, epoch_idx, curr_lrs, curr_prune_step) - avg_val_acc = manager.validate(epoch_idx) + if args.dataset == 'face_verification': + avg_val_acc = manager.evalLFW(epoch_idx) + else: + avg_val_acc = manager.validate(epoch_idx) + #{{{ Train for pruning if args.mode == 'prune' and (epoch_idx+1) >= (args.pruning_interval + start_epoch) and ( avg_val_acc > history_best_avg_val_acc_when_prune): stop_prune = False @@ -437,51 +476,68 @@ def main(): os.remove(os.path.join(args.save_folder, checkpoint_file)) else: print('Something is wrong! Block the program with pdb') - pdb.set_trace() + pdb.set_trace() manager.save_checkpoint(optimizers, epoch_idx, args.save_folder) + #}}} - if args.mode == 'finetune': - if epoch_idx + 1 == 50 or epoch_idx + 1 == 80: - for param_group in optimizers[0].param_groups: - param_group['lr'] *= 0.1 - curr_lrs[0] = param_group['lr'] - if len(optimizers.lrs) == 2 and epoch_idx + 1 == 50: - for param_group in optimizers[1].param_groups: - param_group['lr'] *= 0.2 - curr_lrs[1] = param_group['lr'] - + #{{{ Train for finetuning + if args.mode == 'finetune': if avg_val_acc > history_best_avg_val_acc: if args.save_folder is not None: + num_epochs_that_criterion_does_not_get_better = 0 paths = os.listdir(args.save_folder) if paths and '.pth.tar' in paths[0]: for checkpoint_file in paths: os.remove(os.path.join(args.save_folder, checkpoint_file)) else: print('Something is wrong! Block the program with pdb') - pdb.set_trace() + pdb.set_trace() history_best_avg_val_acc = avg_val_acc manager.save_checkpoint(optimizers, epoch_idx, args.save_folder) + else: + num_epochs_that_criterion_does_not_get_better += 1 + + if times_of_decaying_learning_rate >= 3: + logging.info('\n') + logging.info("times_of_decaying_learning_rate reach {}, stop training".format( + times_of_decaying_learning_rate)) + break - print('-' * 16) + if num_epochs_that_criterion_does_not_get_better >= 10: + times_of_decaying_learning_rate += 1 + num_epochs_that_criterion_does_not_get_better = 0 + for param_group in optimizers[0].param_groups: + param_group['lr'] *= 0.1 + curr_lrs[0] = param_group['lr'] + logging.info('\n') + logging.info("continously {} epochs doesn't get higher acc, " + "decay learning rate by multiplying 0.1".format( + num_epochs_that_criterion_does_not_get_better)) + + if times_of_decaying_learning_rate == 1 and len(optimizers.lrs) == 2: + for param_group in optimizers[1].param_groups: + param_group['lr'] *= 0.2 + curr_lrs[1] = param_group['lr'] + #}}} - if args.mode == 'finetune' and not args.test_piggymask: - if avg_train_acc > 0.95 and (history_best_avg_val_acc - baseline_acc) > -0.01: - # json_data = {} - # json_data['acc_before_prune'] = '{:.4f}'.format(history_best_avg_val_acc) - # with open(args.tmp_benchmark_file, 'w') as jsonfile: - # json.dump(json_data, jsonfile) + + logging.info('-' * 16) + + if args.mode == 'finetune': + if history_best_avg_val_acc - baseline_acc > -args.acc_margin: pass else: - print("It's time to expand the Network") - print('Auto expand network') + logging.info("It's time to expand the Network") + logging.info('Auto expand network') sys.exit(2) elif args.mode == 'prune' and stop_prune: - print('Acc too low, stop pruning.') + logging.info('Acc too low, stop pruning.') sys.exit(4) + #}}} -if __name__ == '__main__': - main() +if __name__ == '__main__': + main() diff --git a/FACE_UTILS/LFWDataset.py b/FACE_UTILS/LFWDataset.py new file mode 100755 index 0000000..4564a86 --- /dev/null +++ b/FACE_UTILS/LFWDataset.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +import torchvision.datasets as datasets +import os +import numpy as np + +class LFWDataset(datasets.ImageFolder): + ''' + ''' + def __init__(self, dir, pairs_path, file_ext='jpg', transform=None): + + super(LFWDataset, self).__init__(dir,transform) + self.pairs_path = pairs_path + # LFW dir contains 2 folders: faces and lists + self.validation_images = self.get_lfw_paths(dir,file_ext=file_ext) + + def read_lfw_pairs(self,pairs_filename): + pairs = [] + with open(pairs_filename, 'r') as f: + for line in f.readlines()[1:]: + pair = line.strip().split() + pairs.append(pair) + return np.array(pairs) + # !!!!!!!!!!!!!!!!!!!!!!!!!!!NOTICE YOUR FILE_EXTENSION!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + def get_lfw_paths(self,lfw_dir,file_ext="jpg"): + pairs = self.read_lfw_pairs(self.pairs_path) + nrof_skipped_pairs = 0 + path_list = [] + issame_list = [] + for i in range(len(pairs)): + pair = pairs[i] + if len(pair) == 3: + path0 = os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[1])+'.'+file_ext) + path1 = os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[2])+'.'+file_ext) + issame = True + elif len(pair) == 4: + path0 = os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[1])+'.'+file_ext) + path1 = os.path.join(lfw_dir, pair[2], pair[2] + '_' + '%04d' % int(pair[3])+'.'+file_ext) + issame = False + if os.path.exists(path0) and os.path.exists(path1): # Only add the pair if both paths exist + path_list.append((path0,path1,issame)) + issame_list.append(issame) + else: + nrof_skipped_pairs += 1 + if nrof_skipped_pairs>0: + print('Skipped %d image pairs' % nrof_skipped_pairs) + return path_list + + def __getitem__(self, index): + ''' + + Args: + index: Index of the triplet or the matches - not of a single image + + Returns: + + ''' + def transform(img_path): + """Convert image into numpy array and apply transformation + Doing this so that it is consistent with all other datasets + to return a PIL Image. + """ + img = self.loader(img_path) + return self.transform(img) + (path_1,path_2,issame) = self.validation_images[index] + img1, img2 = transform(path_1), transform(path_2) + return img1, img2, issame + + def __len__(self): + return len(self.validation_images) diff --git a/FACE_UTILS/__init__.py b/FACE_UTILS/__init__.py new file mode 100644 index 0000000..d145110 --- /dev/null +++ b/FACE_UTILS/__init__.py @@ -0,0 +1,9 @@ +"""Contains a bunch of utility functions.""" +import numpy as np +import pdb + + +def set_dataset_paths(args): + """Set default train and test path if not provided as input.""" + args.train_path = 'data/%s/train' % (args.dataset) + args.val_path = 'data/%s/val' % (args.dataset) diff --git a/FACE_UTILS/dataset.py b/FACE_UTILS/dataset.py new file mode 100644 index 0000000..e730626 --- /dev/null +++ b/FACE_UTILS/dataset.py @@ -0,0 +1,55 @@ +import collections +import glob +import os + +import numpy as np +from PIL import Image + +import torch +import torch.backends.cudnn as cudnn +import torch.nn as nn +import torch.optim as optim +import torchvision.datasets as datasets +import torchvision.models as models +import torchvision.transforms as transforms +import pdb + +VGGFACE_MEAN = [0.5, 0.5, 0.5] +VGGFACE_STD = [0.5, 0.5, 0.5] + + +def train_loader(path, train_batch_size, num_workers=4, pin_memory=False, normalize=None): + if normalize is None: + normalize = transforms.Normalize( + mean=VGGFACE_MEAN, std=VGGFACE_STD) + + train_transform = transforms.Compose([ + transforms.Resize(112), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ]) + + train_dataset = datasets.ImageFolder(path, train_transform) + + return torch.utils.data.DataLoader(train_dataset, + batch_size=train_batch_size, shuffle=True, sampler=None, + num_workers=num_workers, pin_memory=pin_memory) + + +def val_loader(path, val_batch_size, num_workers=4, pin_memory=False, normalize=None): + if normalize is None: + normalize = transforms.Normalize( + mean=VGGFACE_MEAN, std=VGGFACE_STD) + + val_transform = transforms.Compose([ + transforms.Resize(112), + transforms.ToTensor(), + normalize, + ]) + + val_dataset = datasets.ImageFolder(path, val_transform) + + return torch.utils.data.DataLoader(val_dataset, + batch_size=val_batch_size, shuffle=False, sampler=None, + num_workers=num_workers, pin_memory=pin_memory) diff --git a/FACE_UTILS/manager.py b/FACE_UTILS/manager.py new file mode 100644 index 0000000..4a57c74 --- /dev/null +++ b/FACE_UTILS/manager.py @@ -0,0 +1,312 @@ +import logging +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import numpy as np +from .prune import SparsePruner +from tqdm import tqdm +import pdb +from pprint import pprint +import os +import math +from datetime import datetime +import models.layers as nl +# import imdbfolder_coco as imdbfolder +import sys +from torch.autograd import Variable +from .metrics import fv_evaluate +from models import AngleLoss + +class Metric(object): + def __init__(self, name): + self.name = name + self.sum = torch.tensor(0.) + self.n = torch.tensor(0.) + + def update(self, val): + self.sum += val + self.n += 1 + + @property + def avg(self): + return self.sum / self.n + +def accuracy(output, target): + # get the index of the max log-probability + pred = output.max(1, keepdim=True)[1] + return pred.eq(target.view_as(pred)).cpu().float().mean() + + +class Manager(object): + """Handles training and pruning.""" + + def __init__(self, args, model, shared_layer_info, masks, train_loader, val_loader, begin_prune_step, end_prune_step): + self.args = args + self.model = model + self.shared_layer_info = shared_layer_info + self.inference_dataset_idx = self.model.module.datasets.index(args.dataset) + 1 + self.pruner = SparsePruner(self.model, masks, self.args, begin_prune_step, end_prune_step, self.inference_dataset_idx) + + self.train_loader = train_loader + self.val_loader = val_loader + + if args.dataset == 'face_verification': + self.criterion = AngleLoss() + elif args.dataset == 'emotion': + class_counts = torch.from_numpy(np.array([74874, 134415, 25459, 14090, 6378, 3803, 24882]).astype(np.float32)) + class_weights = (torch.sum(class_counts) - class_counts) / class_counts + self.criterion = nn.CrossEntropyLoss(weight=class_weights.cuda()) + else: + self.criterion = nn.CrossEntropyLoss() + + def train(self, optimizers, epoch_idx, curr_lrs, curr_prune_step): + # Set model to training mode + self.model.train() + + train_loss = Metric('train_loss') + train_accuracy = Metric('train_accuracy') + + with tqdm(total=len(self.train_loader), + desc='Train Ep. #{}: '.format(epoch_idx + 1), + disable=False, + ascii=True) as t: + for batch_idx, (data, target) in enumerate(self.train_loader): + if self.args.cuda: + data, target = data.cuda(), target.cuda() + + optimizers.zero_grad() + # Do forward-backward. + output = self.model(data) + + if self.args.dataset != 'face_verification': + train_accuracy.update(accuracy(output, target)) + + loss = self.criterion(output, target) + train_loss.update(loss) + loss.backward() + + # Set fixed param grads to 0. + self.pruner.do_weight_decay_and_make_grads_zero() + + # Gradient is applied across all ranks + optimizers.step() + + # Set pruned weights to 0. + if self.args.mode == 'prune': + self.pruner.gradually_prune(curr_prune_step) + curr_prune_step += 1 + + if self.inference_dataset_idx == 1: + t.set_postfix({'loss': train_loss.avg.item(), + 'accuracy': '{:.2f}'.format(100. * train_accuracy.avg.item()), + 'lr': curr_lrs[0], + 'sparsity': self.pruner.calculate_sparsity(), + 'network_width_mpl': self.args.network_width_multiplier}) + else: + t.set_postfix({'loss': train_loss.avg.item(), + 'accuracy': '{:.2f}'.format(100. * train_accuracy.avg.item()), + 'lr': curr_lrs[0], + 'sparsity': self.pruner.calculate_sparsity(), + 'network_width_mpl': self.args.network_width_multiplier}) + t.update(1) + + summary = {'loss': '{:.3f}'.format(train_loss.avg.item()), + 'accuracy': '{:.2f}'.format(100. * train_accuracy.avg.item()), + 'lr': curr_lrs[0], + 'sparsity': '{:.3f}'.format(self.pruner.calculate_sparsity()), + 'network_width_mpl': self.args.network_width_multiplier} + logging.info(('In train()-> Train Ep. #{} '.format(epoch_idx + 1) + + ', '.join(['{}: {}'.format(k, v) for k, v in summary.items()]))) + return train_accuracy.avg.item(), curr_prune_step + + #{{{ Evaluate classification + def validate(self, epoch_idx, biases=None): + """Performs evaluation.""" + self.pruner.apply_mask() + self.model.eval() + val_loss = Metric('val_loss') + val_accuracy = Metric('val_accuracy') + + with tqdm(total=len(self.val_loader), + desc='Val Ep. #{}: '.format(epoch_idx + 1), #, datetime.strftime(datetime.now(), '%Y/%m/%d-%H:%M:%S')) + ascii=True) as t: + with torch.no_grad(): + for data, target in self.val_loader: + if self.args.cuda: + data, target = data.cuda(), target.cuda() + + output = self.model(data) + + val_loss.update(self.criterion(output, target)) + val_accuracy.update(accuracy(output, target)) + + if self.inference_dataset_idx == 1: + t.set_postfix({'loss': val_loss.avg.item(), + 'accuracy': '{:.2f}'.format(100. * val_accuracy.avg.item()), + 'sparsity': self.pruner.calculate_sparsity(), + 'task{} ratio'.format(self.inference_dataset_idx): '{:.2f}'.format(self.pruner.calculate_curr_task_ratio()), + 'zero ratio': self.pruner.calculate_zero_ratio()}) + else: + t.set_postfix({'loss': val_loss.avg.item(), + 'accuracy': '{:.2f}'.format(100. * val_accuracy.avg.item()), + 'sparsity': self.pruner.calculate_sparsity(), + 'task{} ratio'.format(self.inference_dataset_idx): '{:.2f}'.format(self.pruner.calculate_curr_task_ratio()), + 'shared_ratio': self.pruner.calculate_shared_part_ratio(), + 'zero ratio': self.pruner.calculate_zero_ratio()}) + t.update(1) + summary = {'loss': '{:.3f}'.format(val_loss.avg.item()), + 'accuracy': '{:.2f}'.format(100. * val_accuracy.avg.item()), + 'sparsity': '{:.3f}'.format(self.pruner.calculate_sparsity()), + 'task{} ratio'.format(self.inference_dataset_idx): '{:.3f}'.format(self.pruner.calculate_curr_task_ratio()), + 'zero ratio': '{:.3f}'.format(self.pruner.calculate_zero_ratio())} + if self.inference_dataset_idx != 1: + summary['shared_ratio'] = '{:.3f}'.format(self.pruner.calculate_shared_part_ratio()) + logging.info(('In validate()-> Val Ep. #{} '.format(epoch_idx + 1) + + ', '.join(['{}: {}'.format(k, v) for k, v in summary.items()]))) + return val_accuracy.avg.item() + #}}} + + #{{{ Evaluate LFW + def evalLFW(self, epoch_idx): + distance_metric = True + subtract_mean = False + self.pruner.apply_mask() + self.model.eval() # switch to evaluate mode + labels, embedding_list_a, embedding_list_b = [], [], [] + with torch.no_grad(): + with tqdm(total=len(self.val_loader), + desc='Validate Epoch #{}: '.format(epoch_idx + 1), + ascii=True) as t: + for batch_idx, (data_a, data_p, label) in enumerate(self.val_loader): + data_a, data_p = data_a.cuda(), data_p.cuda() + data_a, data_p, label = Variable(data_a, volatile=True), \ + Variable(data_p, volatile=True), Variable(label) + # ==== compute output ==== + out_a = self.model.module.forward_to_embeddings(data_a) + out_p = self.model.module.forward_to_embeddings(data_p) + # do L2 normalization for features + if not distance_metric: + out_a = F.normalize(out_a, p=2, dim=1) + out_p = F.normalize(out_p, p=2, dim=1) + out_a = out_a.data.cpu().numpy() + out_p = out_p.data.cpu().numpy() + + embedding_list_a.append(out_a) + embedding_list_b.append(out_p) + # ======================== + labels.append(label.data.cpu().numpy()) + t.update(1) + + labels = np.array([sublabel for label in labels for sublabel in label]) + embedding_list_a = np.array([item for embedding in embedding_list_a for item in embedding]) + embedding_list_b = np.array([item for embedding in embedding_list_b for item in embedding]) + tpr, fpr, accuracy, val, val_std, far = fv_evaluate(embedding_list_a, embedding_list_b, labels, + distance_metric=distance_metric, subtract_mean=subtract_mean) + print('In evalLFW(): Test set: Accuracy: {:.5f}+-{:.5f}'.format(np.mean(accuracy),np.std(accuracy))) + logging.info(('In evalLFW()-> Validate Epoch #{} '.format(epoch_idx + 1) + + 'Test set: Accuracy: {:.5f}+-{:.5f}, '.format(np.mean(accuracy),np.std(accuracy)) + + 'task_ratio: {:.2f}'.format(self.pruner.calculate_curr_task_ratio()))) + return np.mean(accuracy) + #}}} + + def save_checkpoint(self, optimizers, epoch_idx, save_folder): + """Saves model to file.""" + filepath = self.args.checkpoint_format.format(save_folder=save_folder, epoch=epoch_idx + 1) + + for name, module in self.model.module.named_modules(): + if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): + if module.bias is not None: + self.shared_layer_info[self.args.dataset][ + 'bias'][name] = module.bias + if module.piggymask is not None: + self.shared_layer_info[self.args.dataset][ + 'piggymask'][name] = module.piggymask + elif isinstance(module, nn.BatchNorm2d): + self.shared_layer_info[self.args.dataset][ + 'bn_layer_running_mean'][name] = module.running_mean + self.shared_layer_info[self.args.dataset][ + 'bn_layer_running_var'][name] = module.running_var + self.shared_layer_info[self.args.dataset][ + 'bn_layer_weight'][name] = module.weight + self.shared_layer_info[self.args.dataset][ + 'bn_layer_bias'][name] = module.bias + elif isinstance(module, nn.PReLU): + self.shared_layer_info[self.args.dataset][ + 'prelu_layer_weight'][name] = module.weight + + checkpoint = { + 'model_state_dict': self.model.module.state_dict(), + 'dataset_history': self.model.module.datasets, + 'dataset2num_classes': self.model.module.dataset2num_classes, + 'masks': self.pruner.masks, + 'shared_layer_info': self.shared_layer_info + } + torch.save(checkpoint, filepath) + + def load_checkpoint(self, optimizers, resume_from_epoch, save_folder, restore_optimizers=True): + if resume_from_epoch > 0: + filepath = self.args.checkpoint_format.format(save_folder=save_folder, epoch=resume_from_epoch) + checkpoint = torch.load(filepath) + checkpoint_keys = checkpoint.keys() + state_dict = checkpoint['model_state_dict'] + curr_model_state_dict = self.model.module.state_dict() + for name, param in state_dict.items(): + if 'piggymask' in name or name == 'classifier.weight' or name == 'classifier.bias' or (name == 'classifier.0.weight' or name == 'classifier.0.bias' or name == 'classifier.1.weight'): + # I DONT WANT TO DO THIS! QQ That last 3 exprs are for anglelinear and embeddings + continue + elif len(curr_model_state_dict[name].size()) == 4: + # Conv layer + curr_model_state_dict[name][:param.size(0), :param.size(1), :, :].copy_(param) + elif len(curr_model_state_dict[name].size()) == 2 and 'features' in name: + # FC conv (feature layer) + curr_model_state_dict[name][:param.size(0), :param.size(1)].copy_(param) + elif len(curr_model_state_dict[name].size()) == 1: + # bn and prelu layer + curr_model_state_dict[name][:param.size(0)].copy_(param) + else: + curr_model_state_dict[name].copy_(param) + + def load_checkpoint_only_for_evaluate(self, resume_from_epoch, save_folder): + if resume_from_epoch > 0: + filepath = self.args.checkpoint_format.format(save_folder=save_folder, epoch=resume_from_epoch) + checkpoint = torch.load(filepath) + checkpoint_keys = checkpoint.keys() + state_dict = checkpoint['model_state_dict'] + + curr_model_state_dict = self.model.module.state_dict() + for name, param in state_dict.items(): + if 'piggymask' in name: # we load piggymask value in main.py + continue + if name == 'classifier.weight' or name == 'classifier.bias' or (name == 'classifier.0.weight' or name == 'classifier.0.bias' or name == 'classifier.1.weight'): + # I DONT WANT TO DO THIS! QQ That last 3 exprs are for anglelinear and embeddings + continue + elif len(curr_model_state_dict[name].size()) == 4: + # Conv layer + curr_model_state_dict[name].copy_(param[:curr_model_state_dict[name].size(0), :curr_model_state_dict[name].size(1), :, :]) + elif len(curr_model_state_dict[name].size()) == 2 and 'features' in name: + # FC conv (feature layer) + curr_model_state_dict[name].copy_(param[:curr_model_state_dict[name].size(0), :curr_model_state_dict[name].size(1)]) + elif len(curr_model_state_dict[name].size()) == 1: + # bn and prelu layer + curr_model_state_dict[name].copy_(param[:curr_model_state_dict[name].size(0)]) + else: + curr_model_state_dict[name].copy_(param) + + # load the batch norm params and bias in convolution in correspond to curr dataset + for name, module in self.model.module.named_modules(): + if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): + if module.bias is not None: + module.bias = self.shared_layer_info[self.args.dataset]['bias'][name] + elif isinstance(module, nn.BatchNorm2d): + module.running_mean = self.shared_layer_info[self.args.dataset][ + 'bn_layer_running_mean'][name] + module.running_var = self.shared_layer_info[self.args.dataset][ + 'bn_layer_running_var'][name] + module.weight = self.shared_layer_info[self.args.dataset][ + 'bn_layer_weight'][name] + module.bias = self.shared_layer_info[self.args.dataset][ + 'bn_layer_bias'][name] + elif isinstance(module, nn.PReLU): + module.weight = self.shared_layer_info[self.args.dataset][ + 'prelu_layer_weight'][name] diff --git a/FACE_UTILS/metrics.py b/FACE_UTILS/metrics.py new file mode 100755 index 0000000..3185135 --- /dev/null +++ b/FACE_UTILS/metrics.py @@ -0,0 +1,153 @@ +# -*- coding: utf-8 -*- +# the source code is modified from official facenet github +import os +import operator +import numpy as np +from sklearn.model_selection import KFold +from scipy import interpolate +import math + + +def distance(embeddings1, embeddings2, distance_metric=0): + if distance_metric==0: + # Euclidian distance + diff = np.subtract(embeddings1, embeddings2) + dist = np.sum(np.square(diff),1) + elif distance_metric==1: + # Distance based on cosine similarity + dot = np.sum(np.multiply(embeddings1, embeddings2), axis=1) + norm = np.linalg.norm(embeddings1, axis=1) * np.linalg.norm(embeddings2, axis=1) + similarity = dot / norm + dist = np.arccos(np.clip(similarity,0,1))*4 / math.pi # notice!!! + # np.arccos(similarity) return the value in range between [0,pi] + # similarity must be between [-1.0,1.0] + # https://github.com/davidsandberg/facenet/issues/692 + else: + raise 'Undefined distance metric %d' % distance_metric + return dist + +def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10, + distance_metric=0, subtract_mean=False): + assert(embeddings1.shape[0] == embeddings2.shape[0]) + assert(embeddings1.shape[1] == embeddings2.shape[1]) + nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) + nrof_thresholds = len(thresholds) + k_fold = KFold(n_splits=nrof_folds, shuffle=False) + + tprs = np.zeros((nrof_folds,nrof_thresholds)) + fprs = np.zeros((nrof_folds,nrof_thresholds)) + accuracy = np.zeros((nrof_folds)) + + indices = np.arange(nrof_pairs) + + for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): + + if subtract_mean: + mean = np.mean(np.concatenate([embeddings1[train_set],embeddings2[train_set]]), axis=0) + else: + mean = 0.0 + dist = distance(embeddings1-mean, embeddings2-mean, distance_metric) + # Find the best threshold for the fold + acc_train = np.zeros((nrof_thresholds)) + for threshold_idx, threshold in enumerate(thresholds): + _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set]) + best_threshold_index = np.argmax(acc_train) + for threshold_idx, threshold in enumerate(thresholds): + tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set]) + _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set]) + + tpr = np.mean(tprs,0) + fpr = np.mean(fprs,0) + return tpr, fpr, accuracy + +def calculate_accuracy(threshold, dist, actual_issame): + predict_issame = np.less(dist, threshold) + tp = np.sum(np.logical_and(predict_issame, actual_issame)) + fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame))) + tn = np.sum(np.logical_and(np.logical_not(predict_issame), np.logical_not(actual_issame))) + fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame)) + + tpr = 0 if (tp+fn==0) else float(tp) / float(tp+fn) + fpr = 0 if (fp+tn==0) else float(fp) / float(fp+tn) + acc = float(tp+tn)/dist.size + return tpr, fpr, acc + + + +def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10, distance_metric=0, subtract_mean=False): + assert(embeddings1.shape[0] == embeddings2.shape[0]) + assert(embeddings1.shape[1] == embeddings2.shape[1]) + nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) + nrof_thresholds = len(thresholds) + k_fold = KFold(n_splits=nrof_folds, shuffle=False) + + val = np.zeros(nrof_folds) + far = np.zeros(nrof_folds) + + indices = np.arange(nrof_pairs) + + for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): + if subtract_mean: + mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0) + else: + mean = 0.0 + dist = distance(embeddings1-mean, embeddings2-mean, distance_metric) + + # Find the threshold that gives FAR = far_target + far_train = np.zeros(nrof_thresholds) + for threshold_idx, threshold in enumerate(thresholds): + _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set]) + if np.max(far_train)>=far_target: + f = interpolate.interp1d(far_train, thresholds, kind='slinear') + threshold = f(far_target) + else: + threshold = 0.0 + + val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set]) + + val_mean = np.mean(val) + far_mean = np.mean(far) + val_std = np.std(val) + return val_mean, val_std, far_mean + + +def calculate_val_far(threshold, dist, actual_issame): + predict_issame = np.less(dist, threshold) + true_accept = np.sum(np.logical_and(predict_issame, actual_issame)) + false_accept = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame))) + n_same = np.sum(actual_issame) + n_diff = np.sum(np.logical_not(actual_issame)) + val = float(true_accept) / float(n_same) + far = float(false_accept) / float(n_diff) + return val, far + + +def fv_evaluate(embeddings1, embeddings2, labels, + nrof_folds=10, distance_metric=0, subtract_mean=False): + """ + Notice + 1. At evaluation stage we want to have all features to be normalized because we calculate the distance betweeen them. + If the normalize the using L2-Norm, the maximum distance is 4 + When you remove the normalization, your maximum distance between features is unknown (it could be 100k) + So L2-Norm during evaluation is useful because we can test a range of threshold between 0-4 + E.g. + embeddings1 = np.asarray([[1,0],[0,1]]) + embeddings2 = np.asarray([[-1,0],[0,-1]]) + diff = np.subtract(embeddings1, embeddings2) + dist = np.sum(np.square(diff),1) + print(dist) # [4,4] + + + + 2. When you apply the normalization during training, the input features to 'Classifier' are normalized. + I think that this should not hurt the performance a lot. + (L2-normalization op in the training phase leads the performance slightly degraded!) + """ + # Calculate evaluation metrics + thresholds = np.arange(0, 4, 0.01) + tpr, fpr, accuracy = calculate_roc(thresholds, embeddings1, embeddings2, labels, + nrof_folds=nrof_folds, distance_metric=distance_metric, subtract_mean=subtract_mean) + thresholds = np.arange(0, 4, 0.001) + val, val_std, far = calculate_val(thresholds, embeddings1, embeddings2, labels, 1e-3, + nrof_folds=nrof_folds, distance_metric=distance_metric, subtract_mean=subtract_mean) + return tpr, fpr, accuracy, val, val_std, far diff --git a/FACE_UTILS/packnet_manager.py b/FACE_UTILS/packnet_manager.py new file mode 100644 index 0000000..0fc4bb3 --- /dev/null +++ b/FACE_UTILS/packnet_manager.py @@ -0,0 +1,267 @@ +import .dataset as dataset +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import numpy as np +from .packnet_prune import SparsePruner +from tqdm import tqdm +import pdb +from pprint import pprint +import os +import math +from datetime import datetime +from torch.autograd import Variable +from .metrics import fv_evaluate +from packnet_models import AngleLoss + + +class Metric(object): + def __init__(self, name): + self.name = name + self.sum = torch.tensor(0.) + self.n = torch.tensor(0.) + + def update(self, val): + self.sum += val + self.n += 1 + + @property + def avg(self): + return self.sum / self.n + +def accuracy(output, target): + # get the index of the max log-probability + pred = output.max(1, keepdim=True)[1] + return pred.eq(target.view_as(pred)).cpu().float().mean() + + +class Manager(object): + """Handles training and pruning.""" + + def __init__(self, args, model, shared_layer_info, masks, train_loader, val_loader): + self.args = args + self.model = model + self.shared_layer_info = shared_layer_info + self.inference_dataset_idx = self.model.module.datasets.index(args.dataset) + 1 + self.pruner = SparsePruner(self.model, masks, self.args, None, None, self.inference_dataset_idx) + + self.train_loader = train_loader + self.val_loader = val_loader + + if args.dataset == 'face_verification': + self.criterion = AngleLoss() + elif args.dataset == 'emotion': + class_counts = torch.from_numpy(np.array([74874, 134415, 25459, 14090, 6378, 3803, 24882]).astype(np.float32)) + class_weights = (torch.sum(class_counts) - class_counts) / class_counts + self.criterion = nn.CrossEntropyLoss(weight=class_weights.cuda()) + else: + self.criterion = nn.CrossEntropyLoss() + + def train(self, optimizers, epoch_idx, curr_lrs): + # Set model to training mode + self.model.train() + + train_loss = Metric('train_loss') + train_accuracy = Metric('train_accuracy') + + with tqdm(total=len(self.train_loader), + desc='Train Epoch #{}: '.format(epoch_idx + 1), + disable=False, + ascii=True) as t: + for batch_idx, (data, target) in enumerate(self.train_loader): + if self.args.cuda: + data, target = data.cuda(), target.cuda() + + optimizers.zero_grad() + # Do forward-backward. + output = self.model(data) + + if self.args.dataset != 'face_verification': + train_accuracy.update(accuracy(output, target)) + + loss = self.criterion(output, target) + train_loss.update(loss) + loss.backward() + + # Set fixed param grads to 0. + self.pruner.do_weight_decay_and_make_grads_zero() + + # Gradient is applied across all ranks + optimizers.step() + + # Set pruned weights to 0. + self.pruner.make_pruned_zero() + + t.set_postfix({'loss': train_loss.avg.item(), + 'accuracy': '{:.2f}'.format(100. * train_accuracy.avg.item()), + 'lr': curr_lrs[0], + 'sparsity': self.pruner.calculate_sparsity()}) + + t.update(1) + + return train_accuracy.avg.item() + + #{{{ Evaluate classification + def validate(self, epoch_idx, biases=None): + """Performs evaluation.""" + self.pruner.apply_mask() + self.model.eval() + val_loss = Metric('val_loss') + val_accuracy = Metric('val_accuracy') + + with tqdm(total=len(self.val_loader), + desc='Validate Epoch #{}: '.format(epoch_idx + 1), + ascii=True) as t: + with torch.no_grad(): + for data, target in self.val_loader: + if self.args.cuda: + data, target = data.cuda(), target.cuda() + + output = self.model(data) + + val_loss.update(self.criterion(output, target)) + val_accuracy.update(accuracy(output, target)) + + t.set_postfix({'loss': val_loss.avg.item(), + 'accuracy': '{:.2f}'.format(100. * val_accuracy.avg.item()), + 'sparsity': self.pruner.calculate_sparsity(), + 'task{} ratio'.format(self.inference_dataset_idx): self.pruner.calculate_curr_task_ratio(), + 'zero ratio': self.pruner.calculate_zero_ratio()}) + t.update(1) + + return val_accuracy.avg.item() + #}}} + + #{{{ Evaluate LFW + def evalLFW(self, epoch_idx): + distance_metric = True + subtract_mean = False + self.model.eval() # switch to evaluate mode + labels, embedding_list_a, embedding_list_b = [], [], [] + with torch.no_grad(): + with tqdm(total=len(self.val_loader), + desc='Validate Epoch #{}: '.format(epoch_idx + 1), + ascii=True) as t: + for batch_idx, (data_a, data_p, label) in enumerate(self.val_loader): + data_a, data_p = data_a.cuda(), data_p.cuda() + data_a, data_p, label = Variable(data_a, volatile=True), \ + Variable(data_p, volatile=True), Variable(label) + # ==== compute output ==== + out_a = self.model.module.forward_to_embeddings(data_a) + out_p = self.model.module.forward_to_embeddings(data_p) + # do L2 normalization for features + if not distance_metric: + out_a = F.normalize(out_a, p=2, dim=1) + out_p = F.normalize(out_p, p=2, dim=1) + out_a = out_a.data.cpu().numpy() + out_p = out_p.data.cpu().numpy() + + embedding_list_a.append(out_a) + embedding_list_b.append(out_p) + # ======================== + labels.append(label.data.cpu().numpy()) + t.update(1) + + labels = np.array([sublabel for label in labels for sublabel in label]) + embedding_list_a = np.array([item for embedding in embedding_list_a for item in embedding]) + embedding_list_b = np.array([item for embedding in embedding_list_b for item in embedding]) + tpr, fpr, accuracy, val, val_std, far = fv_evaluate(embedding_list_a, embedding_list_b, labels, + distance_metric=distance_metric, subtract_mean=subtract_mean) + print('\33[91mTest set: Accuracy: {:.5f}+-{:.5f}\n\33[0m'.format(np.mean(accuracy),np.std(accuracy))) + print('\33[91mTest set: Validation rate: {:.5f}+-{:.5f} @ FAR={:.5f}\n\33[0m'.format(val,val_std, far)) + return np.mean(accuracy) + #}}} + + def one_shot_prune(self, one_shot_prune_perc): + self.pruner.one_shot_prune(one_shot_prune_perc) + + def save_checkpoint(self, optimizers, epoch_idx, save_folder): + """Saves model to file.""" + filepath = self.args.checkpoint_format.format(save_folder=save_folder, epoch=epoch_idx + 1) + + for name, module in self.model.module.named_modules(): + if isinstance(module, nn.Conv2d): + if module.bias is not None: + self.shared_layer_info[self.args.dataset][ + 'conv_bias'][name] = module.bias + elif isinstance(module, nn.BatchNorm2d): + self.shared_layer_info[self.args.dataset][ + 'bn_layer_running_mean'][name] = module.running_mean + self.shared_layer_info[self.args.dataset][ + 'bn_layer_running_var'][name] = module.running_var + self.shared_layer_info[self.args.dataset][ + 'bn_layer_weight'][name] = module.weight + self.shared_layer_info[self.args.dataset][ + 'bn_layer_bias'][name] = module.bias + elif isinstance(module, nn.Linear) and 'features' in name: + self.shared_layer_info[self.args.dataset]['fc_bias'][name] = module.bias + elif isinstance(module, nn.PReLU): + self.shared_layer_info[self.args.dataset][ + 'prelu_layer_weight'][name] = module.weight + + checkpoint = { + 'model_state_dict': self.model.module.state_dict(), + 'optimizer_network_state_dict': optimizers[0].state_dict(), + 'dataset_history': self.model.module.datasets, + 'dataset2num_classes': self.model.module.dataset2num_classes, + 'masks': self.pruner.masks, + 'shared_layer_info': self.shared_layer_info, + } + + torch.save(checkpoint, filepath) + + def load_checkpoint(self, optimizers, resume_from_epoch, save_folder, restore_optimizers=True): + # Restore from a previous checkpoint, if initial_epoch is specified. + # Horovod: restore on the first worker which will broadcast weights to other workers. + if resume_from_epoch > 0: + filepath = self.args.checkpoint_format.format(save_folder=save_folder, epoch=resume_from_epoch) + checkpoint = torch.load(filepath) + checkpoint_keys = checkpoint.keys() + state_dict = checkpoint['model_state_dict'] + curr_model_state_dict = self.model.module.state_dict() + for name, param in state_dict.items(): + if name == 'classifier.weight' or name == 'classifier.bias' or (name == 'classifier.0.weight' or name == 'classifier.0.bias' or name == 'classifier.1.weight'): + # I DONT WANT TO DO THIS! QQ That last 3 exprs are for anglelinear and embeddings + continue + else: + curr_model_state_dict[name].copy_(param) + + # if restore_optimizers: + # if 'optimizer_network_state_dict' in checkpoint_keys: + # optimizers[0].load_state_dict(checkpoint['optimizer_network_state_dict']) + + def load_checkpoint_for_inference(self, resume_from_epoch, save_folder): + if resume_from_epoch > 0: + filepath = self.args.checkpoint_format.format(save_folder=save_folder, epoch=resume_from_epoch) + checkpoint = torch.load(filepath) + checkpoint_keys = checkpoint.keys() + state_dict = checkpoint['model_state_dict'] + curr_model_state_dict = self.model.module.state_dict() + + for name, param in state_dict.items(): + if name == 'classifier.weight' or name == 'classifier.bias' or (name == 'classifier.0.weight' or name == 'classifier.0.bias' or name == 'classifier.1.weight'): + # I DONT WANT TO DO THIS! QQ That last 3 exprs are for anglelinear and embeddings + continue + else: + curr_model_state_dict[name].copy_(param) + + # load the batch norm params and bias in convolution in correspond to curr dataset + for name, module in self.model.module.named_modules(): + if isinstance(module, nn.Conv2d): + if module.bias is not None: + module.bias = self.shared_layer_info[self.args.dataset]['conv_bias'][name] + elif isinstance(module, nn.BatchNorm2d): + module.running_mean = self.shared_layer_info[self.args.dataset][ + 'bn_layer_running_mean'][name] + module.running_var = self.shared_layer_info[self.args.dataset][ + 'bn_layer_running_var'][name] + module.weight = self.shared_layer_info[self.args.dataset][ + 'bn_layer_weight'][name] + module.bias = self.shared_layer_info[self.args.dataset][ + 'bn_layer_bias'][name] + elif isinstance(module, nn.Linear) and 'features' in name: + module.bias = self.shared_layer_info[self.args.dataset]['fc_bias'][name] + elif isinstance(module, nn.PReLU): + module.weight = self.shared_layer_info[self.args.dataset][ + 'prelu_layer_weight'][name] diff --git a/FACE_UTILS/packnet_prune.py b/FACE_UTILS/packnet_prune.py new file mode 100644 index 0000000..61415af --- /dev/null +++ b/FACE_UTILS/packnet_prune.py @@ -0,0 +1,205 @@ +"""Handles all the pruning-related stuff.""" +from __future__ import print_function + +import collections + +import numpy as np + +import torch +import torch.nn as nn +import pdb + +class SparsePruner(object): + """Performs pruning on the given model.""" + + def __init__(self, model, masks, args, begin_prune_step, end_prune_step, inference_dataset_idx): + self.model = model + self.args = args + self.sparsity_func_exponent = 3 + self.begin_prune_step = begin_prune_step + self.end_prune_step = end_prune_step + self.last_prune_step = begin_prune_step + + self.masks = masks + valid_key = list(masks.keys())[0] + self.current_dataset_idx = masks[valid_key].max() + self.inference_dataset_idx = inference_dataset_idx + + def _pruning_mask(self, weights, mask, layer_name, pruning_ratio): + """Ranks weights by magnitude. Sets all below kth to 0. + Returns pruned mask. + """ + # Select all prunable weights, ie. belonging to current dataset. + tensor = weights[mask.eq(self.current_dataset_idx) | mask.eq(0)] # This will flatten weights + abs_tensor = tensor.abs() + cutoff_rank = round(pruning_ratio * tensor.numel()) + cutoff_value = abs_tensor.cpu().kthvalue(cutoff_rank)[0].cuda() # value at cutoff rank + + # Remove those weights which are below cutoff and belong to current + # dataset that we are training for. + remove_mask = weights.abs().le(cutoff_value) * mask.eq(self.current_dataset_idx) + + # mask = 1 - remove_mask + mask[remove_mask.eq(1)] = 0 + # print('Layer {}, pruned {}/{} ({:.2f}%)'.format( + # layer_name, mask.eq(0).sum(), tensor.numel(), + # float(100 * mask.eq(0).sum()) / tensor.numel())) + + return mask + + def _adjust_sparsity(self, curr_prune_step): + + p = min(1.0, + max(0.0, + ((curr_prune_step - self.begin_prune_step) + / (self.end_prune_step - self.begin_prune_step)) + )) + + sparsity = self.args.target_sparsity + \ + (self.args.initial_sparsity - self.args.target_sparsity) * pow(1-p, self.sparsity_func_exponent) + + return sparsity + + def _time_to_update_masks(self, curr_prune_step): + is_step_within_pruning_range = \ + (curr_prune_step >= self.begin_prune_step) and \ + (curr_prune_step <= self.end_prune_step) + + is_pruning_step = ( + self.last_prune_step + self.args.pruning_frequency) <= curr_prune_step + + return is_step_within_pruning_range and is_pruning_step + + def gradually_prune(self, curr_prune_step): + if self._time_to_update_masks(curr_prune_step): + self.last_prune_step = curr_prune_step + curr_pruning_ratio = self._adjust_sparsity(curr_prune_step) + # print('Pruning for dataset idx: %d' % (self.current_dataset_idx)) + # print('Pruning each layer by removing {:.2f}% of values'.format(100 * curr_pruning_ratio)) + + for name, module in self.model.named_modules(): + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): + if 'classifiers' in name: + continue + mask = self._pruning_mask(module.weight.data, self.masks[name], name, pruning_ratio=curr_pruning_ratio) + self.masks[name] = mask + module.weight.data[self.masks[name].eq(0)] = 0.0 + else: + curr_pruning_ratio = self._adjust_sparsity(self.last_prune_step) + + + return curr_pruning_ratio + + def one_shot_prune(self, one_shot_prune_perc): + """Gets pruning mask for each layer, based on previous_masks. + Sets the self.current_masks to the computed pruning masks. + """ + print('Pruning for dataset idx: %d' % (self.current_dataset_idx)) + print('Pruning each layer by removing %.2f%% of values' % (100 * one_shot_prune_perc)) + + for name, module in self.model.named_modules(): + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): + if 'classifiers' in name: + continue + mask = self._pruning_mask( + module.weight.data, self.masks[name], name, pruning_ratio=one_shot_prune_perc) + self.masks[name] = mask + + # Set pruned weights to 0. + module.weight.data[self.masks[name].eq(0)] = 0.0 + + + def calculate_sparsity(self): + total_elem = 0 + zero_elem = 0 + for name, module in self.model.named_modules(): + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): + mask = self.masks[name] + # total_elem += torch.sum(mask.eq(self.inference_dataset_idx) | mask.eq(0)) + # zero_elem += torch.sum(mask.eq(0)) + total_elem += torch.sum(mask.ge(self.inference_dataset_idx)) + zero_elem += torch.sum(mask.gt(self.inference_dataset_idx)) + break # because every layer has the same pruning ratio, + # so we are able to see only one layer for getting the sparsity + + if total_elem.cpu() != 0.0: + return float(zero_elem.cpu()) / float(total_elem.cpu()) + else: + return 0.0 + + def calculate_curr_task_ratio(self): + total_elem = 0 + curr_task_elem = 0 + for name, module in self.model.named_modules(): + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): + mask = self.masks[name] + total_elem += mask.numel() + curr_task_elem += torch.sum(mask.eq(self.inference_dataset_idx)) + break # because every layer has the same pruning ratio, + # so we are able to see only one layer for getting the sparsity + + return float(curr_task_elem.cpu()) / total_elem + + def calculate_zero_ratio(self): + total_elem = 0 + zero_elem = 0 + for name, module in self.model.named_modules(): + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): + mask = self.masks[name] + total_elem += mask.numel() + zero_elem += torch.sum(mask.eq(0)) + break # because every layer has the same pruning ratio, + # so we are able to see only one layer for getting the sparsity + + return float(zero_elem.cpu()) / total_elem + + + def do_weight_decay_and_make_grads_zero(self): + """Sets grads of fixed weights to 0.""" + assert self.masks + for name, module in self.model.named_modules(): + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): + if 'classifiers' in name: + continue + mask = self.masks[name] + # Set grads of all weights not belonging to current dataset to 0. + if module.weight.grad is not None: + module.weight.grad.data.add_(self.args.weight_decay, module.weight.data) + module.weight.grad.data[mask.ne( + self.current_dataset_idx)] = 0 + + def make_pruned_zero(self): + """Makes pruned weights 0.""" + assert self.masks + + for name, module in self.model.named_modules(): + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): + if 'classifiers' in name: + continue + layer_mask = self.masks[name] + module.weight.data[layer_mask.eq(0)] = 0.0 + + def apply_mask(self): + """To be done to retrieve weights just for a particular dataset.""" + for name, module in self.model.named_modules(): + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): + if 'classifiers' in name: + continue + weight = module.weight.data + mask = self.masks[name].cuda() + weight[mask.eq(0)] = 0.0 + weight[mask.gt(self.inference_dataset_idx)] = 0.0 + + def make_finetuning_mask(self): + """Turns previously pruned weights into trainable weights for + current dataset. + """ + assert self.masks + self.current_dataset_idx += 1 + + for name, module in self.model.named_modules(): + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): + if 'classifiers' in name: + continue + mask = self.masks[name] + mask[mask.eq(0)] = self.current_dataset_idx diff --git a/FACE_UTILS/prune.py b/FACE_UTILS/prune.py new file mode 100644 index 0000000..cbedc0b --- /dev/null +++ b/FACE_UTILS/prune.py @@ -0,0 +1,254 @@ +"""Handles all the pruning-related stuff.""" +from __future__ import print_function + +import collections + +import numpy as np + +import torch +import torch.nn as nn +import pdb +import models.layers as nl +import sys + +class SparsePruner(object): + """Performs pruning on the given model.""" + + def __init__(self, model, masks, args, begin_prune_step, end_prune_step, inference_dataset_idx): + self.model = model + self.args = args + self.sparsity_func_exponent = 3 + self.begin_prune_step = begin_prune_step + self.end_prune_step = end_prune_step + self.last_prune_step = begin_prune_step + + self.masks = masks + if args.mode == 'prune' or args.mode == 'inference': + self.current_dataset_idx = self.model.module.datasets.index(args.dataset) + 1 + elif args.mode == 'finetune': + self.current_dataset_idx = len(self.model.module.datasets) - 1 + else: + print('Dont support mode: \'{}\''.format(args.mode)) + sys.exit(-1) + + # valid_key = list(masks.keys())[0] + # print(valid_key) ## TODO HERE + # self.current_dataset_idx = max(len(self.model.module.datasets) - 1, self.masks[valid_key].max()) ## TODO HERE + # print('==>', self.current_dataset_idx, len(self.model.module.datasets), self.masks[valid_key].max()) ## TODO HERE + self.inference_dataset_idx = inference_dataset_idx + + def _pruning_mask(self, weights, mask, layer_name, pruning_ratio): + """Ranks weights by magnitude. Sets all below kth to 0. + Returns pruned mask. + """ + # Select all prunable weights, ie. belonging to current dataset. + tensor = weights[mask.eq(self.current_dataset_idx) | mask.eq(0)] # This will flatten weights + abs_tensor = tensor.abs() + cutoff_rank = round(pruning_ratio * tensor.numel()) + try: + cutoff_value = abs_tensor.cpu().kthvalue(cutoff_rank)[0].cuda() # value at cutoff rank + except: + print("Not enough weights for pruning, that is to say, too little space for new task, need expand the network.") + sys.exit(2) + # Remove those weights which are below cutoff and belong to current + # dataset that we are training for. + remove_mask = weights.abs().le(cutoff_value) * mask.eq(self.current_dataset_idx) + + # mask = 1 - remove_mask + mask[remove_mask.eq(1)] = 0 + # print('Layer {}, pruned {}/{} ({:.2f}%)'.format( + # layer_name, mask.eq(0).sum(), tensor.numel(), + # float(100 * mask.eq(0).sum()) / tensor.numel())) + + return mask + + def _adjust_sparsity(self, curr_prune_step): + + p = min(1.0, + max(0.0, + ((curr_prune_step - self.begin_prune_step) + / (self.end_prune_step - self.begin_prune_step)) + )) + + sparsity = self.args.target_sparsity + \ + (self.args.initial_sparsity - self.args.target_sparsity) * pow(1-p, self.sparsity_func_exponent) + + return sparsity + + def _time_to_update_masks(self, curr_prune_step): + is_step_within_pruning_range = \ + (curr_prune_step >= self.begin_prune_step) and \ + (curr_prune_step <= self.end_prune_step) + + is_pruning_step = ( + self.last_prune_step + self.args.pruning_frequency) <= curr_prune_step + + return is_step_within_pruning_range and is_pruning_step + + def gradually_prune(self, curr_prune_step): + if self._time_to_update_masks(curr_prune_step): + self.last_prune_step = curr_prune_step + curr_pruning_ratio = self._adjust_sparsity(curr_prune_step) + # print('Pruning for dataset idx: %d' % (self.current_dataset_idx)) + # print('Pruning each layer by removing {:.2f}% of values'.format(100 * curr_pruning_ratio)) + + for name, module in self.model.named_modules(): + if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): + mask = self._pruning_mask(module.weight.data, self.masks[name], name, pruning_ratio=curr_pruning_ratio) + self.masks[name] = mask + module.weight.data[self.masks[name].eq(0)] = 0.0 + else: + curr_pruning_ratio = self._adjust_sparsity(self.last_prune_step) + + + return curr_pruning_ratio + + def one_shot_prune(self, one_shot_prune_perc): + """Gets pruning mask for each layer, based on previous_masks. + Sets the self.current_masks to the computed pruning masks. + """ + print('Pruning for dataset idx: %d' % (self.current_dataset_idx)) + print('Pruning each layer by removing %.2f%% of values' % (100 * one_shot_prune_perc)) + + for name, module in self.model.named_modules(): + if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): + mask = self._pruning_mask( + module.weight.data, self.masks[name], name, pruning_ratio=one_shot_prune_perc) + self.masks[name] = mask + + # Set pruned weights to 0. + module.weight.data[self.masks[name].eq(0)] = 0.0 + + + def calculate_sparsity(self): + total_elem = 0 + zero_elem = 0 + is_first_conv = True + + for name, module in self.model.named_modules(): + if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): + ## UNCOMMENT + # if is_first_conv: + # is_first_conv = False + # continue + ## UNCOMMENT + mask = self.masks[name] + total_elem += torch.sum(mask.eq(self.inference_dataset_idx) | mask.eq(0)) + zero_elem += torch.sum(mask.eq(0)) + # total_elem += torch.sum(mask.ge(self.inference_dataset_idx) | mask.eq(0)) + # zero_elem += torch.sum(mask.eq(self.inference_dataset_idx)) + ## UNCOMMENT + # break # because every layer has the same pruning ratio, + # # so we are able to see only one layer for getting the sparsity + ## UNCOMMENT + + if total_elem.cpu() != 0.0: + return float(zero_elem.cpu()) / float(total_elem.cpu()) + else: + return 0.0 + + def calculate_curr_task_ratio(self): + total_elem = 0 + curr_task_elem = 0 + is_first_conv = True + + for name, module in self.model.named_modules(): + if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): + ## UNCOMMENT + # if is_first_conv: + # is_first_conv = False + # continue + ## UNCOMMENT + mask = self.masks[name] + total_elem += mask.numel() + curr_task_elem += torch.sum(mask.eq(self.inference_dataset_idx)) + ## UNCOMMENT + # break # because every layer has the same pruning ratio, + # # so we are able to see only one layer for getting the sparsity + ## UNCOMMENT + + return float(curr_task_elem.cpu()) / total_elem + + def calculate_zero_ratio(self): + total_elem = 0 + zero_elem = 0 + is_first_conv = True + for name, module in self.model.named_modules(): + if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): + ## UNCOMMENT + # if is_first_conv: + # is_first_conv = False + # continue + ## UNCOMMENT + mask = self.masks[name] + total_elem += mask.numel() + zero_elem += torch.sum(mask.eq(0)) + ## UNCOMMENT + # break # because every layer has the same pruning ratio, + # # so we are able to see only one layer for getting the sparsity + ## UNCOMMENT + + return float(zero_elem.cpu()) / total_elem + + def calculate_shared_part_ratio(self): + total_elem = 0 + shared_elem = 0 + for name, module in self.model.named_modules(): + if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): + mask = self.masks[name] + total_elem += torch.sum(mask.gt(0) & mask.lt(self.inference_dataset_idx)) + shared_elem += torch.sum(torch.where(mask.gt(0) & mask.lt(self.inference_dataset_idx) & module.piggymask.gt(0.005), + torch.tensor(1).cuda(), torch.tensor(0).cuda())) + + if total_elem.cpu() != 0.0: + return float(shared_elem.cpu()) / float(total_elem.cpu()) + else: + return 0.0 + + def do_weight_decay_and_make_grads_zero(self): + """Sets grads of fixed weights to 0.""" + assert self.masks + for name, module in self.model.named_modules(): + if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): + mask = self.masks[name] + # Set grads of all weights not belonging to current dataset to 0. + if module.weight.grad is not None: + module.weight.grad.data.add_(self.args.weight_decay, module.weight.data) + module.weight.grad.data[mask.ne( + self.current_dataset_idx)] = 0 + if module.piggymask is not None and module.piggymask.grad is not None: + if self.args.mode == 'finetune': + module.piggymask.grad.data[mask.eq(0) | mask.ge(self.current_dataset_idx)] = 0 + elif self.args.mode == 'prune': + module.piggymask.grad.data.fill_(0) + + + def make_pruned_zero(self): + """Makes pruned weights 0.""" + assert self.masks + + for name, module in self.model.named_modules(): + if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): + layer_mask = self.masks[name] + module.weight.data[layer_mask.eq(0)] = 0.0 + + def apply_mask(self): + """To be done to retrieve weights just for a particular dataset.""" + for name, module in self.model.named_modules(): + if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): + weight = module.weight.data + mask = self.masks[name].cuda() + weight[mask.eq(0)] = 0.0 + weight[mask.gt(self.inference_dataset_idx)] = 0.0 + + def make_finetuning_mask(self): + """Turns previously pruned weights into trainable weights for + current dataset. + """ + assert self.masks + self.current_dataset_idx += 1 + + for name, module in self.model.named_modules(): + if isinstance(module, nl.SharableConv2d) or isinstance(module, nl.SharableLinear): + mask = self.masks[name] + mask[mask.eq(0)] = self.current_dataset_idx diff --git a/datasets/.txt b/datasets/.txt deleted file mode 100644 index e69de29..0000000 diff --git a/experiment3/FvGeEmAg0_CPG_face.sh b/experiment3/FvGeEmAg0_CPG_face.sh new file mode 100644 index 0000000..bcd16fe --- /dev/null +++ b/experiment3/FvGeEmAg0_CPG_face.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# Normally, bash shell cannot support floating point arthematic, thus, here we use `bc` package + +PREFIX=FvGeEmAg0 +TARGET_TASK_ID=4 + +dataset=( + 'None' # dummy + 'face_verification' + 'gender' + 'emotion' + 'age0' +) + +num_classes=( + 0 # dummy + 4630 + 3 + 7 + 8 +) + +init_lr=( + 0 # dummy + 1e-3 + 5e-4 + 5e-4 + 5e-4 +) + + +batch_size=( + 0 # dummy + 256 + 128 + 128 + 128 +) + + +finetune_start_sparsity=( + 0 # dummy + 0 + 0.5 + 0.1 + 0.2 +) + + +acc_margin=( + 0 # dummy + 0.005 + 0.015 + 0.005 + 0.010 +) + + +GPU_ID=0,1,2,3 +arch='spherenet20' +finetune_epochs=100 +network_width_multiplier=1.0 +pruning_ratio_interval=0.1 + + +for task_id in `seq $TARGET_TASK_ID $TARGET_TASK_ID`; do + state=2 + while [ $state -eq 2 ]; do + if [ "$task_id" != "1" ] + then + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr ${init_lr[task_id]} \ + --lr_mask 5e-4 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id-1]}/gradual_prune/${finetune_start_sparsity[task_id]} \ + --epochs $finetune_epochs \ + --mode finetune \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --jsonfile logs/baseline_face_acc.txt \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + else + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr ${init_lr[task_id]} \ + --lr_mask 5e-4 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --epochs $finetune_epochs \ + --mode finetune \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --jsonfile logs/baseline_face_acc.txt \ + --use_vgg_pretrained \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + fi + + state=$? + if [ $state -eq 2 ] + then + network_width_multiplier=$(bc <<< $network_width_multiplier+0.5) + echo "New network_width_multiplier: $network_width_multiplier" + continue + elif [ $state -eq 3 ] + then + echo "You should provide the baseline_face_acc.txt" + exit 0 + fi + + nrof_epoch=0 + if [ "$task_id" == "1" ] + then + nrof_epoch_for_each_prune=10 + pruning_frequency=1000 + else + nrof_epoch_for_each_prune=20 + pruning_frequency=100 + fi + start_sparsity=0.0 + end_sparsity=0.1 + nrof_epoch=$nrof_epoch_for_each_prune + + # gradually pruning + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr 0.0005 \ + --lr_mask 0.0 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$end_sparsity \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --epochs $nrof_epoch \ + --mode prune \ + --initial_sparsity=$start_sparsity \ + --target_sparsity=$end_sparsity \ + --pruning_frequency=$pruning_frequency \ + --pruning_interval=4 \ + --jsonfile logs/baseline_face_acc.txt \ + --batch_size ${batch_size[task_id]}\ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + state=$? + if [ $state -eq 2 ] + then + network_width_multiplier=$(bc <<< $network_width_multiplier+0.5) + echo "New network_width_multiplier: $network_width_multiplier" + continue + fi + done + + if [ $state -eq 4 ] + then + continue + fi + + for RUN_ID in `seq 1 9`; do + nrof_epoch=$nrof_epoch_for_each_prune + start_sparsity=$end_sparsity + if [ $RUN_ID -lt 9 ] + then + end_sparsity=$(printf "%.1f" $(bc <<< $end_sparsity+$pruning_ratio_interval)) + else + end_sparsity=$(printf "%.2f" $(bc <<< $end_sparsity+0.05)) + fi + + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr 0.0005 \ + --lr_mask 0.0 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$end_sparsity \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$start_sparsity \ + --epochs $nrof_epoch \ + --mode prune \ + --initial_sparsity=$start_sparsity \ + --target_sparsity=$end_sparsity \ + --pruning_frequency=$pruning_frequency \ + --pruning_interval=4 \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --jsonfile logs/baseline_face_acc.txt \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + if [ $? -eq 4 ] + then + break + fi + done +done diff --git a/experiment3/FvGeEmAg1_CPG_face.sh b/experiment3/FvGeEmAg1_CPG_face.sh new file mode 100644 index 0000000..700b2aa --- /dev/null +++ b/experiment3/FvGeEmAg1_CPG_face.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# Normally, bash shell cannot support floating point arthematic, thus, here we use `bc` package + +PREFIX=FvGeEmAg1 +TARGET_TASK_ID=4 + +dataset=( + 'None' # dummy + 'face_verification' + 'gender' + 'emotion' + 'age1' +) + +num_classes=( + 0 # dummy + 4630 + 3 + 7 + 8 +) + +init_lr=( + 0 # dummy + 1e-3 + 5e-4 + 5e-4 + 1e-3 +) + + +batch_size=( + 0 # dummy + 256 + 128 + 128 + 32 +) + + +finetune_start_sparsity=( + 0 # dummy + 0 + 0.5 + 0.1 + 0.2 +) + + +acc_margin=( + 0 # dummy + 0.005 + 0.015 + 0.005 + 0.005 +) + + +GPU_ID=0,1,2,3 +arch='spherenet20' +finetune_epochs=100 +network_width_multiplier=1.0 +pruning_ratio_interval=0.1 + + +for task_id in `seq $TARGET_TASK_ID $TARGET_TASK_ID`; do + state=2 + while [ $state -eq 2 ]; do + if [ "$task_id" != "1" ] + then + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr ${init_lr[task_id]} \ + --lr_mask 5e-4 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id-1]}/gradual_prune/${finetune_start_sparsity[task_id]} \ + --epochs $finetune_epochs \ + --mode finetune \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --jsonfile logs/baseline_face_acc.txt \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + else + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr ${init_lr[task_id]} \ + --lr_mask 5e-4 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --epochs $finetune_epochs \ + --mode finetune \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --jsonfile logs/baseline_face_acc.txt \ + --use_vgg_pretrained \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + fi + + state=$? + if [ $state -eq 2 ] + then + network_width_multiplier=$(bc <<< $network_width_multiplier+0.5) + echo "New network_width_multiplier: $network_width_multiplier" + continue + elif [ $state -eq 3 ] + then + echo "You should provide the baseline_face_acc.txt" + exit 0 + fi + + nrof_epoch=0 + if [ "$task_id" == "1" ] + then + nrof_epoch_for_each_prune=10 + pruning_frequency=1000 + else + nrof_epoch_for_each_prune=20 + pruning_frequency=100 + fi + start_sparsity=0.0 + end_sparsity=0.1 + nrof_epoch=$nrof_epoch_for_each_prune + + # gradually pruning + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr 0.0001 \ + --lr_mask 0.0 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$end_sparsity \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --epochs $nrof_epoch \ + --mode prune \ + --initial_sparsity=$start_sparsity \ + --target_sparsity=$end_sparsity \ + --pruning_frequency=$pruning_frequency \ + --pruning_interval=4 \ + --jsonfile logs/baseline_face_acc.txt \ + --batch_size ${batch_size[task_id]}\ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + state=$? + if [ $state -eq 2 ] + then + network_width_multiplier=$(bc <<< $network_width_multiplier+0.5) + echo "New network_width_multiplier: $network_width_multiplier" + continue + fi + done + + if [ $state -eq 4 ] + then + continue + fi + + for RUN_ID in `seq 1 9`; do + nrof_epoch=$nrof_epoch_for_each_prune + start_sparsity=$end_sparsity + if [ $RUN_ID -lt 9 ] + then + end_sparsity=$(printf "%.1f" $(bc <<< $end_sparsity+$pruning_ratio_interval)) + else + end_sparsity=$(printf "%.2f" $(bc <<< $end_sparsity+0.05)) + fi + + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr 0.0001 \ + --lr_mask 0.0 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$end_sparsity \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$start_sparsity \ + --epochs $nrof_epoch \ + --mode prune \ + --initial_sparsity=$start_sparsity \ + --target_sparsity=$end_sparsity \ + --pruning_frequency=$pruning_frequency \ + --pruning_interval=4 \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --jsonfile logs/baseline_face_acc.txt \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + if [ $? -eq 4 ] + then + break + fi + done +done diff --git a/experiment3/FvGeEmAg2_CPG_face.sh b/experiment3/FvGeEmAg2_CPG_face.sh new file mode 100644 index 0000000..889a09d --- /dev/null +++ b/experiment3/FvGeEmAg2_CPG_face.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# Normally, bash shell cannot support floating point arthematic, thus, here we use `bc` package + +PREFIX=FvGeEmAg2 +TARGET_TASK_ID=4 + +dataset=( + 'None' # dummy + 'face_verification' + 'gender' + 'emotion' + 'age2' +) + +num_classes=( + 0 # dummy + 4630 + 3 + 7 + 8 +) + +init_lr=( + 0 # dummy + 1e-3 + 5e-4 + 5e-4 + 1e-3 +) + + +batch_size=( + 0 # dummy + 256 + 128 + 128 + 32 +) + + +finetune_start_sparsity=( + 0 # dummy + 0 + 0.5 + 0.1 + 0.2 +) + + +acc_margin=( + 0 # dummy + 0.005 + 0.015 + 0.005 + 0.010 +) + + +GPU_ID=0,1,2,3 +arch='spherenet20' +finetune_epochs=100 +network_width_multiplier=1.0 +pruning_ratio_interval=0.1 + + +for task_id in `seq $TARGET_TASK_ID $TARGET_TASK_ID`; do + state=2 + while [ $state -eq 2 ]; do + if [ "$task_id" != "1" ] + then + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr ${init_lr[task_id]} \ + --lr_mask 5e-4 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id-1]}/gradual_prune/${finetune_start_sparsity[task_id]} \ + --epochs $finetune_epochs \ + --mode finetune \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --jsonfile logs/baseline_face_acc.txt \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + else + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr ${init_lr[task_id]} \ + --lr_mask 5e-4 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --epochs $finetune_epochs \ + --mode finetune \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --jsonfile logs/baseline_face_acc.txt \ + --use_vgg_pretrained \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + fi + + state=$? + if [ $state -eq 2 ] + then + network_width_multiplier=$(bc <<< $network_width_multiplier+0.5) + echo "New network_width_multiplier: $network_width_multiplier" + continue + elif [ $state -eq 3 ] + then + echo "You should provide the baseline_face_acc.txt" + exit 0 + fi + + nrof_epoch=0 + if [ "$task_id" == "1" ] + then + nrof_epoch_for_each_prune=10 + pruning_frequency=1000 + else + nrof_epoch_for_each_prune=20 + pruning_frequency=100 + fi + start_sparsity=0.0 + end_sparsity=0.1 + nrof_epoch=$nrof_epoch_for_each_prune + + # gradually pruning + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr 0.00003 \ + --lr_mask 0.0 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$end_sparsity \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --epochs $nrof_epoch \ + --mode prune \ + --initial_sparsity=$start_sparsity \ + --target_sparsity=$end_sparsity \ + --pruning_frequency=$pruning_frequency \ + --pruning_interval=4 \ + --jsonfile logs/baseline_face_acc.txt \ + --batch_size ${batch_size[task_id]}\ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + state=$? + if [ $state -eq 2 ] + then + network_width_multiplier=$(bc <<< $network_width_multiplier+0.5) + echo "New network_width_multiplier: $network_width_multiplier" + continue + fi + done + + if [ $state -eq 4 ] + then + continue + fi + + for RUN_ID in `seq 1 9`; do + nrof_epoch=$nrof_epoch_for_each_prune + start_sparsity=$end_sparsity + if [ $RUN_ID -lt 9 ] + then + end_sparsity=$(printf "%.1f" $(bc <<< $end_sparsity+$pruning_ratio_interval)) + else + end_sparsity=$(printf "%.2f" $(bc <<< $end_sparsity+0.05)) + fi + + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr 0.00003 \ + --lr_mask 0.0 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$end_sparsity \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$start_sparsity \ + --epochs $nrof_epoch \ + --mode prune \ + --initial_sparsity=$start_sparsity \ + --target_sparsity=$end_sparsity \ + --pruning_frequency=$pruning_frequency \ + --pruning_interval=4 \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --jsonfile logs/baseline_face_acc.txt \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + if [ $? -eq 4 ] + then + break + fi + done +done diff --git a/experiment3/FvGeEmAg3_CPG_face.sh b/experiment3/FvGeEmAg3_CPG_face.sh new file mode 100644 index 0000000..bc82b05 --- /dev/null +++ b/experiment3/FvGeEmAg3_CPG_face.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# Normally, bash shell cannot support floating point arthematic, thus, here we use `bc` package + +PREFIX=FvGeEmAg3 +TARGET_TASK_ID=4 + +dataset=( + 'None' # dummy + 'face_verification' + 'gender' + 'emotion' + 'age3' +) + +num_classes=( + 0 # dummy + 4630 + 3 + 7 + 8 +) + +init_lr=( + 0 # dummy + 1e-3 + 5e-4 + 5e-4 + 5e-4 +) + + +batch_size=( + 0 # dummy + 256 + 128 + 128 + 32 +) + + +finetune_start_sparsity=( + 0 # dummy + 0 + 0.5 + 0.1 + 0.2 +) + + +acc_margin=( + 0 # dummy + 0.005 + 0.015 + 0.005 + 0.005 +) + + +GPU_ID=0,1,2,3 +arch='spherenet20' +finetune_epochs=100 +network_width_multiplier=1.0 +pruning_ratio_interval=0.1 + + +for task_id in `seq $TARGET_TASK_ID $TARGET_TASK_ID`; do + state=2 + while [ $state -eq 2 ]; do + if [ "$task_id" != "1" ] + then + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr ${init_lr[task_id]} \ + --lr_mask 5e-4 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id-1]}/gradual_prune/${finetune_start_sparsity[task_id]} \ + --epochs $finetune_epochs \ + --mode finetune \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --jsonfile logs/baseline_face_acc.txt \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + else + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr ${init_lr[task_id]} \ + --lr_mask 5e-4 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --epochs $finetune_epochs \ + --mode finetune \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --jsonfile logs/baseline_face_acc.txt \ + --use_vgg_pretrained \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + fi + + state=$? + if [ $state -eq 2 ] + then + network_width_multiplier=$(bc <<< $network_width_multiplier+0.5) + echo "New network_width_multiplier: $network_width_multiplier" + continue + elif [ $state -eq 3 ] + then + echo "You should provide the baseline_face_acc.txt" + exit 0 + fi + + nrof_epoch=0 + if [ "$task_id" == "1" ] + then + nrof_epoch_for_each_prune=10 + pruning_frequency=1000 + else + nrof_epoch_for_each_prune=20 + pruning_frequency=100 + fi + start_sparsity=0.0 + end_sparsity=0.1 + nrof_epoch=$nrof_epoch_for_each_prune + + # gradually pruning + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr 0.00001 \ + --lr_mask 0.0 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$end_sparsity \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --epochs $nrof_epoch \ + --mode prune \ + --initial_sparsity=$start_sparsity \ + --target_sparsity=$end_sparsity \ + --pruning_frequency=$pruning_frequency \ + --pruning_interval=4 \ + --jsonfile logs/baseline_face_acc.txt \ + --batch_size ${batch_size[task_id]}\ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + state=$? + if [ $state -eq 2 ] + then + network_width_multiplier=$(bc <<< $network_width_multiplier+0.5) + echo "New network_width_multiplier: $network_width_multiplier" + continue + fi + done + + if [ $state -eq 4 ] + then + continue + fi + + for RUN_ID in `seq 1 9`; do + nrof_epoch=$nrof_epoch_for_each_prune + start_sparsity=$end_sparsity + if [ $RUN_ID -lt 9 ] + then + end_sparsity=$(printf "%.1f" $(bc <<< $end_sparsity+$pruning_ratio_interval)) + else + end_sparsity=$(printf "%.2f" $(bc <<< $end_sparsity+0.05)) + fi + + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr 0.00001 \ + --lr_mask 0.0 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$end_sparsity \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$start_sparsity \ + --epochs $nrof_epoch \ + --mode prune \ + --initial_sparsity=$start_sparsity \ + --target_sparsity=$end_sparsity \ + --pruning_frequency=$pruning_frequency \ + --pruning_interval=4 \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --jsonfile logs/baseline_face_acc.txt \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + if [ $? -eq 4 ] + then + break + fi + done +done diff --git a/experiment3/FvGeEmAg4_CPG_face.sh b/experiment3/FvGeEmAg4_CPG_face.sh new file mode 100644 index 0000000..7e33d9a --- /dev/null +++ b/experiment3/FvGeEmAg4_CPG_face.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# Normally, bash shell cannot support floating point arthematic, thus, here we use `bc` package + +PREFIX=FvGeEmAg4 +TARGET_TASK_ID=4 + +dataset=( + 'None' # dummy + 'face_verification' + 'gender' + 'emotion' + 'age4' +) + +num_classes=( + 0 # dummy + 4630 + 3 + 7 + 8 +) + +init_lr=( + 0 # dummy + 1e-3 + 5e-4 + 5e-4 + 5e-4 +) + + +batch_size=( + 0 # dummy + 256 + 128 + 128 + 32 +) + + +finetune_start_sparsity=( + 0 # dummy + 0 + 0.5 + 0.1 + 0.2 +) + + +acc_margin=( + 0 # dummy + 0.005 + 0.015 + 0.005 + 0.005 +) + + +GPU_ID=0,1,2,3 +arch='spherenet20' +finetune_epochs=100 +network_width_multiplier=1.0 +pruning_ratio_interval=0.1 + + +for task_id in `seq $TARGET_TASK_ID $TARGET_TASK_ID`; do + state=2 + while [ $state -eq 2 ]; do + if [ "$task_id" != "1" ] + then + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr ${init_lr[task_id]} \ + --lr_mask 5e-4 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id-1]}/gradual_prune/${finetune_start_sparsity[task_id]} \ + --epochs $finetune_epochs \ + --mode finetune \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --jsonfile logs/baseline_face_acc.txt \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + else + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr ${init_lr[task_id]} \ + --lr_mask 5e-4 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --epochs $finetune_epochs \ + --mode finetune \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --jsonfile logs/baseline_face_acc.txt \ + --use_vgg_pretrained \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + fi + + state=$? + if [ $state -eq 2 ] + then + network_width_multiplier=$(bc <<< $network_width_multiplier+0.5) + echo "New network_width_multiplier: $network_width_multiplier" + continue + elif [ $state -eq 3 ] + then + echo "You should provide the baseline_face_acc.txt" + exit 0 + fi + + nrof_epoch=0 + if [ "$task_id" == "1" ] + then + nrof_epoch_for_each_prune=10 + pruning_frequency=1000 + else + nrof_epoch_for_each_prune=20 + pruning_frequency=100 + fi + start_sparsity=0.0 + end_sparsity=0.1 + nrof_epoch=$nrof_epoch_for_each_prune + + # gradually pruning + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr 0.0001 \ + --lr_mask 0.0 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$end_sparsity \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --epochs $nrof_epoch \ + --mode prune \ + --initial_sparsity=$start_sparsity \ + --target_sparsity=$end_sparsity \ + --pruning_frequency=$pruning_frequency \ + --pruning_interval=4 \ + --jsonfile logs/baseline_face_acc.txt \ + --batch_size ${batch_size[task_id]}\ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + state=$? + if [ $state -eq 2 ] + then + network_width_multiplier=$(bc <<< $network_width_multiplier+0.5) + echo "New network_width_multiplier: $network_width_multiplier" + continue + fi + done + + if [ $state -eq 4 ] + then + continue + fi + + for RUN_ID in `seq 1 9`; do + nrof_epoch=$nrof_epoch_for_each_prune + start_sparsity=$end_sparsity + if [ $RUN_ID -lt 9 ] + then + end_sparsity=$(printf "%.1f" $(bc <<< $end_sparsity+$pruning_ratio_interval)) + else + end_sparsity=$(printf "%.2f" $(bc <<< $end_sparsity+0.05)) + fi + + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr 0.0001 \ + --lr_mask 0.0 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$end_sparsity \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$start_sparsity \ + --epochs $nrof_epoch \ + --mode prune \ + --initial_sparsity=$start_sparsity \ + --target_sparsity=$end_sparsity \ + --pruning_frequency=$pruning_frequency \ + --pruning_interval=4 \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --jsonfile logs/baseline_face_acc.txt \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + if [ $? -eq 4 ] + then + break + fi + done +done diff --git a/experiment3/FvGeEm_CPG_face.sh b/experiment3/FvGeEm_CPG_face.sh new file mode 100644 index 0000000..bca5be1 --- /dev/null +++ b/experiment3/FvGeEm_CPG_face.sh @@ -0,0 +1,195 @@ +#!/bin/bash +# Normally, bash shell cannot support floating point arthematic, thus, here we use `bc` package + +PREFIX=FvGeEm +TARGET_TASK_ID=0 + +dataset=( + 'None' # dummy + 'face_verification' + 'gender' + 'emotion' +) + +num_classes=( + 0 # dummy + 4630 + 3 + 7 +) + +init_lr=( + 0 # dummy + 1e-3 + 5e-4 + 5e-4 +) + + +batch_size=( + 0 # dummy + 256 + 128 + 128 +) + + +finetune_start_sparsity=( + 0 # dummy + 0 + 0.5 + 0.1 +) + + +acc_margin=( + 0 # dummy + 0.005 + 0.015 + 0.005 +) + + +GPU_ID=0,1,2,3 +arch='spherenet20' +finetune_epochs=100 +network_width_multiplier=1.0 +pruning_ratio_interval=0.1 + + +for task_id in `seq $TARGET_TASK_ID $TARGET_TASK_ID`; do + state=2 + while [ $state -eq 2 ]; do + if [ "$task_id" != "1" ] + then + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr ${init_lr[task_id]} \ + --lr_mask 5e-4 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id-1]}/gradual_prune/${finetune_start_sparsity[task_id]} \ + --epochs $finetune_epochs \ + --mode finetune \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --jsonfile logs/baseline_face_acc.txt \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + else + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr ${init_lr[task_id]} \ + --lr_mask 5e-4 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --epochs $finetune_epochs \ + --mode finetune \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --jsonfile logs/baseline_face_acc.txt \ + --use_vgg_pretrained \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + fi + + state=$? + if [ $state -eq 2 ] + then + network_width_multiplier=$(bc <<< $network_width_multiplier+0.5) + echo "New network_width_multiplier: $network_width_multiplier" + continue + elif [ $state -eq 3 ] + then + echo "You should provide the baseline_face_acc.txt" + exit 0 + fi + + nrof_epoch=0 + if [ "$task_id" == "1" ] + then + nrof_epoch_for_each_prune=10 + pruning_frequency=1000 + else + nrof_epoch_for_each_prune=20 + pruning_frequency=100 + fi + start_sparsity=0.0 + end_sparsity=0.1 + nrof_epoch=$nrof_epoch_for_each_prune + + # gradually pruning + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr 0.0005 \ + --lr_mask 0.0 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$end_sparsity \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/scratch \ + --epochs $nrof_epoch \ + --mode prune \ + --initial_sparsity=$start_sparsity \ + --target_sparsity=$end_sparsity \ + --pruning_frequency=$pruning_frequency \ + --pruning_interval=4 \ + --jsonfile logs/baseline_face_acc.txt \ + --batch_size ${batch_size[task_id]}\ + --val_batch_size 1 \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + state=$? + if [ $state -eq 2 ] + then + network_width_multiplier=$(bc <<< $network_width_multiplier+0.5) + echo "New network_width_multiplier: $network_width_multiplier" + continue + fi + done + + if [ $state -eq 4 ] + then + continue + fi + + for RUN_ID in `seq 1 9`; do + nrof_epoch=$nrof_epoch_for_each_prune + start_sparsity=$end_sparsity + if [ $RUN_ID -lt 9 ] + then + end_sparsity=$(printf "%.1f" $(bc <<< $end_sparsity+$pruning_ratio_interval)) + else + end_sparsity=$(printf "%.2f" $(bc <<< $end_sparsity+0.05)) + fi + + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $arch \ + --dataset ${dataset[task_id]} --num_classes ${num_classes[task_id]} \ + --lr 0.0005 \ + --lr_mask 0.0 \ + --weight_decay 4e-5 \ + --save_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$end_sparsity \ + --load_folder ${PREFIX}_checkpoints/CPG/$arch/${dataset[task_id]}/gradual_prune/$start_sparsity \ + --epochs $nrof_epoch \ + --mode prune \ + --initial_sparsity=$start_sparsity \ + --target_sparsity=$end_sparsity \ + --pruning_frequency=$pruning_frequency \ + --pruning_interval=4 \ + --batch_size ${batch_size[task_id]} \ + --val_batch_size 1 \ + --jsonfile logs/baseline_face_acc.txt \ + --acc_margin ${acc_margin[task_id]} \ + --network_width_multiplier $network_width_multiplier \ + --log_path ${PREFIX}_checkpoints/CPG/$arch/run.log + if [ $? -eq 4 ] + then + break + fi + done +done diff --git a/experiment3/baseline_face.sh b/experiment3/baseline_face.sh new file mode 100644 index 0000000..b4dd26c --- /dev/null +++ b/experiment3/baseline_face.sh @@ -0,0 +1,61 @@ +#!/bin/bash + + +DATASETS=( + 'None' # dummy + 'face_verification' + 'emotion' + 'gender' + 'age0' + 'age1' + 'age2' + 'age3' + 'age4' +) + +NUM_CLASSES=( + 0 # dummy + 4630 + 7 + 3 + 8 + 8 + 8 + 8 + 8 +) + +LRS=( + 0.0 # dummy + 0.0 + 1e-3 + 1e-4 + 1e-3 + 1e-3 + 1e-3 + 1e-3 + 1e-3 +) + +GPU_ID=0,1,2,3 +ARCH='spherenet20' +FINETUNE_EPOCHS=100 + +# CNN20 pretrained on the face verification task +echo {\"face_verification\": \"0.9942\"} > logs/baseline_face_acc.txt + +for TASK_ID in `seq 2 8`; do + CUDA_VISIBLE_DEVICES=$GPU_ID python packnet_face_main.py \ + --arch $ARCH \ + --dataset ${DATASETS[TASK_ID]} \ + --num_classes ${NUM_CLASSES[TASK_ID]} \ + --lr ${LRS[TASK_ID]} \ + --weight_decay 4e-5 \ + --batch_size 32 \ + --val_batch_size 1 \ + --save_folder baseline_checkpoints/$ARCH/${DATASETS[TASK_ID]} \ + --epochs $FINETUNE_EPOCHS \ + --mode finetune \ + --logfile logs/baseline_face_acc.txt \ + --use_vgg_pretrained +done diff --git a/experiment3/inference_FvGeEmAg.sh b/experiment3/inference_FvGeEmAg.sh new file mode 100644 index 0000000..a9cc2e4 --- /dev/null +++ b/experiment3/inference_FvGeEmAg.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# Usage: +# bash scripts/inference_FvGeEmAg.sh FvGeEmAg0 0 0.2 age0 logs/FvGeEmAg0_s02.log +# bash scripts/inference_FvGeEmAg.sh FvGeEmAg1 0 0.3 age1 logs/FvGeEmAg1_s03.log +# bash scripts/inference_FvGeEmAg.sh FvGeEmAg2 0 0.1 age2 logs/FvGeEmAg2_s01.log +# bash scripts/inference_FvGeEmAg.sh FvGeEmAg3 0 0.1 age3 logs/FvGeEmAg3_s01.log +# bash scripts/inference_FvGeEmAg.sh FvGeEmAg4 0 0.2 age4 logs/FvGeEmAg4_s02.log + +PREFIX=$1 +GPU_ID=$2 +TARGET_SPARSITY=$3 +AGEFOLD=$4 +LOG_PATH=$5 + +DATASET=( + 'face_verification' + 'gender' + 'emotion' +) + +NUM_CLASSES=( + 4630 + 3 + 7 +) + +ARCH='spherenet20' +NETWORK_WIDTH_MULTIPLIER=1.0 +SPARSITY_DIR=${PREFIX}_checkpoints/CPG/$ARCH/$AGEFOLD/gradual_prune/$TARGET_SPARSITY + + +echo "In directory: " $SPARSITY_DIR +for task_id in `seq 0 2`; do + CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $ARCH \ + --dataset ${DATASET[task_id]} --num_classes ${NUM_CLASSES[task_id]} \ + --load_folder $SPARSITY_DIR \ + --mode inference \ + --jsonfile logs/baseline_face_acc.txt \ + --log_path $LOG_PATH \ + --network_width_multiplier $NETWORK_WIDTH_MULTIPLIER +done + + +CUDA_VISIBLE_DEVICES=$GPU_ID python CPG_face_main.py \ + --arch $ARCH \ + --dataset $AGEFOLD --num_classes 8 \ + --load_folder $SPARSITY_DIR \ + --mode inference \ + --jsonfile logs/baseline_face_acc.txt \ + --log_path $LOG_PATH \ + --network_width_multiplier $NETWORK_WIDTH_MULTIPLIER diff --git a/models/__init__.py b/models/__init__.py index 3c05fdd..4a2c04f 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -1,2 +1,3 @@ from .resnet import * -from .vgg import * \ No newline at end of file +from .vgg import * +from .spherenet import * diff --git a/models/net_sphere.py b/models/net_sphere.py deleted file mode 100644 index 40f1f54..0000000 --- a/models/net_sphere.py +++ /dev/null @@ -1,239 +0,0 @@ -# -*- coding: utf-8 -*- -import torch -import torch.nn as nn -from torch.autograd import Variable -import torch.nn.functional as F -from torch.nn import Parameter -import math -import torchvision.models as models - -class AngleLinear(nn.Module): - def __init__(self, in_features, out_features, m = 4): - super(AngleLinear, self).__init__() - self.in_features = in_features - self.out_features = out_features - self.weight = Parameter(torch.Tensor(in_features,out_features)) - self.weight.data.uniform_(-1, 1).renorm_(2,1,1e-5).mul_(1e5) - self.m = m - self.mlambda = [ - lambda x: x**0, # cos(0*theta)=1 - lambda x: x**1, # cos(1*theta)=cos(theta) - lambda x: 2*x**2-1, # cos(2*theta)=2*cos(theta)**2-1 - lambda x: 4*x**3-3*x, # cos(3*theta)=4*cos(theta)**3-3cos(theta) - lambda x: 8*x**4-8*x**2+1, - lambda x: 16*x**5-20*x**3+5*x - ] - - def forward(self, input): - # input为输入的特征,(B, C),B为batchsize,C为图像的类别总数 - x = input # size=(B,F),F为特征长度,如512 - w = self.weight # size=(F,Classnum) F=in_features Classnum=out_features - - ww = w.renorm(2,1,1e-5).mul(1e5) - # 对w进行归一化,renorm使用L2范数对第1维度进行归一化,将大于1e-5的截断,乘以1e5, - # 使得最终归一化到1.如果1e-5设置的过大,裁剪时某些很小的值最终可能小于1 - # 注意,第0维度只对每一行进行归一化(每行平方和为1), - # 第1维度指对每一列进行归一化。由于w的每一列为x的权重,因而此处需要对每一列进行归一化。 - # 如果要对x归一化,需要对每一行进行归一化,此时第二个参数应为0 - xlen = x.pow(2).sum(1).pow(0.5) # size=B - # 对输入x求平方,而后对不同列求和,再开方,得到每行的模,最终大小为第0维的,即B - # (由于对x不归一化,但是计算余弦时需要归一化,因而可以先计算模。 - # 但是对于w,不太懂为何不直接使用这种方式,而是使用renorm函数?) - wlen = ww.pow(2).sum(0).pow(0.5) # size=Classnum - # 对权重w求平方,而后对不同行求和,再开方,得到每列的模 - # (理论上之前已经归一化,此处应该是1,但第一次运行到此处时,并不是1,不太懂),最终大小为第1维的,即C - - cos_theta = x.mm(ww) # size=(B,Classnum) - # 矩阵相乘(B,F)*(F,C)=(B,C),得到cos值,由于此处只是乘加,故未归一化 - cos_theta = cos_theta / xlen.view(-1,1) / wlen.view(1,-1) - # 对每个cos值均除以B和C,得到归一化后的cos值 - cos_theta = cos_theta.clamp(-1,1) - # 将cos值截断到[-1,1]之间,理论上不截断应该也没有问题,毕竟w和x都归一化后,cos值不可能超出该范围 - # ------------------------------------------------ - cos_m_theta = self.mlambda[self.m](cos_theta) - # 通过cos_theta计算cos_m_theta,mlambda为cos_m_theta展开的结果 - theta = Variable(cos_theta.data.acos()) - # 通过反余弦,计算角度theta,(B,C) - k = (self.m*theta/3.14159265).floor() - # 通过公式,计算k,(B,C)。此处为了保证theta大于k*pi/m,转换过来就是m*theta/pi,再向上取整 - n_one = k*0.0 - 1 - # 通过k的大小,得到同样大小的-1矩阵,(B,C) - phi_theta = (n_one**k) * cos_m_theta - 2*k - # 通过论文中公式,得到phi_theta。(B,C) - # -------------------------------------------- - cos_theta = cos_theta * xlen.view(-1,1) - # 由于实际上不对x进行归一化,此处cos_theta需要乘以B。(B,C) - phi_theta = phi_theta * xlen.view(-1,1) - # 由于实际上不对x进行归一化,此处phi_theta需要乘以B。(B,C) - output = (cos_theta,phi_theta) - return output # size=(B,Classnum,2) - - -class AngleLoss(nn.Module): - def __init__(self, gamma=0): - super(AngleLoss, self).__init__() - self.gamma = gamma - self.it = 0 - self.LambdaMin = 5.0 - self.LambdaMax = 1500.0 - self.lamb = 1500.0 - - def forward(self, input, target): - self.it += 1 - cos_theta,phi_theta = input # cos_theta,(B,C)。 phi_theta,(B,C) - target = target.view(-1,1) #size=(B,1) - - index = cos_theta.data * 0.0 #size=(B,Classnum) - # 得到和cos_theta相同大小的全0矩阵。(B,C) - index.scatter_(1,target.data.view(-1,1),1) - # 得到一个one-hot矩阵,第i行只有target[i]的值为1,其他均为0 - index = index.byte()# index为float的,转换成byte类型 - index = Variable(index) - - self.lamb = max(self.LambdaMin,self.LambdaMax/(1+0.1*self.it )) # 得到lamb - output = cos_theta * 1.0 #size=(B,Classnum) - # 如果直接使用output=cos_theta,可能不收敛(未测试,但其他程序中碰到过直接对输入使用[index]无法收敛,加上*1.0可以收敛的情况) - output[index] -= cos_theta[index]*(1.0+0)/(1+self.lamb)# 此行及下一行将target[i]的值通过公式得到最终输出 - output[index] += phi_theta[index]*(1.0+0)/(1+self.lamb) - - logpt = F.log_softmax(output,dim=1) # 得到概率, ..I change this line (dim=1) - logpt = logpt.gather(1,target) # 下面为交叉熵的计算(和focal loss的计算有点类似,当gamma为0时,为交叉熵)。 - logpt = logpt.view(-1) - pt = Variable(logpt.data.exp()) # ln(e) = 1 - - loss = -1 * (1-pt)**self.gamma * logpt - loss = loss.mean() - - # target = target.view(-1) # 若要简化,理论上可直接使用这两行计算交叉熵(此处未测试,在其他程序中使用后可以正常训练) - # loss = F.cross_entropy(cos_theta, target) - - return loss - -class sphere(nn.Module): - def __init__(self,embedding_size,classnum,feature=False): - super(sphere, self).__init__() - self.embedding_size = embedding_size - self.classnum = classnum - self.feature = feature - #input = B*3*112*112 - self.conv1_1 = nn.Conv2d(3,64,3,2,1) #=>B*64*56*56 - self.relu1_1 = nn.PReLU(64) - self.conv1_2 = nn.Conv2d(64,64,3,1,1) - self.relu1_2 = nn.PReLU(64) - self.conv1_3 = nn.Conv2d(64,64,3,1,1) - self.relu1_3 = nn.PReLU(64) - - self.conv2_1 = nn.Conv2d(64,128,3,2,1) #=>B*128*28*28 - self.relu2_1 = nn.PReLU(128) - self.conv2_2 = nn.Conv2d(128,128,3,1,1) - self.relu2_2 = nn.PReLU(128) - self.conv2_3 = nn.Conv2d(128,128,3,1,1) - self.relu2_3 = nn.PReLU(128) - - self.conv2_4 = nn.Conv2d(128,128,3,1,1) #=>B*128*28*28 - self.relu2_4 = nn.PReLU(128) - self.conv2_5 = nn.Conv2d(128,128,3,1,1) - self.relu2_5 = nn.PReLU(128) - - - self.conv3_1 = nn.Conv2d(128,256,3,2,1) #=>B*256*14*14 - self.relu3_1 = nn.PReLU(256) - self.conv3_2 = nn.Conv2d(256,256,3,1,1) - self.relu3_2 = nn.PReLU(256) - self.conv3_3 = nn.Conv2d(256,256,3,1,1) - self.relu3_3 = nn.PReLU(256) - - self.conv3_4 = nn.Conv2d(256,256,3,1,1) #=>B*256*14*14 - self.relu3_4 = nn.PReLU(256) - self.conv3_5 = nn.Conv2d(256,256,3,1,1) - self.relu3_5 = nn.PReLU(256) - - self.conv3_6 = nn.Conv2d(256,256,3,1,1) #=>B*256*14*14 - self.relu3_6 = nn.PReLU(256) - self.conv3_7 = nn.Conv2d(256,256,3,1,1) - self.relu3_7 = nn.PReLU(256) - - self.conv3_8 = nn.Conv2d(256,256,3,1,1) #=>B*256*14*14 - self.relu3_8 = nn.PReLU(256) - self.conv3_9 = nn.Conv2d(256,256,3,1,1) - self.relu3_9 = nn.PReLU(256) - - self.conv4_1 = nn.Conv2d(256,512,3,2,1) #=>B*512*7*7 - self.relu4_1 = nn.PReLU(512) - self.conv4_2 = nn.Conv2d(512,512,3,1,1) - self.relu4_2 = nn.PReLU(512) - self.conv4_3 = nn.Conv2d(512,512,3,1,1) - self.relu4_3 = nn.PReLU(512) - - self.fc5 = nn.Linear(512*7*7,self.embedding_size) - self.fc6 = AngleLinear(self.embedding_size,self.classnum) - - - def l2_norm(self,input): - input_size = input.size() - buffer = torch.pow(input, 2) - normp = torch.sum(buffer, 1).add_(1e-10) - norm = torch.sqrt(normp) - _output = torch.div(input, norm.view(-1, 1).expand_as(input)) - output = _output.view(input_size) - return output - - - def forward(self, x): - x = self.relu1_1(self.conv1_1(x)) - x = x + self.relu1_3(self.conv1_3(self.relu1_2(self.conv1_2(x)))) - - x = self.relu2_1(self.conv2_1(x)) - x = x + self.relu2_3(self.conv2_3(self.relu2_2(self.conv2_2(x)))) - x = x + self.relu2_5(self.conv2_5(self.relu2_4(self.conv2_4(x)))) - - x = self.relu3_1(self.conv3_1(x)) - x = x + self.relu3_3(self.conv3_3(self.relu3_2(self.conv3_2(x)))) - x = x + self.relu3_5(self.conv3_5(self.relu3_4(self.conv3_4(x)))) - x = x + self.relu3_7(self.conv3_7(self.relu3_6(self.conv3_6(x)))) - x = x + self.relu3_9(self.conv3_9(self.relu3_8(self.conv3_8(x)))) - - x = self.relu4_1(self.conv4_1(x)) - x = x + self.relu4_3(self.conv4_3(self.relu4_2(self.conv4_2(x)))) - - x = x.view(x.size(0),-1) - x = self.fc5(x) - #x = self.l2_norm(x) - if self.feature: - return x - - x = self.fc6(x) - return x - -class sphereVGG(nn.Module): - def __init__(self,embedding_size,classnum,feature=False): - super(sphereVGG, self).__init__() - self.embedding_size = embedding_size - self.classnum = classnum - self.feature = feature - # load feature extractor from vgg16_bn pretrained-model - #self.vgg16_bn_feat_extractor = models.vgg16_bn(pretrained=False).features - self.vgg16_bn_feat_extractor = nn.Sequential(*list(models.vgg16_bn(pretrained=False).features)) - # concatenate the embedding layer - self.fc5 = nn.Linear(512*5*5,self.embedding_size) - #self.fc6 = AngleLinear(self.embedding_size,self.classnum) - self.fc6 = nn.Linear(self.embedding_size,self.classnum) - - def l2_norm(self,input): - input_size = input.size() - buffer = torch.pow(input, 2) - normp = torch.sum(buffer, 1).add_(1e-10) - norm = torch.sqrt(normp) - _output = torch.div(input, norm.view(-1, 1).expand_as(input)) - output = _output.view(input_size) - return output - - def forward(self, x): - x = self.vgg16_bn_feat_extractor(x) - x = x.view(x.size(0),-1) - x = self.fc5(x) - #x = self.l2_norm(x) - if self.feature: - return x - x = self.fc6(x) - return x \ No newline at end of file diff --git a/models/spherenet.py b/models/spherenet.py new file mode 100644 index 0000000..758cb6c --- /dev/null +++ b/models/spherenet.py @@ -0,0 +1,249 @@ +import torch +import torch.nn as nn +import torch.utils.model_zoo as model_zoo +import models.layers as nl +import pdb +from torch.nn.parameter import Parameter +from torch.autograd import Variable +import torch.nn.functional as F + +__all__ = ['spherenet20', 'AngleLoss'] + + +class View(nn.Module): + """Changes view using a nn.Module.""" + + def __init__(self, *shape): + super(View, self).__init__() + self.shape = shape + + def forward(self, input): + return input.view(*self.shape) + + +class AngleLoss(nn.Module): + def __init__(self, gamma=0): + super(AngleLoss, self).__init__() + self.gamma = gamma + self.it = 0 + self.LambdaMin = 5.0 + self.LambdaMax = 1500.0 + self.lamb = 1500.0 + + def forward(self, input, target): + self.it += 1 + cos_theta,phi_theta = input + target = target.view(-1,1) + + index = cos_theta.data * 0.0 + index.scatter_(1,target.data.view(-1,1),1) + index = index.byte() + index = Variable(index) + + self.lamb = max(self.LambdaMin,self.LambdaMax/(1+0.1*self.it )) + output = cos_theta * 1.0 + output[index] -= cos_theta[index]*(1.0+0)/(1+self.lamb) + output[index] += phi_theta[index]*(1.0+0)/(1+self.lamb) + + logpt = F.log_softmax(output,dim=1) + logpt = logpt.gather(1,target) + logpt = logpt.view(-1) + pt = Variable(logpt.data.exp()) + + loss = -1 * (1-pt)**self.gamma * logpt + loss = loss.mean() + return loss + + +class AngleLinear(nn.Module): + def __init__(self, in_features, out_features, m = 4): + super(AngleLinear, self).__init__() + self.in_features = in_features + self.out_features = out_features + self.weight = Parameter(torch.Tensor(in_features,out_features)) + self.weight.data.uniform_(-1, 1).renorm_(2,1,1e-5).mul_(1e5) + self.m = m + self.mlambda = [ + lambda x: x**0, # cos(0*theta)=1 + lambda x: x**1, # cos(1*theta)=cos(theta) + lambda x: 2*x**2-1, # cos(2*theta)=2*cos(theta)**2-1 + lambda x: 4*x**3-3*x, # cos(3*theta)=4*cos(theta)**3-3cos(theta) + lambda x: 8*x**4-8*x**2+1, + lambda x: 16*x**5-20*x**3+5*x + ] + + def forward(self, input): + x = input + w = self.weight + ww = w.renorm(2,1,1e-5).mul(1e5) + xlen = x.pow(2).sum(1).pow(0.5) + wlen = ww.pow(2).sum(0).pow(0.5) + cos_theta = x.mm(ww) + cos_theta = cos_theta / xlen.view(-1,1) / wlen.view(1,-1) + cos_theta = cos_theta.clamp(-1,1) + cos_m_theta = self.mlambda[self.m](cos_theta) + theta = Variable(cos_theta.data.acos()) + k = (self.m*theta/3.14159265).floor() + n_one = k*0.0 - 1 + phi_theta = (n_one**k) * cos_m_theta - 2*k + cos_theta = cos_theta * xlen.view(-1,1) + phi_theta = phi_theta * xlen.view(-1,1) + output = (cos_theta,phi_theta) + return output + + +class SphereNet(nn.Module): + def __init__(self, dataset_history, dataset2num_classes, network_width_multiplier=1.0, shared_layer_info={}, init_weights=True): + super(SphereNet, self).__init__() + self.network_width_multiplier = network_width_multiplier + self.make_feature_layers() + + self.shared_layer_info = shared_layer_info + self.datasets = dataset_history + self.classifiers = nn.ModuleList() + self.dataset2num_classes = dataset2num_classes + + if self.datasets: + self._reconstruct_classifiers() + + if init_weights: + self._initialize_weights() + return + + def forward(self, x): + x = self.relu1_1(self.conv1_1(x)) + x = x + self.relu1_3(self.conv1_3(self.relu1_2(self.conv1_2(x)))) + x = self.relu2_1(self.conv2_1(x)) + x = x + self.relu2_3(self.conv2_3(self.relu2_2(self.conv2_2(x)))) + x = x + self.relu2_5(self.conv2_5(self.relu2_4(self.conv2_4(x)))) + x = self.relu3_1(self.conv3_1(x)) + x = x + self.relu3_3(self.conv3_3(self.relu3_2(self.conv3_2(x)))) + x = x + self.relu3_5(self.conv3_5(self.relu3_4(self.conv3_4(x)))) + x = x + self.relu3_7(self.conv3_7(self.relu3_6(self.conv3_6(x)))) + x = x + self.relu3_9(self.conv3_9(self.relu3_8(self.conv3_8(x)))) + x = self.relu4_1(self.conv4_1(x)) + x = x + self.relu4_3(self.conv4_3(self.relu4_2(self.conv4_2(x)))) + x = self.flatten(x) + x = self.classifier(x) + return x + + def forward_to_embeddings(self, x): + x = self.relu1_1(self.conv1_1(x)) + x = x + self.relu1_3(self.conv1_3(self.relu1_2(self.conv1_2(x)))) + x = self.relu2_1(self.conv2_1(x)) + x = x + self.relu2_3(self.conv2_3(self.relu2_2(self.conv2_2(x)))) + x = x + self.relu2_5(self.conv2_5(self.relu2_4(self.conv2_4(x)))) + x = self.relu3_1(self.conv3_1(x)) + x = x + self.relu3_3(self.conv3_3(self.relu3_2(self.conv3_2(x)))) + x = x + self.relu3_5(self.conv3_5(self.relu3_4(self.conv3_4(x)))) + x = x + self.relu3_7(self.conv3_7(self.relu3_6(self.conv3_6(x)))) + x = x + self.relu3_9(self.conv3_9(self.relu3_8(self.conv3_8(x)))) + x = self.relu4_1(self.conv4_1(x)) + x = x + self.relu4_3(self.conv4_3(self.relu4_2(self.conv4_2(x)))) + x = self.flatten(x) + x = self.classifier[0](x) + return x + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nl.SharableConv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.PReLU): + nn.init.constant_(m.weight, 0.25) + return + + def _reconstruct_classifiers(self): + for dataset, num_classes in self.dataset2num_classes.items(): + if 'face_verification' in dataset: + embedding_size = 512 + classifier_module = nn.Sequential(nn.Linear(int(self.shared_layer_info[dataset]['network_width_multiplier']*512)*7*7, + embedding_size), + AngleLinear(embedding_size, num_classes)) + self.classifiers.append(classifier_module) + else: + self.classifiers.append(nn.Linear(int(self.shared_layer_info[dataset]['network_width_multiplier']*512)*7*7, + num_classes)) + return + + def add_dataset(self, dataset, num_classes): + """Adds a new dataset to the classifier.""" + if dataset not in self.datasets: + self.datasets.append(dataset) + self.dataset2num_classes[dataset] = num_classes + if 'face_verification' in dataset: + embedding_size = 512 + classifier_module = nn.Sequential(nn.Linear(int(self.network_width_multiplier*512)*7*7, embedding_size), + AngleLinear(embedding_size, num_classes)) + self.classifiers.append(classifier_module) + nn.init.normal_(classifier_module[0].weight, 0, 0.01) + nn.init.constant_(classifier_module[0].bias, 0) + nn.init.normal_(classifier_module[1].weight, 0, 0.01) + else: + self.classifiers.append(nn.Linear(int(self.network_width_multiplier*512)*7*7, num_classes)) + nn.init.normal_(self.classifiers[self.datasets.index(dataset)].weight, 0, 0.01) + nn.init.constant_(self.classifiers[self.datasets.index(dataset)].bias, 0) + return + + def set_dataset(self, dataset): + """Change the active classifier.""" + assert dataset in self.datasets + self.classifier = self.classifiers[self.datasets.index(dataset)] + return + + def make_feature_layers(self): + ext = self.network_width_multiplier + self.conv1_1 = nl.SharableConv2d(3,int(64*ext),3,2,1) #=>B*int(64*ext)*56*56 + self.relu1_1 = nn.PReLU(int(64*ext)) + self.conv1_2 = nl.SharableConv2d(int(64*ext), int(64*ext),3,1,1) + self.relu1_2 = nn.PReLU(int(64*ext)) + self.conv1_3 = nl.SharableConv2d(int(64*ext), int(64*ext),3,1,1) + self.relu1_3 = nn.PReLU(int(64*ext)) + + self.conv2_1 = nl.SharableConv2d(int(64*ext), int(128*ext),3,2,1) #=>B*int(128*ext)*28*28 + self.relu2_1 = nn.PReLU(int(128*ext)) + self.conv2_2 = nl.SharableConv2d(int(128*ext),int(128*ext),3,1,1) + self.relu2_2 = nn.PReLU(int(128*ext)) + self.conv2_3 = nl.SharableConv2d(int(128*ext),int(128*ext),3,1,1) + self.relu2_3 = nn.PReLU(int(128*ext)) + + self.conv2_4 = nl.SharableConv2d(int(128*ext),int(128*ext),3,1,1) #=>B*int(128*ext)*28*28 + self.relu2_4 = nn.PReLU(int(128*ext)) + self.conv2_5 = nl.SharableConv2d(int(128*ext),int(128*ext),3,1,1) + self.relu2_5 = nn.PReLU(int(128*ext)) + + + self.conv3_1 = nl.SharableConv2d(int(128*ext),int(256*ext),3,2,1) #=>B*int(256*ext)*14*14 + self.relu3_1 = nn.PReLU(int(256*ext)) + self.conv3_2 = nl.SharableConv2d(int(256*ext),int(256*ext),3,1,1) + self.relu3_2 = nn.PReLU(int(256*ext)) + self.conv3_3 = nl.SharableConv2d(int(256*ext),int(256*ext),3,1,1) + self.relu3_3 = nn.PReLU(int(256*ext)) + + self.conv3_4 = nl.SharableConv2d(int(256*ext),int(256*ext),3,1,1) #=>B*int(256*ext)*14*14 + self.relu3_4 = nn.PReLU(int(256*ext)) + self.conv3_5 = nl.SharableConv2d(int(256*ext),int(256*ext),3,1,1) + self.relu3_5 = nn.PReLU(int(256*ext)) + + self.conv3_6 = nl.SharableConv2d(int(256*ext),int(256*ext),3,1,1) #=>B*int(256*ext)*14*14 + self.relu3_6 = nn.PReLU(int(256*ext)) + self.conv3_7 = nl.SharableConv2d(int(256*ext),int(256*ext),3,1,1) + self.relu3_7 = nn.PReLU(int(256*ext)) + + self.conv3_8 = nl.SharableConv2d(int(256*ext),int(256*ext),3,1,1) #=>B*int(256*ext)*14*14 + self.relu3_8 = nn.PReLU(int(256*ext)) + self.conv3_9 = nl.SharableConv2d(int(256*ext),int(256*ext),3,1,1) + self.relu3_9 = nn.PReLU(int(256*ext)) + self.conv4_1 = nl.SharableConv2d(int(256*ext),int(512*ext),3,2,1) #=>B*int(512*ext)*7*7 + self.relu4_1 = nn.PReLU(int(512*ext)) + self.conv4_2 = nl.SharableConv2d(int(512*ext),int(512*ext),3,1,1) + self.relu4_2 = nn.PReLU(int(512*ext)) + self.conv4_3 = nl.SharableConv2d(int(512*ext),int(512*ext),3,1,1) + self.relu4_3 = nn.PReLU(int(512*ext)) + self.flatten = View(-1, int(ext*512)*7*7) + return + + +def spherenet20(dataset_history=[], dataset2num_classes={}, network_width_multiplier=1.0, shared_layer_info={}, **kwargs): + return SphereNet(dataset_history, dataset2num_classes, network_width_multiplier, shared_layer_info, **kwargs) diff --git a/models/vgg_with_one_mask.py b/models/vgg_with_one_mask.py deleted file mode 100644 index 45d3700..0000000 --- a/models/vgg_with_one_mask.py +++ /dev/null @@ -1,282 +0,0 @@ -import torch.nn as nn -import torch.utils.model_zoo as model_zoo -import models.layers_with_one_mask as nl -import pdb - -__all__ = [ - 'VGG', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', - 'vgg19_bn', 'vgg19', 'custom_vgg', 'custom_vgg_cifar100' -] - - -model_urls = { - 'vgg11': 'https://download.pytorch.org/models/vgg11-bbd30ac9.pth', - 'vgg13': 'https://download.pytorch.org/models/vgg13-c768596a.pth', - 'vgg16': 'https://download.pytorch.org/models/vgg16-397923af.pth', - 'vgg19': 'https://download.pytorch.org/models/vgg19-dcbb9e9d.pth', - 'vgg11_bn': 'https://download.pytorch.org/models/vgg11_bn-6002323d.pth', - 'vgg13_bn': 'https://download.pytorch.org/models/vgg13_bn-abd245e5.pth', - 'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth', - 'vgg19_bn': 'https://download.pytorch.org/models/vgg19_bn-c79401a0.pth', -} - -class View(nn.Module): - """Changes view using a nn.Module.""" - - def __init__(self, *shape): - super(View, self).__init__() - self.shape = shape - - def forward(self, input): - return input.view(*self.shape) - -class VGG(nn.Module): - def __init__(self, features, dataset_history, dataset2num_classes, network_width_multiplier=1.0, shared_layer_info={}, init_weights=True, progressive_init=False): - super(VGG, self).__init__() - self.features = features - self.network_width_multiplier = network_width_multiplier - self.shared_layer_info = shared_layer_info - # self.avgpool = nn.AdaptiveAvgPool2d((7, 7)) - - self.datasets, self.classifiers = dataset_history, nn.ModuleList() - self.dataset2num_classes = dataset2num_classes - - if self.datasets: - self._reconstruct_classifiers() - - if init_weights: - self._initialize_weights() - - if progressive_init: - self._initialize_weights_2() - - def forward(self, x): - x = self.features(x) - # x = self.avgpool(x) - x = self.classifier(x) - return x - - def _initialize_weights(self): - for m in self.modules(): - if isinstance(m, nl.SharableConv2d): - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nl.SharableLinear): - nn.init.normal_(m.weight, 0, 0.01) - nn.init.constant_(m.bias, 0) - - def _initialize_weights_2(self): - for m in self.modules(): - if isinstance(m, nl.SharableConv2d): - nn.init.normal_(m.weight, 0, 0.01) - - def _reconstruct_classifiers(self): - for dataset, num_classes in self.dataset2num_classes.items(): - self.classifiers.append(nn.Linear(int(self.shared_layer_info[dataset]['network_width_multiplier'] * 4096), num_classes)) - - def add_dataset(self, dataset, num_classes): - """Adds a new dataset to the classifier.""" - if dataset not in self.datasets: - self.datasets.append(dataset) - self.dataset2num_classes[dataset] = num_classes - self.classifiers.append(nn.Linear(int(4096*self.network_width_multiplier), num_classes)) - nn.init.normal_(self.classifiers[self.datasets.index(dataset)].weight, 0, 0.01) - nn.init.constant_(self.classifiers[self.datasets.index(dataset)].bias, 0) - - def set_dataset(self, dataset): - """Change the active classifier.""" - assert dataset in self.datasets - self.classifier = self.classifiers[self.datasets.index(dataset)] - -def make_layers_cifar100(cfg, network_width_multiplier, batch_norm=False, groups=1): - layers = [] - in_channels = 3 - - for v in cfg: - if v == 'M': - layers += [nn.MaxPool2d(kernel_size=2, stride=2)] - else: - if in_channels == 3: - conv2d = nl.SharableConv2d(in_channels, int(v * network_width_multiplier), kernel_size=3, padding=1, bias=False) - else: - conv2d = nl.SharableConv2d(in_channels, int(v * network_width_multiplier), kernel_size=3, padding=1, bias=False, groups=groups) - - if batch_norm: - layers += [conv2d, nn.BatchNorm2d(int(v * network_width_multiplier)), nn.ReLU(inplace=True)] - else: - layers += [conv2d, nn.ReLU(inplace=True)] - in_channels = int(v * network_width_multiplier) - - layers += [ - View(-1, int(512*network_width_multiplier)), - nl.SharableLinear(int(512*network_width_multiplier), int(4096*network_width_multiplier)), - nn.ReLU(True), - nl.SharableLinear(int(4096*network_width_multiplier), int(4096*network_width_multiplier)), - nn.ReLU(True), - ] - - return nn.Sequential(*layers) - -def make_layers(cfg, network_width_multiplier, batch_norm=False, groups=1): - layers = [] - in_channels = 3 - - for v in cfg: - if v == 'M': - layers += [nn.MaxPool2d(kernel_size=2, stride=2)] - else: - if in_channels == 3: - conv2d = nl.SharableConv2d(in_channels, int(v * network_width_multiplier), kernel_size=3, padding=1, bias=False) - else: - conv2d = nl.SharableConv2d(in_channels, int(v * network_width_multiplier), kernel_size=3, padding=1, bias=False, groups=groups) - - if batch_norm: - layers += [conv2d, nn.BatchNorm2d(int(v * network_width_multiplier)), nn.ReLU(inplace=True)] - else: - layers += [conv2d, nn.ReLU(inplace=True)] - in_channels = int(v * network_width_multiplier) - - layers += [ - View(-1, int(512*network_width_multiplier)*7*7), - nl.SharableLinear(int(512*network_width_multiplier)*7*7, int(4096*network_width_multiplier)), - nn.ReLU(True), - # We need Dropout() for 224x224 - nn.Dropout(), - nl.SharableLinear(int(4096*network_width_multiplier), int(4096*network_width_multiplier)), - nn.ReLU(True), - nn.Dropout() - ] - - return nn.Sequential(*layers) - -cfg = { - 'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], - 'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], - 'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], - 'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], -} - - -def vgg11(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 11-layer model (configuration "A") - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['A']), dataset_history, dataset2num_classes, **kwargs) - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg11'])) - return model - - -def vgg11_bn(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 11-layer model (configuration "A") with batch normalization - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['A'], batch_norm=True), dataset_history, dataset2num_classes, **kwargs) - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg11_bn'])) - return model - - -def vgg13(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 13-layer model (configuration "B") - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['B']), dataset_history, dataset2num_classes, **kwargs) - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg13'])) - return model - - -def vgg13_bn(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 13-layer model (configuration "B") with batch normalization - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['B'], batch_norm=True), dataset_history, dataset2num_classes, **kwargs) - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg13_bn'])) - return model - - -def vgg16(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 16-layer model (configuration "D") - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['D']), dataset_history, dataset2num_classes, **kwargs) - - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg16'])) - return model - - -def vgg16_bn(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 16-layer model (configuration "D") with batch normalization - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['D'], batch_norm=True), dataset_history, dataset2num_classes, **kwargs) - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg16_bn'])) - return model - - -def vgg19(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 19-layer model (configuration "E") - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['E']), dataset_history, dataset2num_classes, **kwargs) - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg19'])) - return model - - -def vgg19_bn(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 19-layer model (configuration 'E') with batch normalization - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['E'], batch_norm=True), dataset_history, dataset2num_classes, **kwargs) - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg19_bn'])) - return model - -def custom_vgg_cifar100(custom_cfg, dataset_history=[], dataset2num_classes={}, network_width_multiplier=1.0, groups=1, shared_layer_info={}, **kwargs): - return VGG(make_layers_cifar100(custom_cfg, network_width_multiplier, batch_norm=True, groups=groups), dataset_history, - dataset2num_classes, network_width_multiplier, shared_layer_info, **kwargs) - -def custom_vgg(custom_cfg, dataset_history=[], dataset2num_classes={}, network_width_multiplier=1.0, groups=1, shared_layer_info={}, **kwargs): - return VGG(make_layers(custom_cfg, network_width_multiplier, batch_norm=True, groups=groups), dataset_history, - dataset2num_classes, network_width_multiplier, shared_layer_info, **kwargs) \ No newline at end of file diff --git a/models_with_one_mask/__init__.py b/models_with_one_mask/__init__.py deleted file mode 100644 index 1bee9d0..0000000 --- a/models_with_one_mask/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .vgg import * \ No newline at end of file diff --git a/models_with_one_mask/layers.py b/models_with_one_mask/layers.py deleted file mode 100644 index d686dec..0000000 --- a/models_with_one_mask/layers.py +++ /dev/null @@ -1,237 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.nn.modules.utils import _pair -from torch.nn.parameter import Parameter -import pdb -from pprint import pprint - -DEFAULT_THRESHOLD = 5e-3 - -class Binarizer(torch.autograd.Function): - """Binarizes {0, 1} a real valued tensor.""" - - @staticmethod - def forward(ctx, inputs, threshold): - outputs = inputs.clone() - outputs[inputs.le(threshold)] = 0 - outputs[inputs.gt(threshold)] = 1 - return outputs - - @staticmethod - def backward(ctx, grad_out): - return grad_out, None - -class Ternarizer(torch.autograd.Function): - """Ternarizes {-1, 0, 1} a real valued tensor.""" - - def __init__(self, threshold=DEFAULT_THRESHOLD): - super(Ternarizer, self).__init__() - self.threshold = threshold - - def forward(self, inputs): - outputs = inputs.clone() - outputs.fill_(0) - outputs[inputs < 0] = -1 - outputs[inputs > self.threshold] = 1 - return outputs - - def backward(self, gradOutput): - return gradOutput - - -class SharableConv2d(nn.Module): - """Modified conv with masks for weights.""" - - def __init__(self, in_channels, out_channels, kernel_size, stride=1, - padding=0, dilation=1, groups=1, bias=True, - mask_init='1s', mask_scale=1e-2, - threshold_fn='binarizer', threshold=None): - super(SharableConv2d, self).__init__() - kernel_size = _pair(kernel_size) - stride = _pair(stride) - padding = _pair(padding) - dilation = _pair(dilation) - self.mask_scale = mask_scale - self.mask_init = mask_init - - if threshold is None: - threshold = DEFAULT_THRESHOLD - self.info = { - 'threshold_fn': threshold_fn, - 'threshold': threshold, - } - - if in_channels % groups != 0: - raise ValueError('in_channels must be divisible by groups') - if out_channels % groups != 0: - raise ValueError('out_channels must be divisible by groups') - self.in_channels = in_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.stride = stride - self.padding = padding - self.dilation = dilation - self.transposed = False - self.output_padding = _pair(0) - self.groups = groups - - self.weight = Parameter(torch.Tensor( - out_channels, in_channels // groups, *kernel_size), requires_grad=True) - if bias: - self.bias = Parameter(torch.Tensor(out_channels), requires_grad=True) - else: - self.register_parameter('bias', None) - - self.packnet_mask = None - # Give real-valued mask weights per task to manage the shared part from previous tasks. - self.piggymask_float = None - self.piggymask_task_tag = None - self.inference_task_id = None - - # Initialize the thresholder. - if threshold_fn == 'binarizer': - # print('Calling binarizer with threshold:', threshold) - self.threshold_fn = Binarizer.apply - elif threshold_fn == 'ternarizer': - print('Calling ternarizer with threshold:', threshold) - self.threshold_fn = Ternarizer(threshold=threshold) - - def forward(self, input, layer_info=None, name=None): - - weight = torch.where(self.packnet_mask == self.inference_task_id, - torch.ones_like(self.packnet_mask, dtype=torch.float32), - torch.zeros_like(self.packnet_mask, dtype=torch.float32)) * self.weight - - if self.piggymask_float is not None: - # Get binarized/ternarized mask from real-valued mask. - mask_thresholded = self.threshold_fn(self.piggymask_float, self.info['threshold']) - # Mask weights with above mask. - mask_thresholded = torch.where(self.piggymask_task_tag == self.inference_task_id, - mask_thresholded, - torch.zeros_like(self.piggymask_task_tag, dtype=torch.float32)) - - # assert torch.all(torch.sqrt(weight * (mask_thresholded.data * self.weight.data)) < 1e-6) - weight += mask_thresholded * self.weight - - # Perform conv using modified weight. - return F.conv2d(input, weight, self.bias, self.stride, - self.padding, self.dilation, self.groups) - - def __repr__(self): - s = ('{name} ({in_channels}, {out_channels}, kernel_size={kernel_size}' - ', stride={stride}') - if self.padding != (0,) * len(self.padding): - s += ', padding={padding}' - if self.dilation != (1,) * len(self.dilation): - s += ', dilation={dilation}' - if self.output_padding != (0,) * len(self.output_padding): - s += ', output_padding={output_padding}' - if self.groups != 1: - s += ', groups={groups}' - if self.bias is None: - s += ', bias=False' - s += ')' - return s.format(name=self.__class__.__name__, **self.__dict__) - - def _apply(self, fn): - for module in self.children(): - module._apply(fn) - - for param in self._parameters.values(): - if param is not None: - # Variables stored in modules are graph leaves, and we don't - # want to create copy nodes, so we have to unpack the data. - param.data = fn(param.data) - if param._grad is not None: - param._grad.data = fn(param._grad.data) - - for key, buf in self._buffers.items(): - if buf is not None: - self._buffers[key] = fn(buf) - - self.weight.data = fn(self.weight.data) - if self.bias is not None and self.bias.data is not None: - self.bias.data = fn(self.bias.data) - -class SharableLinear(nn.Module): - """Modified linear layer.""" - - def __init__(self, in_features, out_features, bias=True, - mask_init='1s', mask_scale=1e-2, - threshold_fn='binarizer', threshold=None): - super(SharableLinear, self).__init__() - self.in_features = in_features - self.out_features = out_features - self.threshold_fn = threshold_fn - self.mask_scale = mask_scale - self.mask_init = mask_init - - if threshold is None: - threshold = DEFAULT_THRESHOLD - self.info = { - 'threshold_fn': threshold_fn, - 'threshold': threshold, - } - - # weight and bias are no longer Parameters. - self.weight = Parameter(torch.Tensor( - out_features, in_features), requires_grad=True) - if bias: - self.bias = Parameter(torch.Tensor( - out_features), requires_grad=True) - else: - self.register_parameter('bias', None) - - self.piggymask_float = None - self.piggymask_task_tag = None - self.inference_task_id = None - - # Initialize the thresholder. - if threshold_fn == 'binarizer': - self.threshold_fn = Binarizer.apply - elif threshold_fn == 'ternarizer': - self.threshold_fn = Ternarizer(threshold=threshold) - - def forward(self, input): - weight = torch.where(self.packnet_mask == self.inference_task_id, - torch.ones_like(self.packnet_mask, dtype=torch.float32), - torch.zeros_like(self.packnet_mask, dtype=torch.float32)) * self.weight - - if self.piggymask_float is not None: - # Get binarized/ternarized mask from real-valued mask. - mask_thresholded = self.threshold_fn(self.piggymask_float, self.info['threshold']) - # Mask weights with above mask. - mask_thresholded = torch.where(self.piggymask_task_tag == self.inference_task_id, - mask_thresholded, - torch.zeros_like(self.piggymask_task_tag, dtype=torch.float32)) - - # assert torch.all(torch.sqrt(weight * (mask_thresholded.data * self.weight.data)) < 1e-6) - weight += mask_thresholded * self.weight - - # Get output using modified weight. - return F.linear(input, weight, self.bias) - - def __repr__(self): - return self.__class__.__name__ + '(' \ - + 'in_features=' + str(self.in_features) \ - + ', out_features=' + str(self.out_features) + ')' - - def _apply(self, fn): - for module in self.children(): - module._apply(fn) - - for param in self._parameters.values(): - if param is not None: - # Variables stored in modules are graph leaves, and we don't - # want to create copy nodes, so we have to unpack the data. - param.data = fn(param.data) - if param._grad is not None: - param._grad.data = fn(param._grad.data) - - for key, buf in self._buffers.items(): - if buf is not None: - self._buffers[key] = fn(buf) - - self.weight.data = fn(self.weight.data) - self.bias.data = fn(self.bias.data) diff --git a/models_with_one_mask/vgg.py b/models_with_one_mask/vgg.py deleted file mode 100644 index f24546d..0000000 --- a/models_with_one_mask/vgg.py +++ /dev/null @@ -1,282 +0,0 @@ -import torch.nn as nn -import torch.utils.model_zoo as model_zoo -import models_with_one_mask.layers as nl -import pdb - -__all__ = [ - 'VGG', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', - 'vgg19_bn', 'vgg19', 'custom_vgg', 'custom_vgg_cifar100' -] - - -model_urls = { - 'vgg11': 'https://download.pytorch.org/models/vgg11-bbd30ac9.pth', - 'vgg13': 'https://download.pytorch.org/models/vgg13-c768596a.pth', - 'vgg16': 'https://download.pytorch.org/models/vgg16-397923af.pth', - 'vgg19': 'https://download.pytorch.org/models/vgg19-dcbb9e9d.pth', - 'vgg11_bn': 'https://download.pytorch.org/models/vgg11_bn-6002323d.pth', - 'vgg13_bn': 'https://download.pytorch.org/models/vgg13_bn-abd245e5.pth', - 'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth', - 'vgg19_bn': 'https://download.pytorch.org/models/vgg19_bn-c79401a0.pth', -} - -class View(nn.Module): - """Changes view using a nn.Module.""" - - def __init__(self, *shape): - super(View, self).__init__() - self.shape = shape - - def forward(self, input): - return input.view(*self.shape) - -class VGG(nn.Module): - def __init__(self, features, dataset_history, dataset2num_classes, network_width_multiplier=1.0, shared_layer_info={}, init_weights=True, progressive_init=False): - super(VGG, self).__init__() - self.features = features - self.network_width_multiplier = network_width_multiplier - self.shared_layer_info = shared_layer_info - # self.avgpool = nn.AdaptiveAvgPool2d((7, 7)) - - self.datasets, self.classifiers = dataset_history, nn.ModuleList() - self.dataset2num_classes = dataset2num_classes - - if self.datasets: - self._reconstruct_classifiers() - - if init_weights: - self._initialize_weights() - - if progressive_init: - self._initialize_weights_2() - - def forward(self, x): - x = self.features(x) - # x = self.avgpool(x) - x = self.classifier(x) - return x - - def _initialize_weights(self): - for m in self.modules(): - if isinstance(m, nl.SharableConv2d): - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') - if m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nl.SharableLinear): - nn.init.normal_(m.weight, 0, 0.01) - nn.init.constant_(m.bias, 0) - - def _initialize_weights_2(self): - for m in self.modules(): - if isinstance(m, nl.SharableConv2d): - nn.init.normal_(m.weight, 0, 0.01) - - def _reconstruct_classifiers(self): - for dataset, num_classes in self.dataset2num_classes.items(): - self.classifiers.append(nn.Linear(int(self.shared_layer_info[dataset]['network_width_multiplier'] * 4096), num_classes)) - - def add_dataset(self, dataset, num_classes): - """Adds a new dataset to the classifier.""" - if dataset not in self.datasets: - self.datasets.append(dataset) - self.dataset2num_classes[dataset] = num_classes - self.classifiers.append(nn.Linear(int(4096*self.network_width_multiplier), num_classes)) - nn.init.normal_(self.classifiers[self.datasets.index(dataset)].weight, 0, 0.01) - nn.init.constant_(self.classifiers[self.datasets.index(dataset)].bias, 0) - - def set_dataset(self, dataset): - """Change the active classifier.""" - assert dataset in self.datasets - self.classifier = self.classifiers[self.datasets.index(dataset)] - -def make_layers_cifar100(cfg, network_width_multiplier, batch_norm=False, groups=1): - layers = [] - in_channels = 3 - - for v in cfg: - if v == 'M': - layers += [nn.MaxPool2d(kernel_size=2, stride=2)] - else: - if in_channels == 3: - conv2d = nl.SharableConv2d(in_channels, int(v * network_width_multiplier), kernel_size=3, padding=1, bias=False) - else: - conv2d = nl.SharableConv2d(in_channels, int(v * network_width_multiplier), kernel_size=3, padding=1, bias=False, groups=groups) - - if batch_norm: - layers += [conv2d, nn.BatchNorm2d(int(v * network_width_multiplier)), nn.ReLU(inplace=True)] - else: - layers += [conv2d, nn.ReLU(inplace=True)] - in_channels = int(v * network_width_multiplier) - - layers += [ - View(-1, int(512*network_width_multiplier)), - nl.SharableLinear(int(512*network_width_multiplier), int(4096*network_width_multiplier)), - nn.ReLU(True), - nl.SharableLinear(int(4096*network_width_multiplier), int(4096*network_width_multiplier)), - nn.ReLU(True), - ] - - return nn.Sequential(*layers) - -def make_layers(cfg, network_width_multiplier, batch_norm=False, groups=1): - layers = [] - in_channels = 3 - - for v in cfg: - if v == 'M': - layers += [nn.MaxPool2d(kernel_size=2, stride=2)] - else: - if in_channels == 3: - conv2d = nl.SharableConv2d(in_channels, int(v * network_width_multiplier), kernel_size=3, padding=1, bias=False) - else: - conv2d = nl.SharableConv2d(in_channels, int(v * network_width_multiplier), kernel_size=3, padding=1, bias=False, groups=groups) - - if batch_norm: - layers += [conv2d, nn.BatchNorm2d(int(v * network_width_multiplier)), nn.ReLU(inplace=True)] - else: - layers += [conv2d, nn.ReLU(inplace=True)] - in_channels = int(v * network_width_multiplier) - - layers += [ - View(-1, int(512*network_width_multiplier)*7*7), - nl.SharableLinear(int(512*network_width_multiplier)*7*7, int(4096*network_width_multiplier)), - nn.ReLU(True), - # We need Dropout() for 224x224 - nn.Dropout(), - nl.SharableLinear(int(4096*network_width_multiplier), int(4096*network_width_multiplier)), - nn.ReLU(True), - nn.Dropout() - ] - - return nn.Sequential(*layers) - -cfg = { - 'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], - 'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], - 'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], - 'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], -} - - -def vgg11(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 11-layer model (configuration "A") - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['A']), dataset_history, dataset2num_classes, **kwargs) - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg11'])) - return model - - -def vgg11_bn(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 11-layer model (configuration "A") with batch normalization - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['A'], batch_norm=True), dataset_history, dataset2num_classes, **kwargs) - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg11_bn'])) - return model - - -def vgg13(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 13-layer model (configuration "B") - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['B']), dataset_history, dataset2num_classes, **kwargs) - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg13'])) - return model - - -def vgg13_bn(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 13-layer model (configuration "B") with batch normalization - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['B'], batch_norm=True), dataset_history, dataset2num_classes, **kwargs) - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg13_bn'])) - return model - - -def vgg16(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 16-layer model (configuration "D") - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['D']), dataset_history, dataset2num_classes, **kwargs) - - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg16'])) - return model - - -def vgg16_bn(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 16-layer model (configuration "D") with batch normalization - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['D'], batch_norm=True), dataset_history, dataset2num_classes, **kwargs) - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg16_bn'])) - return model - - -def vgg19(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 19-layer model (configuration "E") - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['E']), dataset_history, dataset2num_classes, **kwargs) - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg19'])) - return model - - -def vgg19_bn(pretrained=False, dataset_history=[], dataset2num_classes={}, **kwargs): - """VGG 19-layer model (configuration 'E') with batch normalization - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - """ - if pretrained: - kwargs['init_weights'] = False - model = VGG(make_layers(cfg['E'], batch_norm=True), dataset_history, dataset2num_classes, **kwargs) - if pretrained: - model.load_state_dict(model_zoo.load_url(model_urls['vgg19_bn'])) - return model - -def custom_vgg_cifar100(custom_cfg, dataset_history=[], dataset2num_classes={}, network_width_multiplier=1.0, groups=1, shared_layer_info={}, **kwargs): - return VGG(make_layers_cifar100(custom_cfg, network_width_multiplier, batch_norm=True, groups=groups), dataset_history, - dataset2num_classes, network_width_multiplier, shared_layer_info, **kwargs) - -def custom_vgg(custom_cfg, dataset_history=[], dataset2num_classes={}, network_width_multiplier=1.0, groups=1, shared_layer_info={}, **kwargs): - return VGG(make_layers(custom_cfg, network_width_multiplier, batch_norm=True, groups=groups), dataset_history, - dataset2num_classes, network_width_multiplier, shared_layer_info, **kwargs) \ No newline at end of file diff --git a/packnet_cifar100_main.py b/packnet_face_main.py similarity index 61% rename from packnet_cifar100_main.py rename to packnet_face_main.py index 57d290b..a385f79 100644 --- a/packnet_cifar100_main.py +++ b/packnet_face_main.py @@ -10,8 +10,9 @@ import torch.optim as optim import torch.backends.cudnn as cudnn from torch.nn.parameter import Parameter +import torchvision.transforms as transforms -import UTILS.utils as utils +import FACE_UTILS as utils import pdb import os import math @@ -21,41 +22,36 @@ from pprint import pprint import packnet_models -from UTILS.packnet_manager import Manager -import UTILS.dataset as dataset +from FACE_UTILS.packnet_manager import Manager +import FACE_UTILS.dataset as dataset import torch.utils.model_zoo as model_zoo -import logging - -model_urls = { - 'vgg11': 'https://download.pytorch.org/models/vgg11-bbd30ac9.pth', - 'vgg13': 'https://download.pytorch.org/models/vgg13-c768596a.pth', - 'vgg16': 'https://download.pytorch.org/models/vgg16-397923af.pth', - 'vgg19': 'https://download.pytorch.org/models/vgg19-dcbb9e9d.pth', - 'vgg11_bn': 'https://download.pytorch.org/models/vgg11_bn-6002323d.pth', - 'vgg13_bn': 'https://download.pytorch.org/models/vgg13_bn-abd245e5.pth', - 'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth', - 'vgg19_bn': 'https://download.pytorch.org/models/vgg19_bn-c79401a0.pth', -} +from FACE_UTILS.LFWDataset import LFWDataset + + +#{{{ Arguments +INIT_WEIGHT_PATH = 'common_data/face_weight.pth' # To prevent PIL warnings. warnings.filterwarnings("ignore") parser = argparse.ArgumentParser() -parser.add_argument('--arch', type=str, default='vgg16_bn_cifar100', +parser.add_argument('--arch', type=str, default='vgg16_bn', help='Architectures') parser.add_argument('--num_classes', type=int, default=-1, help='Num outputs for dataset') + # Optimization options. parser.add_argument('--lr', type=float, default=0.1, help='Learning rate for parameters, used for baselines') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training') -parser.add_argument('--val_batch_size', type=int, default=100, +parser.add_argument('--val_batch_size', type=int, default=32, help='input batch size for validation') parser.add_argument('--workers', type=int, default=24, help='') parser.add_argument('--weight_decay', type=float, default=4e-5, help='Weight decay') + # Paths. parser.add_argument('--dataset', type=str, default='', help='Name of dataset') @@ -63,15 +59,16 @@ help='Location of train data') parser.add_argument('--val_path', type=str, default='', help='Location of test data') + # Other. parser.add_argument('--cuda', action='store_true', default=True, help='use CUDA') parser.add_argument('--seed', type=int, default=1, help='random seed') -parser.add_argument('--checkpoint_format', type=str, - default='./{save_folder}/checkpoint-{epoch}.pth.tar', - help='checkpoint file format') +parser.add_argument('--checkpoint_format', type=str, + default='./{save_folder}/checkpoint-{epoch}.pth.tar', + help='checkpoint file format') parser.add_argument('--epochs', type=int, default=160, help='number of epochs to train') @@ -79,19 +76,19 @@ parser.add_argument('--save_folder', type=str, help='folder name inside one_check folder') parser.add_argument('--load_folder', default='', help='') - -# parser.add_argument('--datadir', default='/home/ivclab/decathlon-1.0/', -# help='folder containing data folder') -# parser.add_argument('--imdbdir', default='/home/ivclab/decathlon-1.0/annotations', -# help='annotation folder') - parser.add_argument('--one_shot_prune_perc', type=float, default=0.5, help='% of neurons to prune per layer') parser.add_argument('--mode', choices=['finetune', 'prune', 'inference'], help='Run mode') parser.add_argument('--logfile', type=str, help='file to save baseline accuracy') -parser.add_argument('--initial_from_task', type=str, help='') +parser.add_argument('--jsonfile', type=str, help='file to restore baseline validation accuracy') +parser.add_argument('--use_vgg_pretrained', action='store_true', default=False, + help='') +#}}} + + +#{{{ Multiple optimizers class Optimizers(object): def __init__(self): self.optimizers = [] @@ -115,11 +112,13 @@ def __getitem__(self, index): def __setitem__(self, index, value): self.optimizers[index] = value +#}}} + def main(): """Do stuff.""" + #{{{ Setting arguments, resume epochs and datasets args = parser.parse_args() - # args.batch_size = args.batch_size * torch.cuda.device_count() if args.save_folder and not os.path.isdir(args.save_folder): os.makedirs(args.save_folder) @@ -147,7 +146,7 @@ def main(): # Set default train and test path if not provided as input. utils.set_dataset_paths(args) - + if resume_from_epoch: filepath = args.checkpoint_format.format(save_folder=resume_folder, epoch=resume_from_epoch) checkpoint = torch.load(filepath) @@ -159,21 +158,19 @@ def main(): shared_layer_info = checkpoint['shared_layer_info'] else: shared_layer_info = {} - - if 'num_for_construct' in checkpoint_keys: - num_for_construct = checkpoint['num_for_construct'] else: dataset_history = [] dataset2num_classes = {} masks = {} shared_layer_info = {} - if args.arch == 'vgg16_bn_cifar100': - model = packnet_models.__dict__[args.arch](pretrained=False, dataset_history=dataset_history, dataset2num_classes=dataset2num_classes) + if args.arch == 'spherenet20': + model = packnet_models.__dict__[args.arch](dataset_history=dataset_history, dataset2num_classes=dataset2num_classes, + shared_layer_info=shared_layer_info) else: print('Error!') sys.exit(0) - + # Add and set the model dataset. model.add_dataset(args.dataset, args.num_classes) model.set_dataset(args.dataset) @@ -185,34 +182,33 @@ def main(): 'bn_layer_running_var': {}, 'bn_layer_weight': {}, 'bn_layer_bias': {}, - 'fc_bias': {} + 'fc_bias': {}, + 'prelu_layer_weight': {} } - model = nn.DataParallel(model) - model = model.cuda() - if args.initial_from_task and 'None' not in args.initial_from_task: - filepath = '' - for try_epoch in range(200, 0, -1): - if os.path.exists(args.checkpoint_format.format( - save_folder=args.initial_from_task, epoch=try_epoch)): - filepath = args.checkpoint_format.format(save_folder=args.initial_from_task, epoch=try_epoch) - break - if filepath == '': - pdb.set_trace() - print('Something is wrong') - checkpoint = torch.load(filepath) - state_dict = checkpoint['model_state_dict'] - curr_model_state_dict = model.module.state_dict() - - for name, param in state_dict.items(): - if 'num_batches_tracked' in name: - continue - try: - curr_model_state_dict[name][:].copy_(param) - except: - pdb.set_trace() - print('here') - + if args.cuda: + # Move model to GPU + model = nn.DataParallel(model) + model = model.cuda() + #}}} + + if args.use_vgg_pretrained and model.module.datasets.index(args.dataset) == 0: + print('Initialize vgg face') + curr_model_state_dict = model.state_dict() + state_dict = torch.load(INIT_WEIGHT_PATH) + if args.arch == 'spherenet20': + for name, param in state_dict.items(): + if 'fc' not in name: + curr_model_state_dict['module.' + name].copy_(param) + if args.dataset == 'face_verification': + curr_model_state_dict['module.classifiers.0.weight'].copy_(state_dict['fc5.weight']) + curr_model_state_dict['module.classifiers.0.bias'].copy_(state_dict['fc5.bias']) + curr_model_state_dict['module.classifiers.1.weight'].copy_(state_dict['fc6.weight']) + else: + print("Currently, we didn't define the mapping of {} between vgg pretrained weight and our model".format(args.arch)) + sys.exit(5) + + #{{{ Initializing mask if not masks: for name, module in model.named_modules(): if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): @@ -222,56 +218,66 @@ def main(): if 'cuda' in module.weight.data.type(): mask = mask.cuda() masks[name] = mask - - if args.num_classes == 2: - train_loader = dataset.cifar100_train_loader_two_class(args.dataset, args.batch_size) - val_loader = dataset.cifar100_val_loader_two_class(args.dataset, args.val_batch_size) - elif args.num_classes == 5: - train_loader = dataset.cifar100_train_loader(args.dataset, args.batch_size) - val_loader = dataset.cifar100_val_loader(args.dataset, args.val_batch_size) + #}}} + + #{{{ Data loader + train_loader = dataset.train_loader(args.train_path, args.batch_size) + if args.dataset == 'face_verification': + kwargs = {'num_workers': 2, 'pin_memory': True} if torch.cuda.is_available() else {} + val_loader = torch.utils.data.DataLoader( + LFWDataset(dir=args.val_path, pairs_path='lfw_pairs.txt', + transform=transforms.Compose([ + transforms.Resize(112), + transforms.ToTensor(), + transforms.Normalize(mean=[0.5, 0.5, 0.5], + std= [0.5, 0.5, 0.5])])), + batch_size=args.val_batch_size, shuffle=False, **kwargs) else: - print("num_classes should be either 2 or 5") - sys.exit(1) - + val_loader = dataset.val_loader(args.val_path, args.val_batch_size) + #}}} + # if we are going to save checkpoint in other folder, then we recalculate the starting epoch if args.save_folder != args.load_folder: start_epoch = 0 else: start_epoch = resume_from_epoch - + manager = Manager(args, model, shared_layer_info, masks, train_loader, val_loader) - + if args.mode == 'inference': manager.load_checkpoint_for_inference(resume_from_epoch, resume_folder) manager.validate(resume_from_epoch-1) return + #{{{ Setting optimizers lr = args.lr # update all layers named_params = dict(model.named_parameters()) params_to_optimize_via_SGD = [] - named_params_to_optimize_via_SGD = [] - masks_to_optimize_via_SGD = [] - named_masks_to_optimize_via_SGD = [] - - for tuple_ in named_params.items(): - if 'classifiers' in tuple_[0]: - if '.{}.'.format(model.module.datasets.index(args.dataset)) in tuple_[0]: - params_to_optimize_via_SGD.append(tuple_[1]) - named_params_to_optimize_via_SGD.append(tuple_) + named_of_params_to_optimize_via_SGD = [] + + for name, param in named_params.items(): + if 'classifiers' in name: + if '.{}.'.format(model.module.datasets.index(args.dataset)) in name: + params_to_optimize_via_SGD.append(param) + named_of_params_to_optimize_via_SGD.append(name) continue else: - params_to_optimize_via_SGD.append(tuple_[1]) - named_params_to_optimize_via_SGD.append(tuple_) + params_to_optimize_via_SGD.append(param) + named_of_params_to_optimize_via_SGD.append(name) - # here we must set weight decay to 0.0, + # Here we must set weight decay to 0.0, # because the weight decay strategy in build-in step() function will change every weight elem in the tensor, # which will hurt previous tasks' accuracy. (Instead, we do weight decay ourself in the `prune.py`) + ## TODO HERE: TRY DIFFERENT OPTIMS optimizer_network = optim.SGD(params_to_optimize_via_SGD, lr=lr, - weight_decay=0.0, momentum=0.9, nesterov=True) + weight_decay=0.0, momentum=0.9, nesterov=True) + # optimizer_network = optim.Adam(params_to_optimize_via_SGD, lr=lr, + # weight_decay=0.0) optimizers = Optimizers() optimizers.add(optimizer_network, lr) + #}}} manager.load_checkpoint(optimizers, resume_from_epoch, resume_folder) @@ -281,14 +287,17 @@ def main(): for param_group in optimizer.param_groups: curr_lrs.append(param_group['lr']) break - + if start_epoch != 0: curr_best_accuracy = manager.validate(start_epoch-1) elif args.mode == 'prune': print() print('Sparsity ratio: {}'.format(args.one_shot_prune_perc)) print('Before pruning: ') - baseline_acc = manager.validate(start_epoch-1) + with open(args.jsonfile, 'r') as jsonfile: + json_data = json.load(jsonfile) + baseline_acc = float(json_data[args.dataset]) + # baseline_acc = manager.validate(start_epoch-1) print('Execute one shot pruning ...') manager.one_shot_prune(args.one_shot_prune_perc) else: @@ -296,24 +305,44 @@ def main(): if args.mode == 'finetune': manager.pruner.make_finetuning_mask() + # Use the model pretrained on face_verification task (no more finetuning required) + if args.dataset == 'face_verification': + print('Finetuning face verification, use the pretrained weights directly') + avg_val_acc = manager.evalLFW(0) + manager.save_checkpoint(optimizers, 0, args.save_folder) + if args.logfile: + json_data = {} + if os.path.isfile(args.logfile): + with open(args.logfile) as json_file: + json_data = json.load(json_file) + + json_data[args.dataset] = '{:.4f}'.format(avg_val_acc) + + with open(args.logfile, 'w') as json_file: + json.dump(json_data, json_file) + return + history_best_val_acc = 0.0 + num_epochs_that_criterion_does_not_get_better = 0 + times_of_decaying_learning_rate = 0 + #{{{ Training Loop for epoch_idx in range(start_epoch, args.epochs): avg_train_acc = manager.train(optimizers, epoch_idx, curr_lrs) - avg_val_acc = manager.validate(epoch_idx) + if args.dataset == 'face_verification': + avg_val_acc = manager.evalLFW(epoch_idx) + else: + avg_val_acc = manager.validate(epoch_idx) if args.mode == 'finetune': - if epoch_idx + 1 == 50 or epoch_idx + 1 == 80: - for param_group in optimizers[0].param_groups: - param_group['lr'] *= 0.1 - curr_lrs[0] = param_group['lr'] if avg_val_acc > history_best_val_acc: + num_epochs_that_criterion_does_not_get_better = 0 history_best_val_acc = avg_val_acc if args.save_folder is not None: paths = os.listdir(args.save_folder) if paths and '.pth.tar' in paths[0]: for checkpoint_file in paths: - os.remove(os.path.join(args.save_folder, checkpoint_file)) + os.remove(os.path.join(args.save_folder, checkpoint_file)) else: print('Something is wrong! Block the program with pdb') pdb.set_trace() @@ -330,6 +359,26 @@ def main(): with open(args.logfile, 'w') as json_file: json.dump(json_data, json_file) + else: + num_epochs_that_criterion_does_not_get_better += 1 + + if times_of_decaying_learning_rate >= 3: + print() + print("times_of_decaying_learning_rate reach {}, stop training".format( + times_of_decaying_learning_rate)) + + break + + if num_epochs_that_criterion_does_not_get_better >= 10: + times_of_decaying_learning_rate += 1 + num_epochs_that_criterion_does_not_get_better = 0 + for param_group in optimizers[0].param_groups: + param_group['lr'] *= 0.1 + curr_lrs[0] = param_group['lr'] + print() + print("continously {} epochs doesn't get higher acc, " + "decay learning rate by multiplying 0.1".format( + num_epochs_that_criterion_does_not_get_better)) if args.mode == 'prune': if epoch_idx + 1 == 40: @@ -347,6 +396,7 @@ def main(): print('Cannot prune any more!') print('-' * 16) + #}}} if __name__ == '__main__': main() diff --git a/packnet_models/__init__.py b/packnet_models/__init__.py index 44236ed..b7814b3 100644 --- a/packnet_models/__init__.py +++ b/packnet_models/__init__.py @@ -1,2 +1,3 @@ from .vgg import * -from .resnet import * \ No newline at end of file +from .resnet import * +from .spherenet import * diff --git a/packnet_models/spherenet.py b/packnet_models/spherenet.py new file mode 100644 index 0000000..668553e --- /dev/null +++ b/packnet_models/spherenet.py @@ -0,0 +1,241 @@ +import torch +import torch.nn as nn +import torch.utils.model_zoo as model_zoo +import pdb +from torch.nn.parameter import Parameter +from torch.autograd import Variable + +__all__ = ['spherenet20', 'AngleLoss'] + + +class View(nn.Module): + """Changes view using a nn.Module.""" + + def __init__(self, *shape): + super(View, self).__init__() + self.shape = shape + + def forward(self, input): + return input.view(*self.shape) + + +class AngleLoss(nn.Module): + def __init__(self, gamma=0): + super(AngleLoss, self).__init__() + self.gamma = gamma + self.it = 0 + self.LambdaMin = 5.0 + self.LambdaMax = 1500.0 + self.lamb = 1500.0 + + def forward(self, input, target): + self.it += 1 + cos_theta,phi_theta = input + target = target.view(-1,1) + + index = cos_theta.data * 0.0 + index.scatter_(1,target.data.view(-1,1),1) + index = index.byte() + index = Variable(index) + + self.lamb = max(self.LambdaMin,self.LambdaMax/(1+0.1*self.it )) + output = cos_theta * 1.0 + output[index] -= cos_theta[index]*(1.0+0)/(1+self.lamb) + output[index] += phi_theta[index]*(1.0+0)/(1+self.lamb) + + logpt = F.log_softmax(output,dim=1) + logpt = logpt.gather(1,target) + logpt = logpt.view(-1) + pt = Variable(logpt.data.exp()) + + loss = -1 * (1-pt)**self.gamma * logpt + loss = loss.mean() + return loss + + +class AngleLinear(nn.Module): + def __init__(self, in_features, out_features, m = 4): + super(AngleLinear, self).__init__() + self.in_features = in_features + self.out_features = out_features + self.weight = Parameter(torch.Tensor(in_features,out_features)) + self.weight.data.uniform_(-1, 1).renorm_(2,1,1e-5).mul_(1e5) + self.m = m + self.mlambda = [ + lambda x: x**0, # cos(0*theta)=1 + lambda x: x**1, # cos(1*theta)=cos(theta) + lambda x: 2*x**2-1, # cos(2*theta)=2*cos(theta)**2-1 + lambda x: 4*x**3-3*x, # cos(3*theta)=4*cos(theta)**3-3cos(theta) + lambda x: 8*x**4-8*x**2+1, + lambda x: 16*x**5-20*x**3+5*x + ] + + def forward(self, input): + x = input + w = self.weight + ww = w.renorm(2,1,1e-5).mul(1e5) + xlen = x.pow(2).sum(1).pow(0.5) + wlen = ww.pow(2).sum(0).pow(0.5) + cos_theta = x.mm(ww) + cos_theta = cos_theta / xlen.view(-1,1) / wlen.view(1,-1) + cos_theta = cos_theta.clamp(-1,1) + cos_m_theta = self.mlambda[self.m](cos_theta) + theta = Variable(cos_theta.data.acos()) + k = (self.m*theta/3.14159265).floor() + n_one = k*0.0 - 1 + phi_theta = (n_one**k) * cos_m_theta - 2*k + cos_theta = cos_theta * xlen.view(-1,1) + phi_theta = phi_theta * xlen.view(-1,1) + output = (cos_theta,phi_theta) + return output + + +class SphereNet20(nn.Module): + def __init__(self, dataset_history, dataset2num_classes, shared_layer_info={}, init_weights=True): + super(SphereNet20, self).__init__() + self.make_feature_layers() + + self.shared_layer_info = shared_layer_info + self.datasets = dataset_history + self.classifiers = nn.ModuleList() + self.dataset2num_classes = dataset2num_classes + + if self.datasets: + self._reconstruct_classifiers() + + if init_weights != '': + self._initialize_weights() + return + + def forward(self, x): + x = self.relu1_1(self.conv1_1(x)) + x = x + self.relu1_3(self.conv1_3(self.relu1_2(self.conv1_2(x)))) + x = self.relu2_1(self.conv2_1(x)) + x = x + self.relu2_3(self.conv2_3(self.relu2_2(self.conv2_2(x)))) + x = x + self.relu2_5(self.conv2_5(self.relu2_4(self.conv2_4(x)))) + x = self.relu3_1(self.conv3_1(x)) + x = x + self.relu3_3(self.conv3_3(self.relu3_2(self.conv3_2(x)))) + x = x + self.relu3_5(self.conv3_5(self.relu3_4(self.conv3_4(x)))) + x = x + self.relu3_7(self.conv3_7(self.relu3_6(self.conv3_6(x)))) + x = x + self.relu3_9(self.conv3_9(self.relu3_8(self.conv3_8(x)))) + x = self.relu4_1(self.conv4_1(x)) + x = x + self.relu4_3(self.conv4_3(self.relu4_2(self.conv4_2(x)))) + x = self.flatten(x) + x = self.classifier(x) + return x + + def forward_to_embeddings(self, x): + x = self.relu1_1(self.conv1_1(x)) + x = x + self.relu1_3(self.conv1_3(self.relu1_2(self.conv1_2(x)))) + x = self.relu2_1(self.conv2_1(x)) + x = x + self.relu2_3(self.conv2_3(self.relu2_2(self.conv2_2(x)))) + x = x + self.relu2_5(self.conv2_5(self.relu2_4(self.conv2_4(x)))) + x = self.relu3_1(self.conv3_1(x)) + x = x + self.relu3_3(self.conv3_3(self.relu3_2(self.conv3_2(x)))) + x = x + self.relu3_5(self.conv3_5(self.relu3_4(self.conv3_4(x)))) + x = x + self.relu3_7(self.conv3_7(self.relu3_6(self.conv3_6(x)))) + x = x + self.relu3_9(self.conv3_9(self.relu3_8(self.conv3_8(x)))) + x = self.relu4_1(self.conv4_1(x)) + x = x + self.relu4_3(self.conv4_3(self.relu4_2(self.conv4_2(x)))) + x = self.flatten(x) + x = self.classifier[0](x) + return x + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.PReLU): + nn.init.constant_(m.weight, 0.25) + return + + def _reconstruct_classifiers(self): + for dataset, num_classes in self.dataset2num_classes.items(): + if 'face_verification' in dataset: + embedding_size = 512 + classifier_module = nn.Sequential(nn.Linear(512*7*7, embedding_size), + AngleLinear(embedding_size, num_classes)) + self.classifiers.append(classifier_module) + else: + self.classifiers.append(nn.Linear(512*7*7, num_classes)) + return + + def add_dataset(self, dataset, num_classes): + """Adds a new dataset to the classifier.""" + if dataset not in self.datasets: + self.datasets.append(dataset) + self.dataset2num_classes[dataset] = num_classes + if 'face_verification' in dataset: + embedding_size = 512 + classifier_module = nn.Sequential(nn.Linear(512*7*7, embedding_size), + AngleLinear(embedding_size, num_classes)) + self.classifiers.append(classifier_module) + nn.init.normal_(classifier_module[0].weight, 0, 0.01) + nn.init.constant_(classifier_module[0].bias, 0) + nn.init.normal_(classifier_module[1].weight, 0, 0.01) + else: + self.classifiers.append(nn.Linear(512*7*7, num_classes)) + nn.init.normal_(self.classifiers[self.datasets.index(dataset)].weight, 0, 0.01) + nn.init.constant_(self.classifiers[self.datasets.index(dataset)].bias, 0) + + def set_dataset(self, dataset): + """Change the active classifier.""" + assert dataset in self.datasets + self.classifier = self.classifiers[self.datasets.index(dataset)] + + def make_feature_layers(self): + self.conv1_1 = nn.Conv2d(3,64,3,2,1) #=>B*64*56*56 + self.relu1_1 = nn.PReLU(64) + self.conv1_2 = nn.Conv2d(64,64,3,1,1) + self.relu1_2 = nn.PReLU(64) + self.conv1_3 = nn.Conv2d(64,64,3,1,1) + self.relu1_3 = nn.PReLU(64) + + self.conv2_1 = nn.Conv2d(64,128,3,2,1) #=>B*128*28*28 + self.relu2_1 = nn.PReLU(128) + self.conv2_2 = nn.Conv2d(128,128,3,1,1) + self.relu2_2 = nn.PReLU(128) + self.conv2_3 = nn.Conv2d(128,128,3,1,1) + self.relu2_3 = nn.PReLU(128) + + self.conv2_4 = nn.Conv2d(128,128,3,1,1) #=>B*128*28*28 + self.relu2_4 = nn.PReLU(128) + self.conv2_5 = nn.Conv2d(128,128,3,1,1) + self.relu2_5 = nn.PReLU(128) + + + self.conv3_1 = nn.Conv2d(128,256,3,2,1) #=>B*256*14*14 + self.relu3_1 = nn.PReLU(256) + self.conv3_2 = nn.Conv2d(256,256,3,1,1) + self.relu3_2 = nn.PReLU(256) + self.conv3_3 = nn.Conv2d(256,256,3,1,1) + self.relu3_3 = nn.PReLU(256) + + self.conv3_4 = nn.Conv2d(256,256,3,1,1) #=>B*256*14*14 + self.relu3_4 = nn.PReLU(256) + self.conv3_5 = nn.Conv2d(256,256,3,1,1) + self.relu3_5 = nn.PReLU(256) + + self.conv3_6 = nn.Conv2d(256,256,3,1,1) #=>B*256*14*14 + self.relu3_6 = nn.PReLU(256) + self.conv3_7 = nn.Conv2d(256,256,3,1,1) + self.relu3_7 = nn.PReLU(256) + + self.conv3_8 = nn.Conv2d(256,256,3,1,1) #=>B*256*14*14 + self.relu3_8 = nn.PReLU(256) + self.conv3_9 = nn.Conv2d(256,256,3,1,1) + self.relu3_9 = nn.PReLU(256) + self.conv4_1 = nn.Conv2d(256,512,3,2,1) #=>B*512*7*7 + self.relu4_1 = nn.PReLU(512) + self.conv4_2 = nn.Conv2d(512,512,3,1,1) + self.relu4_2 = nn.PReLU(512) + self.conv4_3 = nn.Conv2d(512,512,3,1,1) + self.relu4_3 = nn.PReLU(512) + self.flatten = View(-1, 512*7*7) + return + + +def spherenet20(dataset_history=[], dataset2num_classes={}, shared_layer_info={}, **kwargs): + return SphereNet20(dataset_history, dataset2num_classes, shared_layer_info, **kwargs)