From d902b523104ec9a829762fd516566ef3d43d1c5b Mon Sep 17 00:00:00 2001 From: Pablo Monteagudo Lago Date: Fri, 14 Feb 2025 17:20:50 +0000 Subject: [PATCH 1/5] Prevent tensors in different devices/dtype in CaileySGD optim --- src/brevitas/optim/cailey_sgd.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/brevitas/optim/cailey_sgd.py b/src/brevitas/optim/cailey_sgd.py index 2e2426fee..d68329aae 100644 --- a/src/brevitas/optim/cailey_sgd.py +++ b/src/brevitas/optim/cailey_sgd.py @@ -150,9 +150,7 @@ def step(self, closure=None): param_state = self.state[p] if "momentum_buffer" not in param_state: - param_state["momentum_buffer"] = torch.zeros(g.t().size()) - if p.is_cuda: - param_state["momentum_buffer"] = param_state["momentum_buffer"].cuda() + param_state["momentum_buffer"] = torch.zeros_like(g.t()) V = param_state["momentum_buffer"] V = momentum * V - g.t() From 1dce3a99976167d9569b46b477d9135bf8eeb4fa Mon Sep 17 00:00:00 2001 From: Pablo Monteagudo Lago Date: Fri, 14 Feb 2025 18:06:33 +0000 Subject: [PATCH 2/5] Add test for float16 --- src/brevitas/optim/cailey_sgd.py | 5 ++++- tests/brevitas/optim/test_cailey_sgd.py | 17 ++++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/brevitas/optim/cailey_sgd.py b/src/brevitas/optim/cailey_sgd.py index d68329aae..d676373c1 100644 --- a/src/brevitas/optim/cailey_sgd.py +++ b/src/brevitas/optim/cailey_sgd.py @@ -43,7 +43,10 @@ def Cayley_loop(X, W, tan_vec, t): # def qr_retraction(tan_vec): # tan_vec, p-by-n, p <= n [p, n] = tan_vec.size() tan_vec.t_() - q, r = torch.linalg.qr(tan_vec) + dtype = tan_vec.dtype + # torch.linalg.qr is not implemented for 'Half' + q, r = torch.linalg.qr(tan_vec.to(torch.float32)) + q, r = q.to(dtype=dtype), r.to(dtype=dtype) d = torch.diag(r, 0) ph = d.sign() q *= ph.expand_as(q) diff --git a/tests/brevitas/optim/test_cailey_sgd.py b/tests/brevitas/optim/test_cailey_sgd.py index 92de8ae5a..87bda873a 100644 --- a/tests/brevitas/optim/test_cailey_sgd.py +++ b/tests/brevitas/optim/test_cailey_sgd.py @@ -72,7 +72,7 @@ (LinearLR, { "start_factor": 1.0, "end_factor": 0.0, "total_iters": 20}),] DEVICES = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"] -DTYPES = [torch.float32] +DTYPES = [torch.float32, torch.float16] device_dtype_parametrize = pytest_cases.parametrize("device, dtype", list(product(DEVICES, DTYPES))) @@ -83,6 +83,7 @@ class TestCaileySGD: @pytest_cases.parametrize("optimizer_kwargs", OPTIMIZER_KWARGS) @pytest_cases.parametrize("lr_scheduler_args", LR_SCHEDULER_ARGS) def test_forloop_goes_right_direction(self, device, dtype, optimizer_kwargs, lr_scheduler_args): + torch.manual_seed(SEED) optim_cls = CaileySGD # Generate a random orthogonal matrix of size NxN. Columns represent orthonormal vector in R^{N} N = 5 @@ -108,6 +109,8 @@ def closure(): return loss initial_value = closure().item() + ATOL = 1e-5 if dtype == torch.float32 else 1e-2 + RTOL = 1e-6 if dtype == torch.float16 else 1e-3 for _ in range(20): closure() optimizer.step() @@ -115,11 +118,15 @@ def closure(): scheduler.step() # Verify that iterates stay within the Stiefel manifold + print( + weight.to(dtype=torch.float32).detach().cpu() + @ weight.to(dtype=torch.float32).detach().cpu().t()) assert torch.allclose( - weight.detach().cpu() @ weight.detach().cpu().t(), - torch.eye(P, P, device=device, dtype=dtype).detach().cpu(), - atol=1e-5, - rtol=1e-6) + weight.to(dtype=torch.float32).detach().cpu() + @ weight.to(dtype=torch.float32).detach().cpu().t(), + torch.eye(P, P, device=device, dtype=torch.float32).detach().cpu(), + atol=ATOL, + rtol=RTOL) if optimizer_kwargs.get("maximize", False): assert closure().item() > initial_value From 2322ba87632cb7e3c5547c537e06df48f1043317 Mon Sep 17 00:00:00 2001 From: Pablo Monteagudo Lago Date: Tue, 18 Feb 2025 06:45:10 +0000 Subject: [PATCH 3/5] Fix test --- tests/brevitas/optim/test_cailey_sgd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/brevitas/optim/test_cailey_sgd.py b/tests/brevitas/optim/test_cailey_sgd.py index 87bda873a..0d29d933c 100644 --- a/tests/brevitas/optim/test_cailey_sgd.py +++ b/tests/brevitas/optim/test_cailey_sgd.py @@ -72,7 +72,7 @@ (LinearLR, { "start_factor": 1.0, "end_factor": 0.0, "total_iters": 20}),] DEVICES = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"] -DTYPES = [torch.float32, torch.float16] +DTYPES = [torch.float32, torch.float16, torch.bfloat16] device_dtype_parametrize = pytest_cases.parametrize("device, dtype", list(product(DEVICES, DTYPES))) @@ -110,7 +110,7 @@ def closure(): initial_value = closure().item() ATOL = 1e-5 if dtype == torch.float32 else 1e-2 - RTOL = 1e-6 if dtype == torch.float16 else 1e-3 + RTOL = 1e-6 if dtype == torch.float32 else 1e-3 for _ in range(20): closure() optimizer.step() From feee23a506cf0aa282c26ace8614e14bf5c991b9 Mon Sep 17 00:00:00 2001 From: Pablo Monteagudo Lago Date: Tue, 18 Feb 2025 13:41:04 +0000 Subject: [PATCH 4/5] Fix tests --- src/brevitas/optim/cailey_sgd.py | 6 +++--- tests/brevitas/optim/test_cailey_sgd.py | 11 ++++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/brevitas/optim/cailey_sgd.py b/src/brevitas/optim/cailey_sgd.py index d676373c1..2a17cfa85 100644 --- a/src/brevitas/optim/cailey_sgd.py +++ b/src/brevitas/optim/cailey_sgd.py @@ -46,8 +46,8 @@ def qr_retraction(tan_vec): # tan_vec, p-by-n, p <= n dtype = tan_vec.dtype # torch.linalg.qr is not implemented for 'Half' q, r = torch.linalg.qr(tan_vec.to(torch.float32)) - q, r = q.to(dtype=dtype), r.to(dtype=dtype) - d = torch.diag(r, 0) + q = q.to(dtype=dtype) + d = torch.diag(r, 0).to(dtype=dtype) ph = d.sign() q *= ph.expand_as(q) q.t_() @@ -90,7 +90,7 @@ class CaileySGD(Optimizer): def __init__( self, params, - lr: float = 1e-3, + lr: float = 1e-1, momentum: int = 0, dampening: int = 0, weight_decay: int = 0, diff --git a/tests/brevitas/optim/test_cailey_sgd.py b/tests/brevitas/optim/test_cailey_sgd.py index 0d29d933c..dc5dad3dd 100644 --- a/tests/brevitas/optim/test_cailey_sgd.py +++ b/tests/brevitas/optim/test_cailey_sgd.py @@ -65,14 +65,14 @@ OPTIMIZER_KWARGS = [{ "stiefel": True}, { - "stiefel": True, "lr": 1e-2}, { - "stiefel": True, "lr": torch.tensor(0.001)}] + "stiefel": True, "lr": 0.5}, { + "stiefel": True, "lr": torch.tensor(0.5)}] LR_SCHEDULER_ARGS = [ None, (LinearLR, { "start_factor": 1.0, "end_factor": 0.0, "total_iters": 20}),] DEVICES = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"] -DTYPES = [torch.float32, torch.float16, torch.bfloat16] +DTYPES = ["float32", "float16", "bfloat16"] device_dtype_parametrize = pytest_cases.parametrize("device, dtype", list(product(DEVICES, DTYPES))) @@ -85,6 +85,7 @@ class TestCaileySGD: def test_forloop_goes_right_direction(self, device, dtype, optimizer_kwargs, lr_scheduler_args): torch.manual_seed(SEED) optim_cls = CaileySGD + dtype = getattr(torch, dtype) # Generate a random orthogonal matrix of size NxN. Columns represent orthonormal vector in R^{N} N = 5 P = 3 @@ -109,8 +110,8 @@ def closure(): return loss initial_value = closure().item() - ATOL = 1e-5 if dtype == torch.float32 else 1e-2 - RTOL = 1e-6 if dtype == torch.float32 else 1e-3 + ATOL = 1e-2 if dtype == torch.float32 else 1e-1 + RTOL = 1e-3 if dtype == torch.float32 else 1e-2 for _ in range(20): closure() optimizer.step() From 5ccc035f7ee61fe50a03c0e56367a3c6d10c10f0 Mon Sep 17 00:00:00 2001 From: Pablo Monteagudo Lago Date: Thu, 20 Feb 2025 08:20:06 +0000 Subject: [PATCH 5/5] Skip test for bfloat16 and float16 --- src/brevitas/optim/cailey_sgd.py | 4 ++-- tests/brevitas/optim/test_cailey_sgd.py | 14 ++++++-------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/brevitas/optim/cailey_sgd.py b/src/brevitas/optim/cailey_sgd.py index 2a17cfa85..265fb2c75 100644 --- a/src/brevitas/optim/cailey_sgd.py +++ b/src/brevitas/optim/cailey_sgd.py @@ -46,8 +46,8 @@ def qr_retraction(tan_vec): # tan_vec, p-by-n, p <= n dtype = tan_vec.dtype # torch.linalg.qr is not implemented for 'Half' q, r = torch.linalg.qr(tan_vec.to(torch.float32)) - q = q.to(dtype=dtype) - d = torch.diag(r, 0).to(dtype=dtype) + q, r = q.to(dtype=dtype), r.to(dtype=dtype) + d = torch.diag(r, 0) ph = d.sign() q *= ph.expand_as(q) q.t_() diff --git a/tests/brevitas/optim/test_cailey_sgd.py b/tests/brevitas/optim/test_cailey_sgd.py index dc5dad3dd..40b6a4675 100644 --- a/tests/brevitas/optim/test_cailey_sgd.py +++ b/tests/brevitas/optim/test_cailey_sgd.py @@ -42,22 +42,19 @@ from copy import deepcopy from itertools import product -import math -import sys -from typing import List, Union -import unittest from hypothesis import given import numpy as np +from packaging import version import pytest import pytest_cases from pytest_cases import fixture from scipy.stats import ortho_group import torch from torch.nn import Parameter -import torch.nn as nn from torch.optim.lr_scheduler import LinearLR +from brevitas import torch_version from brevitas.optim.cailey_sgd import CaileySGD from tests.conftest import SEED @@ -83,6 +80,10 @@ class TestCaileySGD: @pytest_cases.parametrize("optimizer_kwargs", OPTIMIZER_KWARGS) @pytest_cases.parametrize("lr_scheduler_args", LR_SCHEDULER_ARGS) def test_forloop_goes_right_direction(self, device, dtype, optimizer_kwargs, lr_scheduler_args): + if torch_version < version.parse('2.3.1') and dtype in ["float16", "bfloat16"]: + pytest.skip( + "Some operations in the CaileySGD optimizer (e.g. diag, eye) are not implemented for 'Half' or 'BFloat16' in PyTorch versions under 2.3.1." + ) torch.manual_seed(SEED) optim_cls = CaileySGD dtype = getattr(torch, dtype) @@ -119,9 +120,6 @@ def closure(): scheduler.step() # Verify that iterates stay within the Stiefel manifold - print( - weight.to(dtype=torch.float32).detach().cpu() - @ weight.to(dtype=torch.float32).detach().cpu().t()) assert torch.allclose( weight.to(dtype=torch.float32).detach().cpu() @ weight.to(dtype=torch.float32).detach().cpu().t(),