From d902b523104ec9a829762fd516566ef3d43d1c5b Mon Sep 17 00:00:00 2001
From: Pablo Monteagudo Lago <pablo.monteagudolago@amd.com>
Date: Fri, 14 Feb 2025 17:20:50 +0000
Subject: [PATCH 1/5] Prevent tensors in different devices/dtype in CaileySGD
 optim

---
 src/brevitas/optim/cailey_sgd.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/brevitas/optim/cailey_sgd.py b/src/brevitas/optim/cailey_sgd.py
index 2e2426fee..d68329aae 100644
--- a/src/brevitas/optim/cailey_sgd.py
+++ b/src/brevitas/optim/cailey_sgd.py
@@ -150,9 +150,7 @@ def step(self, closure=None):
 
                     param_state = self.state[p]
                     if "momentum_buffer" not in param_state:
-                        param_state["momentum_buffer"] = torch.zeros(g.t().size())
-                        if p.is_cuda:
-                            param_state["momentum_buffer"] = param_state["momentum_buffer"].cuda()
+                        param_state["momentum_buffer"] = torch.zeros_like(g.t())
 
                     V = param_state["momentum_buffer"]
                     V = momentum * V - g.t()

From 1dce3a99976167d9569b46b477d9135bf8eeb4fa Mon Sep 17 00:00:00 2001
From: Pablo Monteagudo Lago <pablo.monteagudolago@amd.com>
Date: Fri, 14 Feb 2025 18:06:33 +0000
Subject: [PATCH 2/5] Add test for float16

---
 src/brevitas/optim/cailey_sgd.py        |  5 ++++-
 tests/brevitas/optim/test_cailey_sgd.py | 17 ++++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/brevitas/optim/cailey_sgd.py b/src/brevitas/optim/cailey_sgd.py
index d68329aae..d676373c1 100644
--- a/src/brevitas/optim/cailey_sgd.py
+++ b/src/brevitas/optim/cailey_sgd.py
@@ -43,7 +43,10 @@ def Cayley_loop(X, W, tan_vec, t):  #
 def qr_retraction(tan_vec):  # tan_vec, p-by-n, p <= n
     [p, n] = tan_vec.size()
     tan_vec.t_()
-    q, r = torch.linalg.qr(tan_vec)
+    dtype = tan_vec.dtype
+    # torch.linalg.qr is not implemented for 'Half'
+    q, r = torch.linalg.qr(tan_vec.to(torch.float32))
+    q, r = q.to(dtype=dtype), r.to(dtype=dtype)
     d = torch.diag(r, 0)
     ph = d.sign()
     q *= ph.expand_as(q)
diff --git a/tests/brevitas/optim/test_cailey_sgd.py b/tests/brevitas/optim/test_cailey_sgd.py
index 92de8ae5a..87bda873a 100644
--- a/tests/brevitas/optim/test_cailey_sgd.py
+++ b/tests/brevitas/optim/test_cailey_sgd.py
@@ -72,7 +72,7 @@
     (LinearLR, {
         "start_factor": 1.0, "end_factor": 0.0, "total_iters": 20}),]
 DEVICES = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
-DTYPES = [torch.float32]
+DTYPES = [torch.float32, torch.float16]
 
 device_dtype_parametrize = pytest_cases.parametrize("device, dtype", list(product(DEVICES, DTYPES)))
 
@@ -83,6 +83,7 @@ class TestCaileySGD:
     @pytest_cases.parametrize("optimizer_kwargs", OPTIMIZER_KWARGS)
     @pytest_cases.parametrize("lr_scheduler_args", LR_SCHEDULER_ARGS)
     def test_forloop_goes_right_direction(self, device, dtype, optimizer_kwargs, lr_scheduler_args):
+        torch.manual_seed(SEED)
         optim_cls = CaileySGD
         # Generate a random orthogonal matrix of size NxN. Columns represent orthonormal vector in R^{N}
         N = 5
@@ -108,6 +109,8 @@ def closure():
             return loss
 
         initial_value = closure().item()
+        ATOL = 1e-5 if dtype == torch.float32 else 1e-2
+        RTOL = 1e-6 if dtype == torch.float16 else 1e-3
         for _ in range(20):
             closure()
             optimizer.step()
@@ -115,11 +118,15 @@ def closure():
                 scheduler.step()
 
             # Verify that iterates stay within the Stiefel manifold
+            print(
+                weight.to(dtype=torch.float32).detach().cpu()
+                @ weight.to(dtype=torch.float32).detach().cpu().t())
             assert torch.allclose(
-                weight.detach().cpu() @ weight.detach().cpu().t(),
-                torch.eye(P, P, device=device, dtype=dtype).detach().cpu(),
-                atol=1e-5,
-                rtol=1e-6)
+                weight.to(dtype=torch.float32).detach().cpu()
+                @ weight.to(dtype=torch.float32).detach().cpu().t(),
+                torch.eye(P, P, device=device, dtype=torch.float32).detach().cpu(),
+                atol=ATOL,
+                rtol=RTOL)
 
             if optimizer_kwargs.get("maximize", False):
                 assert closure().item() > initial_value

From 2322ba87632cb7e3c5547c537e06df48f1043317 Mon Sep 17 00:00:00 2001
From: Pablo Monteagudo Lago <pablo.monteagudolago@amd.com>
Date: Tue, 18 Feb 2025 06:45:10 +0000
Subject: [PATCH 3/5] Fix test

---
 tests/brevitas/optim/test_cailey_sgd.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/brevitas/optim/test_cailey_sgd.py b/tests/brevitas/optim/test_cailey_sgd.py
index 87bda873a..0d29d933c 100644
--- a/tests/brevitas/optim/test_cailey_sgd.py
+++ b/tests/brevitas/optim/test_cailey_sgd.py
@@ -72,7 +72,7 @@
     (LinearLR, {
         "start_factor": 1.0, "end_factor": 0.0, "total_iters": 20}),]
 DEVICES = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
-DTYPES = [torch.float32, torch.float16]
+DTYPES = [torch.float32, torch.float16, torch.bfloat16]
 
 device_dtype_parametrize = pytest_cases.parametrize("device, dtype", list(product(DEVICES, DTYPES)))
 
@@ -110,7 +110,7 @@ def closure():
 
         initial_value = closure().item()
         ATOL = 1e-5 if dtype == torch.float32 else 1e-2
-        RTOL = 1e-6 if dtype == torch.float16 else 1e-3
+        RTOL = 1e-6 if dtype == torch.float32 else 1e-3
         for _ in range(20):
             closure()
             optimizer.step()

From feee23a506cf0aa282c26ace8614e14bf5c991b9 Mon Sep 17 00:00:00 2001
From: Pablo Monteagudo Lago <pablo.monteagudolago@amd.com>
Date: Tue, 18 Feb 2025 13:41:04 +0000
Subject: [PATCH 4/5] Fix tests

---
 src/brevitas/optim/cailey_sgd.py        |  6 +++---
 tests/brevitas/optim/test_cailey_sgd.py | 11 ++++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/brevitas/optim/cailey_sgd.py b/src/brevitas/optim/cailey_sgd.py
index d676373c1..2a17cfa85 100644
--- a/src/brevitas/optim/cailey_sgd.py
+++ b/src/brevitas/optim/cailey_sgd.py
@@ -46,8 +46,8 @@ def qr_retraction(tan_vec):  # tan_vec, p-by-n, p <= n
     dtype = tan_vec.dtype
     # torch.linalg.qr is not implemented for 'Half'
     q, r = torch.linalg.qr(tan_vec.to(torch.float32))
-    q, r = q.to(dtype=dtype), r.to(dtype=dtype)
-    d = torch.diag(r, 0)
+    q = q.to(dtype=dtype)
+    d = torch.diag(r, 0).to(dtype=dtype)
     ph = d.sign()
     q *= ph.expand_as(q)
     q.t_()
@@ -90,7 +90,7 @@ class CaileySGD(Optimizer):
     def __init__(
         self,
         params,
-        lr: float = 1e-3,
+        lr: float = 1e-1,
         momentum: int = 0,
         dampening: int = 0,
         weight_decay: int = 0,
diff --git a/tests/brevitas/optim/test_cailey_sgd.py b/tests/brevitas/optim/test_cailey_sgd.py
index 0d29d933c..dc5dad3dd 100644
--- a/tests/brevitas/optim/test_cailey_sgd.py
+++ b/tests/brevitas/optim/test_cailey_sgd.py
@@ -65,14 +65,14 @@
 
 OPTIMIZER_KWARGS = [{
     "stiefel": True}, {
-        "stiefel": True, "lr": 1e-2}, {
-            "stiefel": True, "lr": torch.tensor(0.001)}]
+        "stiefel": True, "lr": 0.5}, {
+            "stiefel": True, "lr": torch.tensor(0.5)}]
 LR_SCHEDULER_ARGS = [
     None,
     (LinearLR, {
         "start_factor": 1.0, "end_factor": 0.0, "total_iters": 20}),]
 DEVICES = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
-DTYPES = [torch.float32, torch.float16, torch.bfloat16]
+DTYPES = ["float32", "float16", "bfloat16"]
 
 device_dtype_parametrize = pytest_cases.parametrize("device, dtype", list(product(DEVICES, DTYPES)))
 
@@ -85,6 +85,7 @@ class TestCaileySGD:
     def test_forloop_goes_right_direction(self, device, dtype, optimizer_kwargs, lr_scheduler_args):
         torch.manual_seed(SEED)
         optim_cls = CaileySGD
+        dtype = getattr(torch, dtype)
         # Generate a random orthogonal matrix of size NxN. Columns represent orthonormal vector in R^{N}
         N = 5
         P = 3
@@ -109,8 +110,8 @@ def closure():
             return loss
 
         initial_value = closure().item()
-        ATOL = 1e-5 if dtype == torch.float32 else 1e-2
-        RTOL = 1e-6 if dtype == torch.float32 else 1e-3
+        ATOL = 1e-2 if dtype == torch.float32 else 1e-1
+        RTOL = 1e-3 if dtype == torch.float32 else 1e-2
         for _ in range(20):
             closure()
             optimizer.step()

From 5ccc035f7ee61fe50a03c0e56367a3c6d10c10f0 Mon Sep 17 00:00:00 2001
From: Pablo Monteagudo Lago <pablo.monteagudolago@amd.com>
Date: Thu, 20 Feb 2025 08:20:06 +0000
Subject: [PATCH 5/5] Skip test for bfloat16 and float16

---
 src/brevitas/optim/cailey_sgd.py        |  4 ++--
 tests/brevitas/optim/test_cailey_sgd.py | 14 ++++++--------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/brevitas/optim/cailey_sgd.py b/src/brevitas/optim/cailey_sgd.py
index 2a17cfa85..265fb2c75 100644
--- a/src/brevitas/optim/cailey_sgd.py
+++ b/src/brevitas/optim/cailey_sgd.py
@@ -46,8 +46,8 @@ def qr_retraction(tan_vec):  # tan_vec, p-by-n, p <= n
     dtype = tan_vec.dtype
     # torch.linalg.qr is not implemented for 'Half'
     q, r = torch.linalg.qr(tan_vec.to(torch.float32))
-    q = q.to(dtype=dtype)
-    d = torch.diag(r, 0).to(dtype=dtype)
+    q, r = q.to(dtype=dtype), r.to(dtype=dtype)
+    d = torch.diag(r, 0)
     ph = d.sign()
     q *= ph.expand_as(q)
     q.t_()
diff --git a/tests/brevitas/optim/test_cailey_sgd.py b/tests/brevitas/optim/test_cailey_sgd.py
index dc5dad3dd..40b6a4675 100644
--- a/tests/brevitas/optim/test_cailey_sgd.py
+++ b/tests/brevitas/optim/test_cailey_sgd.py
@@ -42,22 +42,19 @@
 
 from copy import deepcopy
 from itertools import product
-import math
-import sys
-from typing import List, Union
-import unittest
 
 from hypothesis import given
 import numpy as np
+from packaging import version
 import pytest
 import pytest_cases
 from pytest_cases import fixture
 from scipy.stats import ortho_group
 import torch
 from torch.nn import Parameter
-import torch.nn as nn
 from torch.optim.lr_scheduler import LinearLR
 
+from brevitas import torch_version
 from brevitas.optim.cailey_sgd import CaileySGD
 from tests.conftest import SEED
 
@@ -83,6 +80,10 @@ class TestCaileySGD:
     @pytest_cases.parametrize("optimizer_kwargs", OPTIMIZER_KWARGS)
     @pytest_cases.parametrize("lr_scheduler_args", LR_SCHEDULER_ARGS)
     def test_forloop_goes_right_direction(self, device, dtype, optimizer_kwargs, lr_scheduler_args):
+        if torch_version < version.parse('2.3.1') and dtype in ["float16", "bfloat16"]:
+            pytest.skip(
+                "Some operations in the CaileySGD optimizer (e.g. diag, eye) are not implemented for 'Half' or 'BFloat16' in PyTorch versions under 2.3.1."
+            )
         torch.manual_seed(SEED)
         optim_cls = CaileySGD
         dtype = getattr(torch, dtype)
@@ -119,9 +120,6 @@ def closure():
                 scheduler.step()
 
             # Verify that iterates stay within the Stiefel manifold
-            print(
-                weight.to(dtype=torch.float32).detach().cpu()
-                @ weight.to(dtype=torch.float32).detach().cpu().t())
             assert torch.allclose(
                 weight.to(dtype=torch.float32).detach().cpu()
                 @ weight.to(dtype=torch.float32).detach().cpu().t(),