diff --git a/references/classification/train.py b/references/classification/train.py
index d52124fcf33..943bd651986 100644
--- a/references/classification/train.py
+++ b/references/classification/train.py
@@ -26,7 +26,7 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, arg
     for i, (image, target) in enumerate(metric_logger.log_every(data_loader, args.print_freq, header)):
         start_time = time.time()
         image, target = image.to(device), target.to(device)
-        with torch.cuda.amp.autocast(enabled=scaler is not None):
+        with torch.amp.autocast("cuda", enabled=scaler is not None):
             output = model(image)
             loss = criterion(output, target)
 
diff --git a/references/depth/stereo/cascade_evaluation.py b/references/depth/stereo/cascade_evaluation.py
index 7cb6413f1a5..28af2044c1a 100644
--- a/references/depth/stereo/cascade_evaluation.py
+++ b/references/depth/stereo/cascade_evaluation.py
@@ -139,7 +139,7 @@ def _evaluate(
         logger.add_meter("fl-all", fmt="{global_avg:.4f}")
 
     num_processed_samples = 0
-    with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
+    with torch.amp.autocast("cuda", enabled=args.mixed_precision, dtype=torch.float16):
         batch_idx = 0
         for blob in metric_logger.log_every(val_loader, print_freq, header):
             image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob)
diff --git a/references/depth/stereo/train.py b/references/depth/stereo/train.py
index e3d572153b2..5cf57150a69 100644
--- a/references/depth/stereo/train.py
+++ b/references/depth/stereo/train.py
@@ -169,7 +169,7 @@ def _evaluate(
         logger.add_meter("fl-all", fmt="{global_avg:.4f}")
 
     num_processed_samples = 0
-    with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
+    with torch.amp.autocast("cuda", enabled=args.mixed_precision, dtype=torch.float16):
         for blob in metric_logger.log_every(val_loader, print_freq, header):
             image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob)
             padder = utils.InputPadder(image_left.shape, mode=padder_mode)
@@ -314,7 +314,7 @@ def run(model, optimizer, scheduler, train_loader, val_loaders, logger, writer,
 
         # unpack the data blob
         image_left, image_right, disp_mask, valid_disp_mask = (x.to(device) for x in data_blob)
-        with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
+        with torch.amp.autocast("cuda", enabled=args.mixed_precision, dtype=torch.float16):
             disp_predictions = model(image_left, image_right, flow_init=None, num_iters=args.recurrent_updates)
             # different models have different outputs, make sure we get the right ones for this task
             disp_predictions = make_stereo_flow(disp_predictions, model_out_channels)
diff --git a/references/detection/engine.py b/references/detection/engine.py
index 0e9bfffdf8a..4e3a19b3aab 100644
--- a/references/detection/engine.py
+++ b/references/detection/engine.py
@@ -27,7 +27,7 @@ def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, sc
     for images, targets in metric_logger.log_every(data_loader, print_freq, header):
         images = list(image.to(device) for image in images)
         targets = [{k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in t.items()} for t in targets]
-        with torch.cuda.amp.autocast(enabled=scaler is not None):
+        with torch.amp.autocast("cuda", enabled=scaler is not None):
             loss_dict = model(images, targets)
             losses = sum(loss for loss in loss_dict.values())
 
diff --git a/references/segmentation/train.py b/references/segmentation/train.py
index abdc3c6aacb..72591091e62 100644
--- a/references/segmentation/train.py
+++ b/references/segmentation/train.py
@@ -107,7 +107,7 @@ def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, devi
     header = f"Epoch: [{epoch}]"
     for image, target in metric_logger.log_every(data_loader, print_freq, header):
         image, target = image.to(device), target.to(device)
-        with torch.cuda.amp.autocast(enabled=scaler is not None):
+        with torch.amp.autocast("cuda", enabled=scaler is not None):
             output = model(image)
             loss = criterion(output, target)
 
diff --git a/references/video_classification/train.py b/references/video_classification/train.py
index a03a9722003..36e5d0ad3c1 100644
--- a/references/video_classification/train.py
+++ b/references/video_classification/train.py
@@ -25,7 +25,7 @@ def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, devi
     for video, target, _ in metric_logger.log_every(data_loader, print_freq, header):
         start_time = time.time()
         video, target = video.to(device), target.to(device)
-        with torch.cuda.amp.autocast(enabled=scaler is not None):
+        with torch.amp.autocast("cuda", enabled=scaler is not None):
             output = model(video)
             loss = criterion(output, target)
 
diff --git a/test/test_models.py b/test/test_models.py
index 202bbdbd0cd..f774d34585b 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -606,7 +606,7 @@ def checkOut(out):
 
     checkOut(out)
 
-    with torch.cuda.amp.autocast():
+    with torch.amp.autocast("cuda"):
         out = model(model_input)
 
     checkOut(out)
@@ -705,7 +705,7 @@ def test_classification_model(model_fn, dev):
     _check_fx_compatible(model, x, eager_out=out)
 
     if dev == "cuda":
-        with torch.cuda.amp.autocast():
+        with torch.amp.autocast("cuda"):
             out = model(x)
             # See autocast_flaky_numerics comment at top of file.
             if model_name not in autocast_flaky_numerics:
@@ -761,7 +761,7 @@ def check_out(out):
     _check_fx_compatible(model, x, eager_out=out)
 
     if dev == "cuda":
-        with torch.cuda.amp.autocast(), torch.no_grad(), freeze_rng_state():
+        with torch.amp.autocast("cuda"), torch.no_grad(), freeze_rng_state():
             out = model(x)
             # See autocast_flaky_numerics comment at top of file.
             if model_name not in autocast_flaky_numerics:
@@ -864,7 +864,7 @@ def compute_mean_std(tensor):
     _check_jit_scriptable(model, ([x],), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out)
 
     if dev == "cuda":
-        with torch.cuda.amp.autocast(), torch.no_grad(), freeze_rng_state():
+        with torch.amp.autocast("cuda"), torch.no_grad(), freeze_rng_state():
             out = model(model_input)
             # See autocast_flaky_numerics comment at top of file.
             if model_name not in autocast_flaky_numerics:
@@ -941,7 +941,7 @@ def test_video_model(model_fn, dev):
     assert out.shape[-1] == num_classes
 
     if dev == "cuda":
-        with torch.cuda.amp.autocast():
+        with torch.amp.autocast("cuda"):
             out = model(x)
             # See autocast_flaky_numerics comment at top of file.
             if model_name not in autocast_flaky_numerics:
diff --git a/test/test_ops.py b/test/test_ops.py
index 1ba7a2c9efa..97f5237a00c 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -232,7 +232,7 @@ def func(z):
     @pytest.mark.parametrize("x_dtype", (torch.float, torch.half))
     @pytest.mark.parametrize("rois_dtype", (torch.float, torch.half))
     def test_autocast(self, x_dtype, rois_dtype):
-        with torch.cuda.amp.autocast():
+        with torch.amp.autocast("cuda"):
             self.test_forward(torch.device("cuda"), contiguous=False, x_dtype=x_dtype, rois_dtype=rois_dtype)
 
     def _helper_boxes_shape(self, func):
@@ -497,7 +497,7 @@ def test_forward(self, device, contiguous, deterministic, aligned, x_dtype, rois
     @pytest.mark.parametrize("rois_dtype", (torch.float, torch.half))
     @pytest.mark.opcheck_only_one()
     def test_autocast(self, aligned, deterministic, x_dtype, rois_dtype):
-        with torch.cuda.amp.autocast():
+        with torch.amp.autocast("cuda"):
             self.test_forward(
                 torch.device("cuda"),
                 contiguous=False,
@@ -513,7 +513,7 @@ def test_autocast(self, aligned, deterministic, x_dtype, rois_dtype):
     @pytest.mark.parametrize("x_dtype", (torch.float, torch.bfloat16))
     @pytest.mark.parametrize("rois_dtype", (torch.float, torch.bfloat16))
     def test_autocast_cpu(self, aligned, deterministic, x_dtype, rois_dtype):
-        with torch.cpu.amp.autocast():
+        with torch.amp.autocast("cpu"):
             self.test_forward(
                 torch.device("cpu"),
                 contiguous=False,
@@ -856,14 +856,14 @@ def test_nms_gpu(self, iou, device, dtype=torch.float64):
     @pytest.mark.parametrize("dtype", (torch.float, torch.half))
     @pytest.mark.opcheck_only_one()
     def test_autocast(self, iou, dtype):
-        with torch.cuda.amp.autocast():
+        with torch.amp.autocast("cuda"):
             self.test_nms_gpu(iou=iou, dtype=dtype, device="cuda")
 
     @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
     @pytest.mark.parametrize("dtype", (torch.float, torch.bfloat16))
     def test_autocast_cpu(self, iou, dtype):
         boxes, scores = self._create_tensors_with_iou(1000, iou)
-        with torch.cpu.amp.autocast():
+        with torch.amp.autocast("cpu"):
             keep_ref_float = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou)
             keep_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou)
         torch.testing.assert_close(keep_ref_float, keep_dtype)
@@ -1193,7 +1193,7 @@ def test_compare_cpu_cuda_grads(self, contiguous):
     @pytest.mark.parametrize("dtype", (torch.float, torch.half))
     @pytest.mark.opcheck_only_one()
     def test_autocast(self, batch_sz, dtype):
-        with torch.cuda.amp.autocast():
+        with torch.amp.autocast("cuda"):
             self.test_forward(torch.device("cuda"), contiguous=False, batch_sz=batch_sz, dtype=dtype)
 
     def test_forward_scriptability(self):