pytorch
diff --git a/‎test/dtypes/test_affine_quantized.py‎
Lines changed: 11 additions & 6 deletions b/‎test/dtypes/test_affine_quantized.py‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎test/dtypes/test_affine_quantized_float.py‎
Lines changed: 12 additions & 12 deletions b/‎test/dtypes/test_affine_quantized_float.py‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎test/dtypes/test_bitpacking.py‎
Lines changed: 3 additions & 1 deletion b/‎test/dtypes/test_bitpacking.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎test/dtypes/test_nf4.py‎
Lines changed: 18 additions & 4 deletions b/‎test/dtypes/test_nf4.py‎
Lines changed: 18 additions & 4 deletions
@@ -45,11 +45,10 @@
 is_cusparselt_available = (
     hasattr(torch.backends, "cusparselt") and torch.backends.cusparselt.is_available()
 )
-_DEVICE = get_current_accelerator_device()
 
 
 def get_quantization_functions(
-    do_sparse: bool, do_int4: bool, device: str = _DEVICE, int4_zp_int: bool = False
+    do_sparse: bool, do_int4: bool, device: str = "cuda", int4_zp_int: bool = False
 ):
     base_functions = [
         Int8WeightOnlyConfig(),
@@ -85,10 +84,12 @@ def get_quantization_functions(
     return base_functions
 
 
+@unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
 class TestAffineQuantized(TestCase):
     GPU_DEVICES = (["cuda"] if torch.cuda.is_available() else []) + (
         ["xpu"] if torch.xpu.is_available() else []
     )
+    _DEVICE = get_current_accelerator_device()
 
     @unittest.skipIf(len(GPU_DEVICES) == 0, "Need GPU available")
     def test_weights_only(self):
@@ -110,7 +111,9 @@ def test_weights_only(self):
                     _ = torch.load(f, weights_only=True)
 
     @unittest.skipIf(len(GPU_DEVICES) == 0, "Need GPU available")
-    @common_utils.parametrize("apply_quant", get_quantization_functions(False, False))
+    @common_utils.parametrize(
+        "apply_quant", get_quantization_functions(False, False, _DEVICE)
+    )
     def test_to_device(self, apply_quant):
         for device in self.GPU_DEVICES:
 
@@ -171,6 +174,7 @@ def apply_uint6_weight_only_quant(linear):
             )
             return linear
 
+        _DEVICE = get_current_accelerator_device()
         linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
         apply_uint6_weight_only_quant(linear)
 
@@ -202,6 +206,7 @@ def test_print_quantized_module(self):
         "apply_quant", get_quantization_functions(False, True, _DEVICE, False)
     )
     def test_test_copy__apply(self, apply_quant):
+        _DEVICE = get_current_accelerator_device()
         linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
         linear2 = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
 
@@ -226,6 +231,7 @@ def test_test_copy__apply(self, apply_quant):
         "apply_quant", get_quantization_functions(False, True, _DEVICE, False)
     )
     def test_copy__mismatch_metadata(self, apply_quant):
+        _DEVICE = get_current_accelerator_device()
         linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device=_DEVICE)
         linear2 = torch.nn.Linear(128, 512, dtype=torch.bfloat16, device=_DEVICE)
 
@@ -301,9 +307,8 @@ def test_alias(self, device, dtype):
         quantize_(dummy, Int8DynamicActivationInt8WeightConfig())
         _ = dummy.weight[...]
 
-    @common_utils.parametrize("device", [_DEVICE])
+    @common_utils.parametrize("device", ["cuda"])
     @common_utils.parametrize("dtype", [torch.float16, torch.bfloat16])
-    @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @skip_if_no_gemlite()
     def test_slice_gemlite(self, device, dtype):
         # in_feature not divisible by 1024
@@ -384,7 +389,7 @@ def dequant(input_layer, in_features, orig_shape):
             )
             self.assertEqual((W_slice_ref - W_slice).abs().mean().item(), 0)
 
-    @common_utils.parametrize("device", [_DEVICE])
+    @common_utils.parametrize("device", ["cuda"])
     @common_utils.parametrize("dtype", [torch.bfloat16])
     def test_matmul(self, device, dtype):
         x = torch.randn(53, 2048)
 
@@ -36,7 +36,6 @@
 
 random.seed(0)
 torch.manual_seed(0)
-_DEVICE = get_current_accelerator_device()
 
 
 class ToyLinearModel(torch.nn.Module):
@@ -53,15 +52,15 @@ def forward(self, x):
 
 class TestAffineQuantizedFloat8Compile(InductorTestCase):
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        torch.cuda.is_available() and not is_sm_at_least_89(),
         "Requires GPU with compute capability >= 8.9",
     )
     def test_invalid_granularity(self):
         with pytest.raises(ValueError, match="Invalid granularity specification"):
             Float8DynamicActivationFloat8WeightConfig(granularity="invalid")
 
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        torch.cuda.is_available() and not is_sm_at_least_89(),
         "Requires GPU with compute capability >= 8.9",
     )
     def test_mismatched_granularity(self):
@@ -74,7 +73,7 @@ def test_mismatched_granularity(self):
             )
 
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        torch.cuda.is_available() and not is_sm_at_least_89(),
         "Requires GPU with compute capability >= 8.9",
     )
     def test_unsupported_granularity(self):
@@ -95,20 +94,21 @@ def test_per_row_with_float32(self):
             AssertionError,
             match="PerRow quantization only works for bfloat16 precision",
         ):
-            model = ToyLinearModel(64, 64).eval().to(torch.float32).to(_DEVICE)
+            model = ToyLinearModel(64, 64).eval().to(torch.float32).to("cuda")
             quantize_(
                 model,
                 Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()),
             )
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        torch.cuda.is_available() and not is_sm_at_least_89(),
         "Requires GPU with compute capability >= 8.9",
     )
     @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
     @common_utils.parametrize("output_dtype", [torch.float32, torch.bfloat16])
     def test_choose_scale_float8_bounds(self, float8_dtype, output_dtype):
+        _DEVICE = get_current_accelerator_device()
         block_size = ()
         device = _DEVICE
         input_tensor = torch.randn(8, 64, device=device, dtype=torch.float32)
@@ -147,15 +147,15 @@ def test_choose_scale_float8_bounds(self, float8_dtype, output_dtype):
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        torch.cuda.is_available() == "cuda" and not is_sm_at_least_89(),
         "Requires GPU with compute capability >= 8.9",
     )
     @common_utils.parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
     @common_utils.parametrize("output_dtype", [torch.float32, torch.bfloat16])
     @common_utils.parametrize("block_size", [(), (1, 32), (2, 16), (4, 8)])
     def test_dequantize_affine_float8(self, float8_dtype, output_dtype, block_size):
         """Test _dequantize_affine_float8 with various configurations"""
-
+        _DEVICE = get_current_accelerator_device()
         device = _DEVICE
         input_tensor = torch.randn(8, 64, device=device, dtype=torch.float32)
 
@@ -181,12 +181,12 @@ def test_dequantize_affine_float8(self, float8_dtype, output_dtype, block_size):
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @unittest.skipIf(
-        _DEVICE == "cuda" and not is_sm_at_least_89(),
+        torch.cuda.is_available() and not is_sm_at_least_89(),
         "Requires GPU with compute capability >= 8.9",
     )
     def test_dequantize_affine_float8_scale_broadcasting(self):
         """Test that scale broadcasting works correctly for block-wise quantization"""
-        device = _DEVICE
+        device = get_current_accelerator_device()
         # Create input tensor with known block structure
         input_tensor = torch.randn(4, 32, device=device, dtype=torch.float32)
         block_size = (2, 16)  # 2x2 blocks in first dim, 2x16 blocks in second dim
@@ -314,7 +314,7 @@ def test_expected_kernels_on_gpu(self, granularity):
 
         M, K, N = 128, 256, 512
         m = torch.nn.Sequential(
-            torch.nn.Linear(K, N, device=_DEVICE, dtype=torch.bfloat16)
+            torch.nn.Linear(K, N, device="cuda", dtype=torch.bfloat16)
         )
         config = Float8DynamicActivationFloat8WeightConfig(
             granularity=granularity,
@@ -327,7 +327,7 @@ def test_expected_kernels_on_gpu(self, granularity):
         )
 
         m = torch.compile(m)
-        x = torch.randn(M, K, device=_DEVICE, dtype=torch.bfloat16)
+        x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
         out, code = run_and_get_code(m, x)
 
         # triton kernel call looks like:
 
@@ -12,7 +12,6 @@
 
 bit_widths = (1, 2, 3, 4, 5, 6, 7)
 dimensions = (0, -1, 1)
-_DEVICE = get_current_accelerator_device()
 
 
 @pytest.fixture(autouse=True)
@@ -36,6 +35,7 @@ def test_CPU(bit_width, dim):
 @pytest.mark.parametrize("bit_width", bit_widths)
 @pytest.mark.parametrize("dim", dimensions)
 def test_GPU(bit_width, dim):
+    _DEVICE = get_current_accelerator_device()
     test_tensor = torch.randint(0, 2**bit_width, (32, 32, 32), dtype=torch.uint8).to(
         _DEVICE
     )
@@ -49,6 +49,7 @@ def test_GPU(bit_width, dim):
 @pytest.mark.parametrize("bit_width", bit_widths)
 @pytest.mark.parametrize("dim", dimensions)
 def test_compile(bit_width, dim):
+    _DEVICE = get_current_accelerator_device()
     torch._dynamo.config.specialize_int = True
     torch.compile(pack, fullgraph=True)
     torch.compile(unpack, fullgraph=True)
@@ -63,6 +64,7 @@ def test_compile(bit_width, dim):
 # these test cases are for the example pack walk through in the bitpacking.py file
 @pytest.mark.skipif(not torch.accelerator.is_available(), reason="GPU not available")
 def test_pack_example():
+    _DEVICE = get_current_accelerator_device()
     test_tensor = torch.tensor(
         [0x30, 0x29, 0x17, 0x5, 0x20, 0x16, 0x9, 0x22], dtype=torch.uint8
     ).to(_DEVICE)
 
@@ -57,7 +57,6 @@
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
 )
-_DEVICE = get_current_accelerator_device()
 
 
 def _build_input_weight(embed_dim: int, device: torch.device, dtype: torch.dtype):
@@ -131,7 +130,7 @@ def test_backward_dtype_match(self, dtype: torch.dtype):
     def test_reconstruction_qlora_vs_bnb(self, dtype: torch.dtype):
         # From https://github.com/drisspg/transformer_nuggets/blob/f05afad68ad9086d342268f46a7f344617a02314/test/test_qlora.py#L65C1-L81C47
         torch.manual_seed(0)
-        device = _DEVICE
+        device = get_current_accelerator_device()
         embed_dim = 512
         input_weight = _build_input_weight(embed_dim, device, dtype)
         nf4_weight = to_nf4(input_weight)
@@ -161,12 +160,12 @@ def test_nf4_bnb_linear(self, dtype: torch.dtype):
         """
         torch.manual_seed(0)
         dim = 512
-        device = _DEVICE
+        device = get_current_accelerator_device()
         input_weight = _build_input_weight(dim, device, dtype)
         nf4_weight = to_nf4(input_weight)
         bnb_linear = _build_bnb_linear(input_weight, device)
 
-        inp = torch.randn(2, 512, dtype=dtype, device=_DEVICE)
+        inp = torch.randn(2, 512, dtype=dtype, device=device)
 
         out_nf4 = linear_nf4(inp, nf4_weight).sum()
         out_bnb = bnb_linear(inp).sum()
@@ -181,6 +180,7 @@ def test_nf4_bnb_linear(self, dtype: torch.dtype):
     @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
     def test_load_from_state_dicts(self, dtype: torch.dtype):
         """Tests loading to and from different module state dicts"""
+        _DEVICE = get_current_accelerator_device()
         input_tensor = torch.rand(64, device=_DEVICE, dtype=dtype)
         base_mod = self.TestMod(input_tensor, 32, 2)
 
@@ -224,6 +224,7 @@ def test_to_copy(self, dtype: torch.dtype):
         torch.testing.assert_allclose(input_tensor, nf4_to_dtype, atol=0.13, rtol=0.13)
 
         if torch.accelerator.is_available():
+            _DEVICE = get_current_accelerator_device()
             input_tensor = torch.rand(128, device=_DEVICE)
             input_tensor_nf4 = to_nf4(input_tensor, 32, 2)
             nf4_to_dtype = input_tensor_nf4.to(dtype)
@@ -233,6 +234,7 @@ def test_to_copy(self, dtype: torch.dtype):
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need gpu for test")
     def test_to_copy_device(self):
+        _DEVICE = get_current_accelerator_device()
         input_tensor = torch.rand(128, device="cpu")
         t = to_nf4(input_tensor, 32, 2)
         assert t.device == torch.device("cpu")
@@ -256,6 +258,7 @@ def test_to_dtype(self, dtype: torch.dtype):
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
     def test_smoketest_linear(self, dtype: torch.dtype):
+        _DEVICE = get_current_accelerator_device()
         a = torch.randn(32, 32, dtype=dtype, device=_DEVICE)
         a_nf4 = torchao.dtypes.to_nf4(a, 16, 2)
         inp = torch.randn(2, 32, 32, dtype=a.dtype, device=a.device)
@@ -273,6 +276,7 @@ def test_smoketest_linear_compile(self, dtype: torch.dtype):
             self.skipTest("test requires SM capability of at least (8, 0).")
         if version.parse(torch.__version__) < version.parse("2.3.0"):
             self.skipTest("test requires 2.3.0 and above for tracing NF4Tensor")
+        _DEVICE = get_current_accelerator_device()
         a = torch.randn(32, 32, dtype=dtype, device=_DEVICE)
         a_nf4 = torchao.dtypes.to_nf4(a, 16, 2)
         inp = torch.randn(2, 32, 32, dtype=a.dtype, device=a.device)
@@ -283,6 +287,7 @@ def test_smoketest_linear_compile(self, dtype: torch.dtype):
     @parametrize("shape", [(16, 16), (32, 16)])
     @parametrize("chunk_size", [8, 16, 32])
     def test_chunk_size_equivalence(self, dtype: torch.dtype, shape, chunk_size):
+        _DEVICE = get_current_accelerator_device()
         a = torch.randn(shape, device=_DEVICE, dtype=dtype)
         with unittest.mock.patch("torchao.dtypes.nf4tensor.CHUNK_SIZE", chunk_size):
             nf4_patched = to_nf4(a, 16, 2)
@@ -294,6 +299,7 @@ def test_chunk_size_equivalence(self, dtype: torch.dtype, shape, chunk_size):
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @parametrize("input_size", [(512 * 512,), (512, 512)])
     def test_empty_like(self, input_size: Union[Tuple[int], int]):
+        _DEVICE = get_current_accelerator_device()
         nf4_tensor = to_nf4(torch.rand(input_size, device=_DEVICE))
         new_tensor = torch.empty_like(nf4_tensor, device="cpu")
         self.assertTrue(isinstance(new_tensor, NF4Tensor))
@@ -303,6 +309,7 @@ def test_empty_like(self, input_size: Union[Tuple[int], int]):
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @parametrize("compile", [False, True])
     def test_quantize_api(self, compile):
+        _DEVICE = get_current_accelerator_device()
         nf4_linear = nn.Linear(512, 512, device=_DEVICE)
         torchao.quantize_(nf4_linear, nf4_weight_only())
         assert isinstance(nf4_linear.weight, NF4Tensor)
@@ -520,13 +527,15 @@ def test_pin_memory(self):
         nf4_tensor = nf4_tensor.pin_memory()
         self.assertTrue(nf4_tensor.is_pinned())
 
+        _DEVICE = get_current_accelerator_device()
         nf4_tensor = to_nf4(torch.randn(512 * 512, device=_DEVICE))
         self.assertFalse(nf4_tensor.is_pinned())
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_to_cuda(self):
         nf4_tensor = to_nf4(torch.randn(512 * 512))
         self.assertEqual(nf4_tensor.device.type, "cpu")
+        _DEVICE = get_current_accelerator_device()
         nf4_tensor = nf4_tensor.to(_DEVICE, non_blocking=True)
         self.assertEqual(nf4_tensor.device.type, _DEVICE.type)
         self.assertEqual(type(nf4_tensor), NF4Tensor)
@@ -548,6 +557,7 @@ def test_to_cuda(self):
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_to_cpu(self):
+        _DEVICE = get_current_accelerator_device()
         nf4_tensor = to_nf4(torch.randn(512 * 512, device=_DEVICE))
         nf4_tensor = nf4_tensor.cpu()
         self.assertEqual(nf4_tensor.device.type, "cpu")
@@ -562,6 +572,7 @@ def test_to_module(self):
         linear.weight = nn.Parameter(
             to_nf4(linear.weight.detach()), requires_grad=False
         )
+        _DEVICE = get_current_accelerator_device()
         linear.to(_DEVICE)
         self.assertEqual(linear.weight.device.type, _DEVICE.type)
         weight = linear.weight.get_original_weight()
@@ -589,6 +600,7 @@ def test_to_module(self):
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     @parametrize("input_size", [512 * 512, (512 * 512,), (512, 512)])
     def test_tensor_deepcopy(self, input_size: Union[Tuple[int], int]):
+        _DEVICE = get_current_accelerator_device()
         nf4_orig = to_nf4(torch.randn(input_size, device=_DEVICE))
         nf4_clone = copy.deepcopy(nf4_orig)
         self.assertEqual(
@@ -679,6 +691,7 @@ def _test_qlora_fsdp2(
             dropout_p=0,
         )
         torch.manual_seed(42)
+        _DEVICE = get_current_accelerator_device()
         with torch.device(_DEVICE):
             base_model = Transformer(model_args)
             for layer in base_model.layers:
@@ -768,6 +781,7 @@ def _test_comm(self, input_size: int):
         from torch.distributed._composable.fsdp import fully_shard
         from torch.distributed._tensor import distribute_tensor
 
+        _DEVICE = get_current_accelerator_device()
         model = nn.Linear(input_size, input_size, device=_DEVICE)
         origin_tensor = model.weight
         origin_nf4_tensor = to_nf4(origin_tensor)