diff --git a/.circleci/common.sh b/.circleci/common.sh
index 383f4cff5cc..7dfb771b798 100755
--- a/.circleci/common.sh
+++ b/.circleci/common.sh
@@ -133,23 +133,23 @@ function run_torch_xla_python_tests() {
       # CUDA tests
       if [ -x "$(command -v nvidia-smi)" ]; then
         # single-host-single-process
-        PJRT_DEVICE=CUDA python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_cores=1 --num_steps=25 --model=resnet18
+        python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_cores=1 --num_steps=25 --model=resnet18
 
         # single-host-multi-process
         num_devices=$(nvidia-smi --list-gpus | wc -l)
-        PJRT_DEVICE=CUDA GPU_NUM_DEVICES=$GPU_NUM_DEVICES python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18
-        PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=$num_devices test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1  --num_steps=25 --model=resnet18
+        python3 test/test_train_mp_imagenet.py --fake_data --batch_size=16 --num_epochs=1 --num_steps=25 --model=resnet18
+        torchrun --nnodes=1 --node_rank=0 --nproc_per_node=$num_devices test/test_train_mp_imagenet.py --fake_data --pjrt_distributed --batch_size=16 --num_epochs=1  --num_steps=25 --model=resnet18
 
         # single-host-SPMD
-        XLA_USE_SPMD=1 PJRT_DEVICE=CUDA torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 16 --model=resnet50 --sharding=batch --num_epochs=1  --num_steps=25 --model=resnet18
+        XLA_USE_SPMD=1 torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 test/spmd/test_train_spmd_imagenet.py --fake_data --batch_size 16 --model=resnet50 --sharding=batch --num_epochs=1  --num_steps=25 --model=resnet18
 
-        PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
-        PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
-        XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=CUDA python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
+        python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
+        python test/test_train_mp_imagenet_fsdp.py --fake_data --auto_wrap_policy type_based --use_small_fake_sample --num_epochs=1
+        XLA_DISABLE_FUNCTIONALIZATION=1 python test/test_train_mp_imagenet_fsdp.py --fake_data --use_nested_fsdp --use_small_fake_sample --num_epochs=1
         # Syncfree SGD optimizer tests
         if [ -d ./torch_xla/amp/syncfree ]; then
           echo "Running Syncfree Optimizer Test"
-          PJRT_DEVICE=CUDA python test/test_syncfree_optimizers.py
+          python test/test_syncfree_optimizers.py
 
           # Following test scripts are mainly useful for
           # performance evaluation & comparison among different
@@ -221,11 +221,6 @@ function run_torch_xla_tests() {
   RUN_CPP="${RUN_CPP_TESTS:0}"
   RUN_PYTHON="${RUN_PYTHON_TESTS:0}"
 
-  if [ -x "$(command -v nvidia-smi)" ]; then
-    num_devices=$(nvidia-smi --list-gpus | wc -l)
-    echo "Found $num_devices GPU devices..."
-    export GPU_NUM_DEVICES=$num_devices
-  fi
   export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
   export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
 
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 5b694a40158..f076a8844af 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -86,7 +86,7 @@ works only for inference now.
 
 ```
 cd pytorch
-PJRT_DEVICE=CUDA python3 new_xla/benchmarks/experiment_runner.py \
+python3 new_xla/benchmarks/experiment_runner.py \
     --xla=PJRT \
     --dynamo=openxla_eval \
     --test=eval \
diff --git a/benchmarks/llama.py b/benchmarks/llama.py
index d3ca92274de..95a9aab89dc 100644
--- a/benchmarks/llama.py
+++ b/benchmarks/llama.py
@@ -164,9 +164,6 @@ def run_benchmarks(args, llama_dir: str, results_dir: str,
       if dynamo == 'inductor':
         run_env['CUDA_VISIBLE_DEVICES'] = '0'
         run_env['USE_CUDA'] = '1'
-      else:
-        run_env['PJRT_DEVICE'] = 'CUDA'
-        run_env['GPU_NUM_DEVICES'] = '1'
 
       run_ok = True
       with open(log_file, 'w') as f:
diff --git a/docs/gpu.md b/docs/gpu.md
index d0ce667d46f..ae516ded81c 100644
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -69,7 +69,6 @@ pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/cuda/12.
 In order to run below examples, you need to clone the pytorch/xla repo to access the imagenet example(We already clone it in our docker).
 
 ```
-(pytorch) root@20ab2c7a2d06:/# export GPU_NUM_DEVICES=1 PJRT_DEVICE=CUDA
 (pytorch) root@20ab2c7a2d06:/# git clone --recursive https://github.com/pytorch/xla.git
 (pytorch) root@20ab2c7a2d06:/# python xla/test/test_train_mp_imagenet.py --fake_data
 ==> Preparing data..
diff --git a/test/dynamo/test_dynamo.py b/test/dynamo/test_dynamo.py
index 519eef247a7..867bdfafad8 100644
--- a/test/dynamo/test_dynamo.py
+++ b/test/dynamo/test_dynamo.py
@@ -14,6 +14,7 @@
 import unittest
 import warnings
 
+xr._maybe_select_default_device()
 torch_xla._XLAC._init_computation_client()
 
 # Setup import folders.
diff --git a/test/run_tests.sh b/test/run_tests.sh
index 6e03b179511..5cad7025691 100755
--- a/test/run_tests.sh
+++ b/test/run_tests.sh
@@ -56,7 +56,7 @@ function run_coverage {
 function run_test {
   echo "Running in PjRt runtime: $@"
   if [ -x "$(command -v nvidia-smi)" ] && [ "$XLA_CUDA" != "0" ]; then
-    PJRT_DEVICE=CUDA run_coverage "$@"
+    run_coverage "$@"
   else
     # TODO(darisoy): run these tests with multiple CPU devices, this fails due to TF issue.
     PJRT_DEVICE=CPU CPU_NUM_DEVICES=1 run_coverage "$@"
@@ -122,7 +122,7 @@ function run_torchrun {
   if [ -x "$(command -v nvidia-smi)" ] && [ "$XLA_CUDA" != "0" ]; then
     echo "Running torchrun test for GPU $@"
     num_devices=$(nvidia-smi --list-gpus | wc -l)
-    PJRT_DEVICE=CUDA torchrun --nnodes 1 --nproc-per-node $num_devices $@
+    torchrun --nnodes 1 --nproc-per-node $num_devices $@
   fi
 }
 
diff --git a/test/test_operations.py b/test/test_operations.py
index 9e274218c4c..53465057b6f 100644
--- a/test/test_operations.py
+++ b/test/test_operations.py
@@ -139,8 +139,7 @@ def onlyIfTorchSupportsCUDA(fn):
 
 def onlyIfPJRTDeviceIsCUDA(fn):
   return unittest.skipIf(
-      os.environ.get("PJRT_DEVICE") not in ("GPU", "CUDA"),
-      reason="requires CUDA as PJRT_DEVICE")(
+      xr.device_type() != 'CUDA', reason="requires CUDA as PJRT_DEVICE")(
           fn)
 
 
diff --git a/torch_xla/core/xla_model.py b/torch_xla/core/xla_model.py
index 28622fdafc2..b1c7d4c3023 100755
--- a/torch_xla/core/xla_model.py
+++ b/torch_xla/core/xla_model.py
@@ -18,7 +18,13 @@
 import torch_xla.utils.utils as xu
 import torch_xla.utils.closures as xc
 
-_DEVICES = xu.LazyProperty(lambda: torch_xla._XLAC._xla_get_devices())
+
+def _lazy_get_device():
+  runtime._maybe_select_default_device()
+  return torch_xla._XLAC._xla_get_devices()
+
+
+_DEVICES = xu.LazyProperty(_lazy_get_device)
 
 REDUCE_SUM = 'sum'
 REDUCE_MUL = 'mul'
@@ -59,6 +65,7 @@ def __init__(self, device):
 
 def _get_device_context(device=None):
   if device is None:
+    runtime._maybe_select_default_device()
     device = torch_xla._XLAC._xla_get_default_device()
   else:
     device = str(device)
diff --git a/torch_xla/runtime.py b/torch_xla/runtime.py
index b1d01490672..94ef0783d1a 100644
--- a/torch_xla/runtime.py
+++ b/torch_xla/runtime.py
@@ -120,6 +120,8 @@ def xla_device(n: Optional[int] = None,
   Returns:
     A `torch.device` representing an XLA device.
   """
+  _maybe_select_default_device()
+
   if n is None:
     return torch.device(torch_xla._XLAC._xla_get_default_device())