diff --git a/.github/scripts/fbgemm_gpu_integration.bash b/.github/scripts/fbgemm_gpu_integration.bash
index 679c062d7a..23b9432a31 100644
--- a/.github/scripts/fbgemm_gpu_integration.bash
+++ b/.github/scripts/fbgemm_gpu_integration.bash
@@ -284,12 +284,14 @@ integration_fbgemm_gpu_install_matrix_run () {
       12.8.1
       12.9.1
       13.0.2
+      13.2.0
     )
   elif [ "$variant_type" == "genai" ]; then
     local variant_versions=(
       12.6.3
       12.8.1
       13.0.2
+      13.2.0
     )
   elif [ "$variant_type" == "rocm" ]; then
     local variant_versions=(
diff --git a/.github/scripts/generate_ci_matrix.py b/.github/scripts/generate_ci_matrix.py
index d91fab145a..3b615ac2b5 100644
--- a/.github/scripts/generate_ci_matrix.py
+++ b/.github/scripts/generate_ci_matrix.py
@@ -304,10 +304,10 @@ def cuda_versions(self) -> List[str]:
             # FBGEMM HSTU is expensive, so conserve CI resources
             return ["12.8.1"]
         elif self.target == TARGET_GENAI:
-            return ["12.6.3", "12.8.1", "12.9.1", "13.0.2"]
+            return ["12.6.3", "12.8.1", "12.9.1", "13.0.2", "13.2.0"]
         else:
             # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
-            return ["12.6.3", "12.8.1", "12.9.1", "13.0.2"]
+            return ["12.6.3", "12.8.1", "12.9.1", "13.0.2", "13.2.0"]
 
     def rocm_versions(self) -> List[str]:
         if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH:
diff --git a/.github/scripts/nova_dir.bash b/.github/scripts/nova_dir.bash
index f4fcd2783a..73c499c381 100644
--- a/.github/scripts/nova_dir.bash
+++ b/.github/scripts/nova_dir.bash
@@ -22,7 +22,8 @@ fi
 ## Overwrite existing ENV VAR in Nova
 if [[ "$CONDA_ENV" != "" ]]; then export CONDA_RUN="conda run --no-capture-output -p ${CONDA_ENV}" && echo "$CONDA_RUN"; fi
 
-if [[ "$CU_VERSION" == "cu130" ]] ||
+if [[ "$CU_VERSION" == "cu132" ]] ||
+     [[ "$CU_VERSION" == "cu130" ]] ||
      [[ "$CU_VERSION" == "cu129" ]] ||
      [[ "$CU_VERSION" == "cu128" ]]; then
     export TORCH_CUDA_ARCH_LIST="8.0;9.0a;10.0a;12.0a"
diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
index 9cc6f5dc0b..9ec2cdcd9c 100644
--- a/.github/scripts/utils_cuda.bash
+++ b/.github/scripts/utils_cuda.bash
@@ -35,9 +35,21 @@ __set_cuda_symlinks_envvars () {
 
     echo "[INSTALL] Copying nvtx3 headers ..."
     # shellcheck disable=SC2086
-    print_exec cp -r ${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/* ${conda_prefix}/include/
-    # shellcheck disable=SC2086
-    print_exec cp -r ${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/* ${new_cuda_home}/include/
+    if compgen -G "${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/*" > /dev/null 2>&1; then
+      # Copy nvtx3 headers from nsight-compute if available
+      # shellcheck disable=SC2086
+      print_exec cp -r ${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/* ${conda_prefix}/include/
+      # shellcheck disable=SC2086
+      print_exec cp -r ${conda_prefix}/nsight-compute*/host/*/nvtx/include/nvtx3/* ${new_cuda_home}/include/
+    elif [ -d "${conda_prefix}/include/nvtx3" ]; then
+      # nvtx3 headers already available from cuda-nvtx package
+      echo "[INSTALL] nvtx3 headers already present in ${conda_prefix}/include/nvtx3 (from cuda-nvtx)"
+      if [ ! -d "${new_cuda_home}/include/nvtx3" ]; then
+        print_exec cp -r "${conda_prefix}/include/nvtx3" "${new_cuda_home}/include/"
+      fi
+    else
+      echo "[INSTALL] WARNING: nvtx3 headers not found in nsight-compute or cuda-nvtx"
+    fi
   fi
 
   echo "[INSTALL] Appending libcuda.so path to LD_LIBRARY_PATH ..."
@@ -220,8 +232,17 @@ install_cuda () {
       cuda-nvrtc-dev \
       cuda-cupti-dev \
       cuda-profiler-api \
-      cuda-opencl-dev \
-      nsight-compute) || return 1
+      cuda-opencl-dev) || return 1
+
+    # NOTE: nsight-compute is installed separately as best-effort because for
+    # newer CUDA versions (e.g. 13.2+), it may have unresolvable dependency
+    # conflicts on conda-forge (libxkbcommon -> libxml2-16 vs clangxx ->
+    # libllvm16 -> libxml2 <2.14).  The nvtx3 headers it provides are handled
+    # in __set_cuda_symlinks_envvars with a fallback to cuda-nvtx.
+    # shellcheck disable=SC2086
+    (exec_with_retries 3 conda install ${env_prefix} -c conda-forge --override-channels -y \
+      "cuda-version=${cuda_version%.*}" \
+      nsight-compute) || echo "[INSTALL] WARNING: nsight-compute could not be installed, skipping (nvtx3 headers will be sourced from cuda-nvtx)"
   fi
 
   # Set the symlinks and environment variables not covered by conda install
diff --git a/.github/workflows/fbgemm_gpu_release_cuda.yml b/.github/workflows/fbgemm_gpu_release_cuda.yml
index 2f7e807454..a064318b42 100644
--- a/.github/workflows/fbgemm_gpu_release_cuda.yml
+++ b/.github/workflows/fbgemm_gpu_release_cuda.yml
@@ -34,7 +34,7 @@ on:
         description: CUDA Version to Use for Building Artifact
         type: choice
         required: false
-        options: [ "12.6.3", "12.8.1", "12.9.1", "13.0.2" ]
+        options: [ "12.6.3", "12.8.1", "12.9.1", "13.0.2", "13.2.0" ]
         default: "13.0.2"
       publish-to-pypi:
         description: Publish Artifact to PyPI
diff --git a/.github/workflows/fbgemm_gpu_release_genai.yml b/.github/workflows/fbgemm_gpu_release_genai.yml
index a9f665ffe9..0f3c128bcb 100644
--- a/.github/workflows/fbgemm_gpu_release_genai.yml
+++ b/.github/workflows/fbgemm_gpu_release_genai.yml
@@ -34,7 +34,7 @@ on:
         description: CUDA Version to Use for Building Artifact
         type: choice
         required: false
-        options: [ "12.6.3", "12.8.1", "12.9.1", "13.0.2" ]
+        options: [ "12.6.3", "12.8.1", "12.9.1", "13.0.2", "13.2.0" ]
         default: "13.0.2"
       publish-to-pypi:
         description: Publish Artifact to PyPI
@@ -72,7 +72,7 @@ jobs:
           { arch: x86, instance: "linux.12xlarge.memory" },
         ]
         python-version: [ "3.10", "3.11", "3.12", "3.13", "3.14" ]
-        cuda-version: [ "12.6.3", "12.8.1", "13.0.2" ]
+        cuda-version: [ "12.6.3", "12.8.1", "13.0.2", "13.2.0" ]
 
     steps:
     - name: Setup Build Container
@@ -146,7 +146,7 @@ jobs:
           { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
         ]
         python-version: [ "3.10", "3.11", "3.12", "3.13", "3.14" ]
-        cuda-version: [ "12.6.3", "12.8.1", "13.0.2" ]
+        cuda-version: [ "12.6.3", "12.8.1", "13.0.2", "13.2.0" ]
     needs: build_artifact
 
     steps: