apache · tlopex · Feb 19, 2026 · Feb 19, 2026
diff --git a/cmake/config.cmake b/cmake/config.cmake
@@ -45,7 +45,7 @@
 # Possible values:
 # - ON: enable CUDA with cmake's auto search
 # - OFF: disable CUDA
-# - /path/to/cuda: use specific path to cuda toolkit
+# - /path/to/cuda: use specific path to CUDA toolkit
 set(USE_CUDA OFF)
 
 # Whether to enable NCCL support:

diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
@@ -19,7 +19,7 @@
 find_cuda(${USE_CUDA} ${USE_CUDNN})
 
 if(CUDA_FOUND)
-  # always set the includedir when cuda is available
+  # always set the includedir when CUDA is available
   # avoid global retrigger of cmake
   include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
 endif(CUDA_FOUND)

diff --git a/cmake/utils/FindCUDA.cmake b/cmake/utils/FindCUDA.cmake
@@ -22,7 +22,7 @@
 #   find_cuda(${USE_CUDA} ${USE_CUDNN})
 #
 # - When USE_CUDA=ON, use auto search
-# - When USE_CUDA=/path/to/cuda-path, use the cuda path
+# - When USE_CUDA=/path/to/cuda-path, use the CUDA path
 # - When USE_CUDNN=ON, use auto search
 # - When USE_CUDNN=/path/to/cudnn-path, use the cudnn path
 #

diff --git a/docs/install/docker.rst b/docs/install/docker.rst
@@ -22,7 +22,7 @@ Docker Images
 We provide docker utility scripts to help developers to setup development environment.
 They are also helpful run through TVM demo and tutorials.
 We need `docker <https://docs.docker.com/engine/installation/>`_ and
-`nvidia-docker <https://github.com/NVIDIA/nvidia-docker/>`_ if we want to use cuda.
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker/>`_ if we want to use CUDA.
 
 Get a tvm source distribution or clone the GitHub repo to get the auxiliary scripts
 

diff --git a/include/tvm/s_tir/meta_schedule/postproc.h b/include/tvm/s_tir/meta_schedule/postproc.h
@@ -137,7 +137,7 @@ class Postproc : public runtime::ObjectRef {
   TVM_DLL static Postproc RewriteReductionBlock();
   /*!
    * \brief Create a postprocessor that adds thread binding to unbound blocks
-   * \param max_threadblocks The max number of threadblocks in the cuda device.
+   * \param max_threadblocks The max number of threadblocks in the CUDA device.
    * \return The postprocessor created.
    */
   TVM_DLL static Postproc RewriteUnboundBlock(int max_threadblocks);

diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
@@ -297,7 +297,7 @@ namespace attr {
  *   The size of the shared memory that may be allocated internally by
  *   the kernel.  For example, exposed as the
  *   CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES attribute in
- *   cuda.
+ *   CUDA.
  *
  *   Defined as "tir.use_dyn_shared_memory".
  *

diff --git a/python/tvm/contrib/cutlass/gemm_profiler.py b/python/tvm/contrib/cutlass/gemm_profiler.py
@@ -48,7 +48,7 @@ def __init__(self):
   {                                                                     \\
     cudaError_t error = status;                                         \\
     if (error != cudaSuccess) {                                         \\
-      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \\
+      std::cerr << "Got bad CUDA status: " << cudaGetErrorString(error) \\
                 << " at line: " << __LINE__ << std::endl;               \\
       exit(EXIT_FAILURE);                                               \\
     }                                                                   \\

diff --git a/python/tvm/contrib/msc/framework/tensorrt/codegen/sources.py b/python/tvm/contrib/msc/framework/tensorrt/codegen/sources.py
@@ -64,7 +64,7 @@ def get_trt_common_h_code() -> str:
   do {                                                   \\
     auto ret = (status);                                 \\
     if (ret != 0) {                                      \\
-      std::cout << "Cuda failure: " << ret << std::endl; \\
+      std::cout << "CUDA failure: " << ret << std::endl; \\
       abort();                                           \\
     }                                                    \\
   } while (0)

diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
@@ -38,18 +38,18 @@
 def compile_cuda(
     code, target_format=None, arch=None, options=None, path_target=None, compiler="nvcc"
 ):
-    """Compile cuda code with NVCC or NVRTC.
+    """Compile CUDA code with NVCC or NVRTC.
 
     Parameters
     ----------
     code : str
-        The cuda code.
+        The CUDA code.
 
     target_format : str
         The target format of the compiler ("ptx", "cubin", or "fatbin").
 
     arch : str
-        The cuda architecture.
+        The CUDA architecture.
 
     options : str or list of str
         The additional options.
@@ -78,7 +78,7 @@ def compile_cuda(
     elif compiler == "nvrtc":
         result = _compile_cuda_nvrtc(code, target_format, arch, options, path_target, use_nvshmem)
     else:
-        raise ValueError(f"cuda compiler must be 'nvcc' or 'nvrtc', got: {compiler}")
+        raise ValueError(f"CUDA compiler must be 'nvcc' or 'nvrtc', got: {compiler}")
 
     return result
 
@@ -623,12 +623,12 @@ def _link_nvshmem_nvrtc(binary_buf, nvshmem_lib_path):
 
 
 def find_cuda_path():
-    """Utility function to find cuda path
+    """Utility function to find CUDA path
 
     Returns
     -------
     path : str
-        Path to cuda root.
+        Path to CUDA root.
     """
     if "CUDA_PATH" in os.environ:
         return os.environ["CUDA_PATH"]
@@ -641,23 +641,23 @@ def find_cuda_path():
     cuda_path = "/usr/local/cuda"
     if os.path.exists(os.path.join(cuda_path, "bin/nvcc")):
         return cuda_path
-    raise RuntimeError("Cannot find cuda path")
+    raise RuntimeError("Cannot find CUDA path")
 
 
 def get_cuda_version(cuda_path=None):
-    """Utility function to get cuda version
+    """Utility function to get CUDA version
 
     Parameters
     ----------
     cuda_path : Optional[str]
 
-        Path to cuda root.  If None is passed, will use
+        Path to CUDA root.  If None is passed, will use
         `find_cuda_path()` as default.
 
     Returns
     -------
     version : float
-        The cuda version
+        The CUDA version
 
     """
     if cuda_path is None:
@@ -683,7 +683,7 @@ def get_cuda_version(cuda_path=None):
         release_fields = [s.strip() for s in release_line.split(",")]
         version_str = [f[1:] for f in release_fields if f.startswith("V")][0]
         return tuple(int(field) for field in version_str.split("."))
-    raise RuntimeError("Cannot read cuda version file")
+    raise RuntimeError("Cannot read CUDA version file")
 
 
 def find_nvshmem_paths() -> Tuple[str, str]:

diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py
@@ -107,12 +107,12 @@ def create_dylib(output, objects, arch, sdk="macosx", min_os_version=None):
 
 
 def compile_metal(code, path_target=None, sdk="macosx", min_os_version=None):
-    """Compile metal with CLI tool from env.
+    """Compile Metal with CLI tool from env.
 
     Parameters
     ----------
     code : str
-        The cuda code.
+        The Metal code.
 
     path_target : str, optional
         Output file.

diff --git a/python/tvm/runtime/device.py b/python/tvm/runtime/device.py
@@ -40,7 +40,7 @@ def exist(self):
 
         Returns True if TVM has support for the device, if the
         physical device is present, and the device is accessible
-        through appropriate drivers (e.g. cuda/vulkan).
+        through appropriate drivers (e.g. CUDA/Vulkan).
 
         Returns
         -------
@@ -54,7 +54,7 @@ def exist(self):
     def max_threads_per_block(self):
         """Maximum number of threads on each block.
 
-        Returns device value for cuda, metal, rocm, opencl, and vulkan
+        Returns device value for CUDA, Metal, ROCm, OpenCL, and Vulkan
         devices.  Returns remote device value for RPC devices.
         Returns None for all other devices.
 
@@ -70,8 +70,8 @@ def max_threads_per_block(self):
     def warp_size(self):
         """Number of threads that execute concurrently.
 
-        Returns device value for cuda, rocm, and vulkan.  Returns
-        1 for metal and opencl devices, regardless of the physical
+        Returns device value for CUDA, ROCm, and Vulkan.  Returns
+        1 for Metal and OpenCL devices, regardless of the physical
         device.  Returns remote device value for RPC devices.  Returns
         None for all other devices.
 
@@ -87,7 +87,7 @@ def warp_size(self):
     def max_shared_memory_per_block(self):
         """Total amount of shared memory per block in bytes.
 
-        Returns device value for cuda, rocm, opencl, and vulkan.
+        Returns device value for CUDA, ROCm, OpenCL, and Vulkan.
         Returns remote device value for RPC devices.  Returns None for
         all other devices.
 
@@ -106,8 +106,8 @@ def compute_version(self):
         Returns maximum API version (e.g. CUDA/OpenCL/Vulkan)
         supported by the device.
 
-        Returns device value for cuda, rocm, opencl, and
-        vulkan. Returns remote device value for RPC devices.  Returns
+        Returns device value for CUDA, ROCm, OpenCL, and
+        Vulkan. Returns remote device value for RPC devices.  Returns
         None for all other devices.
 
         Returns
@@ -122,7 +122,7 @@ def compute_version(self):
     def device_name(self):
         """Return the vendor-specific name of device.
 
-        Returns device value for cuda, rocm, opencl, and vulkan.
+        Returns device value for CUDA, ROCm, OpenCL, and Vulkan.
         Returns remote device value for RPC devices.  Returns None for
         all other devices.
 
@@ -138,7 +138,7 @@ def device_name(self):
     def max_clock_rate(self):
         """Return the max clock frequency of device (kHz).
 
-        Returns device value for cuda, rocm, and opencl.  Returns
+        Returns device value for CUDA, ROCm, and OpenCL.  Returns
         remote device value for RPC devices.  Returns None for all
         other devices.
 
@@ -154,7 +154,7 @@ def max_clock_rate(self):
     def multi_processor_count(self):
         """Return the number of compute units in the device.
 
-        Returns device value for cuda, rocm, and opencl.  Returns
+        Returns device value for CUDA, ROCm, and OpenCL.  Returns
         remote device value for RPC devices.  Returns None for all
         other devices.
 
@@ -170,7 +170,7 @@ def multi_processor_count(self):
     def max_thread_dimensions(self):
         """Return the maximum size of each thread axis
 
-        Returns device value for cuda, rocm, opencl, and vulkan.
+        Returns device value for CUDA, ROCm, OpenCL, and Vulkan.
         Returns remote device value for RPC devices.  Returns None for
         all other devices.
 
@@ -186,10 +186,10 @@ def max_thread_dimensions(self):
     def api_version(self):
         """Returns version number of the SDK used to compile TVM.
 
-        For example, CUDA_VERSION for cuda or VK_HEADER_VERSION for
+        For example, CUDA_VERSION for CUDA or VK_HEADER_VERSION for
         Vulkan.
 
-        Returns device value for cuda, rocm, opencl, and vulkan.
+        Returns device value for CUDA, ROCm, OpenCL, and Vulkan.
         Returns remote device value for RPC devices.  Returns None for
         all other devices.
 

diff --git a/python/tvm/runtime/executable.py b/python/tvm/runtime/executable.py
@@ -52,7 +52,7 @@ def jit(
         """Just-in-time compile and link the modules.
 
         The Executable returned by tvm.compile may not be directly
-        runnable as they may contain cuda source files and objects that
+        runnable as they may contain CUDA source files and objects that
         are yet to be compiled and linked.
         This function helps to create a runtime.Module for these cases.
 

diff --git a/python/tvm/s_tir/meta_schedule/schedule/cuda/layout_transform.py b/python/tvm/s_tir/meta_schedule/schedule/cuda/layout_transform.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""layout_transform scheduling rule for cuda."""
+"""layout_transform scheduling rule for CUDA."""
 
 import math
 from collections import deque

diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py
@@ -49,7 +49,7 @@
 MARKERS = {
     "gpu": "mark a test as requiring a gpu",
     "tensorcore": "mark a test as requiring a tensorcore",
-    "cuda": "mark a test as requiring cuda",
+    "cuda": "mark a test as requiring CUDA",
     "opencl": "mark a test as requiring opencl",
     "rocm": "mark a test as requiring rocm",
     "vulkan": "mark a test as requiring vulkan",

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
@@ -1197,7 +1197,7 @@ def requires_nvcc_version(major_version, minor_version=0, release_version=0):
     installed version of NVCC is at least `(major_version,
     minor_version, release_version)`.
 
-    This also marks the test as requiring a cuda support.
+    This also marks the test as requiring a CUDA support.
 
     Parameters
     ----------
@@ -1240,7 +1240,7 @@ def requires_cuda_compute_version(major_version, minor_version=0):
     compute architecture of the GPU is at least `(major_version,
     minor_version)`.
 
-    This also marks the test as requiring a cuda support.
+    This also marks the test as requiring a CUDA support.
 
     Parameters
     ----------

diff --git a/python/tvm/topi/gpu/sort.py b/python/tvm/topi/gpu/sort.py
@@ -579,7 +579,7 @@ def dual_mergepath(
 
     with T.serial(0, cast(upper_lim - lower_lim, target_dtype)) as l2_width:
         width = 2 << (l2_width + lower_lim)
-        # Define and launch the cuda kernel
+        # Define and launch the CUDA kernel
         target = tvm.target.Target.current()
         if "vulkan" in str(target):
             ntx = max_threads

diff --git a/src/runtime/contrib/cublas/cublas_utils.h b/src/runtime/contrib/cublas/cublas_utils.h
@@ -122,7 +122,7 @@ inline cudaDataType_t GetCudaDataType(DLDataType type) {
         return CUDA_R_16BF;
     }
   }
-  LOG(FATAL) << "Unsupported cuda type";
+  LOG(FATAL) << "Unsupported CUDA type";
 }
 
 /*! \brief Execute matrix multiply followed by the specified epilogue, using cuBLASLt. */

diff --git a/src/runtime/contrib/nvshmem/memory_allocator.cc b/src/runtime/contrib/nvshmem/memory_allocator.cc
@@ -76,7 +76,7 @@ class NVSHMEMAllocator final : public PooledAllocator {
   void* DeviceAllocDataSpace(Device dev, size_t size, size_t alignment,
                              DLDataType type_hint) final {
     ICHECK_EQ(dev.device_type, DLDeviceType::kDLCUDA)
-        << "nvshmem can only allocate cuda device memory space.";
+        << "nvshmem can only allocate CUDA device memory space.";
     ICHECK(type_hint.code == DLDataTypeCode::kDLInt || type_hint.code == DLDataTypeCode::kDLUInt ||
            type_hint.code == DLDataTypeCode::kDLFloat)
         << "nvshmem can only allocate tensor with int, usingned int or float data types.";

diff --git a/src/runtime/contrib/papi/papi.cc b/src/runtime/contrib/papi/papi.cc
@@ -88,7 +88,7 @@ int component_for_device(Device dev) {
 /*! \brief MetricCollectorNode for PAPI metrics.
  *
  * PAPI (Performance Application Programming Interface) collects metrics on a
- * variety of platforms including cpu, cuda and rocm.
+ * variety of platforms including CPU, CUDA and ROCm.
  *
  * PAPI is avaliable at https://github.com/icl-utk-edu/papi.
  */

diff --git a/src/runtime/cuda/cuda_module.h b/src/runtime/cuda/cuda_module.h
@@ -39,12 +39,12 @@ namespace runtime {
 static constexpr const int kMaxNumGPUs = 32;
 
 /*!
- * \brief create a cuda module from data.
+ * \brief create a CUDA module from data.
  *
  * \param data The module data, can be ptx, cubin
  * \param fmt The format of the data, can be "ptx", "cubin"
  * \param fmap The map function information map of each function.
- * \param cuda_source Optional, cuda source file
+ * \param cuda_source Optional, CUDA source file
  */
 ffi::Module CUDAModuleCreate(std::string data, std::string fmt,
                              ffi::Map<ffi::String, FunctionInfo> fmap, std::string cuda_source);

diff --git a/src/s_tir/meta_schedule/postproc/rewrite_unbound_block.cc b/src/s_tir/meta_schedule/postproc/rewrite_unbound_block.cc
@@ -109,7 +109,7 @@ class RewriteUnboundBlockNode : public PostprocNode {
  public:
   /*! \brief The max number of threads per block from Target */
   int max_threads_per_block_ = -1;
-  /*! \brief The max number of threadblocks in the cuda device */
+  /*! \brief The max number of threadblocks in the CUDA device */
   int max_threadblocks_ = -1;
 
   static void RegisterReflection() {

diff --git a/src/s_tir/meta_schedule/schedule_rule/auto_bind.cc b/src/s_tir/meta_schedule/schedule_rule/auto_bind.cc
@@ -53,7 +53,7 @@ class AutoBindNode : public ScheduleRuleNode {
  public:
   /*! \brief The max number of threads per block from Target */
   int64_t max_threads_per_block_ = -1;
-  /*! \brief The max number of threadblocks in the cuda device */
+  /*! \brief The max number of threadblocks in the CUDA device */
   int64_t max_threadblocks_ = -1;
   /*! \brief thread_extents Candidates of thread axis extent. */
   ffi::Array<Integer> thread_extents_;

diff --git a/src/target/opt/build_cuda_off.cc b/src/target/opt/build_cuda_off.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- *  Optional module when build cuda is switched to off
+ *  Optional module when build CUDA is switched to off
  */
 #include "../../runtime/cuda/cuda_module.h"
 namespace tvm {