From 9d73c3564c7aa2f5c8883d463dfc3175326e80d1 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Fri, 19 Jun 2026 20:11:06 +0000
Subject: [PATCH] [ROCm] Add AMD GPU support via ROCm/HIP

This adds an optional AMD GPU build to cuPDLPx through ROCm/HIP, alongside
the existing CUDA path. The CUDA build is unchanged when USE_HIP is off.

To review: start with internal/cuda_to_hip.h, which routes the CUDA runtime,
cuBLAS, cuSPARSE, and CUB symbols used by the solver to their hipRT, hipBLAS,
hipSPARSE, and hipCUB equivalents on a HIP build, and includes the standard
CUDA headers otherwise. The device sources keep their CUDA spelling and are
compiled as HIP. internal/cusparse_compat.h selects the standard hipsparseSpMV
path on ROCm, since hipSPARSE does not provide the cusparseSpMVOp variant.

CMakeLists.txt gains a USE_HIP option (off by default). When enabled the
project is configured with the HIP language, the .cu sources are compiled as
HIP, and the targets link hipBLAS, hipSPARSE, and hipCUB instead of the CUDA
libraries. GPU architectures are chosen with CMAKE_HIP_ARCHITECTURES,
defaulting to gfx90a. On Windows the CLI-only mps_parser.c is excluded from
the core library because it relies on strtok_r. The interface test gains a
case that runs the GPU solver path with presolve disabled, exercising the
hipBLAS and hipSPARSE execution path end to end.

Test Plan:

Built and ran on an AMD Instinct MI200 (gfx90a) with ROCm 7.2.1:

```
cmake -B build -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_PREFIX_PATH=/opt/rocm \
  -DCUPDLPX_BUILD_CLI=ON -DCUPDLPX_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release
cmake --build build -j$(nproc)
./build/tests/test_interface
```

The interface suite passes, including the GPU solver case (Status: OPTIMAL).
The same configuration builds cleanly for gfx1100 (RDNA3) and gfx1201
(RDNA4); the device code objects are identical across the documentation and
formatting commits that followed validation. The CUDA build path is
unaffected by these changes.

This work was authored with the assistance of Claude, an AI assistant by
Anthropic.
---
 CMakeLists.txt             | 115 +++++++++++++++++++-----
 README.md                  |  21 ++++-
 internal/cuda_to_hip.h     | 180 +++++++++++++++++++++++++++++++++++++
 internal/cusparse_compat.h |  14 +++
 internal/internal_types.h  |   8 ++
 internal/utils.h           |   7 ++
 src/feasibility_polish.cu  |   4 +
 src/preconditioner.cu      |   6 ++
 src/solver.cu              |   4 +
 test/test_interface.c      |  29 +++++-
 10 files changed, 361 insertions(+), 27 deletions(-)
 create mode 100644 internal/cuda_to_hip.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e7679f..142c027 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,8 +3,15 @@
 # -----------------------------------------------------------------------------
 cmake_minimum_required(VERSION 3.20)
 
+# HIP/ROCm support option (must be set before project() to influence language detection)
+option(USE_HIP "Build with HIP for AMD GPUs" OFF)
+
 # Project config
-project(cupdlpx LANGUAGES C CXX CUDA)
+if(USE_HIP)
+  project(cupdlpx LANGUAGES C CXX HIP)
+else()
+  project(cupdlpx LANGUAGES C CXX CUDA)
+endif()
 
 set(CUPDLPX_VERSION_MAJOR 0)
 set(CUPDLPX_VERSION_MINOR 2)
@@ -32,8 +39,16 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-  set(CMAKE_CUDA_ARCHITECTURES 60 70 75 80 86 89 90)
+if(USE_HIP)
+  # HIP architecture configuration
+  # Default to gfx90a if not specified; can override with -DCMAKE_HIP_ARCHITECTURES=gfx1100, etc.
+  if(NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "")
+    set(CMAKE_HIP_ARCHITECTURES "gfx90a")
+  endif()
+else()
+  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES 60 70 75 80 86 89 90)
+  endif()
 endif()
 
 # -----------------------------------------------------------------------------
@@ -61,9 +76,14 @@ else()
     endif()
 endif()
 
-# CUDA standards and RDC
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+# CUDA/HIP standards and RDC
+if(USE_HIP)
+  set(CMAKE_HIP_STANDARD 17)
+  set(CMAKE_HIP_STANDARD_REQUIRED ON)
+else()
+  set(CMAKE_CUDA_STANDARD 17)
+  set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+endif()
 
 # -----------------------------------------------------------------------------
 # CONTROL OPTIONS
@@ -85,7 +105,16 @@ endif()
 # -----------------------------------------------------------------------------
 # FIND DEPENDENCIES
 # -----------------------------------------------------------------------------
-find_package(CUDAToolkit REQUIRED)
+if(USE_HIP)
+  # Find ROCm/HIP libraries
+  find_package(hip REQUIRED)
+  find_package(hipblas REQUIRED)
+  find_package(hipsparse REQUIRED)
+  find_package(hipcub REQUIRED)
+  find_package(rocprim REQUIRED)
+else()
+  find_package(CUDAToolkit REQUIRED)
+endif()
 include(FetchContent)
 
 # 1. ZLIB Configuration
@@ -152,20 +181,40 @@ target_compile_definitions(cupdlpx_compile_flags INTERFACE PSLP_VERSION="${PSLP_
 file(GLOB C_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.c")
 file(GLOB CU_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cu")
 list(REMOVE_ITEM C_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/src/cli.c")
+if(WIN32)
+  # mps_parser.c is CLI-only; exclude it on Windows where strtok_r is unavailable
+  list(REMOVE_ITEM C_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/src/mps_parser.c")
+endif()
 
 set(CORE_INCLUDE_DIRS
   PUBLIC  ${CMAKE_CURRENT_SOURCE_DIR}/include
   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/internal
 )
 
-set(CORE_LINK_LIBS 
-  PUBLIC cupdlpx_compile_flags
-  PUBLIC CUDA::cudart
-  PUBLIC CUDA::cublas
-  PUBLIC CUDA::cusparse
-  PUBLIC ZLIB::ZLIB
-  PUBLIC PSLP
-)
+if(USE_HIP)
+  set(CORE_LINK_LIBS
+    PUBLIC cupdlpx_compile_flags
+    PUBLIC hip::device
+    PUBLIC roc::hipblas
+    PUBLIC roc::hipsparse
+    PUBLIC hip::hipcub
+    PUBLIC ZLIB::ZLIB
+    PUBLIC PSLP
+  )
+  # Mark .cu files as HIP language
+  set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE HIP)
+  # Define USE_HIP for the compat header
+  add_compile_definitions(USE_HIP)
+else()
+  set(CORE_LINK_LIBS
+    PUBLIC cupdlpx_compile_flags
+    PUBLIC CUDA::cudart
+    PUBLIC CUDA::cublas
+    PUBLIC CUDA::cusparse
+    PUBLIC ZLIB::ZLIB
+    PUBLIC PSLP
+  )
+endif()
 
 # 1. Core STATIC Library
 if(CUPDLPX_BUILD_STATIC_LIB)
@@ -174,9 +223,17 @@ if(CUPDLPX_BUILD_STATIC_LIB)
     target_link_libraries(cupdlpx_core ${CORE_LINK_LIBS})
     set_target_properties(cupdlpx_core PROPERTIES
       POSITION_INDEPENDENT_CODE ON
-      CUDA_SEPARABLE_COMPILATION ON
-      CUDA_RESOLVE_DEVICE_SYMBOLS ON
     )
+    if(USE_HIP)
+      set_target_properties(cupdlpx_core PROPERTIES
+        HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}"
+      )
+    else()
+      set_target_properties(cupdlpx_core PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON
+        CUDA_RESOLVE_DEVICE_SYMBOLS ON
+      )
+    endif()
 endif()
 
 # 2. Shared Library
@@ -187,9 +244,17 @@ if(CUPDLPX_BUILD_SHARED_LIB)
     set_target_properties(cupdlpx_shared PROPERTIES
         OUTPUT_NAME "cupdlpx"
         RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+    )
+    if(USE_HIP)
+      set_target_properties(cupdlpx_shared PROPERTIES
+        HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}"
+      )
+    else()
+      set_target_properties(cupdlpx_shared PROPERTIES
         CUDA_SEPARABLE_COMPILATION ON
         CUDA_RESOLVE_DEVICE_SYMBOLS ON
-    )
+      )
+    endif()
 endif()
 
 # 3. CLI Executable
@@ -204,8 +269,12 @@ if(CUPDLPX_BUILD_CLI)
     set_target_properties(cupdlpx_cli PROPERTIES
         OUTPUT_NAME "cupdlpx"
         RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
-        CUDA_RESOLVE_DEVICE_SYMBOLS ON
     )
+    if(NOT USE_HIP)
+      set_target_properties(cupdlpx_cli PROPERTIES
+        CUDA_RESOLVE_DEVICE_SYMBOLS ON
+      )
+    endif()
 endif()
 
 # 4. Tests
@@ -217,14 +286,18 @@ if(CUPDLPX_BUILD_TESTS)
     enable_testing()
     file(GLOB TEST_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/test/*.c" "${CMAKE_CURRENT_SOURCE_DIR}/test/*.cu")
     foreach(TEST_SRC ${TEST_SOURCES})
-        get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE) 
+        get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
         add_executable(${TEST_NAME} ${TEST_SRC})
         target_link_libraries(${TEST_NAME} PRIVATE cupdlpx_core)
         target_include_directories(${TEST_NAME} PRIVATE include internal)
         set_target_properties(${TEST_NAME} PROPERTIES
             RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tests"
-            CUDA_RESOLVE_DEVICE_SYMBOLS ON
         )
+        if(NOT USE_HIP)
+          set_target_properties(${TEST_NAME} PROPERTIES
+            CUDA_RESOLVE_DEVICE_SYMBOLS ON
+          )
+        endif()
         add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
     endforeach()
 endif()
diff --git a/README.md b/README.md
index 919e8e0..2c7dbf2 100644
--- a/README.md
+++ b/README.md
@@ -26,12 +26,14 @@ Our work is presented in two papers:
 ## Installation
 
 ### Requirements
-* **GPU:** NVIDIA GPU with CUDA 12.4+.
-* **Build Tools:** CMake (≥ 3.20), GCC, NVCC.
+* **GPU:** NVIDIA GPU with CUDA 12.4+, or AMD GPU with ROCm 7.2+.
+* **Build Tools:** CMake (≥ 3.20), GCC, and NVCC (CUDA) or hipcc (ROCm).
 
 > **SpMV backend** is selected automatically at compile time based on cuSPARSE version:
 > - `cusparseSpMV` — CUDA 12.4 – 13.1 (cuSPARSE < 12.7.3)
 > - `cusparseSpMVOp` — CUDA 13.1 Update 1+ (cuSPARSE ≥ 12.7.3)
+>
+> On AMD GPUs the solver uses the `hipsparseSpMV` backend via hipSPARSE.
 
 ### Build from Source
 Clone the repository and compile the project using CMake.
@@ -43,6 +45,21 @@ cmake --build build --clean-first
 ```
 This will create the solver binary at `./build/cupdlpx`.
 
+#### Building for AMD GPUs (ROCm/HIP)
+To target AMD GPUs, configure with `-DUSE_HIP=ON` and select the GPU
+architecture with `-DCMAKE_HIP_ARCHITECTURES`. The CUDA sources are compiled
+as HIP and the cuBLAS/cuSPARSE/CUB calls are mapped to hipBLAS/hipSPARSE/hipCUB.
+```bash
+cmake -B build -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_PREFIX_PATH=/opt/rocm
+cmake --build build --clean-first
+```
+Set `CMAKE_HIP_ARCHITECTURES` to match your GPU (for example `gfx90a` for
+MI200, `gfx1100` for RDNA3 desktop, or `gfx1201` for RDNA4). If the ROCm
+install is not on CMake's default search path, point `-DCMAKE_PREFIX_PATH` at
+it (e.g. `/opt/rocm`) so `find_package` can locate hip, hipBLAS, hipSPARSE,
+and hipCUB. The resulting `./build/cupdlpx` binary is used exactly as in the
+CUDA build.
+
 #### Verifying the Installation
 Run a small test problem to confirm that the solver was built correctly.
 ```bash
diff --git a/internal/cuda_to_hip.h b/internal/cuda_to_hip.h
new file mode 100644
index 0000000..b038feb
--- /dev/null
+++ b/internal/cuda_to_hip.h
@@ -0,0 +1,180 @@
+/*
+Copyright 2025 Haihao Lu
+Copyright (c) 2026 Advanced Micro Devices, Inc.
+
+Author: Jeff Daily <jeff.daily@amd.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+/*
+ * CUDA to HIP compatibility header for cuPDLPx.
+ *
+ * This header maps CUDA API symbols to their HIP equivalents when building
+ * with USE_HIP. On CUDA builds, it simply includes the standard CUDA headers.
+ * Source files keep their CUDA spelling; this header handles the translation.
+ */
+
+#pragma once
+
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+
+// HIP runtime
+#include <hip/hip_runtime.h>
+
+// hipBLAS
+#include <hipblas/hipblas.h>
+
+// hipSPARSE
+#include <hipsparse/hipsparse.h>
+
+// hipCUB (for cub::DeviceReduce) - C++ only
+#ifdef __cplusplus
+#include <hipcub/hipcub.hpp>
+#endif
+
+// ----------------------------------------------------------------------------
+// CUDA Runtime API -> HIP Runtime API
+// ----------------------------------------------------------------------------
+
+// Memory management
+#define cudaMalloc hipMalloc
+#define cudaFree hipFree
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemset hipMemset
+
+// Memory copy kinds
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+
+// Error handling
+#define cudaError_t hipError_t
+#define cudaSuccess hipSuccess
+#define cudaGetLastError hipGetLastError
+#define cudaGetErrorName hipGetErrorName
+
+// Streams
+#define cudaStream_t hipStream_t
+#define cudaStreamCreate hipStreamCreate
+#define cudaStreamDestroy hipStreamDestroy
+
+// Device synchronization
+#define cudaDeviceSynchronize hipDeviceSynchronize
+
+// ----------------------------------------------------------------------------
+// CUDA Graph API -> HIP Graph API
+// ----------------------------------------------------------------------------
+
+#define cudaGraph_t hipGraph_t
+#define cudaGraphExec_t hipGraphExec_t
+#define cudaStreamBeginCapture hipStreamBeginCapture
+#define cudaStreamEndCapture hipStreamEndCapture
+#define cudaStreamCaptureModeGlobal hipStreamCaptureModeGlobal
+#define cudaGraphInstantiate hipGraphInstantiate
+#define cudaGraphDestroy hipGraphDestroy
+#define cudaGraphLaunch hipGraphLaunch
+#define cudaGraphExecDestroy hipGraphExecDestroy
+
+// ----------------------------------------------------------------------------
+// cuBLAS -> hipBLAS
+// ----------------------------------------------------------------------------
+
+#define cublasHandle_t hipblasHandle_t
+#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
+#define cublasSetStream hipblasSetStream
+#define cublasSetPointerMode hipblasSetPointerMode
+#define cublasStatus_t hipblasStatus_t
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_POINTER_MODE_HOST HIPBLAS_POINTER_MODE_HOST
+
+// cuBLAS functions
+#define cublasDnrm2 hipblasDnrm2
+#define cublasDdot hipblasDdot
+#define cublasDscal hipblasDscal
+#define cublasDaxpy hipblasDaxpy
+#define cublasIdamax hipblasIdamax
+
+// cublasDnrm2_v2_64 maps to hipblasDnrm2 (hipBLAS uses 32-bit by default,
+// but LP problem sizes should fit; use hipblasDnrm2_64 if needed)
+#define cublasDnrm2_v2_64 hipblasDnrm2
+
+// Error name helper
+static inline const char *cublasGetStatusName(hipblasStatus_t status)
+{
+    return hipblasStatusToString(status);
+}
+
+// ----------------------------------------------------------------------------
+// cuSPARSE -> hipSPARSE
+// ----------------------------------------------------------------------------
+
+#define cusparseHandle_t hipsparseHandle_t
+#define cusparseCreate hipsparseCreate
+#define cusparseDestroy hipsparseDestroy
+#define cusparseSetStream hipsparseSetStream
+#define cusparseStatus_t hipsparseStatus_t
+#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS
+#define cusparseGetErrorName hipsparseGetErrorName
+
+// Sparse matrix and dense vector descriptors
+#define cusparseSpMatDescr_t hipsparseSpMatDescr_t
+#define cusparseDnVecDescr_t hipsparseDnVecDescr_t
+
+// Index types and base
+#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I
+#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO
+
+// Operations
+#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE
+#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC
+#define CUSPARSE_CSR2CSC_ALG_DEFAULT HIPSPARSE_CSR2CSC_ALG_DEFAULT
+#define CUSPARSE_SPMV_CSR_ALG2 HIPSPARSE_SPMV_CSR_ALG2
+
+// Functions
+#define cusparseCreateCsr hipsparseCreateCsr
+#define cusparseDestroySpMat hipsparseDestroySpMat
+#define cusparseCreateDnVec hipsparseCreateDnVec
+#define cusparseDestroyDnVec hipsparseDestroyDnVec
+#define cusparseDnVecSetValues hipsparseDnVecSetValues
+#define cusparseSpMV hipsparseSpMV
+#define cusparseSpMV_bufferSize hipsparseSpMV_bufferSize
+#define cusparseSpMV_preprocess hipsparseSpMV_preprocess
+#define cusparseCsr2cscEx2 hipsparseCsr2cscEx2
+#define cusparseCsr2cscEx2_bufferSize hipsparseCsr2cscEx2_bufferSize
+
+// ----------------------------------------------------------------------------
+// Data types
+// ----------------------------------------------------------------------------
+
+#define CUDA_R_64F HIP_R_64F
+
+// ----------------------------------------------------------------------------
+// CUB -> hipCUB (C++ only)
+// ----------------------------------------------------------------------------
+
+#ifdef __cplusplus
+namespace cub = hipcub;
+#endif
+
+#else // CUDA build
+
+// Standard CUDA headers
+#include <cub/device/device_reduce.cuh>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cusparse.h>
+
+#endif // USE_HIP
diff --git a/internal/cusparse_compat.h b/internal/cusparse_compat.h
index 9eaecb2..c84aeca 100644
--- a/internal/cusparse_compat.h
+++ b/internal/cusparse_compat.h
@@ -1,5 +1,17 @@
 #pragma once
 
+// On HIP builds, cuda_to_hip.h handles the cusparse -> hipsparse mapping.
+// Include it first so the defines are active when we check for SpMVOp.
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+// hipSPARSE does not have SpMVOp; force the standard SpMV path.
+#define CUPDLPX_HAS_SPMVOP 0
+
+// Provide fallback typedefs for compilation (never used at runtime on HIP).
+typedef void *cusparseSpMVOpDescr_t;
+typedef void *cusparseSpMVOpPlan_t;
+
+#else // CUDA build
+
 #include <cusparse.h>
 
 // cusparseSpMVOp_bufferSize was introduced in cuSPARSE 12.7.3 (CUDA 13.1 Update 1).
@@ -19,3 +31,5 @@ typedef void *cusparseSpMVOpDescr_t;
 typedef void *cusparseSpMVOpPlan_t;
 #endif
 #endif
+
+#endif // USE_HIP
diff --git a/internal/internal_types.h b/internal/internal_types.h
index 1302430..6786ea2 100644
--- a/internal/internal_types.h
+++ b/internal/internal_types.h
@@ -17,9 +17,17 @@ limitations under the License.
 #pragma once
 
 #include "cupdlpx_types.h"
+
+// Include cuda_to_hip.h first to map CUDA -> HIP symbols when building for ROCm.
+#include "cuda_to_hip.h"
 #include "cusparse_compat.h"
+
+// On CUDA, include the standard headers; on HIP, cuda_to_hip.h already included them.
+#if !defined(USE_HIP) && !defined(__HIP_PLATFORM_AMD__)
 #include <cublas_v2.h>
 #include <cusparse.h>
+#endif
+
 #include <stdbool.h>
 #include <time.h>
 
diff --git a/internal/utils.h b/internal/utils.h
index c31fd62..0293e78 100644
--- a/internal/utils.h
+++ b/internal/utils.h
@@ -16,11 +16,18 @@ limitations under the License.
 
 #pragma once
 
+// Include cuda_to_hip.h first to map CUDA -> HIP symbols when building for ROCm.
+// It must come before any CUDA headers.
+#include "cuda_to_hip.h"
 #include "cusparse_compat.h"
 #include "internal_types.h"
+
+// On CUDA, include the standard headers; on HIP, cuda_to_hip.h already included them.
+#if !defined(USE_HIP) && !defined(__HIP_PLATFORM_AMD__)
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <cusparse.h>
+#endif
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/src/feasibility_polish.cu b/src/feasibility_polish.cu
index 879d241..d1304d3 100644
--- a/src/feasibility_polish.cu
+++ b/src/feasibility_polish.cu
@@ -16,9 +16,13 @@ limitations under the License.
 
 #include "feasibility_polish.h"
 #include "utils.h"
+
+// CUDA/HIP headers - cuda_to_hip.h in utils.h handles the mapping
+#if !defined(USE_HIP) && !defined(__HIP_PLATFORM_AMD__)
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <cusparse.h>
+#endif
 #include <math.h>
 #include <stdbool.h>
 #include <stdio.h>
diff --git a/src/preconditioner.cu b/src/preconditioner.cu
index 8883c8c..3e41d3b 100644
--- a/src/preconditioner.cu
+++ b/src/preconditioner.cu
@@ -16,9 +16,15 @@ limitations under the License.
 
 #include "preconditioner.h"
 #include "utils.h"
+
+// CUB / hipCUB - cuda_to_hip.h aliases cub namespace to hipcub on HIP builds
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+#include <hipcub/hipcub.hpp>
+#else
 #include <cub/device/device_reduce.cuh>
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
+#endif
 #include <math.h>
 #include <stdio.h>
 #include <time.h>
diff --git a/src/solver.cu b/src/solver.cu
index 7ad9301..8ab80ff 100644
--- a/src/solver.cu
+++ b/src/solver.cu
@@ -21,9 +21,13 @@ limitations under the License.
 #include "presolve.h"
 #include "solver.h"
 #include "utils.h"
+
+// CUDA/HIP headers - cuda_to_hip.h in utils.h handles the mapping
+#if !defined(USE_HIP) && !defined(__HIP_PLATFORM_AMD__)
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include <cusparse.h>
+#endif
 #include <math.h>
 #include <stdbool.h>
 #include <stdio.h>
diff --git a/test/test_interface.c b/test/test_interface.c
index 5a23077..52a9512 100644
--- a/test/test_interface.c
+++ b/test/test_interface.c
@@ -120,7 +120,6 @@ int main()
     A_dense.m = m;
     A_dense.n = n;
     A_dense.fmt = matrix_dense;
-    A_dense.zero_tolerance = 0.0;
     A_dense.data.dense.A = &A[0][0];
 
     // A as a CSR matrix
@@ -133,7 +132,6 @@ int main()
     A_csr.m = m;
     A_csr.n = n;
     A_csr.fmt = matrix_csr;
-    A_csr.zero_tolerance = 0.0;
     A_csr.data.csr.nnz = 5;
     A_csr.data.csr.row_ptr = csr_row_ptr;
     A_csr.data.csr.col_ind = csr_col_ind;
@@ -149,7 +147,6 @@ int main()
     A_csc.m = m;
     A_csc.n = n;
     A_csc.fmt = matrix_csc;
-    A_csc.zero_tolerance = 0.0;
     A_csc.data.csc.nnz = 5;
     A_csc.data.csc.col_ptr = csc_col_ptr;
     A_csc.data.csc.row_ind = csc_row_ind;
@@ -165,7 +162,6 @@ int main()
     A_coo.m = m;
     A_coo.n = n;
     A_coo.fmt = matrix_coo;
-    A_coo.zero_tolerance = 0.0;
     A_coo.data.coo.nnz = 5;
     A_coo.data.coo.row_ind = coo_row_ind;
     A_coo.data.coo.col_ind = coo_col_ind;
@@ -226,5 +222,30 @@ int main()
     test_warm_start("Test 7: CSC Matrix", &A_csc, c, l, u);
     test_warm_start("Test 8: COO Matrix", &A_coo, c, l, u);
 
+    // Test 9: GPU solver path (presolve disabled) -- forces hipBLAS/hipSPARSE execution
+    printf("\n=== Test 9: CSR Matrix (presolve disabled, GPU solver) ===\n");
+    {
+        lp_problem_t *prob9 = create_lp_problem(c, &A_csr, l, u, NULL, NULL, NULL);
+        if (!prob9)
+        {
+            fprintf(stderr, "[test] create_lp_problem failed for Test 9.\n");
+            return 1;
+        }
+        pdhg_parameters_t params9;
+        set_default_parameters(&params9);
+        params9.presolve = false;
+        params9.verbose = true;
+        cupdlpx_result_t *res9 = solve_lp_problem(prob9, &params9);
+        lp_problem_free(prob9);
+        if (!res9)
+        {
+            fprintf(stderr, "[test] solve_lp_problem failed for Test 9.\n");
+            return 1;
+        }
+        print_vec("x", res9->primal_solution, res9->num_variables);
+        print_vec("y", res9->dual_solution, res9->num_constraints);
+        cupdlpx_result_free(res9);
+    }
+
     return 0;
 }