diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e7679f..142c027 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,8 +3,15 @@ # ----------------------------------------------------------------------------- cmake_minimum_required(VERSION 3.20) +# HIP/ROCm support option (must be set before project() to influence language detection) +option(USE_HIP "Build with HIP for AMD GPUs" OFF) + # Project config -project(cupdlpx LANGUAGES C CXX CUDA) +if(USE_HIP) + project(cupdlpx LANGUAGES C CXX HIP) +else() + project(cupdlpx LANGUAGES C CXX CUDA) +endif() set(CUPDLPX_VERSION_MAJOR 0) set(CUPDLPX_VERSION_MINOR 2) @@ -32,8 +39,16 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() -if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 60 70 75 80 86 89 90) +if(USE_HIP) + # HIP architecture configuration + # Default to gfx90a if not specified; can override with -DCMAKE_HIP_ARCHITECTURES=gfx1100, etc. + if(NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "") + set(CMAKE_HIP_ARCHITECTURES "gfx90a") + endif() +else() + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 60 70 75 80 86 89 90) + endif() endif() # ----------------------------------------------------------------------------- @@ -61,9 +76,14 @@ else() endif() endif() -# CUDA standards and RDC -set(CMAKE_CUDA_STANDARD 17) -set(CMAKE_CUDA_STANDARD_REQUIRED ON) +# CUDA/HIP standards and RDC +if(USE_HIP) + set(CMAKE_HIP_STANDARD 17) + set(CMAKE_HIP_STANDARD_REQUIRED ON) +else() + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) +endif() # ----------------------------------------------------------------------------- # CONTROL OPTIONS @@ -85,7 +105,16 @@ endif() # ----------------------------------------------------------------------------- # FIND DEPENDENCIES # ----------------------------------------------------------------------------- -find_package(CUDAToolkit REQUIRED) +if(USE_HIP) + # Find ROCm/HIP libraries + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(hipsparse REQUIRED) + find_package(hipcub REQUIRED) + find_package(rocprim REQUIRED) +else() + find_package(CUDAToolkit REQUIRED) +endif() include(FetchContent) # 1. ZLIB Configuration @@ -152,20 +181,40 @@ target_compile_definitions(cupdlpx_compile_flags INTERFACE PSLP_VERSION="${PSLP_ file(GLOB C_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.c") file(GLOB CU_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cu") list(REMOVE_ITEM C_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/src/cli.c") +if(WIN32) + # mps_parser.c is CLI-only; exclude it on Windows where strtok_r is unavailable + list(REMOVE_ITEM C_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/src/mps_parser.c") +endif() set(CORE_INCLUDE_DIRS PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/internal ) -set(CORE_LINK_LIBS - PUBLIC cupdlpx_compile_flags - PUBLIC CUDA::cudart - PUBLIC CUDA::cublas - PUBLIC CUDA::cusparse - PUBLIC ZLIB::ZLIB - PUBLIC PSLP -) +if(USE_HIP) + set(CORE_LINK_LIBS + PUBLIC cupdlpx_compile_flags + PUBLIC hip::device + PUBLIC roc::hipblas + PUBLIC roc::hipsparse + PUBLIC hip::hipcub + PUBLIC ZLIB::ZLIB + PUBLIC PSLP + ) + # Mark .cu files as HIP language + set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE HIP) + # Define USE_HIP for the compat header + add_compile_definitions(USE_HIP) +else() + set(CORE_LINK_LIBS + PUBLIC cupdlpx_compile_flags + PUBLIC CUDA::cudart + PUBLIC CUDA::cublas + PUBLIC CUDA::cusparse + PUBLIC ZLIB::ZLIB + PUBLIC PSLP + ) +endif() # 1. Core STATIC Library if(CUPDLPX_BUILD_STATIC_LIB) @@ -174,9 +223,17 @@ if(CUPDLPX_BUILD_STATIC_LIB) target_link_libraries(cupdlpx_core ${CORE_LINK_LIBS}) set_target_properties(cupdlpx_core PROPERTIES POSITION_INDEPENDENT_CODE ON - CUDA_SEPARABLE_COMPILATION ON - CUDA_RESOLVE_DEVICE_SYMBOLS ON ) + if(USE_HIP) + set_target_properties(cupdlpx_core PROPERTIES + HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" + ) + else() + set_target_properties(cupdlpx_core PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON + ) + endif() endif() # 2. Shared Library @@ -187,9 +244,17 @@ if(CUPDLPX_BUILD_SHARED_LIB) set_target_properties(cupdlpx_shared PROPERTIES OUTPUT_NAME "cupdlpx" RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + ) + if(USE_HIP) + set_target_properties(cupdlpx_shared PROPERTIES + HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" + ) + else() + set_target_properties(cupdlpx_shared PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON - ) + ) + endif() endif() # 3. CLI Executable @@ -204,8 +269,12 @@ if(CUPDLPX_BUILD_CLI) set_target_properties(cupdlpx_cli PROPERTIES OUTPUT_NAME "cupdlpx" RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" - CUDA_RESOLVE_DEVICE_SYMBOLS ON ) + if(NOT USE_HIP) + set_target_properties(cupdlpx_cli PROPERTIES + CUDA_RESOLVE_DEVICE_SYMBOLS ON + ) + endif() endif() # 4. Tests @@ -217,14 +286,18 @@ if(CUPDLPX_BUILD_TESTS) enable_testing() file(GLOB TEST_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/test/*.c" "${CMAKE_CURRENT_SOURCE_DIR}/test/*.cu") foreach(TEST_SRC ${TEST_SOURCES}) - get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE) + get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE) add_executable(${TEST_NAME} ${TEST_SRC}) target_link_libraries(${TEST_NAME} PRIVATE cupdlpx_core) target_include_directories(${TEST_NAME} PRIVATE include internal) set_target_properties(${TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tests" - CUDA_RESOLVE_DEVICE_SYMBOLS ON ) + if(NOT USE_HIP) + set_target_properties(${TEST_NAME} PROPERTIES + CUDA_RESOLVE_DEVICE_SYMBOLS ON + ) + endif() add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) endforeach() endif() diff --git a/README.md b/README.md index 919e8e0..2c7dbf2 100644 --- a/README.md +++ b/README.md @@ -26,12 +26,14 @@ Our work is presented in two papers: ## Installation ### Requirements -* **GPU:** NVIDIA GPU with CUDA 12.4+. -* **Build Tools:** CMake (≥ 3.20), GCC, NVCC. +* **GPU:** NVIDIA GPU with CUDA 12.4+, or AMD GPU with ROCm 7.2+. +* **Build Tools:** CMake (≥ 3.20), GCC, and NVCC (CUDA) or hipcc (ROCm). > **SpMV backend** is selected automatically at compile time based on cuSPARSE version: > - `cusparseSpMV` — CUDA 12.4 – 13.1 (cuSPARSE < 12.7.3) > - `cusparseSpMVOp` — CUDA 13.1 Update 1+ (cuSPARSE ≥ 12.7.3) +> +> On AMD GPUs the solver uses the `hipsparseSpMV` backend via hipSPARSE. ### Build from Source Clone the repository and compile the project using CMake. @@ -43,6 +45,21 @@ cmake --build build --clean-first ``` This will create the solver binary at `./build/cupdlpx`. +#### Building for AMD GPUs (ROCm/HIP) +To target AMD GPUs, configure with `-DUSE_HIP=ON` and select the GPU +architecture with `-DCMAKE_HIP_ARCHITECTURES`. The CUDA sources are compiled +as HIP and the cuBLAS/cuSPARSE/CUB calls are mapped to hipBLAS/hipSPARSE/hipCUB. +```bash +cmake -B build -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_PREFIX_PATH=/opt/rocm +cmake --build build --clean-first +``` +Set `CMAKE_HIP_ARCHITECTURES` to match your GPU (for example `gfx90a` for +MI200, `gfx1100` for RDNA3 desktop, or `gfx1201` for RDNA4). If the ROCm +install is not on CMake's default search path, point `-DCMAKE_PREFIX_PATH` at +it (e.g. `/opt/rocm`) so `find_package` can locate hip, hipBLAS, hipSPARSE, +and hipCUB. The resulting `./build/cupdlpx` binary is used exactly as in the +CUDA build. + #### Verifying the Installation Run a small test problem to confirm that the solver was built correctly. ```bash diff --git a/internal/cuda_to_hip.h b/internal/cuda_to_hip.h new file mode 100644 index 0000000..b038feb --- /dev/null +++ b/internal/cuda_to_hip.h @@ -0,0 +1,180 @@ +/* +Copyright 2025 Haihao Lu +Copyright (c) 2026 Advanced Micro Devices, Inc. + +Author: Jeff Daily + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* + * CUDA to HIP compatibility header for cuPDLPx. + * + * This header maps CUDA API symbols to their HIP equivalents when building + * with USE_HIP. On CUDA builds, it simply includes the standard CUDA headers. + * Source files keep their CUDA spelling; this header handles the translation. + */ + +#pragma once + +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + +// HIP runtime +#include + +// hipBLAS +#include + +// hipSPARSE +#include + +// hipCUB (for cub::DeviceReduce) - C++ only +#ifdef __cplusplus +#include +#endif + +// ---------------------------------------------------------------------------- +// CUDA Runtime API -> HIP Runtime API +// ---------------------------------------------------------------------------- + +// Memory management +#define cudaMalloc hipMalloc +#define cudaFree hipFree +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemset hipMemset + +// Memory copy kinds +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice + +// Error handling +#define cudaError_t hipError_t +#define cudaSuccess hipSuccess +#define cudaGetLastError hipGetLastError +#define cudaGetErrorName hipGetErrorName + +// Streams +#define cudaStream_t hipStream_t +#define cudaStreamCreate hipStreamCreate +#define cudaStreamDestroy hipStreamDestroy + +// Device synchronization +#define cudaDeviceSynchronize hipDeviceSynchronize + +// ---------------------------------------------------------------------------- +// CUDA Graph API -> HIP Graph API +// ---------------------------------------------------------------------------- + +#define cudaGraph_t hipGraph_t +#define cudaGraphExec_t hipGraphExec_t +#define cudaStreamBeginCapture hipStreamBeginCapture +#define cudaStreamEndCapture hipStreamEndCapture +#define cudaStreamCaptureModeGlobal hipStreamCaptureModeGlobal +#define cudaGraphInstantiate hipGraphInstantiate +#define cudaGraphDestroy hipGraphDestroy +#define cudaGraphLaunch hipGraphLaunch +#define cudaGraphExecDestroy hipGraphExecDestroy + +// ---------------------------------------------------------------------------- +// cuBLAS -> hipBLAS +// ---------------------------------------------------------------------------- + +#define cublasHandle_t hipblasHandle_t +#define cublasCreate hipblasCreate +#define cublasDestroy hipblasDestroy +#define cublasSetStream hipblasSetStream +#define cublasSetPointerMode hipblasSetPointerMode +#define cublasStatus_t hipblasStatus_t +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_POINTER_MODE_HOST HIPBLAS_POINTER_MODE_HOST + +// cuBLAS functions +#define cublasDnrm2 hipblasDnrm2 +#define cublasDdot hipblasDdot +#define cublasDscal hipblasDscal +#define cublasDaxpy hipblasDaxpy +#define cublasIdamax hipblasIdamax + +// cublasDnrm2_v2_64 maps to hipblasDnrm2 (hipBLAS uses 32-bit by default, +// but LP problem sizes should fit; use hipblasDnrm2_64 if needed) +#define cublasDnrm2_v2_64 hipblasDnrm2 + +// Error name helper +static inline const char *cublasGetStatusName(hipblasStatus_t status) +{ + return hipblasStatusToString(status); +} + +// ---------------------------------------------------------------------------- +// cuSPARSE -> hipSPARSE +// ---------------------------------------------------------------------------- + +#define cusparseHandle_t hipsparseHandle_t +#define cusparseCreate hipsparseCreate +#define cusparseDestroy hipsparseDestroy +#define cusparseSetStream hipsparseSetStream +#define cusparseStatus_t hipsparseStatus_t +#define CUSPARSE_STATUS_SUCCESS HIPSPARSE_STATUS_SUCCESS +#define cusparseGetErrorName hipsparseGetErrorName + +// Sparse matrix and dense vector descriptors +#define cusparseSpMatDescr_t hipsparseSpMatDescr_t +#define cusparseDnVecDescr_t hipsparseDnVecDescr_t + +// Index types and base +#define CUSPARSE_INDEX_32I HIPSPARSE_INDEX_32I +#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO + +// Operations +#define CUSPARSE_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE +#define CUSPARSE_ACTION_NUMERIC HIPSPARSE_ACTION_NUMERIC +#define CUSPARSE_CSR2CSC_ALG_DEFAULT HIPSPARSE_CSR2CSC_ALG_DEFAULT +#define CUSPARSE_SPMV_CSR_ALG2 HIPSPARSE_SPMV_CSR_ALG2 + +// Functions +#define cusparseCreateCsr hipsparseCreateCsr +#define cusparseDestroySpMat hipsparseDestroySpMat +#define cusparseCreateDnVec hipsparseCreateDnVec +#define cusparseDestroyDnVec hipsparseDestroyDnVec +#define cusparseDnVecSetValues hipsparseDnVecSetValues +#define cusparseSpMV hipsparseSpMV +#define cusparseSpMV_bufferSize hipsparseSpMV_bufferSize +#define cusparseSpMV_preprocess hipsparseSpMV_preprocess +#define cusparseCsr2cscEx2 hipsparseCsr2cscEx2 +#define cusparseCsr2cscEx2_bufferSize hipsparseCsr2cscEx2_bufferSize + +// ---------------------------------------------------------------------------- +// Data types +// ---------------------------------------------------------------------------- + +#define CUDA_R_64F HIP_R_64F + +// ---------------------------------------------------------------------------- +// CUB -> hipCUB (C++ only) +// ---------------------------------------------------------------------------- + +#ifdef __cplusplus +namespace cub = hipcub; +#endif + +#else // CUDA build + +// Standard CUDA headers +#include +#include +#include +#include + +#endif // USE_HIP diff --git a/internal/cusparse_compat.h b/internal/cusparse_compat.h index 9eaecb2..c84aeca 100644 --- a/internal/cusparse_compat.h +++ b/internal/cusparse_compat.h @@ -1,5 +1,17 @@ #pragma once +// On HIP builds, cuda_to_hip.h handles the cusparse -> hipsparse mapping. +// Include it first so the defines are active when we check for SpMVOp. +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) +// hipSPARSE does not have SpMVOp; force the standard SpMV path. +#define CUPDLPX_HAS_SPMVOP 0 + +// Provide fallback typedefs for compilation (never used at runtime on HIP). +typedef void *cusparseSpMVOpDescr_t; +typedef void *cusparseSpMVOpPlan_t; + +#else // CUDA build + #include // cusparseSpMVOp_bufferSize was introduced in cuSPARSE 12.7.3 (CUDA 13.1 Update 1). @@ -19,3 +31,5 @@ typedef void *cusparseSpMVOpDescr_t; typedef void *cusparseSpMVOpPlan_t; #endif #endif + +#endif // USE_HIP diff --git a/internal/internal_types.h b/internal/internal_types.h index 1302430..6786ea2 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -17,9 +17,17 @@ limitations under the License. #pragma once #include "cupdlpx_types.h" + +// Include cuda_to_hip.h first to map CUDA -> HIP symbols when building for ROCm. +#include "cuda_to_hip.h" #include "cusparse_compat.h" + +// On CUDA, include the standard headers; on HIP, cuda_to_hip.h already included them. +#if !defined(USE_HIP) && !defined(__HIP_PLATFORM_AMD__) #include #include +#endif + #include #include diff --git a/internal/utils.h b/internal/utils.h index c31fd62..0293e78 100644 --- a/internal/utils.h +++ b/internal/utils.h @@ -16,11 +16,18 @@ limitations under the License. #pragma once +// Include cuda_to_hip.h first to map CUDA -> HIP symbols when building for ROCm. +// It must come before any CUDA headers. +#include "cuda_to_hip.h" #include "cusparse_compat.h" #include "internal_types.h" + +// On CUDA, include the standard headers; on HIP, cuda_to_hip.h already included them. +#if !defined(USE_HIP) && !defined(__HIP_PLATFORM_AMD__) #include #include #include +#endif #include #include #include diff --git a/src/feasibility_polish.cu b/src/feasibility_polish.cu index 879d241..d1304d3 100644 --- a/src/feasibility_polish.cu +++ b/src/feasibility_polish.cu @@ -16,9 +16,13 @@ limitations under the License. #include "feasibility_polish.h" #include "utils.h" + +// CUDA/HIP headers - cuda_to_hip.h in utils.h handles the mapping +#if !defined(USE_HIP) && !defined(__HIP_PLATFORM_AMD__) #include #include #include +#endif #include #include #include diff --git a/src/preconditioner.cu b/src/preconditioner.cu index 8883c8c..3e41d3b 100644 --- a/src/preconditioner.cu +++ b/src/preconditioner.cu @@ -16,9 +16,15 @@ limitations under the License. #include "preconditioner.h" #include "utils.h" + +// CUB / hipCUB - cuda_to_hip.h aliases cub namespace to hipcub on HIP builds +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) +#include +#else #include #include #include +#endif #include #include #include diff --git a/src/solver.cu b/src/solver.cu index 7ad9301..8ab80ff 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -21,9 +21,13 @@ limitations under the License. #include "presolve.h" #include "solver.h" #include "utils.h" + +// CUDA/HIP headers - cuda_to_hip.h in utils.h handles the mapping +#if !defined(USE_HIP) && !defined(__HIP_PLATFORM_AMD__) #include #include #include +#endif #include #include #include diff --git a/test/test_interface.c b/test/test_interface.c index 5a23077..52a9512 100644 --- a/test/test_interface.c +++ b/test/test_interface.c @@ -120,7 +120,6 @@ int main() A_dense.m = m; A_dense.n = n; A_dense.fmt = matrix_dense; - A_dense.zero_tolerance = 0.0; A_dense.data.dense.A = &A[0][0]; // A as a CSR matrix @@ -133,7 +132,6 @@ int main() A_csr.m = m; A_csr.n = n; A_csr.fmt = matrix_csr; - A_csr.zero_tolerance = 0.0; A_csr.data.csr.nnz = 5; A_csr.data.csr.row_ptr = csr_row_ptr; A_csr.data.csr.col_ind = csr_col_ind; @@ -149,7 +147,6 @@ int main() A_csc.m = m; A_csc.n = n; A_csc.fmt = matrix_csc; - A_csc.zero_tolerance = 0.0; A_csc.data.csc.nnz = 5; A_csc.data.csc.col_ptr = csc_col_ptr; A_csc.data.csc.row_ind = csc_row_ind; @@ -165,7 +162,6 @@ int main() A_coo.m = m; A_coo.n = n; A_coo.fmt = matrix_coo; - A_coo.zero_tolerance = 0.0; A_coo.data.coo.nnz = 5; A_coo.data.coo.row_ind = coo_row_ind; A_coo.data.coo.col_ind = coo_col_ind; @@ -226,5 +222,30 @@ int main() test_warm_start("Test 7: CSC Matrix", &A_csc, c, l, u); test_warm_start("Test 8: COO Matrix", &A_coo, c, l, u); + // Test 9: GPU solver path (presolve disabled) -- forces hipBLAS/hipSPARSE execution + printf("\n=== Test 9: CSR Matrix (presolve disabled, GPU solver) ===\n"); + { + lp_problem_t *prob9 = create_lp_problem(c, &A_csr, l, u, NULL, NULL, NULL); + if (!prob9) + { + fprintf(stderr, "[test] create_lp_problem failed for Test 9.\n"); + return 1; + } + pdhg_parameters_t params9; + set_default_parameters(¶ms9); + params9.presolve = false; + params9.verbose = true; + cupdlpx_result_t *res9 = solve_lp_problem(prob9, ¶ms9); + lp_problem_free(prob9); + if (!res9) + { + fprintf(stderr, "[test] solve_lp_problem failed for Test 9.\n"); + return 1; + } + print_vec("x", res9->primal_solution, res9->num_variables); + print_vec("y", res9->dual_solution, res9->num_constraints); + cupdlpx_result_free(res9); + } + return 0; }