From 310ed03ec19316a47fa67f5b2873e3774bf4c43a Mon Sep 17 00:00:00 2001 From: Lhongpei <1453244320@qq.com> Date: Mon, 17 Nov 2025 02:58:01 +0000 Subject: [PATCH 1/4] Fused kernel for optimizing efficiency on sparse cases --- internal/internal_types.h | 11 ++ internal/utils.h | 4 + src/solver.cu | 219 ++++++++++++++++++++++++++++++++++++++ src/utils.cu | 24 +++++ 4 files changed, 258 insertions(+) diff --git a/internal/internal_types.h b/internal/internal_types.h index e66600d..20fe8b5 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -31,6 +31,12 @@ typedef struct double *val; } cu_sparse_matrix_csr_t; +typedef enum +{ + CUSPARSE_UPDATE, + FUSED_UPDATE, +} pdhg_update_algorithm_t; + typedef struct { int num_variables; @@ -122,6 +128,9 @@ typedef struct double *ones_primal_d; double *ones_dual_d; + + pdhg_update_algorithm_t primal_update_algorithm; + pdhg_update_algorithm_t dual_update_algorithm; } pdhg_solver_state_t; typedef struct @@ -133,3 +142,5 @@ typedef struct double obj_vec_rescale; double rescaling_time_sec; } rescale_info_t; + + diff --git a/internal/utils.h b/internal/utils.h index 62da072..f84466c 100644 --- a/internal/utils.h +++ b/internal/utils.h @@ -125,6 +125,10 @@ extern "C" int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, double **vals, int *nnz_out); + int calculate_max_nnz_row(int m, const int* matA_row_ptr); + + int calculate_max_nnz_col(int n, const int* matAt_row_ptr); + #ifdef __cplusplus } diff --git a/src/solver.cu b/src/solver.cu index 2880e9a..6fd1fa3 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -60,7 +60,10 @@ __global__ void compute_delta_solution_kernel( const double *initial_primal, const double *pdhg_primal, double *delta_primal, const double *initial_dual, const double *pdhg_dual, double *delta_dual, int n_vars, int n_cons); + +static void fused_compute_next_pdhg_primal_solution(pdhg_solver_state_t *state); static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state); +static void fused_compute_next_pdhg_dual_solution(pdhg_solver_state_t *state); static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state); static void halpern_update(pdhg_solver_state_t *state, double reflection_coefficient); @@ -78,6 +81,8 @@ static void compute_fixed_point_error(pdhg_solver_state_t *state); void lp_problem_free(lp_problem_t *prob); void pdhg_solver_state_free(pdhg_solver_state_t *state); void rescale_info_free(rescale_info_t *info); +static void decide_fused_update_usage(pdhg_solver_state_t *state, + const pdhg_parameters_t *params); cupdlpx_result_t *optimize(const pdhg_parameters_t *params, const lp_problem_t *original_problem) @@ -87,6 +92,7 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params, pdhg_solver_state_t *state = initialize_solver_state(original_problem, rescale_info); + decide_fused_update_usage(state, params); rescale_info_free(rescale_info); initialize_step_size_and_primal_weight(state, params); clock_t start_time = clock(); @@ -575,6 +581,11 @@ __global__ void compute_delta_solution_kernel( static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state) { + if (state->dual_update_algorithm == FUSED_UPDATE) + { + fused_compute_next_pdhg_primal_solution(state); + return; + } CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol, state->current_dual_solution)); CUSPARSE_CHECK( @@ -612,6 +623,11 @@ static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state) static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state) { + if (state->primal_update_algorithm == FUSED_UPDATE) + { + fused_compute_next_pdhg_dual_solution(state); + return; + } CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol, state->reflected_primal_solution)); CUSPARSE_CHECK( @@ -951,4 +967,207 @@ void set_default_parameters(pdhg_parameters_t *params) params->restart_params.k_i = 0.01; params->restart_params.k_d = 0.0; params->restart_params.i_smooth = 0.3; +} + +__global__ void fused_compute_next_pdhg_primal_solution_kernel( + const int *matAt_row_ptr, const int *matAt_col_ind, const double *matAt_val, + double *dual_solution, double *dual_product, + double *current_primal, double *reflected_primal, + const double *objective, const double *var_lb, const double *var_ub, double step_size, + int n_vars) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n_vars) + { + //Compute dual product + double sum = 0.0; + int row_start = matAt_row_ptr[i]; + int row_end = matAt_row_ptr[i + 1]; + for (int j = row_start; j < row_end; ++j) + { + int col = matAt_col_ind[j]; + double val = matAt_val[j]; + sum += val * dual_solution[col]; + } + double dual_prod = sum; + //Compute PDHG primal solution + double temp = current_primal[i] - step_size * (objective[i] - dual_prod); + double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i])); + reflected_primal[i] = 2.0 * temp_proj - current_primal[i]; + dual_product[i] = dual_prod; + } + return; +} + +__global__ void fused_compute_next_pdhg_primal_solution_major_kernel( + const int *matAt_row_ptr, const int *matAt_col_ind, const double *matAt_val, + double *dual_solution, double *dual_product, + double *current_primal, double *reflected_primal, double *pdhg_primal, double *dual_slack, + const double *objective, const double *var_lb, const double *var_ub, double step_size, + int n_vars) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n_vars) + { + //Compute dual product + double sum = 0.0; + int row_start = matAt_row_ptr[i]; + int row_end = matAt_row_ptr[i + 1]; + for (int j = row_start; j < row_end; ++j) + { + int col = matAt_col_ind[j]; + double val = matAt_val[j]; + sum += val * dual_solution[col]; + } + double dual_prod = sum; + //Compute PDHG primal solution + double temp = current_primal[i] - step_size * (objective[i] - dual_prod); + double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i])); + reflected_primal[i] = 2.0 * temp_proj - current_primal[i]; + dual_slack[i] = (temp_proj - temp) / step_size; + pdhg_primal[i] = temp_proj; + dual_product[i] = dual_prod; + } + return; +} + +__global__ void fused_compute_next_pdhg_dual_solution_kernel( + const int *matA_row_ptr, const int *matA_col_ind, const double *matA_val, + double *primal_solution, double *primal_product, + double *current_dual, double *reflected_dual, + const double *const_lb, const double *const_ub, double step_size, + int n_cons) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n_cons) + { + //Compute primal product + double sum = 0.0; + int row_start = matA_row_ptr[i]; + int row_end = matA_row_ptr[i + 1]; + for (int j = row_start; j < row_end; ++j) + { + int col = matA_col_ind[j]; + double val = matA_val[j]; + sum += val * primal_solution[col]; + } + double primal_prod = sum; + //Compute PDHG dual solution + double temp = current_dual[i] / step_size - primal_prod; + double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i])); + reflected_dual[i] = 2.0 * (temp - temp_proj) * step_size - current_dual[i]; + primal_product[i] = primal_prod; + } + return; +} + +__global__ void fused_compute_next_pdhg_dual_major_solution( + const int *matA_row_ptr, const int *matA_col_ind, const double *matA_val, + double *primal_solution, double *primal_product, + double *current_dual, double *reflected_dual, double *pdhg_dual, + const double *const_lb, const double *const_ub, double step_size, + int n_cons) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n_cons) + { + //Compute primal product + double sum = 0.0; + int row_start = matA_row_ptr[i]; + int row_end = matA_row_ptr[i + 1]; + for (int j = row_start; j < row_end; ++j) + { + int col = matA_col_ind[j]; + double val = matA_val[j]; + sum += val * primal_solution[col]; + } + double primal_prod = sum; + //Compute PDHG dual solution + double temp = current_dual[i] / step_size - primal_prod; + double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i])); + pdhg_dual[i] = (temp - temp_proj) * step_size; + reflected_dual[i] = 2.0 * pdhg_dual[i] - current_dual[i]; + primal_product[i] = primal_prod; + } + return; +} + +static void fused_compute_next_pdhg_primal_solution(pdhg_solver_state_t *state) +{ + double step_size = state->step_size / state->primal_weight; + + if (state->is_this_major_iteration || ((state->total_count + 2) % get_print_frequency(state->total_count + 2)) == 0) + { + fused_compute_next_pdhg_primal_solution_major_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, state->constraint_matrix_t->val, + state->current_dual_solution, state->dual_product, + state->current_primal_solution, state->reflected_primal_solution, + state->pdhg_primal_solution, state->dual_slack, + state->objective_vector, state->variable_lower_bound, + state->variable_upper_bound, step_size, + state->num_variables); + } + else + { + fused_compute_next_pdhg_primal_solution_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( + state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, state->constraint_matrix_t->val, + state->current_dual_solution, state->dual_product, + state->current_primal_solution, state->reflected_primal_solution, + state->objective_vector, state->variable_lower_bound, + state->variable_upper_bound, step_size, + state->num_variables); + } +} + +static void fused_compute_next_pdhg_dual_solution(pdhg_solver_state_t *state) +{ + double step_size = state->step_size * state->primal_weight; + + if (state->is_this_major_iteration || ((state->total_count + 2) % get_print_frequency(state->total_count + 2)) == 0) + { + fused_compute_next_pdhg_dual_major_solution<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, state->constraint_matrix->val, + state->reflected_primal_solution, state->primal_product, + state->current_dual_solution, state->reflected_dual_solution, state->pdhg_dual_solution, + state->constraint_lower_bound, state->constraint_upper_bound, + step_size, + state->num_constraints); + } + else + { + fused_compute_next_pdhg_dual_solution_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( + state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, state->constraint_matrix->val, + state->reflected_primal_solution, state->primal_product, + state->current_dual_solution, state->reflected_dual_solution, + state->constraint_lower_bound, state->constraint_upper_bound, + step_size, + state->num_constraints); + } +} + +static void decide_fused_update_usage(pdhg_solver_state_t *state, + const pdhg_parameters_t *params) +{ + // Heuristic to decide whether to use fused kernels or not + // Currently, we use fused kernels when the number of non-zeros is less than a threshold + int n_cons = state->num_constraints; + int n_vars = state->num_variables; + int max_nnz_A_row = calculate_max_nnz_row(n_cons, state->constraint_matrix->row_ptr); + int max_nnz_At_row = calculate_max_nnz_row(n_vars, state->constraint_matrix_t->row_ptr); + int fusion_nnz_threshold = 100; + if (max_nnz_A_row > fusion_nnz_threshold) state->dual_update_algorithm = CUSPARSE_UPDATE; + else state->dual_update_algorithm = FUSED_UPDATE; + if (max_nnz_At_row > fusion_nnz_threshold) state->primal_update_algorithm = CUSPARSE_UPDATE; + else state->primal_update_algorithm = FUSED_UPDATE; + if (params->verbose) + { + if (state->primal_update_algorithm == FUSED_UPDATE) + printf("Using fused primal update kernel.\n"); + else + printf("Using cuSPARSE primal update kernel.\n"); + if (state->dual_update_algorithm == FUSED_UPDATE) + printf("Using fused dual update kernel.\n"); + else + printf("Using cuSPARSE dual update kernel.\n"); + } } \ No newline at end of file diff --git a/src/utils.cu b/src/utils.cu index 83ba194..a22929c 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -961,3 +961,27 @@ int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind, *nnz_out = nnz; return 0; } + +int calculate_max_nnz_row(int m, const int* matA_row_ptr) { + int max_nnz = 0; + int* host_ptr = (int*)safe_malloc((size_t)(m + 1) * sizeof(int)); + CUDA_CHECK(cudaMemcpy(host_ptr, matA_row_ptr, (size_t)(m + 1) * sizeof(int), cudaMemcpyDeviceToHost)); + for (int i = 0; i < m; ++i) { + int row_nnz = host_ptr[i + 1] - host_ptr[i]; + if (row_nnz > max_nnz) max_nnz = row_nnz; + } + free(host_ptr); + return max_nnz; +} + +int calculate_max_nnz_col(int n, const int* matAt_row_ptr) { + int max_nnz = 0; + int* host_ptr = (int*)safe_malloc((size_t)(n + 1) * sizeof(int)); + CUDA_CHECK(cudaMemcpy(host_ptr, matAt_row_ptr, (size_t)(n + 1) * sizeof(int), cudaMemcpyDeviceToHost)); + for (int j = 0; j < n; ++j) { + int col_nnz = host_ptr[j + 1] - host_ptr[j]; + if (col_nnz > max_nnz) max_nnz = col_nnz; + } + free(host_ptr); + return max_nnz; +} \ No newline at end of file From 0e9b04498be423021d0566c3cf265f0e0b168808 Mon Sep 17 00:00:00 2001 From: Lhongpei <1453244320@qq.com> Date: Mon, 17 Nov 2025 14:43:01 +0000 Subject: [PATCH 2/4] Improve kernel efficiency --- src/solver.cu | 119 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 81 insertions(+), 38 deletions(-) diff --git a/src/solver.cu b/src/solver.cu index 6fd1fa3..2454ef0 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -581,7 +581,7 @@ __global__ void compute_delta_solution_kernel( static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state) { - if (state->dual_update_algorithm == FUSED_UPDATE) + if (state->primal_update_algorithm == FUSED_UPDATE) { fused_compute_next_pdhg_primal_solution(state); return; @@ -623,7 +623,7 @@ static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state) static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state) { - if (state->primal_update_algorithm == FUSED_UPDATE) + if (state->dual_update_algorithm == FUSED_UPDATE) { fused_compute_next_pdhg_dual_solution(state); return; @@ -970,10 +970,18 @@ void set_default_parameters(pdhg_parameters_t *params) } __global__ void fused_compute_next_pdhg_primal_solution_kernel( - const int *matAt_row_ptr, const int *matAt_col_ind, const double *matAt_val, - double *dual_solution, double *dual_product, - double *current_primal, double *reflected_primal, - const double *objective, const double *var_lb, const double *var_ub, double step_size, + const int * __restrict__ matAt_row_ptr, + const int * __restrict__ matAt_col_ind, + const double * __restrict__ matAt_val, + const double * __restrict__ dual_solution, + double * __restrict__ dual_product, + const double * __restrict__ current_primal, + double * __restrict__ reflected_primal, + const double * __restrict__ objective, + const double * __restrict__ var_lb, + const double * __restrict__ var_ub, + double step_size, + double inv_step_size, int n_vars) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -987,9 +995,10 @@ __global__ void fused_compute_next_pdhg_primal_solution_kernel( { int col = matAt_col_ind[j]; double val = matAt_val[j]; - sum += val * dual_solution[col]; + sum += val * __ldg(&dual_solution[col]); } double dual_prod = sum; + //Compute PDHG primal solution double temp = current_primal[i] - step_size * (objective[i] - dual_prod); double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i])); @@ -1000,10 +1009,20 @@ __global__ void fused_compute_next_pdhg_primal_solution_kernel( } __global__ void fused_compute_next_pdhg_primal_solution_major_kernel( - const int *matAt_row_ptr, const int *matAt_col_ind, const double *matAt_val, - double *dual_solution, double *dual_product, - double *current_primal, double *reflected_primal, double *pdhg_primal, double *dual_slack, - const double *objective, const double *var_lb, const double *var_ub, double step_size, + const int * __restrict__ matAt_row_ptr, + const int * __restrict__ matAt_col_ind, + const double * __restrict__ matAt_val, + const double * __restrict__ dual_solution, + double * __restrict__ dual_product, + const double * __restrict__ current_primal, + double * __restrict__ reflected_primal, + double * __restrict__ pdhg_primal, + double * __restrict__ dual_slack, + const double * __restrict__ objective, + const double * __restrict__ var_lb, + const double * __restrict__ var_ub, + double step_size, + double inv_step_size, int n_vars) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -1017,25 +1036,36 @@ __global__ void fused_compute_next_pdhg_primal_solution_major_kernel( { int col = matAt_col_ind[j]; double val = matAt_val[j]; - sum += val * dual_solution[col]; + sum += val * __ldg(&dual_solution[col]); } double dual_prod = sum; + //Compute PDHG primal solution double temp = current_primal[i] - step_size * (objective[i] - dual_prod); double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i])); reflected_primal[i] = 2.0 * temp_proj - current_primal[i]; - dual_slack[i] = (temp_proj - temp) / step_size; + + dual_slack[i] = (temp_proj - temp) * inv_step_size; + pdhg_primal[i] = temp_proj; dual_product[i] = dual_prod; } return; } -__global__ void fused_compute_next_pdhg_dual_solution_kernel( - const int *matA_row_ptr, const int *matA_col_ind, const double *matA_val, - double *primal_solution, double *primal_product, - double *current_dual, double *reflected_dual, - const double *const_lb, const double *const_ub, double step_size, +__global__ void fused_compute_next_pdhg_dual_solution_major_kernel( + const int * __restrict__ matA_row_ptr, + const int * __restrict__ matA_col_ind, + const double * __restrict__ matA_val, + const double * __restrict__ primal_solution, + double * __restrict__ primal_product, + const double * __restrict__ current_dual, + double * __restrict__ reflected_dual, + double * __restrict__ pdhg_dual, + const double * __restrict__ const_lb, + const double * __restrict__ const_ub, + double step_size, + double inv_step_size, int n_cons) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -1049,23 +1079,31 @@ __global__ void fused_compute_next_pdhg_dual_solution_kernel( { int col = matA_col_ind[j]; double val = matA_val[j]; - sum += val * primal_solution[col]; + sum += val * __ldg(&primal_solution[col]); } double primal_prod = sum; //Compute PDHG dual solution - double temp = current_dual[i] / step_size - primal_prod; + double temp = current_dual[i]* inv_step_size - primal_prod; double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i])); - reflected_dual[i] = 2.0 * (temp - temp_proj) * step_size - current_dual[i]; + pdhg_dual[i] = (temp - temp_proj) * step_size; + reflected_dual[i] = 2.0 * pdhg_dual[i] - current_dual[i]; primal_product[i] = primal_prod; } return; } -__global__ void fused_compute_next_pdhg_dual_major_solution( - const int *matA_row_ptr, const int *matA_col_ind, const double *matA_val, - double *primal_solution, double *primal_product, - double *current_dual, double *reflected_dual, double *pdhg_dual, - const double *const_lb, const double *const_ub, double step_size, +__global__ void fused_compute_next_pdhg_dual_solution_kernel( + const int * __restrict__ matA_row_ptr, + const int * __restrict__ matA_col_ind, + const double * __restrict__ matA_val, + const double * __restrict__ primal_solution, + double * __restrict__ primal_product, + const double * __restrict__ current_dual, + double * __restrict__ reflected_dual, + const double * __restrict__ const_lb, + const double * __restrict__ const_ub, + double step_size, + double inv_step_size, int n_cons) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -1079,14 +1117,13 @@ __global__ void fused_compute_next_pdhg_dual_major_solution( { int col = matA_col_ind[j]; double val = matA_val[j]; - sum += val * primal_solution[col]; + sum += val * __ldg(&primal_solution[col]); } double primal_prod = sum; //Compute PDHG dual solution - double temp = current_dual[i] / step_size - primal_prod; + double temp = current_dual[i] * inv_step_size - primal_prod; double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i])); - pdhg_dual[i] = (temp - temp_proj) * step_size; - reflected_dual[i] = 2.0 * pdhg_dual[i] - current_dual[i]; + reflected_dual[i] = 2.0 * (temp - temp_proj) * step_size - current_dual[i]; primal_product[i] = primal_prod; } return; @@ -1095,7 +1132,7 @@ __global__ void fused_compute_next_pdhg_dual_major_solution( static void fused_compute_next_pdhg_primal_solution(pdhg_solver_state_t *state) { double step_size = state->step_size / state->primal_weight; - + double inv_step_size = 1.0 / step_size; if (state->is_this_major_iteration || ((state->total_count + 2) % get_print_frequency(state->total_count + 2)) == 0) { fused_compute_next_pdhg_primal_solution_major_kernel<<num_blocks_primal, THREADS_PER_BLOCK>>>( @@ -1104,7 +1141,7 @@ static void fused_compute_next_pdhg_primal_solution(pdhg_solver_state_t *state) state->current_primal_solution, state->reflected_primal_solution, state->pdhg_primal_solution, state->dual_slack, state->objective_vector, state->variable_lower_bound, - state->variable_upper_bound, step_size, + state->variable_upper_bound, step_size, inv_step_size, state->num_variables); } else @@ -1114,7 +1151,7 @@ static void fused_compute_next_pdhg_primal_solution(pdhg_solver_state_t *state) state->current_dual_solution, state->dual_product, state->current_primal_solution, state->reflected_primal_solution, state->objective_vector, state->variable_lower_bound, - state->variable_upper_bound, step_size, + state->variable_upper_bound, step_size, inv_step_size, state->num_variables); } } @@ -1122,15 +1159,16 @@ static void fused_compute_next_pdhg_primal_solution(pdhg_solver_state_t *state) static void fused_compute_next_pdhg_dual_solution(pdhg_solver_state_t *state) { double step_size = state->step_size * state->primal_weight; + double inv_step_size = 1.0 / step_size; if (state->is_this_major_iteration || ((state->total_count + 2) % get_print_frequency(state->total_count + 2)) == 0) { - fused_compute_next_pdhg_dual_major_solution<<num_blocks_dual, THREADS_PER_BLOCK>>>( + fused_compute_next_pdhg_dual_solution_major_kernel<<num_blocks_dual, THREADS_PER_BLOCK>>>( state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, state->constraint_matrix->val, state->reflected_primal_solution, state->primal_product, state->current_dual_solution, state->reflected_dual_solution, state->pdhg_dual_solution, state->constraint_lower_bound, state->constraint_upper_bound, - step_size, + step_size, inv_step_size, state->num_constraints); } else @@ -1140,7 +1178,7 @@ static void fused_compute_next_pdhg_dual_solution(pdhg_solver_state_t *state) state->reflected_primal_solution, state->primal_product, state->current_dual_solution, state->reflected_dual_solution, state->constraint_lower_bound, state->constraint_upper_bound, - step_size, + step_size, inv_step_size, state->num_constraints); } } @@ -1155,9 +1193,14 @@ static void decide_fused_update_usage(pdhg_solver_state_t *state, int max_nnz_A_row = calculate_max_nnz_row(n_cons, state->constraint_matrix->row_ptr); int max_nnz_At_row = calculate_max_nnz_row(n_vars, state->constraint_matrix_t->row_ptr); int fusion_nnz_threshold = 100; - if (max_nnz_A_row > fusion_nnz_threshold) state->dual_update_algorithm = CUSPARSE_UPDATE; + double fusion_density_threshold = 0.01; + int primal_threshold = fmin(fusion_nnz_threshold, + (int)(fusion_density_threshold * n_cons)); + int dual_threshold = fmin(fusion_nnz_threshold, + (int)(fusion_density_threshold * n_vars)); + if (max_nnz_A_row > dual_threshold) state->dual_update_algorithm = CUSPARSE_UPDATE; else state->dual_update_algorithm = FUSED_UPDATE; - if (max_nnz_At_row > fusion_nnz_threshold) state->primal_update_algorithm = CUSPARSE_UPDATE; + if (max_nnz_At_row > primal_threshold) state->primal_update_algorithm = CUSPARSE_UPDATE; else state->primal_update_algorithm = FUSED_UPDATE; if (params->verbose) { From 99b6da5b1833039ec1aaf3ec453e9bef41341340 Mon Sep 17 00:00:00 2001 From: Lhongpei <1453244320@qq.com> Date: Thu, 4 Dec 2025 01:54:26 +0000 Subject: [PATCH 3/4] New Selecting Method: select update algorithm through benchmarking --- include/cupdlpx_types.h | 9 ++ internal/internal_types.h | 1 + internal/utils.h | 2 + src/solver.cu | 183 ++++++++++++++++++++++++++++++++++---- src/utils.cu | 7 ++ 5 files changed, 185 insertions(+), 17 deletions(-) diff --git a/include/cupdlpx_types.h b/include/cupdlpx_types.h index d17a939..211d64a 100644 --- a/include/cupdlpx_types.h +++ b/include/cupdlpx_types.h @@ -77,6 +77,14 @@ extern "C" int iteration_limit; } termination_criteria_t; + + typedef enum { + PDHG_TUNING_HEURISTIC, + PDHG_TUNING_BENCHMARK, + PDHG_CUSPARSE_FIX, + PDHG_FUSED_FIX, + } pdhg_tuning_method_t; + typedef struct { int l_inf_ruiz_iterations; @@ -89,6 +97,7 @@ extern "C" restart_parameters_t restart_params; double reflection_coefficient; bool feasibility_polishing; + pdhg_tuning_method_t tuning_method; } pdhg_parameters_t; typedef struct diff --git a/internal/internal_types.h b/internal/internal_types.h index 9d4b0ec..ebe3239 100644 --- a/internal/internal_types.h +++ b/internal/internal_types.h @@ -37,6 +37,7 @@ typedef enum FUSED_UPDATE, } pdhg_update_algorithm_t; + typedef struct { int num_variables; diff --git a/internal/utils.h b/internal/utils.h index 34fc559..f9a140f 100644 --- a/internal/utils.h +++ b/internal/utils.h @@ -104,6 +104,8 @@ extern "C" bool verbose, termination_reason_t termination_reason); + void print_table_head(const pdhg_parameters_t *params); + void display_iteration_stats(const pdhg_solver_state_t *solver_state, bool verbose); const char *termination_reason_to_string(termination_reason_t reason); diff --git a/src/solver.cu b/src/solver.cu index 89b4302..7d0a031 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -111,6 +111,7 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params, initialize_step_size_and_primal_weight(state, params); clock_t start_time = clock(); bool do_restart = false; + print_table_head(params); while (state->termination_reason == TERMINATION_REASON_UNSPECIFIED) { if ((state->is_this_major_iteration || state->total_count == 0) || @@ -992,6 +993,8 @@ void set_default_parameters(pdhg_parameters_t *params) params->restart_params.k_i = 0.01; params->restart_params.k_d = 0.0; params->restart_params.i_smooth = 0.3; + params->feasibility_polishing = false; + params->tuning_method = PDHG_TUNING_BENCHMARK; } __global__ void fused_compute_next_pdhg_primal_solution_kernel( @@ -1113,6 +1116,9 @@ __global__ void fused_compute_next_pdhg_dual_solution_major_kernel( pdhg_dual[i] = (temp - temp_proj) * step_size; reflected_dual[i] = 2.0 * pdhg_dual[i] - current_dual[i]; primal_product[i] = primal_prod; + } + return; +} //Feasibility Polishing void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state) { @@ -1259,6 +1265,10 @@ __global__ void fused_compute_next_pdhg_dual_solution_kernel( double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i])); reflected_dual[i] = 2.0 * (temp - temp_proj) * step_size - current_dual[i]; primal_product[i] = primal_prod; + } + return; +} + void dual_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state) { print_initial_feas_polish_info(false, params); @@ -1359,37 +1369,176 @@ static void fused_compute_next_pdhg_dual_solution(pdhg_solver_state_t *state) } } -static void decide_fused_update_usage(pdhg_solver_state_t *state, - const pdhg_parameters_t *params) +static void fix_cusparse_decide_fused_update_usage(pdhg_solver_state_t *state, + const pdhg_parameters_t *params) +{ + state->primal_update_algorithm = CUSPARSE_UPDATE; + state->dual_update_algorithm = CUSPARSE_UPDATE; + + if (params->verbose) { + printf("[Auto-Tuning] Strategy: FIXED CUSPARSE\n"); + printf(" Primal: cuSPARSE (Forced)\n"); + printf(" Dual : cuSPARSE (Forced)\n"); + } +} + +static void fix_fused_decide_fused_update_usage(pdhg_solver_state_t *state, + const pdhg_parameters_t *params) +{ + state->primal_update_algorithm = FUSED_UPDATE; + state->dual_update_algorithm = FUSED_UPDATE; + + if (params->verbose) { + printf("[Auto-Tuning] Strategy: FIXED FUSED\n"); + printf(" Primal: FUSED (Forced)\n"); + printf(" Dual : FUSED (Forced)\n"); + } +} + +static void reset_solver_state_after_benchmark(pdhg_solver_state_t *state) { + + CUDA_CHECK(cudaMemcpy(state->current_primal_solution, state->initial_primal_solution, + state->num_variables * sizeof(double), cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpy(state->pdhg_primal_solution, state->initial_primal_solution, + state->num_variables * sizeof(double), cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpy(state->reflected_primal_solution, state->initial_primal_solution, + state->num_variables * sizeof(double), cudaMemcpyDeviceToDevice)); + + CUDA_CHECK(cudaMemcpy(state->current_dual_solution, state->initial_dual_solution, + state->num_constraints * sizeof(double), cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpy(state->pdhg_dual_solution, state->initial_dual_solution, + state->num_constraints * sizeof(double), cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpy(state->reflected_dual_solution, state->initial_dual_solution, + state->num_constraints * sizeof(double), cudaMemcpyDeviceToDevice)); +} + + +static void heur_decide_fused_update_usage(pdhg_solver_state_t *state, + const pdhg_parameters_t *params) { - // Heuristic to decide whether to use fused kernels or not - // Currently, we use fused kernels when the number of non-zeros is less than a threshold + // Heuristic: based on Non-Zeros (NNZ) and Density int n_cons = state->num_constraints; int n_vars = state->num_variables; + int max_nnz_A_row = calculate_max_nnz_row(n_cons, state->constraint_matrix->row_ptr); int max_nnz_At_row = calculate_max_nnz_row(n_vars, state->constraint_matrix_t->row_ptr); + int fusion_nnz_threshold = 100; double fusion_density_threshold = 0.01; - int primal_threshold = fmin(fusion_nnz_threshold, - (int)(fusion_density_threshold * n_cons)); - int dual_threshold = fmin(fusion_nnz_threshold, - (int)(fusion_density_threshold * n_vars)); + + int primal_threshold = fmin(fusion_nnz_threshold, (int)(fusion_density_threshold * n_cons)); + int dual_threshold = fmin(fusion_nnz_threshold, (int)(fusion_density_threshold * n_vars)); + + // Dual Update Uses Matrix A if (max_nnz_A_row > dual_threshold) state->dual_update_algorithm = CUSPARSE_UPDATE; else state->dual_update_algorithm = FUSED_UPDATE; + + // Primal Update Uses Matrix A^T if (max_nnz_At_row > primal_threshold) state->primal_update_algorithm = CUSPARSE_UPDATE; else state->primal_update_algorithm = FUSED_UPDATE; - if (params->verbose) + + if (params->verbose) { + printf("[Auto-Tuning] Strategy: HEURISTIC\n"); + printf(" Primal: %s (Threshold: %d, MaxNNZ: %d)\n", + (state->primal_update_algorithm == FUSED_UPDATE ? "FUSED" : "cuSPARSE"), primal_threshold, max_nnz_At_row); + printf(" Dual : %s (Threshold: %d, MaxNNZ: %d)\n", + (state->dual_update_algorithm == FUSED_UPDATE ? "FUSED" : "cuSPARSE"), dual_threshold, max_nnz_A_row); + } +} + +static void bench_decide_fused_update_usage(pdhg_solver_state_t *state, + const pdhg_parameters_t *params) +{ + if (params->verbose) printf("[Auto-Tuning] Strategy: BENCHMARK (Running tests...)\n"); + + const int WARMUP = 5; + const int ITER = 10; + float t_fused, t_cusparse; + + cudaEvent_t start, stop; + CUDA_CHECK(cudaEventCreate(&start)); + CUDA_CHECK(cudaEventCreate(&stop)); + + + state->primal_update_algorithm = FUSED_UPDATE; + for(int i=0; iprimal_update_algorithm = CUSPARSE_UPDATE; + for(int i=0; iprimal_update_algorithm = FUSED_UPDATE; + if (params->verbose) printf(" Primal: Selected FUSED (%.3f ms) < cuSPARSE (%.3f ms)\n", t_fused, t_cusparse); + } else { + state->primal_update_algorithm = CUSPARSE_UPDATE; + if (params->verbose) printf(" Primal: Selected cuSPARSE (%.3f ms) < FUSED (%.3f ms)\n", t_cusparse, t_fused); + } + + + state->dual_update_algorithm = FUSED_UPDATE; + for(int i=0; idual_update_algorithm = CUSPARSE_UPDATE; + for(int i=0; idual_update_algorithm = FUSED_UPDATE; + if (params->verbose) printf(" Dual : Selected FUSED (%.3f ms) < cuSPARSE (%.3f ms)\n", t_fused, t_cusparse); + } else { + state->dual_update_algorithm = CUSPARSE_UPDATE; + if (params->verbose) printf(" Dual : Selected cuSPARSE (%.3f ms) < FUSED (%.3f ms)\n", t_cusparse, t_fused); + } + + CUDA_CHECK(cudaEventDestroy(start)); + CUDA_CHECK(cudaEventDestroy(stop)); + + reset_solver_state_after_benchmark(state); +} + +static void decide_fused_update_usage(pdhg_solver_state_t *state, + const pdhg_parameters_t *params) +{ + switch (params->tuning_method) { - if (state->primal_update_algorithm == FUSED_UPDATE) - printf("Using fused primal update kernel.\n"); - else - printf("Using cuSPARSE primal update kernel.\n"); - if (state->dual_update_algorithm == FUSED_UPDATE) - printf("Using fused dual update kernel.\n"); - else - printf("Using cuSPARSE dual update kernel.\n"); + case PDHG_TUNING_BENCHMARK: + bench_decide_fused_update_usage(state, params); + break; + + case PDHG_CUSPARSE_FIX: + fix_cusparse_decide_fused_update_usage(state, params); + break; + + case PDHG_FUSED_FIX: + fix_fused_decide_fused_update_usage(state, params); + break; + + case PDHG_TUNING_HEURISTIC: + default: + heur_decide_fused_update_usage(state, params); + break; } } + static pdhg_solver_state_t *initialize_primal_feas_polish_state( const pdhg_solver_state_t *original_state) { diff --git a/src/utils.cu b/src/utils.cu index 62f6bac..c453374 100644 --- a/src/utils.cu +++ b/src/utils.cu @@ -349,7 +349,14 @@ void print_initial_info(const pdhg_parameters_t *params, params->termination_criteria.eps_feasible_relative); printf(" eps_infeas_detect : %.1e\n", params->termination_criteria.eps_infeasible); +} +void print_table_head(const pdhg_parameters_t *params) +{ + if (!params->verbose) + { + return; + } printf("---------------------------------------------------------------------" "------------------\n"); printf("%s | %s | %s | %s \n", " runtime ", " objective ", From cb01fe4f96727035e180f5def99924b27b458735 Mon Sep 17 00:00:00 2001 From: Lhongpei <1453244320@qq.com> Date: Wed, 10 Dec 2025 05:04:49 +0000 Subject: [PATCH 4/4] Use fma and Kahan summation for numerical stability --- src/solver.cu | 121 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 72 insertions(+), 49 deletions(-) diff --git a/src/solver.cu b/src/solver.cu index 7d0a031..473291c 100644 --- a/src/solver.cu +++ b/src/solver.cu @@ -1015,25 +1015,30 @@ __global__ void fused_compute_next_pdhg_primal_solution_kernel( int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n_vars) { - //Compute dual product double sum = 0.0; + double c = 0.0; + int row_start = matAt_row_ptr[i]; int row_end = matAt_row_ptr[i + 1]; + for (int j = row_start; j < row_end; ++j) { int col = matAt_col_ind[j]; double val = matAt_val[j]; - sum += val * __ldg(&dual_solution[col]); + + double y = fma(val, __ldg(&dual_solution[col]), -c); + + double t = sum + y; + c = (t - sum) - y; + sum = t; } double dual_prod = sum; - //Compute PDHG primal solution double temp = current_primal[i] - step_size * (objective[i] - dual_prod); double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i])); reflected_primal[i] = 2.0 * temp_proj - current_primal[i]; dual_product[i] = dual_prod; } - return; } __global__ void fused_compute_next_pdhg_primal_solution_major_kernel( @@ -1056,19 +1061,25 @@ __global__ void fused_compute_next_pdhg_primal_solution_major_kernel( int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n_vars) { - //Compute dual product double sum = 0.0; + double c = 0.0; + int row_start = matAt_row_ptr[i]; int row_end = matAt_row_ptr[i + 1]; + for (int j = row_start; j < row_end; ++j) { int col = matAt_col_ind[j]; double val = matAt_val[j]; - sum += val * __ldg(&dual_solution[col]); + + double y = fma(val, __ldg(&dual_solution[col]), -c); + + double t = sum + y; + c = (t - sum) - y; + sum = t; } double dual_prod = sum; - //Compute PDHG primal solution double temp = current_primal[i] - step_size * (objective[i] - dual_prod); double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i])); reflected_primal[i] = 2.0 * temp_proj - current_primal[i]; @@ -1078,7 +1089,49 @@ __global__ void fused_compute_next_pdhg_primal_solution_major_kernel( pdhg_primal[i] = temp_proj; dual_product[i] = dual_prod; } - return; +} + +__global__ void fused_compute_next_pdhg_dual_solution_kernel( + const int * __restrict__ matA_row_ptr, + const int * __restrict__ matA_col_ind, + const double * __restrict__ matA_val, + const double * __restrict__ primal_solution, + double * __restrict__ primal_product, + const double * __restrict__ current_dual, + double * __restrict__ reflected_dual, + const double * __restrict__ const_lb, + const double * __restrict__ const_ub, + double step_size, + double inv_step_size, + int n_cons) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n_cons) + { + double sum = 0.0; + double c = 0.0; + + int row_start = matA_row_ptr[i]; + int row_end = matA_row_ptr[i + 1]; + + for (int j = row_start; j < row_end; ++j) + { + int col = matA_col_ind[j]; + double val = matA_val[j]; + + double y = fma(val, __ldg(&primal_solution[col]), -c); + + double t = sum + y; + c = (t - sum) - y; + sum = t; + } + double primal_prod = sum; + + double temp = current_dual[i] * inv_step_size - primal_prod; + double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i])); + reflected_dual[i] = 2.0 * (temp - temp_proj) * step_size - current_dual[i]; + primal_product[i] = primal_prod; + } } __global__ void fused_compute_next_pdhg_dual_solution_major_kernel( @@ -1099,26 +1152,33 @@ __global__ void fused_compute_next_pdhg_dual_solution_major_kernel( int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n_cons) { - //Compute primal product double sum = 0.0; + double c = 0.0; + int row_start = matA_row_ptr[i]; int row_end = matA_row_ptr[i + 1]; + for (int j = row_start; j < row_end; ++j) { int col = matA_col_ind[j]; double val = matA_val[j]; - sum += val * __ldg(&primal_solution[col]); + + double y = fma(val, __ldg(&primal_solution[col]), -c); + + double t = sum + y; + c = (t - sum) - y; + sum = t; } double primal_prod = sum; - //Compute PDHG dual solution + double temp = current_dual[i]* inv_step_size - primal_prod; double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i])); pdhg_dual[i] = (temp - temp_proj) * step_size; reflected_dual[i] = 2.0 * pdhg_dual[i] - current_dual[i]; primal_product[i] = primal_prod; } - return; } + //Feasibility Polishing void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state) { @@ -1232,43 +1292,6 @@ void primal_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_stat return; } -__global__ void fused_compute_next_pdhg_dual_solution_kernel( - const int * __restrict__ matA_row_ptr, - const int * __restrict__ matA_col_ind, - const double * __restrict__ matA_val, - const double * __restrict__ primal_solution, - double * __restrict__ primal_product, - const double * __restrict__ current_dual, - double * __restrict__ reflected_dual, - const double * __restrict__ const_lb, - const double * __restrict__ const_ub, - double step_size, - double inv_step_size, - int n_cons) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n_cons) - { - //Compute primal product - double sum = 0.0; - int row_start = matA_row_ptr[i]; - int row_end = matA_row_ptr[i + 1]; - for (int j = row_start; j < row_end; ++j) - { - int col = matA_col_ind[j]; - double val = matA_val[j]; - sum += val * __ldg(&primal_solution[col]); - } - double primal_prod = sum; - //Compute PDHG dual solution - double temp = current_dual[i] * inv_step_size - primal_prod; - double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i])); - reflected_dual[i] = 2.0 * (temp - temp_proj) * step_size - current_dual[i]; - primal_product[i] = primal_prod; - } - return; -} - void dual_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state) { print_initial_feas_polish_info(false, params);