From 310ed03ec19316a47fa67f5b2873e3774bf4c43a Mon Sep 17 00:00:00 2001
From: Lhongpei <1453244320@qq.com>
Date: Mon, 17 Nov 2025 02:58:01 +0000
Subject: [PATCH 1/4] Fused kernel for optimizing efficiency on sparse cases

---
 internal/internal_types.h |  11 ++
 internal/utils.h          |   4 +
 src/solver.cu             | 219 ++++++++++++++++++++++++++++++++++++++
 src/utils.cu              |  24 +++++
 4 files changed, 258 insertions(+)

diff --git a/internal/internal_types.h b/internal/internal_types.h
index e66600d..20fe8b5 100644
--- a/internal/internal_types.h
+++ b/internal/internal_types.h
@@ -31,6 +31,12 @@ typedef struct
 	double *val;
 } cu_sparse_matrix_csr_t;
 
+typedef enum
+{
+	CUSPARSE_UPDATE,
+	FUSED_UPDATE,
+} pdhg_update_algorithm_t;
+
 typedef struct
 {
 	int num_variables;
@@ -122,6 +128,9 @@ typedef struct
 
 	double *ones_primal_d;
 	double *ones_dual_d;
+
+	pdhg_update_algorithm_t primal_update_algorithm;
+	pdhg_update_algorithm_t dual_update_algorithm;
 } pdhg_solver_state_t;
 
 typedef struct
@@ -133,3 +142,5 @@ typedef struct
 	double obj_vec_rescale;
 	double rescaling_time_sec;
 } rescale_info_t;
+
+
diff --git a/internal/utils.h b/internal/utils.h
index 62da072..f84466c 100644
--- a/internal/utils.h
+++ b/internal/utils.h
@@ -125,6 +125,10 @@ extern "C"
     int coo_to_csr(const matrix_desc_t *desc,
                    int **row_ptr, int **col_ind, double **vals, int *nnz_out);
 
+    int calculate_max_nnz_row(int m, const int* matA_row_ptr);
+
+    int calculate_max_nnz_col(int n, const int* matAt_row_ptr);
+
 #ifdef __cplusplus
 }
 
diff --git a/src/solver.cu b/src/solver.cu
index 2880e9a..6fd1fa3 100644
--- a/src/solver.cu
+++ b/src/solver.cu
@@ -60,7 +60,10 @@ __global__ void compute_delta_solution_kernel(
     const double *initial_primal, const double *pdhg_primal,
     double *delta_primal, const double *initial_dual, const double *pdhg_dual,
     double *delta_dual, int n_vars, int n_cons);
+
+static void fused_compute_next_pdhg_primal_solution(pdhg_solver_state_t *state);
 static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state);
+static void fused_compute_next_pdhg_dual_solution(pdhg_solver_state_t *state);
 static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state);
 static void halpern_update(pdhg_solver_state_t *state,
                            double reflection_coefficient);
@@ -78,6 +81,8 @@ static void compute_fixed_point_error(pdhg_solver_state_t *state);
 void lp_problem_free(lp_problem_t *prob);
 void pdhg_solver_state_free(pdhg_solver_state_t *state);
 void rescale_info_free(rescale_info_t *info);
+static void decide_fused_update_usage(pdhg_solver_state_t *state,
+                                 const pdhg_parameters_t *params);
 
 cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
                            const lp_problem_t *original_problem)
@@ -87,6 +92,7 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
     pdhg_solver_state_t *state =
         initialize_solver_state(original_problem, rescale_info);
 
+    decide_fused_update_usage(state, params);
     rescale_info_free(rescale_info);
     initialize_step_size_and_primal_weight(state, params);
     clock_t start_time = clock();
@@ -575,6 +581,11 @@ __global__ void compute_delta_solution_kernel(
 
 static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state)
 {
+    if (state->dual_update_algorithm == FUSED_UPDATE)
+    {
+        fused_compute_next_pdhg_primal_solution(state);
+        return;
+    }
     CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_dual_sol,
                                           state->current_dual_solution));
     CUSPARSE_CHECK(
@@ -612,6 +623,11 @@ static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state)
 
 static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state)
 {
+    if (state->primal_update_algorithm == FUSED_UPDATE)
+    {
+        fused_compute_next_pdhg_dual_solution(state);
+        return;
+    }
     CUSPARSE_CHECK(cusparseDnVecSetValues(state->vec_primal_sol,
                                           state->reflected_primal_solution));
     CUSPARSE_CHECK(
@@ -951,4 +967,207 @@ void set_default_parameters(pdhg_parameters_t *params)
     params->restart_params.k_i = 0.01;
     params->restart_params.k_d = 0.0;
     params->restart_params.i_smooth = 0.3;
+}
+
+__global__ void fused_compute_next_pdhg_primal_solution_kernel(
+    const int *matAt_row_ptr, const int *matAt_col_ind, const double *matAt_val, 
+    double *dual_solution, double *dual_product,
+    double *current_primal, double *reflected_primal,
+    const double *objective, const double *var_lb, const double *var_ub, double step_size,
+    int n_vars)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n_vars)
+    {
+        //Compute dual product
+        double sum = 0.0;
+        int row_start = matAt_row_ptr[i];
+        int row_end = matAt_row_ptr[i + 1];
+        for (int j = row_start; j < row_end; ++j)
+        {
+            int col = matAt_col_ind[j];
+            double val = matAt_val[j];
+            sum += val * dual_solution[col];
+        }
+        double dual_prod = sum;
+        //Compute PDHG primal solution
+        double temp = current_primal[i] - step_size * (objective[i] - dual_prod);
+        double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i]));
+        reflected_primal[i] = 2.0 * temp_proj - current_primal[i];
+        dual_product[i] = dual_prod;
+    }
+    return;
+}
+
+__global__ void fused_compute_next_pdhg_primal_solution_major_kernel(
+    const int *matAt_row_ptr, const int *matAt_col_ind, const double *matAt_val, 
+    double *dual_solution, double *dual_product,
+    double *current_primal, double *reflected_primal, double *pdhg_primal, double *dual_slack,
+    const double *objective, const double *var_lb, const double *var_ub, double step_size,
+    int n_vars)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n_vars)
+    {
+        //Compute dual product
+        double sum = 0.0;
+        int row_start = matAt_row_ptr[i];
+        int row_end = matAt_row_ptr[i + 1];
+        for (int j = row_start; j < row_end; ++j)
+        {
+            int col = matAt_col_ind[j];
+            double val = matAt_val[j];
+            sum += val * dual_solution[col];
+        }
+        double dual_prod = sum;
+        //Compute PDHG primal solution
+        double temp = current_primal[i] - step_size * (objective[i] - dual_prod);
+        double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i]));
+        reflected_primal[i] = 2.0 * temp_proj - current_primal[i];
+        dual_slack[i] = (temp_proj - temp) / step_size;
+        pdhg_primal[i] = temp_proj;
+        dual_product[i] = dual_prod;
+    }
+    return;
+}
+
+__global__ void fused_compute_next_pdhg_dual_solution_kernel(
+    const int *matA_row_ptr, const int *matA_col_ind, const double *matA_val, 
+    double *primal_solution, double *primal_product,
+    double *current_dual, double *reflected_dual,
+    const double *const_lb, const double *const_ub, double step_size,
+    int n_cons)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n_cons)
+    {
+        //Compute primal product
+        double sum = 0.0;
+        int row_start = matA_row_ptr[i];
+        int row_end = matA_row_ptr[i + 1];
+        for (int j = row_start; j < row_end; ++j)
+        {
+            int col = matA_col_ind[j];
+            double val = matA_val[j];
+            sum += val * primal_solution[col];
+        }
+        double primal_prod = sum;
+        //Compute PDHG dual solution
+        double temp = current_dual[i] / step_size - primal_prod;
+        double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i]));
+        reflected_dual[i] = 2.0 * (temp - temp_proj) * step_size - current_dual[i];
+        primal_product[i] = primal_prod;
+    }
+    return;
+}
+
+__global__ void fused_compute_next_pdhg_dual_major_solution(
+    const int *matA_row_ptr, const int *matA_col_ind, const double *matA_val, 
+    double *primal_solution, double *primal_product,
+    double *current_dual, double *reflected_dual, double *pdhg_dual,
+    const double *const_lb, const double *const_ub, double step_size,
+    int n_cons)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n_cons)
+    {
+        //Compute primal product
+        double sum = 0.0;
+        int row_start = matA_row_ptr[i];
+        int row_end = matA_row_ptr[i + 1];
+        for (int j = row_start; j < row_end; ++j)
+        {
+            int col = matA_col_ind[j];
+            double val = matA_val[j];
+            sum += val * primal_solution[col];
+        }
+        double primal_prod = sum;
+        //Compute PDHG dual solution
+        double temp = current_dual[i] / step_size - primal_prod;
+        double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i]));
+        pdhg_dual[i] = (temp - temp_proj) * step_size;
+        reflected_dual[i] = 2.0 * pdhg_dual[i] - current_dual[i];
+        primal_product[i] = primal_prod;
+    }
+    return;
+}
+
+static void fused_compute_next_pdhg_primal_solution(pdhg_solver_state_t *state)
+{
+    double step_size = state->step_size / state->primal_weight;
+
+    if (state->is_this_major_iteration || ((state->total_count + 2) % get_print_frequency(state->total_count + 2)) == 0)
+    {
+        fused_compute_next_pdhg_primal_solution_major_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+            state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, state->constraint_matrix_t->val,
+            state->current_dual_solution, state->dual_product,
+            state->current_primal_solution, state->reflected_primal_solution,
+            state->pdhg_primal_solution, state->dual_slack,
+            state->objective_vector, state->variable_lower_bound,
+            state->variable_upper_bound, step_size,
+            state->num_variables);
+    }
+    else
+    {
+        fused_compute_next_pdhg_primal_solution_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
+            state->constraint_matrix_t->row_ptr, state->constraint_matrix_t->col_ind, state->constraint_matrix_t->val,
+            state->current_dual_solution,  state->dual_product,
+            state->current_primal_solution, state->reflected_primal_solution,
+            state->objective_vector, state->variable_lower_bound,
+            state->variable_upper_bound, step_size,
+            state->num_variables);
+    }
+}
+
+static void fused_compute_next_pdhg_dual_solution(pdhg_solver_state_t *state)
+{
+    double step_size = state->step_size * state->primal_weight;
+
+    if (state->is_this_major_iteration || ((state->total_count + 2) % get_print_frequency(state->total_count + 2)) == 0)
+    {
+        fused_compute_next_pdhg_dual_major_solution<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+            state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, state->constraint_matrix->val,
+            state->reflected_primal_solution, state->primal_product,
+            state->current_dual_solution, state->reflected_dual_solution, state->pdhg_dual_solution,
+            state->constraint_lower_bound, state->constraint_upper_bound,
+            step_size,
+            state->num_constraints);
+    }
+    else
+    {
+        fused_compute_next_pdhg_dual_solution_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+            state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, state->constraint_matrix->val,
+            state->reflected_primal_solution, state->primal_product,
+            state->current_dual_solution, state->reflected_dual_solution,
+            state->constraint_lower_bound, state->constraint_upper_bound,
+            step_size,
+            state->num_constraints);
+    }
+}
+
+static void decide_fused_update_usage(pdhg_solver_state_t *state,
+                                 const pdhg_parameters_t *params)
+{
+    // Heuristic to decide whether to use fused kernels or not
+    // Currently, we use fused kernels when the number of non-zeros is less than a threshold
+    int n_cons = state->num_constraints;
+    int n_vars = state->num_variables;
+    int max_nnz_A_row = calculate_max_nnz_row(n_cons, state->constraint_matrix->row_ptr);
+    int max_nnz_At_row = calculate_max_nnz_row(n_vars, state->constraint_matrix_t->row_ptr);
+    int fusion_nnz_threshold = 100;
+    if (max_nnz_A_row > fusion_nnz_threshold) state->dual_update_algorithm = CUSPARSE_UPDATE;
+    else state->dual_update_algorithm = FUSED_UPDATE;
+    if (max_nnz_At_row > fusion_nnz_threshold) state->primal_update_algorithm = CUSPARSE_UPDATE;
+    else state->primal_update_algorithm = FUSED_UPDATE;
+    if (params->verbose)
+    {
+        if (state->primal_update_algorithm == FUSED_UPDATE)
+            printf("Using fused primal update kernel.\n");
+        else
+            printf("Using cuSPARSE primal update kernel.\n");
+        if (state->dual_update_algorithm == FUSED_UPDATE)
+            printf("Using fused dual update kernel.\n");
+        else
+            printf("Using cuSPARSE dual update kernel.\n");
+    }
 }
\ No newline at end of file
diff --git a/src/utils.cu b/src/utils.cu
index 83ba194..a22929c 100644
--- a/src/utils.cu
+++ b/src/utils.cu
@@ -961,3 +961,27 @@ int coo_to_csr(const matrix_desc_t *desc, int **row_ptr, int **col_ind,
     *nnz_out = nnz;
     return 0;
 }
+
+int calculate_max_nnz_row(int m, const int* matA_row_ptr) {
+    int max_nnz = 0;
+    int* host_ptr = (int*)safe_malloc((size_t)(m + 1) * sizeof(int));
+    CUDA_CHECK(cudaMemcpy(host_ptr, matA_row_ptr, (size_t)(m + 1) * sizeof(int), cudaMemcpyDeviceToHost));
+    for (int i = 0; i < m; ++i) {
+        int row_nnz = host_ptr[i + 1] - host_ptr[i];
+        if (row_nnz > max_nnz) max_nnz = row_nnz;
+    }
+    free(host_ptr);
+    return max_nnz;
+}
+
+int calculate_max_nnz_col(int n, const int* matAt_row_ptr) {
+    int max_nnz = 0;
+    int* host_ptr = (int*)safe_malloc((size_t)(n + 1) * sizeof(int));
+    CUDA_CHECK(cudaMemcpy(host_ptr, matAt_row_ptr, (size_t)(n + 1) * sizeof(int), cudaMemcpyDeviceToHost));
+    for (int j = 0; j < n; ++j) {
+        int col_nnz = host_ptr[j + 1] - host_ptr[j];
+        if (col_nnz > max_nnz) max_nnz = col_nnz;
+    }
+    free(host_ptr);
+    return max_nnz;
+}
\ No newline at end of file

From 0e9b04498be423021d0566c3cf265f0e0b168808 Mon Sep 17 00:00:00 2001
From: Lhongpei <1453244320@qq.com>
Date: Mon, 17 Nov 2025 14:43:01 +0000
Subject: [PATCH 2/4] Improve kernel efficiency

---
 src/solver.cu | 119 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 81 insertions(+), 38 deletions(-)

diff --git a/src/solver.cu b/src/solver.cu
index 6fd1fa3..2454ef0 100644
--- a/src/solver.cu
+++ b/src/solver.cu
@@ -581,7 +581,7 @@ __global__ void compute_delta_solution_kernel(
 
 static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state)
 {
-    if (state->dual_update_algorithm == FUSED_UPDATE)
+    if (state->primal_update_algorithm == FUSED_UPDATE)
     {
         fused_compute_next_pdhg_primal_solution(state);
         return;
@@ -623,7 +623,7 @@ static void compute_next_pdhg_primal_solution(pdhg_solver_state_t *state)
 
 static void compute_next_pdhg_dual_solution(pdhg_solver_state_t *state)
 {
-    if (state->primal_update_algorithm == FUSED_UPDATE)
+    if (state->dual_update_algorithm == FUSED_UPDATE)
     {
         fused_compute_next_pdhg_dual_solution(state);
         return;
@@ -970,10 +970,18 @@ void set_default_parameters(pdhg_parameters_t *params)
 }
 
 __global__ void fused_compute_next_pdhg_primal_solution_kernel(
-    const int *matAt_row_ptr, const int *matAt_col_ind, const double *matAt_val, 
-    double *dual_solution, double *dual_product,
-    double *current_primal, double *reflected_primal,
-    const double *objective, const double *var_lb, const double *var_ub, double step_size,
+    const int * __restrict__ matAt_row_ptr, 
+    const int * __restrict__ matAt_col_ind, 
+    const double * __restrict__ matAt_val, 
+    const double * __restrict__ dual_solution, 
+    double * __restrict__ dual_product,
+    const double * __restrict__ current_primal, 
+    double * __restrict__ reflected_primal,
+    const double * __restrict__ objective, 
+    const double * __restrict__ var_lb, 
+    const double * __restrict__ var_ub, 
+    double step_size, 
+    double inv_step_size,
     int n_vars)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -987,9 +995,10 @@ __global__ void fused_compute_next_pdhg_primal_solution_kernel(
         {
             int col = matAt_col_ind[j];
             double val = matAt_val[j];
-            sum += val * dual_solution[col];
+            sum += val * __ldg(&dual_solution[col]); 
         }
         double dual_prod = sum;
+
         //Compute PDHG primal solution
         double temp = current_primal[i] - step_size * (objective[i] - dual_prod);
         double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i]));
@@ -1000,10 +1009,20 @@ __global__ void fused_compute_next_pdhg_primal_solution_kernel(
 }
 
 __global__ void fused_compute_next_pdhg_primal_solution_major_kernel(
-    const int *matAt_row_ptr, const int *matAt_col_ind, const double *matAt_val, 
-    double *dual_solution, double *dual_product,
-    double *current_primal, double *reflected_primal, double *pdhg_primal, double *dual_slack,
-    const double *objective, const double *var_lb, const double *var_ub, double step_size,
+    const int * __restrict__ matAt_row_ptr, 
+    const int * __restrict__ matAt_col_ind, 
+    const double * __restrict__ matAt_val, 
+    const double * __restrict__ dual_solution,
+    double * __restrict__ dual_product,
+    const double * __restrict__ current_primal, 
+    double * __restrict__ reflected_primal, 
+    double * __restrict__ pdhg_primal, 
+    double * __restrict__ dual_slack,
+    const double * __restrict__ objective, 
+    const double * __restrict__ var_lb, 
+    const double * __restrict__ var_ub, 
+    double step_size, 
+    double inv_step_size, 
     int n_vars)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1017,25 +1036,36 @@ __global__ void fused_compute_next_pdhg_primal_solution_major_kernel(
         {
             int col = matAt_col_ind[j];
             double val = matAt_val[j];
-            sum += val * dual_solution[col];
+            sum += val * __ldg(&dual_solution[col]);
         }
         double dual_prod = sum;
+
         //Compute PDHG primal solution
         double temp = current_primal[i] - step_size * (objective[i] - dual_prod);
         double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i]));
         reflected_primal[i] = 2.0 * temp_proj - current_primal[i];
-        dual_slack[i] = (temp_proj - temp) / step_size;
+        
+        dual_slack[i] = (temp_proj - temp) * inv_step_size; 
+        
         pdhg_primal[i] = temp_proj;
         dual_product[i] = dual_prod;
     }
     return;
 }
 
-__global__ void fused_compute_next_pdhg_dual_solution_kernel(
-    const int *matA_row_ptr, const int *matA_col_ind, const double *matA_val, 
-    double *primal_solution, double *primal_product,
-    double *current_dual, double *reflected_dual,
-    const double *const_lb, const double *const_ub, double step_size,
+__global__ void fused_compute_next_pdhg_dual_solution_major_kernel(
+    const int * __restrict__ matA_row_ptr, 
+    const int * __restrict__ matA_col_ind, 
+    const double * __restrict__ matA_val, 
+    const double * __restrict__ primal_solution,
+    double * __restrict__ primal_product,
+    const double * __restrict__ current_dual, 
+    double * __restrict__ reflected_dual, 
+    double * __restrict__ pdhg_dual,
+    const double * __restrict__ const_lb, 
+    const double * __restrict__ const_ub, 
+    double step_size,    
+    double inv_step_size, 
     int n_cons)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1049,23 +1079,31 @@ __global__ void fused_compute_next_pdhg_dual_solution_kernel(
         {
             int col = matA_col_ind[j];
             double val = matA_val[j];
-            sum += val * primal_solution[col];
+            sum += val * __ldg(&primal_solution[col]);
         }
         double primal_prod = sum;
         //Compute PDHG dual solution
-        double temp = current_dual[i] / step_size - primal_prod;
+        double temp = current_dual[i]* inv_step_size - primal_prod;
         double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i]));
-        reflected_dual[i] = 2.0 * (temp - temp_proj) * step_size - current_dual[i];
+        pdhg_dual[i] = (temp - temp_proj) * step_size;
+        reflected_dual[i] = 2.0 * pdhg_dual[i] - current_dual[i];
         primal_product[i] = primal_prod;
     }
     return;
 }
 
-__global__ void fused_compute_next_pdhg_dual_major_solution(
-    const int *matA_row_ptr, const int *matA_col_ind, const double *matA_val, 
-    double *primal_solution, double *primal_product,
-    double *current_dual, double *reflected_dual, double *pdhg_dual,
-    const double *const_lb, const double *const_ub, double step_size,
+__global__ void fused_compute_next_pdhg_dual_solution_kernel(
+    const int * __restrict__ matA_row_ptr, 
+    const int * __restrict__ matA_col_ind, 
+    const double * __restrict__ matA_val, 
+    const double * __restrict__ primal_solution,
+    double * __restrict__ primal_product,
+    const double * __restrict__ current_dual, 
+    double * __restrict__ reflected_dual, 
+    const double * __restrict__ const_lb, 
+    const double * __restrict__ const_ub, 
+    double step_size,    
+    double inv_step_size, 
     int n_cons)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1079,14 +1117,13 @@ __global__ void fused_compute_next_pdhg_dual_major_solution(
         {
             int col = matA_col_ind[j];
             double val = matA_val[j];
-            sum += val * primal_solution[col];
+            sum += val * __ldg(&primal_solution[col]);
         }
         double primal_prod = sum;
         //Compute PDHG dual solution
-        double temp = current_dual[i] / step_size - primal_prod;
+        double temp = current_dual[i] * inv_step_size - primal_prod;
         double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i]));
-        pdhg_dual[i] = (temp - temp_proj) * step_size;
-        reflected_dual[i] = 2.0 * pdhg_dual[i] - current_dual[i];
+        reflected_dual[i] = 2.0 * (temp - temp_proj) * step_size - current_dual[i];
         primal_product[i] = primal_prod;
     }
     return;
@@ -1095,7 +1132,7 @@ __global__ void fused_compute_next_pdhg_dual_major_solution(
 static void fused_compute_next_pdhg_primal_solution(pdhg_solver_state_t *state)
 {
     double step_size = state->step_size / state->primal_weight;
-
+    double inv_step_size = 1.0 / step_size;
     if (state->is_this_major_iteration || ((state->total_count + 2) % get_print_frequency(state->total_count + 2)) == 0)
     {
         fused_compute_next_pdhg_primal_solution_major_kernel<<<state->num_blocks_primal, THREADS_PER_BLOCK>>>(
@@ -1104,7 +1141,7 @@ static void fused_compute_next_pdhg_primal_solution(pdhg_solver_state_t *state)
             state->current_primal_solution, state->reflected_primal_solution,
             state->pdhg_primal_solution, state->dual_slack,
             state->objective_vector, state->variable_lower_bound,
-            state->variable_upper_bound, step_size,
+            state->variable_upper_bound, step_size, inv_step_size,
             state->num_variables);
     }
     else
@@ -1114,7 +1151,7 @@ static void fused_compute_next_pdhg_primal_solution(pdhg_solver_state_t *state)
             state->current_dual_solution,  state->dual_product,
             state->current_primal_solution, state->reflected_primal_solution,
             state->objective_vector, state->variable_lower_bound,
-            state->variable_upper_bound, step_size,
+            state->variable_upper_bound, step_size, inv_step_size,
             state->num_variables);
     }
 }
@@ -1122,15 +1159,16 @@ static void fused_compute_next_pdhg_primal_solution(pdhg_solver_state_t *state)
 static void fused_compute_next_pdhg_dual_solution(pdhg_solver_state_t *state)
 {
     double step_size = state->step_size * state->primal_weight;
+    double inv_step_size = 1.0 / step_size;
 
     if (state->is_this_major_iteration || ((state->total_count + 2) % get_print_frequency(state->total_count + 2)) == 0)
     {
-        fused_compute_next_pdhg_dual_major_solution<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
+        fused_compute_next_pdhg_dual_solution_major_kernel<<<state->num_blocks_dual, THREADS_PER_BLOCK>>>(
             state->constraint_matrix->row_ptr, state->constraint_matrix->col_ind, state->constraint_matrix->val,
             state->reflected_primal_solution, state->primal_product,
             state->current_dual_solution, state->reflected_dual_solution, state->pdhg_dual_solution,
             state->constraint_lower_bound, state->constraint_upper_bound,
-            step_size,
+            step_size, inv_step_size,
             state->num_constraints);
     }
     else
@@ -1140,7 +1178,7 @@ static void fused_compute_next_pdhg_dual_solution(pdhg_solver_state_t *state)
             state->reflected_primal_solution, state->primal_product,
             state->current_dual_solution, state->reflected_dual_solution,
             state->constraint_lower_bound, state->constraint_upper_bound,
-            step_size,
+            step_size, inv_step_size,
             state->num_constraints);
     }
 }
@@ -1155,9 +1193,14 @@ static void decide_fused_update_usage(pdhg_solver_state_t *state,
     int max_nnz_A_row = calculate_max_nnz_row(n_cons, state->constraint_matrix->row_ptr);
     int max_nnz_At_row = calculate_max_nnz_row(n_vars, state->constraint_matrix_t->row_ptr);
     int fusion_nnz_threshold = 100;
-    if (max_nnz_A_row > fusion_nnz_threshold) state->dual_update_algorithm = CUSPARSE_UPDATE;
+    double fusion_density_threshold = 0.01;
+    int primal_threshold = fmin(fusion_nnz_threshold,
+                                      (int)(fusion_density_threshold * n_cons));
+    int dual_threshold = fmin(fusion_nnz_threshold,
+                                    (int)(fusion_density_threshold * n_vars));
+    if (max_nnz_A_row > dual_threshold) state->dual_update_algorithm = CUSPARSE_UPDATE;
     else state->dual_update_algorithm = FUSED_UPDATE;
-    if (max_nnz_At_row > fusion_nnz_threshold) state->primal_update_algorithm = CUSPARSE_UPDATE;
+    if (max_nnz_At_row > primal_threshold) state->primal_update_algorithm = CUSPARSE_UPDATE;
     else state->primal_update_algorithm = FUSED_UPDATE;
     if (params->verbose)
     {

From 99b6da5b1833039ec1aaf3ec453e9bef41341340 Mon Sep 17 00:00:00 2001
From: Lhongpei <1453244320@qq.com>
Date: Thu, 4 Dec 2025 01:54:26 +0000
Subject: [PATCH 3/4] New Selecting Method: select update algorithm through
 benchmarking

---
 include/cupdlpx_types.h   |   9 ++
 internal/internal_types.h |   1 +
 internal/utils.h          |   2 +
 src/solver.cu             | 183 ++++++++++++++++++++++++++++++++++----
 src/utils.cu              |   7 ++
 5 files changed, 185 insertions(+), 17 deletions(-)

diff --git a/include/cupdlpx_types.h b/include/cupdlpx_types.h
index d17a939..211d64a 100644
--- a/include/cupdlpx_types.h
+++ b/include/cupdlpx_types.h
@@ -77,6 +77,14 @@ extern "C"
 		int iteration_limit;
 	} termination_criteria_t;
 
+
+	typedef enum {
+		PDHG_TUNING_HEURISTIC,
+		PDHG_TUNING_BENCHMARK,
+		PDHG_CUSPARSE_FIX,
+		PDHG_FUSED_FIX,
+	} pdhg_tuning_method_t;
+
 	typedef struct
 	{
 		int l_inf_ruiz_iterations;
@@ -89,6 +97,7 @@ extern "C"
 		restart_parameters_t restart_params;
 		double reflection_coefficient;
 		bool feasibility_polishing;
+		pdhg_tuning_method_t tuning_method;
 	} pdhg_parameters_t;
 
 	typedef struct
diff --git a/internal/internal_types.h b/internal/internal_types.h
index 9d4b0ec..ebe3239 100644
--- a/internal/internal_types.h
+++ b/internal/internal_types.h
@@ -37,6 +37,7 @@ typedef enum
 	FUSED_UPDATE,
 } pdhg_update_algorithm_t;
 
+
 typedef struct
 {
 	int num_variables;
diff --git a/internal/utils.h b/internal/utils.h
index 34fc559..f9a140f 100644
--- a/internal/utils.h
+++ b/internal/utils.h
@@ -104,6 +104,8 @@ extern "C"
         bool verbose,
         termination_reason_t termination_reason);
 
+    void print_table_head(const pdhg_parameters_t *params);
+
     void display_iteration_stats(const pdhg_solver_state_t *solver_state, bool verbose);
 
     const char *termination_reason_to_string(termination_reason_t reason);
diff --git a/src/solver.cu b/src/solver.cu
index 89b4302..7d0a031 100644
--- a/src/solver.cu
+++ b/src/solver.cu
@@ -111,6 +111,7 @@ cupdlpx_result_t *optimize(const pdhg_parameters_t *params,
     initialize_step_size_and_primal_weight(state, params);
     clock_t start_time = clock();
     bool do_restart = false;
+    print_table_head(params);
     while (state->termination_reason == TERMINATION_REASON_UNSPECIFIED)
     {
         if ((state->is_this_major_iteration || state->total_count == 0) ||
@@ -992,6 +993,8 @@ void set_default_parameters(pdhg_parameters_t *params)
     params->restart_params.k_i = 0.01;
     params->restart_params.k_d = 0.0;
     params->restart_params.i_smooth = 0.3;
+    params->feasibility_polishing = false;
+    params->tuning_method = PDHG_TUNING_BENCHMARK;
 }
 
 __global__ void fused_compute_next_pdhg_primal_solution_kernel(
@@ -1113,6 +1116,9 @@ __global__ void fused_compute_next_pdhg_dual_solution_major_kernel(
         pdhg_dual[i] = (temp - temp_proj) * step_size;
         reflected_dual[i] = 2.0 * pdhg_dual[i] - current_dual[i];
         primal_product[i] = primal_prod;
+    }
+    return;
+}
 //Feasibility Polishing
 void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state)
 {
@@ -1259,6 +1265,10 @@ __global__ void fused_compute_next_pdhg_dual_solution_kernel(
         double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i]));
         reflected_dual[i] = 2.0 * (temp - temp_proj) * step_size - current_dual[i];
         primal_product[i] = primal_prod;
+    }
+    return;
+}
+
 void dual_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state)
 {
     print_initial_feas_polish_info(false, params);
@@ -1359,37 +1369,176 @@ static void fused_compute_next_pdhg_dual_solution(pdhg_solver_state_t *state)
     }
 }
 
-static void decide_fused_update_usage(pdhg_solver_state_t *state,
-                                 const pdhg_parameters_t *params)
+static void fix_cusparse_decide_fused_update_usage(pdhg_solver_state_t *state,
+                                                   const pdhg_parameters_t *params)
+{
+    state->primal_update_algorithm = CUSPARSE_UPDATE;
+    state->dual_update_algorithm = CUSPARSE_UPDATE;
+
+    if (params->verbose) {
+        printf("[Auto-Tuning] Strategy: FIXED CUSPARSE\n");
+        printf("  Primal: cuSPARSE (Forced)\n");
+        printf("  Dual  : cuSPARSE (Forced)\n");
+    }
+}
+
+static void fix_fused_decide_fused_update_usage(pdhg_solver_state_t *state,
+                                                   const pdhg_parameters_t *params)
+{
+    state->primal_update_algorithm = FUSED_UPDATE;
+    state->dual_update_algorithm = FUSED_UPDATE;
+
+    if (params->verbose) {
+        printf("[Auto-Tuning] Strategy: FIXED FUSED\n");
+        printf("  Primal: FUSED (Forced)\n");
+        printf("  Dual  : FUSED (Forced)\n");
+    }
+}
+
+static void reset_solver_state_after_benchmark(pdhg_solver_state_t *state) {
+
+    CUDA_CHECK(cudaMemcpy(state->current_primal_solution, state->initial_primal_solution, 
+                          state->num_variables * sizeof(double), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpy(state->pdhg_primal_solution, state->initial_primal_solution, 
+                          state->num_variables * sizeof(double), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpy(state->reflected_primal_solution, state->initial_primal_solution, 
+                          state->num_variables * sizeof(double), cudaMemcpyDeviceToDevice));
+    
+    CUDA_CHECK(cudaMemcpy(state->current_dual_solution, state->initial_dual_solution, 
+                          state->num_constraints * sizeof(double), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpy(state->pdhg_dual_solution, state->initial_dual_solution, 
+                          state->num_constraints * sizeof(double), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemcpy(state->reflected_dual_solution, state->initial_dual_solution, 
+                          state->num_constraints * sizeof(double), cudaMemcpyDeviceToDevice));
+}
+
+
+static void heur_decide_fused_update_usage(pdhg_solver_state_t *state,
+                                           const pdhg_parameters_t *params)
 {
-    // Heuristic to decide whether to use fused kernels or not
-    // Currently, we use fused kernels when the number of non-zeros is less than a threshold
+    // Heuristic: based on Non-Zeros (NNZ) and Density
     int n_cons = state->num_constraints;
     int n_vars = state->num_variables;
+    
     int max_nnz_A_row = calculate_max_nnz_row(n_cons, state->constraint_matrix->row_ptr);
     int max_nnz_At_row = calculate_max_nnz_row(n_vars, state->constraint_matrix_t->row_ptr);
+    
     int fusion_nnz_threshold = 100;
     double fusion_density_threshold = 0.01;
-    int primal_threshold = fmin(fusion_nnz_threshold,
-                                      (int)(fusion_density_threshold * n_cons));
-    int dual_threshold = fmin(fusion_nnz_threshold,
-                                    (int)(fusion_density_threshold * n_vars));
+    
+    int primal_threshold = fmin(fusion_nnz_threshold, (int)(fusion_density_threshold * n_cons));
+    int dual_threshold = fmin(fusion_nnz_threshold, (int)(fusion_density_threshold * n_vars));
+
+    // Dual Update Uses Matrix A
     if (max_nnz_A_row > dual_threshold) state->dual_update_algorithm = CUSPARSE_UPDATE;
     else state->dual_update_algorithm = FUSED_UPDATE;
+
+    // Primal Update Uses Matrix A^T
     if (max_nnz_At_row > primal_threshold) state->primal_update_algorithm = CUSPARSE_UPDATE;
     else state->primal_update_algorithm = FUSED_UPDATE;
-    if (params->verbose)
+
+    if (params->verbose) {
+        printf("[Auto-Tuning] Strategy: HEURISTIC\n");
+        printf("  Primal: %s (Threshold: %d, MaxNNZ: %d)\n", 
+            (state->primal_update_algorithm == FUSED_UPDATE ? "FUSED" : "cuSPARSE"), primal_threshold, max_nnz_At_row);
+        printf("  Dual  : %s (Threshold: %d, MaxNNZ: %d)\n", 
+            (state->dual_update_algorithm == FUSED_UPDATE ? "FUSED" : "cuSPARSE"), dual_threshold, max_nnz_A_row);
+    }
+}
+
+static void bench_decide_fused_update_usage(pdhg_solver_state_t *state,
+                                            const pdhg_parameters_t *params)
+{
+    if (params->verbose) printf("[Auto-Tuning] Strategy: BENCHMARK (Running tests...)\n");
+
+    const int WARMUP = 5;
+    const int ITER = 10;
+    float t_fused, t_cusparse;
+    
+    cudaEvent_t start, stop;
+    CUDA_CHECK(cudaEventCreate(&start));
+    CUDA_CHECK(cudaEventCreate(&stop));
+
+
+    state->primal_update_algorithm = FUSED_UPDATE;
+    for(int i=0; i<WARMUP; i++) compute_next_pdhg_primal_solution(state);
+    CUDA_CHECK(cudaEventRecord(start, 0));
+    for(int i=0; i<ITER; i++) compute_next_pdhg_primal_solution(state);
+    CUDA_CHECK(cudaEventRecord(stop, 0));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+    CUDA_CHECK(cudaEventElapsedTime(&t_fused, start, stop));
+    
+    state->primal_update_algorithm = CUSPARSE_UPDATE;
+    for(int i=0; i<WARMUP; i++) compute_next_pdhg_primal_solution(state);
+    CUDA_CHECK(cudaEventRecord(start, 0));
+    for(int i=0; i<ITER; i++) compute_next_pdhg_primal_solution(state);
+    CUDA_CHECK(cudaEventRecord(stop, 0));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+    CUDA_CHECK(cudaEventElapsedTime(&t_cusparse, start, stop));
+
+    if (t_fused <= t_cusparse) {
+        state->primal_update_algorithm = FUSED_UPDATE;
+        if (params->verbose) printf("  Primal: Selected FUSED (%.3f ms) < cuSPARSE (%.3f ms)\n", t_fused, t_cusparse);
+    } else {
+        state->primal_update_algorithm = CUSPARSE_UPDATE;
+        if (params->verbose) printf("  Primal: Selected cuSPARSE (%.3f ms) < FUSED (%.3f ms)\n", t_cusparse, t_fused);
+    }
+
+
+    state->dual_update_algorithm = FUSED_UPDATE;
+    for(int i=0; i<WARMUP; i++) compute_next_pdhg_dual_solution(state);
+    CUDA_CHECK(cudaEventRecord(start, 0));
+    for(int i=0; i<ITER; i++) compute_next_pdhg_dual_solution(state);
+    CUDA_CHECK(cudaEventRecord(stop, 0));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+    CUDA_CHECK(cudaEventElapsedTime(&t_fused, start, stop));
+    
+    state->dual_update_algorithm = CUSPARSE_UPDATE;
+    for(int i=0; i<WARMUP; i++) compute_next_pdhg_dual_solution(state);
+    CUDA_CHECK(cudaEventRecord(start, 0));
+    for(int i=0; i<ITER; i++) compute_next_pdhg_dual_solution(state);
+    CUDA_CHECK(cudaEventRecord(stop, 0));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+    CUDA_CHECK(cudaEventElapsedTime(&t_cusparse, start, stop));
+
+    if (t_fused <= t_cusparse) {
+        state->dual_update_algorithm = FUSED_UPDATE;
+        if (params->verbose) printf("  Dual  : Selected FUSED (%.3f ms) < cuSPARSE (%.3f ms)\n", t_fused, t_cusparse);
+    } else {
+        state->dual_update_algorithm = CUSPARSE_UPDATE;
+        if (params->verbose) printf("  Dual  : Selected cuSPARSE (%.3f ms) < FUSED (%.3f ms)\n", t_cusparse, t_fused);
+    }
+
+    CUDA_CHECK(cudaEventDestroy(start));
+    CUDA_CHECK(cudaEventDestroy(stop));
+
+    reset_solver_state_after_benchmark(state);
+}
+
+static void decide_fused_update_usage(pdhg_solver_state_t *state,
+                                      const pdhg_parameters_t *params)
+{
+    switch (params->tuning_method) 
     {
-        if (state->primal_update_algorithm == FUSED_UPDATE)
-            printf("Using fused primal update kernel.\n");
-        else
-            printf("Using cuSPARSE primal update kernel.\n");
-        if (state->dual_update_algorithm == FUSED_UPDATE)
-            printf("Using fused dual update kernel.\n");
-        else
-            printf("Using cuSPARSE dual update kernel.\n");
+        case PDHG_TUNING_BENCHMARK:
+            bench_decide_fused_update_usage(state, params);
+            break;
+            
+        case PDHG_CUSPARSE_FIX:
+            fix_cusparse_decide_fused_update_usage(state, params);
+            break;
+            
+        case PDHG_FUSED_FIX:
+            fix_fused_decide_fused_update_usage(state, params);
+            break;
+
+        case PDHG_TUNING_HEURISTIC:
+        default:
+            heur_decide_fused_update_usage(state, params);
+            break;
     }
 }
+
 static pdhg_solver_state_t *initialize_primal_feas_polish_state(
     const pdhg_solver_state_t *original_state)
 {
diff --git a/src/utils.cu b/src/utils.cu
index 62f6bac..c453374 100644
--- a/src/utils.cu
+++ b/src/utils.cu
@@ -349,7 +349,14 @@ void print_initial_info(const pdhg_parameters_t *params,
            params->termination_criteria.eps_feasible_relative);
     printf("  eps_infeas_detect  : %.1e\n",
            params->termination_criteria.eps_infeasible);
+}
 
+void print_table_head(const pdhg_parameters_t *params)
+{
+    if (!params->verbose)
+    {
+        return;
+    }
     printf("---------------------------------------------------------------------"
            "------------------\n");
     printf("%s | %s | %s | %s \n", "   runtime    ", "    objective     ",

From cb01fe4f96727035e180f5def99924b27b458735 Mon Sep 17 00:00:00 2001
From: Lhongpei <1453244320@qq.com>
Date: Wed, 10 Dec 2025 05:04:49 +0000
Subject: [PATCH 4/4] Use fma and Kahan summation for numerical stability

---
 src/solver.cu | 121 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 72 insertions(+), 49 deletions(-)

diff --git a/src/solver.cu b/src/solver.cu
index 7d0a031..473291c 100644
--- a/src/solver.cu
+++ b/src/solver.cu
@@ -1015,25 +1015,30 @@ __global__ void fused_compute_next_pdhg_primal_solution_kernel(
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n_vars)
     {
-        //Compute dual product
         double sum = 0.0;
+        double c = 0.0; 
+
         int row_start = matAt_row_ptr[i];
         int row_end = matAt_row_ptr[i + 1];
+        
         for (int j = row_start; j < row_end; ++j)
         {
             int col = matAt_col_ind[j];
             double val = matAt_val[j];
-            sum += val * __ldg(&dual_solution[col]); 
+
+            double y = fma(val, __ldg(&dual_solution[col]), -c);
+            
+            double t = sum + y;
+            c = (t - sum) - y;
+            sum = t;
         }
         double dual_prod = sum;
 
-        //Compute PDHG primal solution
         double temp = current_primal[i] - step_size * (objective[i] - dual_prod);
         double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i]));
         reflected_primal[i] = 2.0 * temp_proj - current_primal[i];
         dual_product[i] = dual_prod;
     }
-    return;
 }
 
 __global__ void fused_compute_next_pdhg_primal_solution_major_kernel(
@@ -1056,19 +1061,25 @@ __global__ void fused_compute_next_pdhg_primal_solution_major_kernel(
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n_vars)
     {
-        //Compute dual product
         double sum = 0.0;
+        double c = 0.0; 
+
         int row_start = matAt_row_ptr[i];
         int row_end = matAt_row_ptr[i + 1];
+        
         for (int j = row_start; j < row_end; ++j)
         {
             int col = matAt_col_ind[j];
             double val = matAt_val[j];
-            sum += val * __ldg(&dual_solution[col]);
+            
+            double y = fma(val, __ldg(&dual_solution[col]), -c);
+            
+            double t = sum + y;
+            c = (t - sum) - y;
+            sum = t;
         }
         double dual_prod = sum;
 
-        //Compute PDHG primal solution
         double temp = current_primal[i] - step_size * (objective[i] - dual_prod);
         double temp_proj = fmax(var_lb[i], fmin(temp, var_ub[i]));
         reflected_primal[i] = 2.0 * temp_proj - current_primal[i];
@@ -1078,7 +1089,49 @@ __global__ void fused_compute_next_pdhg_primal_solution_major_kernel(
         pdhg_primal[i] = temp_proj;
         dual_product[i] = dual_prod;
     }
-    return;
+}
+
+__global__ void fused_compute_next_pdhg_dual_solution_kernel(
+    const int * __restrict__ matA_row_ptr, 
+    const int * __restrict__ matA_col_ind, 
+    const double * __restrict__ matA_val, 
+    const double * __restrict__ primal_solution,
+    double * __restrict__ primal_product,
+    const double * __restrict__ current_dual, 
+    double * __restrict__ reflected_dual, 
+    const double * __restrict__ const_lb, 
+    const double * __restrict__ const_ub, 
+    double step_size,    
+    double inv_step_size, 
+    int n_cons)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n_cons)
+    {
+        double sum = 0.0;
+        double c = 0.0; 
+
+        int row_start = matA_row_ptr[i];
+        int row_end = matA_row_ptr[i + 1];
+        
+        for (int j = row_start; j < row_end; ++j)
+        {
+            int col = matA_col_ind[j];
+            double val = matA_val[j];
+            
+            double y = fma(val, __ldg(&primal_solution[col]), -c);
+            
+            double t = sum + y;
+            c = (t - sum) - y;
+            sum = t;
+        }
+        double primal_prod = sum;
+
+        double temp = current_dual[i] * inv_step_size - primal_prod;
+        double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i]));
+        reflected_dual[i] = 2.0 * (temp - temp_proj) * step_size - current_dual[i];
+        primal_product[i] = primal_prod;
+    }
 }
 
 __global__ void fused_compute_next_pdhg_dual_solution_major_kernel(
@@ -1099,26 +1152,33 @@ __global__ void fused_compute_next_pdhg_dual_solution_major_kernel(
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n_cons)
     {
-        //Compute primal product
         double sum = 0.0;
+        double c = 0.0;
+
         int row_start = matA_row_ptr[i];
         int row_end = matA_row_ptr[i + 1];
+        
         for (int j = row_start; j < row_end; ++j)
         {
             int col = matA_col_ind[j];
             double val = matA_val[j];
-            sum += val * __ldg(&primal_solution[col]);
+            
+            double y = fma(val, __ldg(&primal_solution[col]), -c);
+            
+            double t = sum + y;
+            c = (t - sum) - y;
+            sum = t;
         }
         double primal_prod = sum;
-        //Compute PDHG dual solution
+
         double temp = current_dual[i]* inv_step_size - primal_prod;
         double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i]));
         pdhg_dual[i] = (temp - temp_proj) * step_size;
         reflected_dual[i] = 2.0 * pdhg_dual[i] - current_dual[i];
         primal_product[i] = primal_prod;
     }
-    return;
 }
+
 //Feasibility Polishing
 void feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state)
 {
@@ -1232,43 +1292,6 @@ void primal_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_stat
     return;
 }
 
-__global__ void fused_compute_next_pdhg_dual_solution_kernel(
-    const int * __restrict__ matA_row_ptr, 
-    const int * __restrict__ matA_col_ind, 
-    const double * __restrict__ matA_val, 
-    const double * __restrict__ primal_solution,
-    double * __restrict__ primal_product,
-    const double * __restrict__ current_dual, 
-    double * __restrict__ reflected_dual, 
-    const double * __restrict__ const_lb, 
-    const double * __restrict__ const_ub, 
-    double step_size,    
-    double inv_step_size, 
-    int n_cons)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n_cons)
-    {
-        //Compute primal product
-        double sum = 0.0;
-        int row_start = matA_row_ptr[i];
-        int row_end = matA_row_ptr[i + 1];
-        for (int j = row_start; j < row_end; ++j)
-        {
-            int col = matA_col_ind[j];
-            double val = matA_val[j];
-            sum += val * __ldg(&primal_solution[col]);
-        }
-        double primal_prod = sum;
-        //Compute PDHG dual solution
-        double temp = current_dual[i] * inv_step_size - primal_prod;
-        double temp_proj = fmax(-const_ub[i], fmin(temp, -const_lb[i]));
-        reflected_dual[i] = 2.0 * (temp - temp_proj) * step_size - current_dual[i];
-        primal_product[i] = primal_prod;
-    }
-    return;
-}
-
 void dual_feasibility_polish(const pdhg_parameters_t *params, pdhg_solver_state_t *state, const pdhg_solver_state_t *ori_state)
 {
     print_initial_feas_polish_info(false, params);