@@ -393,6 +393,23 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
393393 parallel :: stopCluster(cl ) # stop the cluster after parallel execution
394394 unregister_dopar() # Stop Dopar from running in the background
395395
396+ # Store models in a named list
397+ models <- list (
398+ BAG = fit.treebag ,
399+ RF = fit.rf ,
400+ C50 = fit.c50 ,
401+ # GLM = fit.glm,
402+ # LDA = fit.lda,
403+ GLMNET = fit.glmnet ,
404+ KNN = fit.knn ,
405+ CART = fit.cart ,
406+ LASSO = fit.lasso ,
407+ RIDGE = fit.ridge ,
408+ SVM_radial = fit.svm_radial ,
409+ SVM_linear = fit.svm_linear ,
410+ XGboost = fit.xgbTree
411+ )
412+
396413 }else {
397414
398415 # Custom fold construction
@@ -415,21 +432,21 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
415432 # ## Extract the file names of the folds
416433 # result_files <- list.files("Results", pattern = "^fold_.*\\.rds$", full.names = TRUE)
417434 # fold_data = vector("list", length(result_files))
418-
435+
419436 # Initialize master list to store everything in memory
420437 models_all_folds <- vector(" list" , length(result_files ))
421-
438+
422439 # Iterate across folds and inside each subfold corresponding to each param combination (if exist)
423440 for (fold_i in seq_along(result_files )) { # ## number of folds (k_fold x n_rep)
424441
425- # result = readRDS(result_files[[fold_i]])
426- result = result_files [[fold_i ]]
427-
442+ # result = readRDS(result_files[[fold_i]])
443+ result = result_files [[fold_i ]]
444+
428445 # Each fold contains multiple parameter sets (list of lists) --> fold_construction_args_tunable != NULL
429446 if (! is.null(fold_construction_args_tunable )) {
430-
447+
431448 models_all_params <- vector(" list" , length(result ))
432-
449+
433450 for (parameter_i in seq_along(result )) { # ## number of parameter combinations
434451 train_data_i = result [[parameter_i ]][[" train_data" ]]
435452 test_data_i = result [[parameter_i ]][[" test_data" ]]
@@ -442,8 +459,8 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
442459 result [[parameter_i ]][[" test_data" ]] = test_data_i [, setdiff(colnames(train_data_i ), " target" )]
443460
444461 models = lapply(
445- c(" treebag" , " rf" , " C5.0" ,
446- " glmnet" , " knn" , " rpart" , " glmnet" , " glmnet" ,
462+ c(" treebag" , " rf" , " C5.0" ,
463+ " glmnet" , " knn" , " rpart" , " glmnet" , " glmnet" ,
447464 " svmRadial" , " svmLinear" , " xgbTree" ),
448465 function (method ){
449466 tune_grid = NULL
@@ -466,14 +483,14 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
466483
467484 # Store models for this parameter set
468485 models_all_params [[parameter_i ]] <- models
469-
486+
470487 }
471-
488+
472489 # Store all parameter results for this fold
473490 models_all_folds [[fold_i ]] <- models_all_params
474-
491+
475492 # saveRDS(models_all_params, paste0("Results/models_fold_", fold_i, ".rds"))
476-
493+
477494 }else { # Custom function does not have hyperparams to tune --> fold_construction_args_tunable != NULL
478495 train_data_i <- result [[" train_data" ]]
479496 test_data_i <- result [[" test_data" ]]
@@ -486,8 +503,8 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
486503 result [[" test_data" ]] = test_data_i [, setdiff(colnames(train_data_i ), " target" )]
487504
488505 models = lapply(
489- c(" treebag" , " rf" , " C5.0" ,
490- " glmnet" , " knn" , " rpart" , " glmnet" , " glmnet" ,
506+ c(" treebag" , " rf" , " C5.0" ,
507+ " glmnet" , " knn" , " rpart" , " glmnet" , " glmnet" ,
491508 " svmRadial" , " svmLinear" , " xgbTree" ),
492509 function (method ){
493510 tune_grid = NULL
@@ -500,26 +517,26 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
500517 tune_grid = get_tune_grid(" ridge" )
501518 method = " glmnet" # still call wrapper with glmnet model type for ridge
502519 }
503-
520+
504521 # Custom CV validation and hyperparameter tuning
505522 do.call(compute_custom_k_fold_CV ,
506523 list (processed_folds = result [[parameter_i ]], ml_method = method , tuneGrid = tune_grid , ncores = ncores ))
507-
524+
508525 }
509526 )
510-
527+
511528 models_all_folds [[fold_i ]] <- models
512529 # saveRDS(models, paste0("Results/models_fold_", fold_i, ".rds"))
513530 }
514531 }
515532
516533 # result_files <- list.files("Results",pattern = "^models_.*\\.rds$", full.names = TRUE)
517- agg <- aggregate_results(models_all_folds )
518-
519- methods <- c(" treebag" , " rf" , " C5.0" ,
520- " glmnet" , " knn" , " rpart" , " glmnet" , " glmnet" ,
534+ agg <- aggregate_results(models_all_folds )
535+
536+ methods <- c(" treebag" , " rf" , " C5.0" ,
537+ " glmnet" , " knn" , " rpart" , " glmnet" , " glmnet" ,
521538 " svmRadial" , " svmLinear" , " xgbTree" )
522-
539+
523540 optimized_models <- lapply(seq_along(methods ), function (i ) {
524541 wrapper_train_best_hyperparams(
525542 agg [[i ]], # model-specific aggregated results
@@ -528,7 +545,7 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
528545 fold_construction_args_fixed
529546 )
530547 })
531-
548+
532549 model_names <- c(
533550 BAG = " treebag" ,
534551 RF = " rf" ,
@@ -542,20 +559,20 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
542559 SVM_linear = " svmLinear" ,
543560 XGboost = " xgbTree"
544561 )
545-
562+
546563 # Split components across lists
547564 training_sets <- lapply(optimized_models , `[[` , " training_set" )
548565 custom_outputs <- lapply(optimized_models , `[[` , " custom_output" )
549566 models <- lapply(optimized_models , `[[` , " Model" )
550-
567+
551568 # Assign pretty names
552569 names(training_sets ) <- names(model_names )
553570 names(custom_outputs ) <- names(model_names )
554571 names(models ) <- names(model_names )
555572
556573 }
557574
558-
575+
559576 # ###### Optimized based on metric (only AUC or Accuracy available)
560577 if (metric == " AUROC" || metric == " AUPRC" ){
561578
@@ -1270,7 +1287,7 @@ compute_custom_k_fold_CV <- function(processed_folds, ml_method, tuneGrid = NULL
12701287
12711288 train_data = processed_folds [[" train_data" ]]
12721289 test_data = processed_folds [[" test_data" ]]
1273-
1290+
12741291 all_preds <- list ()
12751292
12761293 # # Train once to get grid
@@ -1318,7 +1335,7 @@ compute_custom_k_fold_CV <- function(processed_folds, ml_method, tuneGrid = NULL
13181335 for (grid_row in seq(nrow(grid ))) {
13191336 # Extract hyperparameters
13201337 hp <- grid [grid_row , , drop = FALSE ]
1321-
1338+
13221339 # Train model
13231340 model <- caret :: train(
13241341 target ~ . ,
@@ -1328,34 +1345,34 @@ compute_custom_k_fold_CV <- function(processed_folds, ml_method, tuneGrid = NULL
13281345 tuneGrid = hp ,
13291346 metric = " Accuracy"
13301347 )
1331-
1348+
13321349 # Predict
13331350 test_data <- test_data [, colnames(test_data ) %in% model $ coefnames ]
13341351 probs <- stats :: predict(model , newdata = test_data , type = " prob" )
13351352 preds <- stats :: predict(model , newdata = test_data )
1336-
1353+
13371354 # Prepare results
13381355 rownames(hp ) <- NULL
13391356 hp <- tibble :: as_tibble(hp )
1340-
1357+
13411358 pred_df <- dplyr :: tibble(
13421359 rowIndex = processed_folds $ rowIndex ,
13431360 Resample = processed_folds $ fold_name ,
13441361 obs = processed_folds $ obs_test ,
13451362 pred = preds
13461363 ) %> %
13471364 dplyr :: bind_cols(hp , probs )
1348-
1365+
13491366 if (! is.null(processed_folds $ params )) {
13501367 pred_df <- dplyr :: bind_cols(pred_df , processed_folds $ params )
13511368 }
1352-
1369+
13531370 all_preds [[grid_row ]] = pred_df
13541371 }
13551372
13561373 # # Combine predictions
13571374 pred_df_all <- do.call(rbind , all_preds )
1358-
1375+
13591376 return (list (pred_df_all , names(grid )))
13601377}
13611378
@@ -3283,47 +3300,47 @@ wrapper_train_best_hyperparams <- function(optimized, ml_method, fold_constructi
32833300}
32843301
32853302aggregate_results <- function (all_loaded ) {
3286-
3303+
32873304 # Load all files (each file = one fold)
32883305 # all_loaded <- lapply(result_files, readRDS)
3289-
3306+
32903307 # Dimensions:
32913308 # all_loaded -> folds
32923309 # all_loaded[[fold]] -> param combinations
32933310 # all_loaded[[fold]][[param]] -> ML models (list of 11)
3294-
3311+
32953312 n_folds <- length(all_loaded )
32963313 n_params <- length(all_loaded [[1 ]])
32973314 n_models <- length(all_loaded [[1 ]][[1 ]])
3298-
3315+
32993316 results <- vector(" list" , n_models )
3300-
3317+
33013318 # Iterate over models
33023319 for (m in seq_len(n_models )) {
33033320
33043321 all_preds <- NULL
33053322 hp_cols_all <- character ()
3306-
3323+
33073324 # Nested loops: avoid overcharging R list and giving error
33083325 for (f in seq_len(n_folds )) {
33093326 for (p in seq_len(n_params )) {
33103327 preds <- all_loaded [[f ]][[p ]][[m ]][[1 ]]
33113328 hp <- all_loaded [[f ]][[p ]][[m ]][[2 ]]
3312-
3329+
33133330 # Bind iteratively to avoid large lists in memory
33143331 if (is.null(all_preds )) {
33153332 all_preds <- preds
33163333 } else {
33173334 all_preds <- dplyr :: bind_rows(all_preds , preds )
33183335 }
3319-
3336+
33203337 hp_cols_all <- union(hp_cols_all , hp )
33213338 }
33223339 }
3323-
3340+
33243341
33253342 rownames(all_preds ) <- NULL
3326-
3343+
33273344 # Add any extra columns if present
33283345 extra_cols <- setdiff(
33293346 names(all_preds ),
@@ -3332,7 +3349,7 @@ aggregate_results <- function(all_loaded) {
33323349 if (length(extra_cols ) > 0 ) {
33333350 hp_cols_all <- c(hp_cols_all , extra_cols )
33343351 }
3335-
3352+
33363353 # Compute metrics per resample
33373354 results_matrix <- all_preds %> %
33383355 dplyr :: group_by(dplyr :: across(dplyr :: all_of(hp_cols_all )), Resample ) %> %
@@ -3349,15 +3366,15 @@ aggregate_results <- function(all_loaded) {
33493366 KappaSD = stats :: mad(.data $ Kappa_resample ),
33503367 .groups = " keep"
33513368 )
3352-
3369+
33533370 # Select best hyperparams
33543371 best_row <- results_matrix %> %
33553372 dplyr :: ungroup() %> %
33563373 dplyr :: arrange(dplyr :: desc(Accuracy )) %> %
33573374 dplyr :: slice_max(Accuracy , n = 1 , with_ties = FALSE )
3358-
3375+
33593376 besttune <- best_row %> % dplyr :: select(dplyr :: all_of(hp_cols_all ))
3360-
3377+
33613378 # Compute resample summaries for besttune
33623379 resample_df <- all_preds %> %
33633380 dplyr :: inner_join(besttune , by = hp_cols_all ) %> %
@@ -3373,7 +3390,7 @@ aggregate_results <- function(all_loaded) {
33733390 ) %> %
33743391 dplyr :: select(Accuracy , Kappa , Resample ) %> %
33753392 dplyr :: arrange(Resample )
3376-
3393+
33773394 # Store results for this model
33783395 results [[m ]] <- list (
33793396 Prediction_folds = all_preds ,
@@ -3382,7 +3399,7 @@ aggregate_results <- function(all_loaded) {
33823399 Resample_matrix = resample_df
33833400 )
33843401 }
3385-
3402+
33863403 return (results )
33873404}
33883405
0 commit comments