Skip to content

Commit 82f6119

Browse files
committed
Fix
1 parent 6a2cf2f commit 82f6119

File tree

1 file changed

+68
-51
lines changed

1 file changed

+68
-51
lines changed

R/machine_learning.R

Lines changed: 68 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,23 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
393393
parallel::stopCluster(cl) # stop the cluster after parallel execution
394394
unregister_dopar() #Stop Dopar from running in the background
395395

396+
# Store models in a named list
397+
models <- list(
398+
BAG = fit.treebag,
399+
RF = fit.rf,
400+
C50 = fit.c50,
401+
#GLM = fit.glm,
402+
#LDA = fit.lda,
403+
GLMNET = fit.glmnet,
404+
KNN = fit.knn,
405+
CART = fit.cart,
406+
LASSO = fit.lasso,
407+
RIDGE = fit.ridge,
408+
SVM_radial = fit.svm_radial,
409+
SVM_linear = fit.svm_linear,
410+
XGboost = fit.xgbTree
411+
)
412+
396413
}else{
397414

398415
# Custom fold construction
@@ -415,21 +432,21 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
415432
### Extract the file names of the folds
416433
#result_files <- list.files("Results", pattern = "^fold_.*\\.rds$", full.names = TRUE)
417434
#fold_data = vector("list", length(result_files))
418-
435+
419436
# Initialize master list to store everything in memory
420437
models_all_folds <- vector("list", length(result_files))
421-
438+
422439
# Iterate across folds and inside each subfold corresponding to each param combination (if exist)
423440
for (fold_i in seq_along(result_files)) { ### number of folds (k_fold x n_rep)
424441

425-
#result = readRDS(result_files[[fold_i]])
426-
result = result_files[[fold_i]]
427-
442+
#result = readRDS(result_files[[fold_i]])
443+
result = result_files[[fold_i]]
444+
428445
# Each fold contains multiple parameter sets (list of lists) --> fold_construction_args_tunable != NULL
429446
if (!is.null(fold_construction_args_tunable)) {
430-
447+
431448
models_all_params <- vector("list", length(result))
432-
449+
433450
for (parameter_i in seq_along(result)) { ### number of parameter combinations
434451
train_data_i = result[[parameter_i]][["train_data"]]
435452
test_data_i = result[[parameter_i]][["test_data"]]
@@ -442,8 +459,8 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
442459
result[[parameter_i]][["test_data"]] = test_data_i[, setdiff(colnames(train_data_i), "target")]
443460

444461
models = lapply(
445-
c("treebag", "rf", "C5.0",
446-
"glmnet", "knn", "rpart", "glmnet", "glmnet",
462+
c("treebag", "rf", "C5.0",
463+
"glmnet", "knn", "rpart", "glmnet", "glmnet",
447464
"svmRadial", "svmLinear", "xgbTree"),
448465
function(method){
449466
tune_grid = NULL
@@ -466,14 +483,14 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
466483

467484
# Store models for this parameter set
468485
models_all_params[[parameter_i]] <- models
469-
486+
470487
}
471-
488+
472489
# Store all parameter results for this fold
473490
models_all_folds[[fold_i]] <- models_all_params
474-
491+
475492
#saveRDS(models_all_params, paste0("Results/models_fold_", fold_i, ".rds"))
476-
493+
477494
}else { # Custom function does not have hyperparams to tune --> fold_construction_args_tunable != NULL
478495
train_data_i <- result[["train_data"]]
479496
test_data_i <- result[["test_data"]]
@@ -486,8 +503,8 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
486503
result[["test_data"]] = test_data_i[, setdiff(colnames(train_data_i), "target")]
487504

488505
models = lapply(
489-
c("treebag", "rf", "C5.0",
490-
"glmnet", "knn", "rpart", "glmnet", "glmnet",
506+
c("treebag", "rf", "C5.0",
507+
"glmnet", "knn", "rpart", "glmnet", "glmnet",
491508
"svmRadial", "svmLinear", "xgbTree"),
492509
function(method){
493510
tune_grid = NULL
@@ -500,26 +517,26 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
500517
tune_grid = get_tune_grid("ridge")
501518
method = "glmnet" #still call wrapper with glmnet model type for ridge
502519
}
503-
520+
504521
# Custom CV validation and hyperparameter tuning
505522
do.call(compute_custom_k_fold_CV,
506523
list(processed_folds = result[[parameter_i]], ml_method = method, tuneGrid=tune_grid, ncores=ncores))
507-
524+
508525
}
509526
)
510-
527+
511528
models_all_folds[[fold_i]] <- models
512529
#saveRDS(models, paste0("Results/models_fold_", fold_i, ".rds"))
513530
}
514531
}
515532

516533
#result_files <- list.files("Results",pattern = "^models_.*\\.rds$", full.names = TRUE)
517-
agg <- aggregate_results(models_all_folds)
518-
519-
methods <- c("treebag", "rf", "C5.0",
520-
"glmnet", "knn", "rpart", "glmnet", "glmnet",
534+
agg <- aggregate_results(models_all_folds)
535+
536+
methods <- c("treebag", "rf", "C5.0",
537+
"glmnet", "knn", "rpart", "glmnet", "glmnet",
521538
"svmRadial", "svmLinear", "xgbTree")
522-
539+
523540
optimized_models <- lapply(seq_along(methods), function(i) {
524541
wrapper_train_best_hyperparams(
525542
agg[[i]], # model-specific aggregated results
@@ -528,7 +545,7 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
528545
fold_construction_args_fixed
529546
)
530547
})
531-
548+
532549
model_names <- c(
533550
BAG = "treebag",
534551
RF = "rf",
@@ -542,20 +559,20 @@ compute_k_fold_CV = function(model, k_folds, n_rep, stacking = FALSE, metric = "
542559
SVM_linear = "svmLinear",
543560
XGboost = "xgbTree"
544561
)
545-
562+
546563
# Split components across lists
547564
training_sets <- lapply(optimized_models, `[[`, "training_set")
548565
custom_outputs <- lapply(optimized_models, `[[`, "custom_output")
549566
models <- lapply(optimized_models, `[[`, "Model")
550-
567+
551568
# Assign pretty names
552569
names(training_sets) <- names(model_names)
553570
names(custom_outputs) <- names(model_names)
554571
names(models) <- names(model_names)
555572

556573
}
557574

558-
575+
559576
####### Optimized based on metric (only AUC or Accuracy available)
560577
if(metric == "AUROC" || metric == "AUPRC"){
561578

@@ -1270,7 +1287,7 @@ compute_custom_k_fold_CV <- function(processed_folds, ml_method, tuneGrid = NULL
12701287

12711288
train_data = processed_folds[["train_data"]]
12721289
test_data = processed_folds[["test_data"]]
1273-
1290+
12741291
all_preds <- list()
12751292

12761293
## Train once to get grid
@@ -1318,7 +1335,7 @@ compute_custom_k_fold_CV <- function(processed_folds, ml_method, tuneGrid = NULL
13181335
for (grid_row in seq(nrow(grid))) {
13191336
# Extract hyperparameters
13201337
hp <- grid[grid_row, , drop = FALSE]
1321-
1338+
13221339
# Train model
13231340
model <- caret::train(
13241341
target ~ .,
@@ -1328,34 +1345,34 @@ compute_custom_k_fold_CV <- function(processed_folds, ml_method, tuneGrid = NULL
13281345
tuneGrid = hp,
13291346
metric = "Accuracy"
13301347
)
1331-
1348+
13321349
# Predict
13331350
test_data <- test_data[, colnames(test_data) %in% model$coefnames]
13341351
probs <- stats::predict(model, newdata = test_data, type = "prob")
13351352
preds <- stats::predict(model, newdata = test_data)
1336-
1353+
13371354
# Prepare results
13381355
rownames(hp) <- NULL
13391356
hp <- tibble::as_tibble(hp)
1340-
1357+
13411358
pred_df <- dplyr::tibble(
13421359
rowIndex = processed_folds$rowIndex,
13431360
Resample = processed_folds$fold_name,
13441361
obs = processed_folds$obs_test,
13451362
pred = preds
13461363
) %>%
13471364
dplyr::bind_cols(hp, probs)
1348-
1365+
13491366
if (!is.null(processed_folds$params)) {
13501367
pred_df <- dplyr::bind_cols(pred_df, processed_folds$params)
13511368
}
1352-
1369+
13531370
all_preds[[grid_row]] = pred_df
13541371
}
13551372

13561373
## Combine predictions
13571374
pred_df_all <- do.call(rbind, all_preds)
1358-
1375+
13591376
return(list(pred_df_all, names(grid)))
13601377
}
13611378

@@ -3283,47 +3300,47 @@ wrapper_train_best_hyperparams <- function(optimized, ml_method, fold_constructi
32833300
}
32843301

32853302
aggregate_results <- function(all_loaded) {
3286-
3303+
32873304
# Load all files (each file = one fold)
32883305
#all_loaded <- lapply(result_files, readRDS)
3289-
3306+
32903307
# Dimensions:
32913308
# all_loaded -> folds
32923309
# all_loaded[[fold]] -> param combinations
32933310
# all_loaded[[fold]][[param]] -> ML models (list of 11)
3294-
3311+
32953312
n_folds <- length(all_loaded)
32963313
n_params <- length(all_loaded[[1]])
32973314
n_models <- length(all_loaded[[1]][[1]])
3298-
3315+
32993316
results <- vector("list", n_models)
3300-
3317+
33013318
# Iterate over models
33023319
for (m in seq_len(n_models)) {
33033320

33043321
all_preds <- NULL
33053322
hp_cols_all <- character()
3306-
3323+
33073324
# Nested loops: avoid overcharging R list and giving error
33083325
for (f in seq_len(n_folds)) {
33093326
for (p in seq_len(n_params)) {
33103327
preds <- all_loaded[[f]][[p]][[m]][[1]]
33113328
hp <- all_loaded[[f]][[p]][[m]][[2]]
3312-
3329+
33133330
# Bind iteratively to avoid large lists in memory
33143331
if (is.null(all_preds)) {
33153332
all_preds <- preds
33163333
} else {
33173334
all_preds <- dplyr::bind_rows(all_preds, preds)
33183335
}
3319-
3336+
33203337
hp_cols_all <- union(hp_cols_all, hp)
33213338
}
33223339
}
3323-
3340+
33243341

33253342
rownames(all_preds) <- NULL
3326-
3343+
33273344
# Add any extra columns if present
33283345
extra_cols <- setdiff(
33293346
names(all_preds),
@@ -3332,7 +3349,7 @@ aggregate_results <- function(all_loaded) {
33323349
if (length(extra_cols) > 0) {
33333350
hp_cols_all <- c(hp_cols_all, extra_cols)
33343351
}
3335-
3352+
33363353
# Compute metrics per resample
33373354
results_matrix <- all_preds %>%
33383355
dplyr::group_by(dplyr::across(dplyr::all_of(hp_cols_all)), Resample) %>%
@@ -3349,15 +3366,15 @@ aggregate_results <- function(all_loaded) {
33493366
KappaSD = stats::mad(.data$Kappa_resample),
33503367
.groups = "keep"
33513368
)
3352-
3369+
33533370
# Select best hyperparams
33543371
best_row <- results_matrix %>%
33553372
dplyr::ungroup() %>%
33563373
dplyr::arrange(dplyr::desc(Accuracy)) %>%
33573374
dplyr::slice_max(Accuracy, n = 1, with_ties = FALSE)
3358-
3375+
33593376
besttune <- best_row %>% dplyr::select(dplyr::all_of(hp_cols_all))
3360-
3377+
33613378
# Compute resample summaries for besttune
33623379
resample_df <- all_preds %>%
33633380
dplyr::inner_join(besttune, by = hp_cols_all) %>%
@@ -3373,7 +3390,7 @@ aggregate_results <- function(all_loaded) {
33733390
) %>%
33743391
dplyr::select(Accuracy, Kappa, Resample) %>%
33753392
dplyr::arrange(Resample)
3376-
3393+
33773394
# Store results for this model
33783395
results[[m]] <- list(
33793396
Prediction_folds = all_preds,
@@ -3382,7 +3399,7 @@ aggregate_results <- function(all_loaded) {
33823399
Resample_matrix = resample_df
33833400
)
33843401
}
3385-
3402+
33863403
return(results)
33873404
}
33883405

0 commit comments

Comments
 (0)