|
| 1 | +--- |
| 2 | +title: "Analyse en composantes principales" |
| 3 | +author: "Guyliann Engels & Philippe Grosjean" |
| 4 | +description: "**SDD II Module 7** : Analyse en Composantes Principales (ACP)" |
| 5 | +tutorial: |
| 6 | + id: "B07La_pca" |
| 7 | + version: 0.0.9999/___ |
| 8 | +output: |
| 9 | + learnr::tutorial: |
| 10 | + progressive: true |
| 11 | +runtime: shiny_prerendered |
| 12 | +--- |
| 13 | + |
| 14 | +```{r setup, include=FALSE} |
| 15 | +BioDataScience2::learnr_setup() |
| 16 | +# Package utile |
| 17 | +SciViews::R() |
| 18 | +# Preparation of the dataset ------ |
| 19 | +penguins <- read("penguins", package = "palmerpenguins", lang = "fr") |
| 20 | +
|
| 21 | +# Functions utiles |
| 22 | +# pca for SciViews, version 1.0.0 |
| 23 | +# Copyright (c) 2020, Philippe Grosjean (phgrosjean@sciviews.org) |
| 24 | +library(broom) |
| 25 | +
|
| 26 | +# broom implements only methods for prcomp objects, not princomp, while pcomp |
| 27 | +# is compatible with princomp... but prcomp is simpler. So, conversion is easy |
| 28 | +as.prcomp <- function(x, ...) |
| 29 | + UseMethod("as.prcomp") |
| 30 | +
|
| 31 | +as.prcomp.default <- function(x, ...) |
| 32 | + stop("No method to convert this object into a 'prcomp'") |
| 33 | +
|
| 34 | +as.prcomp.prcomp <- function(x, ...) |
| 35 | + x |
| 36 | +
|
| 37 | +as.prcomp.princomp <- function(x, ...) |
| 38 | + structure(list(sdev = as.numeric(x$sdev), rotation = unclass(x$loadings), |
| 39 | + center = x$center, scale = x$scale, x = as.matrix(x$scores)), |
| 40 | + class = "prcomp") |
| 41 | +
|
| 42 | +# Comparison of pcomp() -> as.prcomp() with prcomp() directly |
| 43 | +# Almost the same, only no rownames for x (is it important?) |
| 44 | +#iris_prcomp_pcomp <- as.prcomp(pcomp(iris[, -5], scale = TRUE)) |
| 45 | +#iris_prcomp <- prcomp(iris[, -5], scale = TRUE) |
| 46 | +
|
| 47 | +# Now, broom methods can be defined simply by converting into prcomp objects |
| 48 | +augment.princomp <- function(x, data = NULL, newdata, ...) |
| 49 | + if (missing(newdata)) { |
| 50 | + augment(as.prcomp(x), data = data, ...) |
| 51 | + } else { |
| 52 | + augment(as.prcomp(x), data = data, newdata = newdata, ...) |
| 53 | + } |
| 54 | +
|
| 55 | +tidy.princomp <- function(x, matrix = "u", ...) |
| 56 | + tidy(as.prcomp(x), matrix = matrix, ...) |
| 57 | +
|
| 58 | +# There is no glance.prcomp() method |
| 59 | +
|
| 60 | +# There is a problem with pcomp() that returns a data.frame in scores, |
| 61 | +# while it is a matrix in the original princomp object. pca() corrects this |
| 62 | +pca <- function(x, ...) { |
| 63 | + res <- SciViews::pcomp(x, ...) |
| 64 | + # Change scores into a matrix |
| 65 | + res$scores <- as.matrix(res$scores) |
| 66 | + res |
| 67 | +} |
| 68 | +
|
| 69 | +scale_axes <- function(data, aspect.ratio = 1) { |
| 70 | + range_x <- range(data[, 1]) |
| 71 | + span_x <- abs(max(range_x) - min(range_x)) |
| 72 | + range_y <- range(data[, 2]) |
| 73 | + span_y <- abs(max(range_y) - min(range_y)) |
| 74 | + if ((span_y / aspect.ratio) > span_x) { |
| 75 | + # Adjust range_x |
| 76 | + span_x_2 <- span_y / aspect.ratio / 2 |
| 77 | + range_x_mid <- sum(range_x) / 2 |
| 78 | + range_x <- c(range_x_mid - span_x_2, range_x_mid + span_x_2) |
| 79 | + } else { |
| 80 | + # Adjust range_y |
| 81 | + span_y_2 <- span_x * aspect.ratio / 2 |
| 82 | + range_y_mid <- sum(range_y) / 2 |
| 83 | + range_y <- c(range_y_mid - span_y_2, range_y_mid + span_y_2) |
| 84 | + } |
| 85 | + list(x = range_x, y = range_y) |
| 86 | +} |
| 87 | +
|
| 88 | +autoplot.pcomp <- function(object, |
| 89 | +type = c("screeplot", "altscreeplot", "loadings", "correlations", "scores", "biplot"), |
| 90 | +choices = 1L:2L, name = deparse(substitute(object)), ar.length = 0.1, |
| 91 | +circle.col = "gray", col = "black", fill = "gray", scale = 1, aspect.ratio = 1, |
| 92 | +repel = FALSE, labels, title, xlab, ylab, ...) { |
| 93 | + type = match.arg(type) |
| 94 | +
|
| 95 | + if (missing(title)) |
| 96 | + title <- paste(name, type, sep = " - ") |
| 97 | +
|
| 98 | + contribs <- paste0(names(object$sdev), " (", |
| 99 | + round((object$sdev^2/object$totdev^2) * 100, digits = 1), "%)")[choices] |
| 100 | +
|
| 101 | + scores <- as.data.frame(object$scores[, choices]) |
| 102 | + names(scores) <- c("x", "y") |
| 103 | + if (!missing(labels)) { |
| 104 | + if (length(labels) != nrow(scores)) |
| 105 | + stop("You must provide a character vector of length ", nrow(scores), |
| 106 | + " for 'labels'") |
| 107 | + scores$labels <- labels |
| 108 | + } else {# Default labels are row numbers |
| 109 | + scores$labels <- 1:nrow(scores) |
| 110 | + } |
| 111 | +
|
| 112 | + lims <- scale_axes(scores, aspect.ratio = aspect.ratio) |
| 113 | +
|
| 114 | + if (!missing(col)) { |
| 115 | + if (length(col) != nrow(scores)) |
| 116 | + stop("You must provide a vector of length ", nrow(scores), " for 'col'") |
| 117 | + scores$color <- col |
| 118 | + scores_formula <- y ~ x %col=% color %label=% labels |
| 119 | + } else { |
| 120 | + if (missing(labels)) { |
| 121 | + scores_formula <- y ~ x %label=% labels |
| 122 | + } else { |
| 123 | + scores_formula <- y ~ x %col=% labels %label=% labels |
| 124 | + } |
| 125 | + } |
| 126 | +
|
| 127 | + res <- switch(type, |
| 128 | + screeplot = object %>.% # Classical screeplot |
| 129 | + tidy(., "pcs") %>.% |
| 130 | + chart(data = ., std.dev^2 ~ PC) + |
| 131 | + geom_col(col = col, fill = fill) + |
| 132 | + labs(y = "Variances", title = title), |
| 133 | +
|
| 134 | + altscreeplot = object %>.% # screeplot represented by dots and lines |
| 135 | + tidy(., "pcs") %>.% |
| 136 | + chart(data = ., std.dev^2 ~ PC) + |
| 137 | + geom_line(col = col) + |
| 138 | + geom_point(col = "white", fill = col, size = 2, shape = 21, stroke = 3) + |
| 139 | + labs(y = "Variances", title = title), |
| 140 | +
|
| 141 | + loadings = object %>.% # Plots of the variables |
| 142 | + tidy(., "variables") %>.% |
| 143 | + spread(., key = PC, value = value) %>.% |
| 144 | + #rename_if(., is.numeric, function(x) paste0("PC", x)) %>.% |
| 145 | + select(., c(1, choices + 1)) %>.% |
| 146 | + set_names(., c("labels", "x", "y")) %>.% |
| 147 | + chart(data = ., y ~ x %xend=% 0 %yend=% 0 %label=% labels) + |
| 148 | + annotate("path", col = circle.col, |
| 149 | + x = cos(seq(0, 2*pi, length.out = 100)), |
| 150 | + y = sin(seq(0, 2*pi, length.out = 100))) + |
| 151 | + geom_hline(yintercept = 0, col = circle.col) + |
| 152 | + geom_vline(xintercept = 0, col = circle.col) + |
| 153 | + geom_segment(arrow = arrow(length = unit(ar.length, "inches"), |
| 154 | + ends = "first")) + |
| 155 | + ggrepel::geom_text_repel(hjust = "outward", vjust = "outward") + |
| 156 | + coord_fixed(ratio = 1) + |
| 157 | + labs(x = contribs[1], y = contribs[2], title = title), |
| 158 | +
|
| 159 | + correlations = object %>.% # Correlations plot |
| 160 | + Correlation(.) %>.% |
| 161 | + as_tibble(., rownames = "labels") %>.% |
| 162 | + select(., c(1, choices + 1)) %>.% |
| 163 | + set_names(., c("labels", "x", "y")) %>.% |
| 164 | + chart(data = ., y ~ x %xend=% 0 %yend=% 0 %label=% labels) + |
| 165 | + annotate("path", col = circle.col, |
| 166 | + x = cos(seq(0, 2*pi, length.out = 100)), |
| 167 | + y = sin(seq(0, 2*pi, length.out = 100))) + |
| 168 | + geom_hline(yintercept = 0, col = circle.col) + |
| 169 | + geom_vline(xintercept = 0, col = circle.col) + |
| 170 | + geom_segment(arrow = arrow(length = unit(ar.length, "inches"), |
| 171 | + ends = "first")) + |
| 172 | + ggrepel::geom_text_repel(hjust = "outward", vjust = "outward") + |
| 173 | + coord_fixed(ratio = 1) + |
| 174 | + labs(x = contribs[1], y = contribs[2], title = title), |
| 175 | +
|
| 176 | + scores = scores %>.% # Plot of the individuals |
| 177 | + chart(data = ., scores_formula) + |
| 178 | + geom_hline(yintercept = 0, col = circle.col) + |
| 179 | + geom_vline(xintercept = 0, col = circle.col) + |
| 180 | + coord_fixed(ratio = 1, xlim = lims$x, ylim = lims$y, expand = TRUE) + |
| 181 | + labs(x = contribs[1], y = contribs[2], title = title) + |
| 182 | + theme(legend.position = "none"), |
| 183 | +
|
| 184 | + biplot = object %>.% # Biplot using ggfortify function |
| 185 | + as.prcomp(.) %>.% |
| 186 | + ggfortify:::autoplot.prcomp(., x = choices[1], y = choices[2], |
| 187 | + scale = scale, size = -1, label = TRUE, loadings = TRUE, |
| 188 | + loadings.label = TRUE) + |
| 189 | + geom_hline(yintercept = 0, col = circle.col) + |
| 190 | + geom_vline(xintercept = 0, col = circle.col) + |
| 191 | + theme_sciviews() + |
| 192 | + labs(x = contribs[1], y = contribs[2], title = title), |
| 193 | +
|
| 194 | + stop("Unrecognized type, must be 'screeplot', 'altscreeplot', loadings', 'correlations', 'scores' or 'biplot'") |
| 195 | + ) |
| 196 | +
|
| 197 | + if (type == "scores") { |
| 198 | + if (isTRUE(repel)) { |
| 199 | + res <- res + geom_point() + ggrepel::geom_text_repel() |
| 200 | + } else {# Use text |
| 201 | + res <- res + geom_text() |
| 202 | + } |
| 203 | + } |
| 204 | +
|
| 205 | + if (!missing(xlab)) |
| 206 | + res <- res + xlab(xlab) |
| 207 | + if (!missing(ylab)) |
| 208 | + res <- res + ylab(ylab) |
| 209 | + res |
| 210 | +} |
| 211 | +
|
| 212 | +chart.pcomp <- function(data, choices = 1L:2L, name = deparse(substitute(data)), |
| 213 | +..., type = NULL, env = parent.frame()) |
| 214 | + autoplot.pcomp(data, choices = choices, name = name, ..., type = type, env = env) |
| 215 | +class(chart.pcomp) <- c("function", "subsettable_type") |
| 216 | +
|
| 217 | +
|
| 218 | +``` |
| 219 | + |
| 220 | +```{r, echo=FALSE} |
| 221 | +BioDataScience2::learnr_banner() |
| 222 | +``` |
| 223 | + |
| 224 | +```{r, context="server"} |
| 225 | +BioDataScience2::learnr_server(input, output, session) |
| 226 | +``` |
| 227 | + |
| 228 | +---- |
| 229 | + |
| 230 | +## Objectif |
| 231 | + |
| 232 | +L'Analyse en Composantes Principales (ACP) est une méthode des statistiques exploratoires très utilisée dans le domaine de la biologie et de l'écologie. Il est donc primordial de comprendre cette analyse. |
| 233 | + |
| 234 | +Le tutoriel learnr sur l'analyse en composantes principales que vous vous apprêtez à réaliser vous permettra de\ : |
| 235 | + |
| 236 | +- ___ |
| 237 | + |
| 238 | +Avant toute chose, assurez vous d'avoir bien compris le contenu du [module 7](https://wp.sciviews.org/sdd-umons2/?iframe=wp.sciviews.org/sdd-umons2-2020/acp-afc.html) du cours et en particulier la [section 7.1](https://wp.sciviews.org/sdd-umons2/?iframe=wp.sciviews.org/sdd-umons2-2020/analyse-en-composantes-principales.html). |
| 239 | + |
| 240 | +## Manchôts de l'Antarctique |
| 241 | + |
| 242 | +```{r} |
| 243 | +penguins <- read("penguins", package = "palmerpenguins") # importation des données |
| 244 | +skimr::skim(penguins) |
| 245 | +``` |
| 246 | + |
| 247 | +```{r} |
| 248 | +naniar::vis_miss(penguins) # Visualisation des données |
| 249 | +penguins <- drop_na(penguins) # Eliminer les lignes vides |
| 250 | +``` |
| 251 | + |
| 252 | +```{r corplot_h2, exercise=TRUE} |
| 253 | +peng_corr <- ___(penguins[___:___]) |
| 254 | +plot(___) |
| 255 | +``` |
| 256 | + |
| 257 | +```{r corplot_h2-hint-1} |
| 258 | +peng_corr <- correlation(penguins[___:___]) |
| 259 | +plot(peng_corr) |
| 260 | +
|
| 261 | +#### ATTENTION: Hint suivant = solution !#### |
| 262 | +``` |
| 263 | + |
| 264 | +```{r corplot_h2-solution} |
| 265 | +peng_corr <- correlation(penguins[3:6]) |
| 266 | +plot(peng_corr) |
| 267 | +``` |
| 268 | + |
| 269 | +```{r corplot_h2-check} |
| 270 | +grade_code("Ce graphique est très intéressant pour visualiser les corrélations entre les différentes variables numériques. Par défaut, la fonction utilise la méthode de Pearson qui met en avant les corrélations linéaires.") |
| 271 | +``` |
| 272 | + |
| 273 | +```{r pca_h2, exercise=TRUE} |
| 274 | +___ %>.% |
| 275 | + ___(., ___:___) %>.% |
| 276 | + pca(., scale = ___) -> penguins_pca |
| 277 | +summary(penguins_pca) |
| 278 | +``` |
| 279 | + |
| 280 | +```{r pca_h2-hint-1} |
| 281 | +___ %>.% |
| 282 | + select(., ___:___) %>.% |
| 283 | + pca(., scale = TRUE) -> penguins_pca |
| 284 | +summary(penguins_pca) |
| 285 | +#### ATTENTION: Hint suivant = solution !#### |
| 286 | +``` |
| 287 | + |
| 288 | +```{r pca_h2-solution} |
| 289 | +penguins %>.% |
| 290 | + select(., 3:6) %>.% |
| 291 | + pca(., scale = TRUE) -> penguins_pca |
| 292 | +summary(penguins_pca) |
| 293 | +``` |
| 294 | + |
| 295 | +```{r pca_h2-check} |
| 296 | +grade_code("Ce graphique est très intéressant pour visualiser les corrélations entre les différentes variables numériques. Par défaut, la fonction utilise la méthode de Pearson qui met en avant les corrélations linéaires.") |
| 297 | +``` |
| 298 | + |
| 299 | +```{r variante_quiz} |
| 300 | +question("Quelle est la proportion cumulée de la variance des deux premières composantes principales de cette analyse en composante principale ?", |
| 301 | + answer("0.686"), |
| 302 | + answer("0.0922"), |
| 303 | + answer("0.881", correct = TRUE), |
| 304 | + allow_retry = TRUE, |
| 305 | + correct = "C'est exact ! La variance cumulée des deux premiers axes correspond à 0.881. Ces deux premiers axes proposent une bonne part de la variance.", |
| 306 | + incorrect = "La proportion de la variance et la proportion de la variance cumulée se trouve dans le tableau `Importance of components`." |
| 307 | + ) |
| 308 | +``` |
| 309 | + |
| 310 | +```{r, eval=FALSE, echo = TRUE} |
| 311 | +chart$scree(penguins_pca) |
| 312 | +chart$loadings(penguins_pca, choices = c(1, 2)) |
| 313 | +chart$scores(penguins_pca, choices = c(1, 2)) |
| 314 | +
|
| 315 | +chart$scores(peng_pca, choices = c(1, 2), labels = peng$species) + |
| 316 | + stat_ellipse() |
| 317 | +
|
| 318 | +chart$scores(peng_pca, choices = c(2, 3), labels = peng$species) + |
| 319 | + stat_ellipse() |
| 320 | +``` |
| 321 | + |
| 322 | + |
| 323 | +## Interprétation de l'ACP |
| 324 | + |
| 325 | + |
| 326 | +## Conclusion |
| 327 | + |
| 328 | +Vous êtes arrivé à la fin de auto-évaluation relative aux cartes auto-adaptative. Vous avez acquis de nouveaux outils vous permettant l'analyse et l'interprétation d'un jeu de données multivariées. Essayez maintenant d'appliquer ces techniques dans une assignation GitHub. |
| 329 | + |
| 330 | +```{r comm_noscore, echo=FALSE} |
| 331 | +question_text( |
| 332 | + "Laissez-nous vos impressions sur cet outil pédagogique", |
| 333 | + answer("", TRUE, message = "Pas de commentaires... C'est bien aussi."), |
| 334 | + incorrect = "Vos commentaires sont enregistrés.", |
| 335 | + placeholder = "Entrez vos commentaires ici...", |
| 336 | + allow_retry = TRUE |
| 337 | +) |
| 338 | +``` |
0 commit comments