Skip to content

Commit b4fddb4

Browse files
First step B07La_pca
1 parent 4dd5b3d commit b4fddb4

File tree

1 file changed

+338
-0
lines changed

1 file changed

+338
-0
lines changed
Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,338 @@
1+
---
2+
title: "Analyse en composantes principales"
3+
author: "Guyliann Engels & Philippe Grosjean"
4+
description: "**SDD II Module 7** : Analyse en Composantes Principales (ACP)"
5+
tutorial:
6+
id: "B07La_pca"
7+
version: 0.0.9999/___
8+
output:
9+
learnr::tutorial:
10+
progressive: true
11+
runtime: shiny_prerendered
12+
---
13+
14+
```{r setup, include=FALSE}
15+
BioDataScience2::learnr_setup()
16+
# Package utile
17+
SciViews::R()
18+
# Preparation of the dataset ------
19+
penguins <- read("penguins", package = "palmerpenguins", lang = "fr")
20+
21+
# Functions utiles
22+
# pca for SciViews, version 1.0.0
23+
# Copyright (c) 2020, Philippe Grosjean (phgrosjean@sciviews.org)
24+
library(broom)
25+
26+
# broom implements only methods for prcomp objects, not princomp, while pcomp
27+
# is compatible with princomp... but prcomp is simpler. So, conversion is easy
28+
as.prcomp <- function(x, ...)
29+
UseMethod("as.prcomp")
30+
31+
as.prcomp.default <- function(x, ...)
32+
stop("No method to convert this object into a 'prcomp'")
33+
34+
as.prcomp.prcomp <- function(x, ...)
35+
x
36+
37+
as.prcomp.princomp <- function(x, ...)
38+
structure(list(sdev = as.numeric(x$sdev), rotation = unclass(x$loadings),
39+
center = x$center, scale = x$scale, x = as.matrix(x$scores)),
40+
class = "prcomp")
41+
42+
# Comparison of pcomp() -> as.prcomp() with prcomp() directly
43+
# Almost the same, only no rownames for x (is it important?)
44+
#iris_prcomp_pcomp <- as.prcomp(pcomp(iris[, -5], scale = TRUE))
45+
#iris_prcomp <- prcomp(iris[, -5], scale = TRUE)
46+
47+
# Now, broom methods can be defined simply by converting into prcomp objects
48+
augment.princomp <- function(x, data = NULL, newdata, ...)
49+
if (missing(newdata)) {
50+
augment(as.prcomp(x), data = data, ...)
51+
} else {
52+
augment(as.prcomp(x), data = data, newdata = newdata, ...)
53+
}
54+
55+
tidy.princomp <- function(x, matrix = "u", ...)
56+
tidy(as.prcomp(x), matrix = matrix, ...)
57+
58+
# There is no glance.prcomp() method
59+
60+
# There is a problem with pcomp() that returns a data.frame in scores,
61+
# while it is a matrix in the original princomp object. pca() corrects this
62+
pca <- function(x, ...) {
63+
res <- SciViews::pcomp(x, ...)
64+
# Change scores into a matrix
65+
res$scores <- as.matrix(res$scores)
66+
res
67+
}
68+
69+
scale_axes <- function(data, aspect.ratio = 1) {
70+
range_x <- range(data[, 1])
71+
span_x <- abs(max(range_x) - min(range_x))
72+
range_y <- range(data[, 2])
73+
span_y <- abs(max(range_y) - min(range_y))
74+
if ((span_y / aspect.ratio) > span_x) {
75+
# Adjust range_x
76+
span_x_2 <- span_y / aspect.ratio / 2
77+
range_x_mid <- sum(range_x) / 2
78+
range_x <- c(range_x_mid - span_x_2, range_x_mid + span_x_2)
79+
} else {
80+
# Adjust range_y
81+
span_y_2 <- span_x * aspect.ratio / 2
82+
range_y_mid <- sum(range_y) / 2
83+
range_y <- c(range_y_mid - span_y_2, range_y_mid + span_y_2)
84+
}
85+
list(x = range_x, y = range_y)
86+
}
87+
88+
autoplot.pcomp <- function(object,
89+
type = c("screeplot", "altscreeplot", "loadings", "correlations", "scores", "biplot"),
90+
choices = 1L:2L, name = deparse(substitute(object)), ar.length = 0.1,
91+
circle.col = "gray", col = "black", fill = "gray", scale = 1, aspect.ratio = 1,
92+
repel = FALSE, labels, title, xlab, ylab, ...) {
93+
type = match.arg(type)
94+
95+
if (missing(title))
96+
title <- paste(name, type, sep = " - ")
97+
98+
contribs <- paste0(names(object$sdev), " (",
99+
round((object$sdev^2/object$totdev^2) * 100, digits = 1), "%)")[choices]
100+
101+
scores <- as.data.frame(object$scores[, choices])
102+
names(scores) <- c("x", "y")
103+
if (!missing(labels)) {
104+
if (length(labels) != nrow(scores))
105+
stop("You must provide a character vector of length ", nrow(scores),
106+
" for 'labels'")
107+
scores$labels <- labels
108+
} else {# Default labels are row numbers
109+
scores$labels <- 1:nrow(scores)
110+
}
111+
112+
lims <- scale_axes(scores, aspect.ratio = aspect.ratio)
113+
114+
if (!missing(col)) {
115+
if (length(col) != nrow(scores))
116+
stop("You must provide a vector of length ", nrow(scores), " for 'col'")
117+
scores$color <- col
118+
scores_formula <- y ~ x %col=% color %label=% labels
119+
} else {
120+
if (missing(labels)) {
121+
scores_formula <- y ~ x %label=% labels
122+
} else {
123+
scores_formula <- y ~ x %col=% labels %label=% labels
124+
}
125+
}
126+
127+
res <- switch(type,
128+
screeplot = object %>.% # Classical screeplot
129+
tidy(., "pcs") %>.%
130+
chart(data = ., std.dev^2 ~ PC) +
131+
geom_col(col = col, fill = fill) +
132+
labs(y = "Variances", title = title),
133+
134+
altscreeplot = object %>.% # screeplot represented by dots and lines
135+
tidy(., "pcs") %>.%
136+
chart(data = ., std.dev^2 ~ PC) +
137+
geom_line(col = col) +
138+
geom_point(col = "white", fill = col, size = 2, shape = 21, stroke = 3) +
139+
labs(y = "Variances", title = title),
140+
141+
loadings = object %>.% # Plots of the variables
142+
tidy(., "variables") %>.%
143+
spread(., key = PC, value = value) %>.%
144+
#rename_if(., is.numeric, function(x) paste0("PC", x)) %>.%
145+
select(., c(1, choices + 1)) %>.%
146+
set_names(., c("labels", "x", "y")) %>.%
147+
chart(data = ., y ~ x %xend=% 0 %yend=% 0 %label=% labels) +
148+
annotate("path", col = circle.col,
149+
x = cos(seq(0, 2*pi, length.out = 100)),
150+
y = sin(seq(0, 2*pi, length.out = 100))) +
151+
geom_hline(yintercept = 0, col = circle.col) +
152+
geom_vline(xintercept = 0, col = circle.col) +
153+
geom_segment(arrow = arrow(length = unit(ar.length, "inches"),
154+
ends = "first")) +
155+
ggrepel::geom_text_repel(hjust = "outward", vjust = "outward") +
156+
coord_fixed(ratio = 1) +
157+
labs(x = contribs[1], y = contribs[2], title = title),
158+
159+
correlations = object %>.% # Correlations plot
160+
Correlation(.) %>.%
161+
as_tibble(., rownames = "labels") %>.%
162+
select(., c(1, choices + 1)) %>.%
163+
set_names(., c("labels", "x", "y")) %>.%
164+
chart(data = ., y ~ x %xend=% 0 %yend=% 0 %label=% labels) +
165+
annotate("path", col = circle.col,
166+
x = cos(seq(0, 2*pi, length.out = 100)),
167+
y = sin(seq(0, 2*pi, length.out = 100))) +
168+
geom_hline(yintercept = 0, col = circle.col) +
169+
geom_vline(xintercept = 0, col = circle.col) +
170+
geom_segment(arrow = arrow(length = unit(ar.length, "inches"),
171+
ends = "first")) +
172+
ggrepel::geom_text_repel(hjust = "outward", vjust = "outward") +
173+
coord_fixed(ratio = 1) +
174+
labs(x = contribs[1], y = contribs[2], title = title),
175+
176+
scores = scores %>.% # Plot of the individuals
177+
chart(data = ., scores_formula) +
178+
geom_hline(yintercept = 0, col = circle.col) +
179+
geom_vline(xintercept = 0, col = circle.col) +
180+
coord_fixed(ratio = 1, xlim = lims$x, ylim = lims$y, expand = TRUE) +
181+
labs(x = contribs[1], y = contribs[2], title = title) +
182+
theme(legend.position = "none"),
183+
184+
biplot = object %>.% # Biplot using ggfortify function
185+
as.prcomp(.) %>.%
186+
ggfortify:::autoplot.prcomp(., x = choices[1], y = choices[2],
187+
scale = scale, size = -1, label = TRUE, loadings = TRUE,
188+
loadings.label = TRUE) +
189+
geom_hline(yintercept = 0, col = circle.col) +
190+
geom_vline(xintercept = 0, col = circle.col) +
191+
theme_sciviews() +
192+
labs(x = contribs[1], y = contribs[2], title = title),
193+
194+
stop("Unrecognized type, must be 'screeplot', 'altscreeplot', loadings', 'correlations', 'scores' or 'biplot'")
195+
)
196+
197+
if (type == "scores") {
198+
if (isTRUE(repel)) {
199+
res <- res + geom_point() + ggrepel::geom_text_repel()
200+
} else {# Use text
201+
res <- res + geom_text()
202+
}
203+
}
204+
205+
if (!missing(xlab))
206+
res <- res + xlab(xlab)
207+
if (!missing(ylab))
208+
res <- res + ylab(ylab)
209+
res
210+
}
211+
212+
chart.pcomp <- function(data, choices = 1L:2L, name = deparse(substitute(data)),
213+
..., type = NULL, env = parent.frame())
214+
autoplot.pcomp(data, choices = choices, name = name, ..., type = type, env = env)
215+
class(chart.pcomp) <- c("function", "subsettable_type")
216+
217+
218+
```
219+
220+
```{r, echo=FALSE}
221+
BioDataScience2::learnr_banner()
222+
```
223+
224+
```{r, context="server"}
225+
BioDataScience2::learnr_server(input, output, session)
226+
```
227+
228+
----
229+
230+
## Objectif
231+
232+
L'Analyse en Composantes Principales (ACP) est une méthode des statistiques exploratoires très utilisée dans le domaine de la biologie et de l'écologie. Il est donc primordial de comprendre cette analyse.
233+
234+
Le tutoriel learnr sur l'analyse en composantes principales que vous vous apprêtez à réaliser vous permettra de\ :
235+
236+
- ___
237+
238+
Avant toute chose, assurez vous d'avoir bien compris le contenu du [module 7](https://wp.sciviews.org/sdd-umons2/?iframe=wp.sciviews.org/sdd-umons2-2020/acp-afc.html) du cours et en particulier la [section 7.1](https://wp.sciviews.org/sdd-umons2/?iframe=wp.sciviews.org/sdd-umons2-2020/analyse-en-composantes-principales.html).
239+
240+
## Manchôts de l'Antarctique
241+
242+
```{r}
243+
penguins <- read("penguins", package = "palmerpenguins") # importation des données
244+
skimr::skim(penguins)
245+
```
246+
247+
```{r}
248+
naniar::vis_miss(penguins) # Visualisation des données
249+
penguins <- drop_na(penguins) # Eliminer les lignes vides
250+
```
251+
252+
```{r corplot_h2, exercise=TRUE}
253+
peng_corr <- ___(penguins[___:___])
254+
plot(___)
255+
```
256+
257+
```{r corplot_h2-hint-1}
258+
peng_corr <- correlation(penguins[___:___])
259+
plot(peng_corr)
260+
261+
#### ATTENTION: Hint suivant = solution !####
262+
```
263+
264+
```{r corplot_h2-solution}
265+
peng_corr <- correlation(penguins[3:6])
266+
plot(peng_corr)
267+
```
268+
269+
```{r corplot_h2-check}
270+
grade_code("Ce graphique est très intéressant pour visualiser les corrélations entre les différentes variables numériques. Par défaut, la fonction utilise la méthode de Pearson qui met en avant les corrélations linéaires.")
271+
```
272+
273+
```{r pca_h2, exercise=TRUE}
274+
___ %>.%
275+
___(., ___:___) %>.%
276+
pca(., scale = ___) -> penguins_pca
277+
summary(penguins_pca)
278+
```
279+
280+
```{r pca_h2-hint-1}
281+
___ %>.%
282+
select(., ___:___) %>.%
283+
pca(., scale = TRUE) -> penguins_pca
284+
summary(penguins_pca)
285+
#### ATTENTION: Hint suivant = solution !####
286+
```
287+
288+
```{r pca_h2-solution}
289+
penguins %>.%
290+
select(., 3:6) %>.%
291+
pca(., scale = TRUE) -> penguins_pca
292+
summary(penguins_pca)
293+
```
294+
295+
```{r pca_h2-check}
296+
grade_code("Ce graphique est très intéressant pour visualiser les corrélations entre les différentes variables numériques. Par défaut, la fonction utilise la méthode de Pearson qui met en avant les corrélations linéaires.")
297+
```
298+
299+
```{r variante_quiz}
300+
question("Quelle est la proportion cumulée de la variance des deux premières composantes principales de cette analyse en composante principale ?",
301+
answer("0.686"),
302+
answer("0.0922"),
303+
answer("0.881", correct = TRUE),
304+
allow_retry = TRUE,
305+
correct = "C'est exact ! La variance cumulée des deux premiers axes correspond à 0.881. Ces deux premiers axes proposent une bonne part de la variance.",
306+
incorrect = "La proportion de la variance et la proportion de la variance cumulée se trouve dans le tableau `Importance of components`."
307+
)
308+
```
309+
310+
```{r, eval=FALSE, echo = TRUE}
311+
chart$scree(penguins_pca)
312+
chart$loadings(penguins_pca, choices = c(1, 2))
313+
chart$scores(penguins_pca, choices = c(1, 2))
314+
315+
chart$scores(peng_pca, choices = c(1, 2), labels = peng$species) +
316+
stat_ellipse()
317+
318+
chart$scores(peng_pca, choices = c(2, 3), labels = peng$species) +
319+
stat_ellipse()
320+
```
321+
322+
323+
## Interprétation de l'ACP
324+
325+
326+
## Conclusion
327+
328+
Vous êtes arrivé à la fin de auto-évaluation relative aux cartes auto-adaptative. Vous avez acquis de nouveaux outils vous permettant l'analyse et l'interprétation d'un jeu de données multivariées. Essayez maintenant d'appliquer ces techniques dans une assignation GitHub.
329+
330+
```{r comm_noscore, echo=FALSE}
331+
question_text(
332+
"Laissez-nous vos impressions sur cet outil pédagogique",
333+
answer("", TRUE, message = "Pas de commentaires... C'est bien aussi."),
334+
incorrect = "Vos commentaires sont enregistrés.",
335+
placeholder = "Entrez vos commentaires ici...",
336+
allow_retry = TRUE
337+
)
338+
```

0 commit comments

Comments
 (0)