Code
library(tidyverse)
library(corrplot)
library(qqplotr)
library(yardstick) # métricas de desempeño
Ejemplo Zarigüeyas
library(tidyverse)
library(corrplot)
library(qqplotr)
library(yardstick) # métricas de desempeño
<- read_csv("../datos/zarigueyas.csv")
datos datos
<-
cor_pearson |>
datos select(edad:vientre) |>
cor(method = "pearson")
corrplot(
cor_pearson, diag = FALSE,
type = "lower",
tl.srt = 45
)
<-
cor_spearman |>
datos select(edad:vientre) |>
cor(method = "spearman")
corrplot(
cor_spearman, diag = FALSE,
type = "lower",
tl.srt = 45
)
|>
datos select(edad:vientre) |>
pivot_longer(cols = everything()) |>
ggplot(aes(x = value)) +
facet_wrap(~ name, scales = "free") +
geom_histogram(color = "black")
|>
datos select(edad:vientre) |>
pivot_longer(cols = everything()) |>
ggplot(aes(x = value)) +
facet_wrap(~ name, scales = "free") +
geom_density()
|>
datos select(edad:vientre) |>
pivot_longer(cols = everything()) |>
ggplot(aes(sample = value)) +
facet_wrap(~ name, scales = "free") +
stat_qq_band() +
stat_qq_point() +
stat_qq_line()
|>
datos select(edad:vientre) |>
pivot_longer(cols = -edad) |>
ggplot(aes(x = value, y = edad)) +
facet_wrap(~ name, scales = "free_x") +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_smooth(se = FALSE, color = "red")
# Regresión lineal simple
<- lm(edad ~ longitud_cabeza, data = datos)
modelo |> summary() modelo
Call:
lm(formula = edad ~ longitud_cabeza, data = datos)
Residuals:
Min 1Q Median 3Q Max
-3.6815 -1.3815 -0.2424 1.1479 5.0761
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -12.80894 4.79274 -2.673 0.008804 **
longitud_cabeza 0.17934 0.05165 3.472 0.000766 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.817 on 99 degrees of freedom
Multiple R-squared: 0.1086, Adjusted R-squared: 0.09957
F-statistic: 12.06 on 1 and 99 DF, p-value: 0.0007661
<-
datos_modelo |>
datos select(edad, longitud_cabeza)
datos_modelo
# La predicción de la primera fila
-12.80894 + (0.17934 * 94.1)
[1] 4.066954
8 - 4.066954
[1] 3.933046
$residuals modelo
1 2 3 4 5 6
3.932640377 2.219591318 1.950574811 2.094050281 -1.601064344 -2.888015285
7 8 9 10 11 12
-2.282572829 1.807099340 5.058181413 2.345132354 5.076115847 0.789164906
13 14 15 16 17 18
0.753296039 -1.300507263 1.147853582 0.381001222 -3.174966226 -1.959753020
19 20 21 22 23 24
0.878837075 -0.192900660 -1.390179432 -1.461917167 0.219591318 -2.121162925
25 26 27 28 29 30
-1.372244998 2.591886135 -1.421720007 -0.013556322 -0.834211984 -1.708670947
31 32 33 34 35 36
-0.834211984 -0.103228491 -0.583129911 -1.439654440 -0.121162925 3.076115847
37 38 39 40 41 42
-1.206506801 3.237525751 -1.381522846 -0.511392175 0.510870551 -1.332047838
43 44 45 46 47 48
-0.457588874 -1.349982271 0.125590856 -0.300507263 1.381001222 0.663623870
49 50 51 52 53 54
1.304935194 -0.888015285 2.430476231 -3.681458665 -2.107556783 -0.246703961
55 56 57 58 59 60
-1.139097358 -3.573852062 -1.565195477 2.645689436 -0.565195477 2.309263487
61 62 63 64 65 66
-1.569523769 1.040246980 -0.403785573 0.076115847 0.932640377 0.233197459
67 68 69 70 71 72
3.327197920 2.165788016 -1.596736052 -0.986965303 0.901099802 1.986443678
73 74 75 76 77 78
1.237525751 2.022312546 -1.704342655 -1.578801618 -1.740211522 -0.439654440
79 80 81 82 83 84
1.385329514 -0.332047838 -0.045096897 -0.242375669 -0.009228029 -2.856474710
85 86 87 88 89 90
-1.260310102 1.287000760 -0.798343116 -1.730933674 -1.457588874 -0.188572367
91 92 93 94 95 96
3.345132354 0.381001222 -0.192900660 -0.511392175 1.094050281 -0.923884153
97 98 99 100 101
-2.242375669 -2.080965764 2.237525751 0.398935656 -0.977687454
$fitted.values modelo
1 2 3 4 5 6 7 8
4.067360 3.780409 4.049425 3.905950 3.601064 3.888015 4.282573 4.192901
9 10 11 12 13 14 15 16
3.941819 3.654868 3.923884 4.210835 4.246704 4.300507 3.852146 3.618999
17 18 19 20 21 22 23 24
4.174966 3.959753 4.121163 4.192901 4.390179 4.461917 3.780409 4.121163
25 26 27 28 29 30 31 32
4.372245 4.408114 3.421720 4.013556 3.834212 3.708671 3.834212 4.103228
33 34 35 36 37 38 39 40
3.583130 3.439654 4.121163 3.923884 3.206507 3.762474 2.381523 3.511392
41 42 43 44 45 46 47 48
2.489129 3.332048 3.457589 3.349982 4.874409 4.300507 3.618999 4.336376
49 50 51 52 53 54 55 56
4.695065 3.888015 4.569524 5.681459 5.107557 4.246704 4.139097 5.573852
57 58 59 60 61 62 63 64
3.565195 4.354311 3.565195 3.690737 4.569524 3.959753 3.403786 3.923884
65 66 67 68 69 70 71 72
4.067360 4.766803 3.672802 3.834212 2.596736 1.986965 3.098900 4.013556
73 74 75 76 77 78 79 80
3.762474 3.977687 2.704343 2.578802 2.740212 3.439654 2.614670 3.332048
81 82 83 84 85 86 87 88
3.045097 3.242376 3.009228 4.856475 3.260310 4.712999 3.798343 4.730934
89 90 91 92 93 94 95 96
3.457589 3.188572 3.654868 3.618999 4.192901 3.511392 3.905950 3.923884
97 98 99 100 101
3.242376 3.080966 3.762474 3.601064 3.977687
$residuals |> summary() modelo
Min. 1st Qu. Median Mean 3rd Qu. Max.
-3.6815 -1.3815 -0.2424 0.0000 1.1479 5.0761
<-
df_diagnosticos |>
datos_modelo mutate(predicho = modelo$fitted.values,
residual = edad - predicho)
df_diagnosticos
|>
df_diagnosticos ggplot(aes(x = residual)) +
geom_histogram(color = "black")
|>
df_diagnosticos ggplot(aes(x = residual)) +
geom_density(color = "black")
|>
df_diagnosticos ggplot(aes(sample = residual)) +
stat_qq_band() +
stat_qq() +
stat_qq_line()
|>
df_diagnosticos ggplot(aes(x = predicho, y = residual)) +
geom_point() +
geom_hline(yintercept = 0, color = "red")
rsq_trad_vec(truth = df_diagnosticos$edad,
estimate = df_diagnosticos$predicho)
[1] 0.1085734
<-
correlacion cor(df_diagnosticos$edad,
$longitud_cabeza,
df_diagnosticosmethod = "pearson")
^2 correlacion
[1] 0.1085734
rmse_vec(truth = df_diagnosticos$edad,
estimate = df_diagnosticos$predicho)
[1] 1.799252
AIC(modelo)
[1] 411.2746
<- lm(edad ~ vientre, data = datos)
modelo2 summary(modelo2)
Call:
lm(formula = edad ~ vientre, data = datos)
Residuals:
Min 1Q Median 3Q Max
-2.913 -1.407 -0.420 1.340 5.087
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.44667 2.15541 -2.063 0.04173 *
vientre 0.25333 0.06581 3.849 0.00021 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.795 on 99 degrees of freedom
Multiple R-squared: 0.1302, Adjusted R-squared: 0.1214
F-statistic: 14.82 on 1 and 99 DF, p-value: 0.00021
rsq_trad_vec(truth = df_diagnosticos$edad,
estimate = modelo2$fitted.values)
[1] 0.1301884
<-
correlacion2 cor(datos$edad,
$vientre,
datosmethod = "pearson")
^2 correlacion2
[1] 0.1301884
rmse_vec(truth = datos$edad,
estimate = modelo2$fitted.values)
[1] 1.777305
AIC(modelo, modelo2)
par(mfrow = c(2, 2))
plot(modelo2)
<-
df_nuevas data.frame(
longitud_cabeza = c(81.34, 90.89, 101.23),
vientre = c(27.56, 31.45, 37.8)
)
predict(object = modelo, newdata = df_nuevas)
1 2 3
1.778926 3.491664 5.346085
predict(object = modelo2, newdata = df_nuevas)
1 2 3
2.535200 3.520667 5.129333
log10(23)
[1] 1.361728
10 ^ 1.361728
[1] 23.00001
log2(23)
[1] 4.523562
2 ^ 4.523562
[1] 23
log(23)
[1] 3.135494
exp(3.135494)
[1] 23
log1p(23)
[1] 3.178054
expm1(3.178054)
[1] 23
<- lm(edad ~ log(vientre), data = datos) modelo_log1
rmse_vec(truth = datos$edad,
estimate = modelo_log1$fitted.values)
[1] 1.771642
<- lm(log(edad) ~ vientre, data = datos) modelo_log2
rmse_vec(truth = datos$edad,
estimate = modelo_log2$fitted.values)
[1] 3.200328
rmse_vec(truth = datos$edad,
estimate = exp(modelo_log2$fitted.values))
[1] 1.855764
<- lm(log(edad) ~ log(vientre), data = datos) modelo_log3
rmse_vec(truth = datos$edad,
estimate = modelo_log3$fitted.values)
[1] 3.198304
rmse_vec(truth = datos$edad,
estimate = exp(modelo_log3$fitted.values))
[1] 1.844843
<- lm(edad ~ sexo, data = datos)
modelo_cat1 |> summary() modelo_cat1
Call:
lm(formula = edad ~ sexo, data = datos)
Residuals:
Min 1Q Median 3Q Max
-2.9524 -1.7288 -0.7288 1.2712 5.0476
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.9524 0.2965 13.330 <2e-16 ***
sexom -0.2236 0.3879 -0.576 0.566
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.922 on 99 degrees of freedom
Multiple R-squared: 0.003343, Adjusted R-squared: -0.006724
F-statistic: 0.3321 on 1 and 99 DF, p-value: 0.5657
model.matrix(edad ~ sexo, data = datos)
(Intercept) sexom
1 1 1
2 1 0
3 1 0
4 1 0
5 1 0
6 1 0
7 1 1
8 1 0
9 1 0
10 1 0
11 1 0
12 1 0
13 1 1
14 1 1
15 1 1
16 1 1
17 1 0
18 1 1
19 1 0
20 1 0
21 1 0
22 1 1
23 1 0
24 1 1
25 1 1
26 1 1
27 1 0
28 1 1
29 1 0
30 1 0
31 1 1
32 1 0
33 1 1
34 1 1
35 1 1
36 1 1
37 1 0
38 1 1
39 1 0
40 1 0
41 1 1
42 1 0
43 1 1
44 1 1
45 1 1
46 1 1
47 1 0
48 1 0
49 1 1
50 1 0
51 1 1
52 1 1
53 1 1
54 1 0
55 1 1
56 1 1
57 1 0
58 1 1
59 1 0
60 1 0
61 1 0
62 1 0
63 1 0
64 1 1
65 1 1
66 1 1
67 1 0
68 1 1
69 1 1
70 1 1
71 1 0
72 1 1
73 1 1
74 1 1
75 1 1
76 1 1
77 1 1
78 1 1
79 1 0
80 1 0
81 1 1
82 1 1
83 1 0
84 1 1
85 1 0
86 1 1
87 1 1
88 1 1
89 1 1
90 1 1
91 1 1
92 1 1
93 1 1
94 1 1
95 1 1
96 1 0
97 1 1
98 1 1
99 1 0
100 1 1
101 1 0
attr(,"assign")
[1] 0 1
attr(,"contrasts")
attr(,"contrasts")$sexo
[1] "contr.treatment"
<-
nuevos_datos data.frame(sexo = "m")
predict(modelo_cat1, newdata = nuevos_datos)
1
3.728814
\(\hat{y} = \beta_0 + (\beta_1 * Sexo_M)\)
\(\hat{edad} = 3.9524 + (-0.2236 * 1) = 3.728814\)
<-
datos2 |>
datos mutate(longcabeza_cat = cut(longitud_cabeza, breaks = 5))
|> head() datos2
unique(datos2$longcabeza_cat)
[1] (90.7,94.9] (94.9,99] (86.6,90.7] (82.5,86.6] (99,103]
Levels: (82.5,86.6] (86.6,90.7] (90.7,94.9] (94.9,99] (99,103]
<- lm(edad ~ longcabeza_cat, data = datos2)
modelo_cat2 |> summary() modelo_cat2
Call:
lm(formula = edad ~ longcabeza_cat, data = datos2)
Residuals:
Min 1Q Median 3Q Max
-3.5000 -1.4000 -0.3684 0.6667 4.5000
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.7143 0.6200 2.765 0.006827 **
longcabeza_cat(86.6,90.7] 0.6541 0.7253 0.902 0.369358
longcabeza_cat(90.7,94.9] 2.7857 0.6604 4.218 5.58e-05 ***
longcabeza_cat(94.9,99] 2.6857 0.7204 3.728 0.000326 ***
longcabeza_cat(99,103] 0.6190 1.1320 0.547 0.585729
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.64 on 96 degrees of freedom
Multiple R-squared: 0.2957, Adjusted R-squared: 0.2664
F-statistic: 10.08 on 4 and 96 DF, p-value: 7.462e-07
<-
nuevos_datos2 data.frame(longitud_cabeza = 95.3,
longcabeza_cat = "(94.9,99]")
predict(modelo_cat2, newdata = nuevos_datos2)
1
4.4
predict(modelo, newdata = nuevos_datos2)
1
4.282573
rmse_vec(truth = datos2$edad,
estimate = modelo_cat2$fitted.values)
[1] 1.599246
# Variables predictora y respuesta
<- datos$longitud_cabeza
var_predictora <- datos$edad
var_y
# Matriz X
<- cbind(1, var_predictora)
matriz_x
# Transpuesta de X
<- t(matriz_x)
transpuesta_x
# X transpuesta * X
<- transpuesta_x %*% matriz_x
tranx_x
# Inversa de X transpuesta * X
<- solve(tranx_x)
inversa
# Inveresa de X transpuesta * X por transpuesta de X
<- inversa %*% t(matriz_x)
inversa_tranx
# Betas
<- inversa_tranx %*% var_y
betas betas
[,1]
-12.8089426
var_predictora 0.1793443