Code
library(tidyverse)
library(readxl)
library(janitor)
library(skimr)
library(moments)
library(ggpubr)
Estadística
library(tidyverse)
library(readxl)
library(janitor)
library(skimr)
library(moments)
library(ggpubr)
<- read_csv("datos-ejemplos/PorcVoluUtilDiar.csv")
df_embalses |> head() df_embalses
<- read_excel("datos-ejemplos/Base agrícola 2019 - 2023.xlsx", skip = 6) |>
df_evas clean_names() |>
filter(
%in% c(
departamento "Antioquia",
"Cundinamarca",
"Valle del Cauca",
"Córdoba"
)
)
|> head() df_evas
$Value |>
df_embalsesmean()
[1] 0.6342816
|>
df_embalses group_by(Name) |>
reframe(promedio = mean(Value)) |>
arrange(promedio)
<- c(47.8, 52.3, 56, 58.3, 17.5)
peso <- c(500, 780, 890, 960, 10)
frutos
weighted.mean(x = peso, w = frutos)
[1] 54.35573
$Value |>
df_embalsesmedian()
[1] 0.6437
|>
df_embalses group_by(Name) |>
reframe(mediana = median(Value)) |>
arrange(mediana)
# Función personalizada
<- function(x) {
moda = unique(x)
ux = tabulate(match(x, ux))
tab == max(tab)]
ux[tab
}
<- c(1, 1, 2, 50, 60, 1, 2, 2, 50, 50)
valores_numericos <- c("A", "A", "B", "B", "C", "D", "A", "B")
valores_categoricos
moda(x = valores_numericos)
[1] 1 2 50
moda(x = valores_categoricos)
[1] "A" "B"
$Value |>
df_embalsesmoda()
[1] 1
quantile(x = df_embalses$Value, probs = 0.90)
90%
0.953302
quantile(x = df_embalses$Value, probs = c(0.20, 0.70))
20% 70%
0.407496 0.798140
quantile(x = df_embalses$Value,
probs = seq(from = 0, to = 1, by = 0.01))
0% 1% 2% 3% 4% 5% 6%
-0.0185200 0.0657600 0.1124336 0.1486900 0.1775488 0.1999700 0.2211500
7% 8% 9% 10% 11% 12% 13%
0.2412904 0.2583300 0.2753936 0.2924060 0.3070348 0.3219332 0.3362336
14% 15% 16% 17% 18% 19% 20%
0.3496752 0.3595920 0.3688000 0.3781536 0.3879624 0.3985292 0.4074960
21% 22% 23% 24% 25% 26% 27%
0.4167556 0.4260300 0.4354800 0.4449500 0.4546200 0.4636500 0.4721872
28% 29% 30% 31% 32% 33% 34%
0.4813000 0.4898716 0.4979540 0.5055100 0.5131400 0.5207888 0.5286700
35% 36% 37% 38% 39% 40% 41%
0.5357800 0.5428388 0.5498600 0.5569400 0.5631300 0.5698300 0.5767300
42% 43% 44% 45% 46% 47% 48%
0.5843500 0.5920724 0.5996684 0.6070520 0.6139200 0.6209000 0.6282400
49% 50% 51% 52% 53% 54% 55%
0.6357896 0.6437000 0.6520268 0.6602036 0.6682300 0.6765500 0.6849840
56% 57% 58% 59% 60% 61% 62%
0.6922316 0.7001576 0.7083200 0.7165100 0.7244960 0.7326548 0.7402916
63% 64% 65% 66% 67% 68% 69%
0.7483700 0.7556400 0.7630300 0.7698588 0.7771900 0.7839924 0.7910700
70% 71% 72% 73% 74% 75% 76%
0.7981400 0.8059200 0.8128292 0.8195864 0.8273132 0.8345900 0.8422168
77% 78% 79% 80% 81% 82% 83%
0.8497200 0.8572616 0.8653200 0.8731680 0.8806100 0.8879204 0.8957700
84% 85% 86% 87% 88% 89% 90%
0.9028400 0.9104780 0.9185200 0.9268416 0.9353584 0.9441700 0.9533020
91% 92% 93% 94% 95% 96% 97%
0.9628888 0.9719900 0.9795700 0.9877100 0.9981260 1.0072000 1.0208800
98% 99% 100%
1.0423300 1.0956232 1.2438600
|>
df_embalses group_by(Name) |>
reframe(percentil_15 = quantile(x = Value, probs = 0.15),
percentil_70 = quantile(x = Value, probs = 0.70)) |>
arrange(percentil_70)
quantile(x = df_embalses$Value, probs = seq(from = 0, to = 1, by = 0.1))
0% 10% 20% 30% 40% 50% 60% 70%
-0.018520 0.292406 0.407496 0.497954 0.569830 0.643700 0.724496 0.798140
80% 90% 100%
0.873168 0.953302 1.243860
quantile(x = df_embalses$Value, probs = seq(from = 0, to = 1, by = 0.25))
0% 25% 50% 75% 100%
-0.01852 0.45462 0.64370 0.83459 1.24386
var(x = df_embalses$Value)
[1] 0.06146684
sd(x = df_embalses$Value)
[1] 0.2479251
0.6342816 + (0.2479251 * 1)
[1] 0.8822067
0.6342816 - (0.2479251 * 1)
[1] 0.3863565
|>
df_embalses group_by(Name) |>
reframe(desviacion = sd(Value, na.rm = TRUE)) |>
arrange(desviacion)
0.2479251 / 0.6342816) * 100 (
[1] 39.08754
|>
df_embalses group_by(Name) |>
reframe(promedio = mean(Value),
desviacion = sd(Value),
coef_var = (desviacion / promedio) * 100)
range(df_embalses$Value)
[1] -0.01852 1.24386
|>
df_embalses group_by(Name) |>
reframe(maximo = max(Value),
minimo = min(Value),
rango = maximo - minimo)
IQR(df_embalses$Value)
[1] 0.37997
|>
df_embalses group_by(Name) |>
reframe(rango_inter = IQR(Value))
skewness(x = df_embalses$Value)
[1] -0.2493391
|>
df_embalses group_by(Name) |>
reframe(coef_asimetria = skewness(Value))
kurtosis(x = df_embalses$Value)
[1] 2.313734
|>
df_embalses group_by(Name) |>
reframe(coef_curtosis = kurtosis(Value))
|> summary() df_embalses
Id Name Value Date
Length:89569 Length:89569 Min. :-0.01852 Min. :2014-01-01
Class :character Class :character 1st Qu.: 0.45462 1st Qu.:2016-10-15
Mode :character Mode :character Median : 0.64370 Median :2019-06-17
Mean : 0.63428 Mean :2019-06-10
3rd Qu.: 0.83459 3rd Qu.:2022-02-15
Max. : 1.24386 Max. :2024-09-23
|>
df_embalses skim()
Name | df_embalses |
Number of rows | 89569 |
Number of columns | 4 |
_______________________ | |
Column type frequency: | |
character | 2 |
Date | 1 |
numeric | 1 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
Id | 0 | 1 | 7 | 7 | 0 | 1 | 0 |
Name | 0 | 1 | 4 | 15 | 0 | 24 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
Date | 0 | 1 | 2014-01-01 | 2024-09-23 | 2019-06-17 | 3919 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Value | 0 | 1 | 0.63 | 0.25 | -0.02 | 0.45 | 0.64 | 0.83 | 1.24 | ▂▅▇▇▂ |
<- table(df_evas$departamento)
tabla_deptos tabla_deptos
Antioquia Córdoba Cundinamarca Valle del Cauca
10652 2859 10831 7465
<- prop.table(tabla_deptos)
tabla_deptos_relat tabla_deptos_relat
Antioquia Córdoba Cundinamarca Valle del Cauca
0.33489483 0.08988587 0.34052253 0.23469676
<- table(df_evas$departamento, df_evas$ano)
tabla_depto_year tabla_depto_year
2019 2020 2021 2022 2023
Antioquia 1916 2011 2176 2277 2272
Córdoba 509 503 564 642 641
Cundinamarca 1945 1997 2219 2343 2327
Valle del Cauca 1331 1457 1540 1572 1565
<- prop.table(tabla_depto_year)
tabla_depto_year_relat tabla_depto_year_relat
2019 2020 2021 2022 2023
Antioquia 0.06023831 0.06322508 0.06841261 0.07158802 0.07143082
Córdoba 0.01600277 0.01581413 0.01773195 0.02018424 0.02015280
Cundinamarca 0.06115006 0.06278492 0.06976452 0.07366303 0.07316000
Valle del Cauca 0.04184613 0.04580753 0.04841702 0.04942308 0.04920301
<-
tabla3_absoluta table(df_evas$departamento,
$ano,
df_evas$grupo_cultivo)
df_evas
tabla3_absoluta
, , = Cereales
2019 2020 2021 2022 2023
Antioquia 248 263 389 384 372
Córdoba 149 152 205 223 228
Cundinamarca 217 211 333 350 340
Valle del Cauca 135 131 205 211 200
, , = Cultivos para condimentos, bebidas medicinales y aromáticas
2019 2020 2021 2022 2023
Antioquia 23 23 21 26 25
Córdoba 1 0 0 0 0
Cundinamarca 40 46 35 38 37
Valle del Cauca 37 36 24 27 28
, , = Cultivos tropicales tradicionales
2019 2020 2021 2022 2023
Antioquia 263 262 260 259 257
Córdoba 25 23 21 28 28
Cundinamarca 157 157 155 156 156
Valle del Cauca 134 139 134 138 137
, , = Frutales
2019 2020 2021 2022 2023
Antioquia 590 655 686 684 700
Córdoba 111 112 113 115 116
Cundinamarca 499 532 630 652 649
Valle del Cauca 421 501 534 536 538
, , = Hortalizas
2019 2020 2021 2022 2023
Antioquia 360 381 393 423 418
Córdoba 83 82 85 96 100
Cundinamarca 418 433 444 495 495
Valle del Cauca 351 401 398 403 404
, , = Leguminosas
2019 2020 2021 2022 2023
Antioquia 204 208 217 234 234
Córdoba 35 33 30 35 34
Cundinamarca 300 302 309 329 326
Valle del Cauca 150 151 146 146 146
, , = Oleaginosas
2019 2020 2021 2022 2023
Antioquia 13 13 9 9 8
Córdoba 18 18 15 19 19
Cundinamarca 6 6 6 6 5
Valle del Cauca 20 18 17 13 15
, , = Raíces y tubérculos
2019 2020 2021 2022 2023
Antioquia 215 206 201 258 258
Córdoba 87 83 95 126 116
Cundinamarca 308 310 307 317 319
Valle del Cauca 83 80 82 98 97
<- tabla3_absoluta |> prop.table()
tabla3_relativa tabla3_relativa
, , = Cereales
2019 2020 2021 2022
Antioquia 7.797026e-03 8.268620e-03 1.223001e-02 1.207281e-02
Córdoba 4.684503e-03 4.778822e-03 6.445122e-03 7.011035e-03
Cundinamarca 6.822398e-03 6.633760e-03 1.046939e-02 1.100387e-02
Valle del Cauca 4.244349e-03 4.118590e-03 6.445122e-03 6.633760e-03
2023
Antioquia 1.169554e-02
Córdoba 7.168233e-03
Cundinamarca 1.068947e-02
Valle del Cauca 6.287924e-03
, , = Cultivos para condimentos, bebidas medicinales y aromáticas
2019 2020 2021 2022
Antioquia 7.231113e-04 7.231113e-04 6.602320e-04 8.174301e-04
Córdoba 3.143962e-05 0.000000e+00 0.000000e+00 0.000000e+00
Cundinamarca 1.257585e-03 1.446223e-03 1.100387e-03 1.194706e-03
Valle del Cauca 1.163266e-03 1.131826e-03 7.545509e-04 8.488697e-04
2023
Antioquia 7.859905e-04
Córdoba 0.000000e+00
Cundinamarca 1.163266e-03
Valle del Cauca 8.803094e-04
, , = Cultivos tropicales tradicionales
2019 2020 2021 2022
Antioquia 8.268620e-03 8.237180e-03 8.174301e-03 8.142862e-03
Córdoba 7.859905e-04 7.231113e-04 6.602320e-04 8.803094e-04
Cundinamarca 4.936020e-03 4.936020e-03 4.873141e-03 4.904581e-03
Valle del Cauca 4.212909e-03 4.370107e-03 4.212909e-03 4.338668e-03
2023
Antioquia 8.079982e-03
Córdoba 8.803094e-04
Cundinamarca 4.904581e-03
Valle del Cauca 4.307228e-03
, , = Frutales
2019 2020 2021 2022
Antioquia 1.854938e-02 2.059295e-02 2.156758e-02 2.150470e-02
Córdoba 3.489798e-03 3.521237e-03 3.552677e-03 3.615556e-03
Cundinamarca 1.568837e-02 1.672588e-02 1.980696e-02 2.049863e-02
Valle del Cauca 1.323608e-02 1.575125e-02 1.678876e-02 1.685164e-02
2023
Antioquia 2.200773e-02
Córdoba 3.646996e-03
Cundinamarca 2.040431e-02
Valle del Cauca 1.691452e-02
, , = Hortalizas
2019 2020 2021 2022
Antioquia 1.131826e-02 1.197850e-02 1.235577e-02 1.329896e-02
Córdoba 2.609488e-03 2.578049e-03 2.672368e-03 3.018204e-03
Cundinamarca 1.314176e-02 1.361336e-02 1.395919e-02 1.556261e-02
Valle del Cauca 1.103531e-02 1.260729e-02 1.251297e-02 1.267017e-02
2023
Antioquia 1.314176e-02
Córdoba 3.143962e-03
Cundinamarca 1.556261e-02
Valle del Cauca 1.270161e-02
, , = Leguminosas
2019 2020 2021 2022
Antioquia 6.413683e-03 6.539441e-03 6.822398e-03 7.356871e-03
Córdoba 1.100387e-03 1.037507e-03 9.431886e-04 1.100387e-03
Cundinamarca 9.431886e-03 9.494765e-03 9.714843e-03 1.034364e-02
Valle del Cauca 4.715943e-03 4.747383e-03 4.590185e-03 4.590185e-03
2023
Antioquia 7.356871e-03
Córdoba 1.068947e-03
Cundinamarca 1.024932e-02
Valle del Cauca 4.590185e-03
, , = Oleaginosas
2019 2020 2021 2022
Antioquia 4.087151e-04 4.087151e-04 2.829566e-04 2.829566e-04
Córdoba 5.659132e-04 5.659132e-04 4.715943e-04 5.973528e-04
Cundinamarca 1.886377e-04 1.886377e-04 1.886377e-04 1.886377e-04
Valle del Cauca 6.287924e-04 5.659132e-04 5.344735e-04 4.087151e-04
2023
Antioquia 2.515170e-04
Córdoba 5.973528e-04
Cundinamarca 1.571981e-04
Valle del Cauca 4.715943e-04
, , = Raíces y tubérculos
2019 2020 2021 2022
Antioquia 6.759518e-03 6.476562e-03 6.319364e-03 8.111422e-03
Córdoba 2.735247e-03 2.609488e-03 2.986764e-03 3.961392e-03
Cundinamarca 9.683403e-03 9.746282e-03 9.651963e-03 9.966360e-03
Valle del Cauca 2.609488e-03 2.515170e-03 2.578049e-03 3.081083e-03
2023
Antioquia 8.111422e-03
Córdoba 3.646996e-03
Cundinamarca 1.002924e-02
Valle del Cauca 3.049643e-03
ftable(df_evas$departamento)
Antioquia Córdoba Cundinamarca Valle del Cauca
10652 2859 10831 7465
|> margin.table() tabla_deptos
[1] 31807
|> margin.table(margin = 1) tabla_depto_year
Antioquia Córdoba Cundinamarca Valle del Cauca
10652 2859 10831 7465
|> margin.table(margin = 2) tabla_depto_year
2019 2020 2021 2022 2023
5701 5968 6499 6834 6805
|> margin.table(margin = 1) tabla_depto_year_relat
Antioquia Córdoba Cundinamarca Valle del Cauca
0.33489483 0.08988587 0.34052253 0.23469676
|> margin.table(margin = 2) tabla_depto_year_relat
2019 2020 2021 2022 2023
0.1792373 0.1876317 0.2043261 0.2148584 0.2139466
|> margin.table(margin = 3) tabla3_absoluta
Cereales
4946
Cultivos para condimentos, bebidas medicinales y aromáticas
467
Cultivos tropicales tradicionales
2889
Frutales
9374
Hortalizas
6663
Leguminosas
3569
Oleaginosas
253
Raíces y tubérculos
3646
|> margin.table(margin = 3) tabla3_relativa
Cereales
0.155500362
Cultivos para condimentos, bebidas medicinales y aromáticas
0.014682303
Cultivos tropicales tradicionales
0.090829063
Frutales
0.294715000
Hortalizas
0.209482189
Leguminosas
0.112208005
Oleaginosas
0.007954224
Raíces y tubérculos
0.114628855
<-
tabla_frecuencias1 |>
df_evas count(departamento, sort = TRUE, name = "frec_abs") |>
mutate(
frec_abs_acum = cumsum(frec_abs),
frec_rel = frec_abs / sum(frec_abs),
frec_rel_acum = cumsum(frec_rel)
)
tabla_frecuencias1
<-
tabla_frecuencias2 |>
df_evas count(departamento,
ano,sort = TRUE,
name = "frec_abs") |>
mutate(frec_abs_acum = cumsum(frec_abs),
frec_rel = frec_abs / sum(frec_abs),
frec_rel_acum = cumsum(frec_rel))
tabla_frecuencias2
|>
tabla_frecuencias2 filter(departamento == "Antioquia") |>
filter(ano %in% c(2022, 2023)) |>
reframe(total = sum(frec_rel))
<-
df_aguacate |>
df_evas filter(cultivo == "Aguacate")
ggqqplot(df_aguacate$produccion_t)
ggqqplot(log(df_aguacate$produccion_t))
ggqqplot(df_aguacate$rendimiento_t_ha)
ggqqplot(log(df_aguacate$rendimiento_t_ha))
\[\rho_{(X,Y)} = \frac{Cov_{(X,Y)}}{\sigma_X\times\sigma_Y} = \frac{\sum_{i=1}^{n}(X_i-\mu_X)(Y_i-\mu_Y)}{\sigma_X\times\sigma_Y}\]
cor(x = df_aguacate$produccion_t,
y = df_aguacate$rendimiento_t_ha,
method = "pearson")
[1] 0.236039
cor(x = log1p(df_aguacate$produccion_t),
y = log1p(df_aguacate$rendimiento_t_ha),
method = "pearson")
[1] 0.7158733
\[\rho = 1 - \frac{6\sum D^2}{N (N^2 - 1)}\]
cor(x = df_aguacate$produccion_t,
y = df_aguacate$rendimiento_t_ha,
method = "spearman")
[1] 0.6123416
cor(x = log1p(df_aguacate$produccion_t),
y = log1p(df_aguacate$rendimiento_t_ha),
method = "spearman")
[1] 0.6123416
|>
df_embalses filter(Name == "CHUZA") |>
ggplot(aes(x = Date, y = Value)) +
geom_line() +
labs(x = "Fecha", y = "%",
title = "Porcentaje de volumen útil",
subtitle = "Embalse Chuza (Chingaza) - Bogotá, Colombia") +
theme_minimal()