Code
library(tidyverse)
library(readxl)
library(janitor)
library(skimr)
library(moments)
library(ggpubr)Estadística
library(tidyverse)
library(readxl)
library(janitor)
library(skimr)
library(moments)
library(ggpubr)df_embalses <- read_csv("datos-ejemplos/PorcVoluUtilDiar.csv")
df_embalses |> head()df_evas <- read_excel("datos-ejemplos/Base agrícola 2019 - 2023.xlsx", skip = 6) |>
clean_names() |>
filter(
departamento %in% c(
"Antioquia",
"Cundinamarca",
"Valle del Cauca",
"Córdoba"
)
)
df_evas |> head()df_embalses$Value |>
mean()[1] 0.6342816
df_embalses |>
group_by(Name) |>
reframe(promedio = mean(Value)) |>
arrange(promedio)peso <- c(47.8, 52.3, 56, 58.3, 17.5)
frutos <- c(500, 780, 890, 960, 10)
weighted.mean(x = peso, w = frutos)[1] 54.35573
df_embalses$Value |>
median()[1] 0.6437
df_embalses |>
group_by(Name) |>
reframe(mediana = median(Value)) |>
arrange(mediana)# Función personalizada
moda <- function(x) {
ux = unique(x)
tab = tabulate(match(x, ux))
ux[tab == max(tab)]
}
valores_numericos <- c(1, 1, 2, 50, 60, 1, 2, 2, 50, 50)
valores_categoricos <- c("A", "A", "B", "B", "C", "D", "A", "B")
moda(x = valores_numericos)[1] 1 2 50
moda(x = valores_categoricos)[1] "A" "B"
df_embalses$Value |>
moda()[1] 1
quantile(x = df_embalses$Value, probs = 0.90) 90%
0.953302
quantile(x = df_embalses$Value, probs = c(0.20, 0.70)) 20% 70%
0.407496 0.798140
quantile(x = df_embalses$Value,
probs = seq(from = 0, to = 1, by = 0.01)) 0% 1% 2% 3% 4% 5% 6%
-0.0185200 0.0657600 0.1124336 0.1486900 0.1775488 0.1999700 0.2211500
7% 8% 9% 10% 11% 12% 13%
0.2412904 0.2583300 0.2753936 0.2924060 0.3070348 0.3219332 0.3362336
14% 15% 16% 17% 18% 19% 20%
0.3496752 0.3595920 0.3688000 0.3781536 0.3879624 0.3985292 0.4074960
21% 22% 23% 24% 25% 26% 27%
0.4167556 0.4260300 0.4354800 0.4449500 0.4546200 0.4636500 0.4721872
28% 29% 30% 31% 32% 33% 34%
0.4813000 0.4898716 0.4979540 0.5055100 0.5131400 0.5207888 0.5286700
35% 36% 37% 38% 39% 40% 41%
0.5357800 0.5428388 0.5498600 0.5569400 0.5631300 0.5698300 0.5767300
42% 43% 44% 45% 46% 47% 48%
0.5843500 0.5920724 0.5996684 0.6070520 0.6139200 0.6209000 0.6282400
49% 50% 51% 52% 53% 54% 55%
0.6357896 0.6437000 0.6520268 0.6602036 0.6682300 0.6765500 0.6849840
56% 57% 58% 59% 60% 61% 62%
0.6922316 0.7001576 0.7083200 0.7165100 0.7244960 0.7326548 0.7402916
63% 64% 65% 66% 67% 68% 69%
0.7483700 0.7556400 0.7630300 0.7698588 0.7771900 0.7839924 0.7910700
70% 71% 72% 73% 74% 75% 76%
0.7981400 0.8059200 0.8128292 0.8195864 0.8273132 0.8345900 0.8422168
77% 78% 79% 80% 81% 82% 83%
0.8497200 0.8572616 0.8653200 0.8731680 0.8806100 0.8879204 0.8957700
84% 85% 86% 87% 88% 89% 90%
0.9028400 0.9104780 0.9185200 0.9268416 0.9353584 0.9441700 0.9533020
91% 92% 93% 94% 95% 96% 97%
0.9628888 0.9719900 0.9795700 0.9877100 0.9981260 1.0072000 1.0208800
98% 99% 100%
1.0423300 1.0956232 1.2438600
df_embalses |>
group_by(Name) |>
reframe(percentil_15 = quantile(x = Value, probs = 0.15),
percentil_70 = quantile(x = Value, probs = 0.70)) |>
arrange(percentil_70)quantile(x = df_embalses$Value, probs = seq(from = 0, to = 1, by = 0.1)) 0% 10% 20% 30% 40% 50% 60% 70%
-0.018520 0.292406 0.407496 0.497954 0.569830 0.643700 0.724496 0.798140
80% 90% 100%
0.873168 0.953302 1.243860
quantile(x = df_embalses$Value, probs = seq(from = 0, to = 1, by = 0.25)) 0% 25% 50% 75% 100%
-0.01852 0.45462 0.64370 0.83459 1.24386
var(x = df_embalses$Value)[1] 0.06146684
sd(x = df_embalses$Value)[1] 0.2479251
0.6342816 + (0.2479251 * 1)[1] 0.8822067
0.6342816 - (0.2479251 * 1)[1] 0.3863565
df_embalses |>
group_by(Name) |>
reframe(desviacion = sd(Value, na.rm = TRUE)) |>
arrange(desviacion)(0.2479251 / 0.6342816) * 100[1] 39.08754
df_embalses |>
group_by(Name) |>
reframe(promedio = mean(Value),
desviacion = sd(Value),
coef_var = (desviacion / promedio) * 100)range(df_embalses$Value)[1] -0.01852 1.24386
df_embalses |>
group_by(Name) |>
reframe(maximo = max(Value),
minimo = min(Value),
rango = maximo - minimo)IQR(df_embalses$Value)[1] 0.37997
df_embalses |>
group_by(Name) |>
reframe(rango_inter = IQR(Value))skewness(x = df_embalses$Value)[1] -0.2493391
df_embalses |>
group_by(Name) |>
reframe(coef_asimetria = skewness(Value))kurtosis(x = df_embalses$Value)[1] 2.313734
df_embalses |>
group_by(Name) |>
reframe(coef_curtosis = kurtosis(Value))df_embalses |> summary() Id Name Value Date
Length:89569 Length:89569 Min. :-0.01852 Min. :2014-01-01
Class :character Class :character 1st Qu.: 0.45462 1st Qu.:2016-10-15
Mode :character Mode :character Median : 0.64370 Median :2019-06-17
Mean : 0.63428 Mean :2019-06-10
3rd Qu.: 0.83459 3rd Qu.:2022-02-15
Max. : 1.24386 Max. :2024-09-23
df_embalses |>
skim()| Name | df_embalses |
| Number of rows | 89569 |
| Number of columns | 4 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| Date | 1 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Id | 0 | 1 | 7 | 7 | 0 | 1 | 0 |
| Name | 0 | 1 | 4 | 15 | 0 | 24 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| Date | 0 | 1 | 2014-01-01 | 2024-09-23 | 2019-06-17 | 3919 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Value | 0 | 1 | 0.63 | 0.25 | -0.02 | 0.45 | 0.64 | 0.83 | 1.24 | ▂▅▇▇▂ |
tabla_deptos <- table(df_evas$departamento)
tabla_deptos
Antioquia Córdoba Cundinamarca Valle del Cauca
10652 2859 10831 7465
tabla_deptos_relat <- prop.table(tabla_deptos)
tabla_deptos_relat
Antioquia Córdoba Cundinamarca Valle del Cauca
0.33489483 0.08988587 0.34052253 0.23469676
tabla_depto_year <- table(df_evas$departamento, df_evas$ano)
tabla_depto_year
2019 2020 2021 2022 2023
Antioquia 1916 2011 2176 2277 2272
Córdoba 509 503 564 642 641
Cundinamarca 1945 1997 2219 2343 2327
Valle del Cauca 1331 1457 1540 1572 1565
tabla_depto_year_relat <- prop.table(tabla_depto_year)
tabla_depto_year_relat
2019 2020 2021 2022 2023
Antioquia 0.06023831 0.06322508 0.06841261 0.07158802 0.07143082
Córdoba 0.01600277 0.01581413 0.01773195 0.02018424 0.02015280
Cundinamarca 0.06115006 0.06278492 0.06976452 0.07366303 0.07316000
Valle del Cauca 0.04184613 0.04580753 0.04841702 0.04942308 0.04920301
tabla3_absoluta <-
table(df_evas$departamento,
df_evas$ano,
df_evas$grupo_cultivo)
tabla3_absoluta, , = Cereales
2019 2020 2021 2022 2023
Antioquia 248 263 389 384 372
Córdoba 149 152 205 223 228
Cundinamarca 217 211 333 350 340
Valle del Cauca 135 131 205 211 200
, , = Cultivos para condimentos, bebidas medicinales y aromáticas
2019 2020 2021 2022 2023
Antioquia 23 23 21 26 25
Córdoba 1 0 0 0 0
Cundinamarca 40 46 35 38 37
Valle del Cauca 37 36 24 27 28
, , = Cultivos tropicales tradicionales
2019 2020 2021 2022 2023
Antioquia 263 262 260 259 257
Córdoba 25 23 21 28 28
Cundinamarca 157 157 155 156 156
Valle del Cauca 134 139 134 138 137
, , = Frutales
2019 2020 2021 2022 2023
Antioquia 590 655 686 684 700
Córdoba 111 112 113 115 116
Cundinamarca 499 532 630 652 649
Valle del Cauca 421 501 534 536 538
, , = Hortalizas
2019 2020 2021 2022 2023
Antioquia 360 381 393 423 418
Córdoba 83 82 85 96 100
Cundinamarca 418 433 444 495 495
Valle del Cauca 351 401 398 403 404
, , = Leguminosas
2019 2020 2021 2022 2023
Antioquia 204 208 217 234 234
Córdoba 35 33 30 35 34
Cundinamarca 300 302 309 329 326
Valle del Cauca 150 151 146 146 146
, , = Oleaginosas
2019 2020 2021 2022 2023
Antioquia 13 13 9 9 8
Córdoba 18 18 15 19 19
Cundinamarca 6 6 6 6 5
Valle del Cauca 20 18 17 13 15
, , = Raíces y tubérculos
2019 2020 2021 2022 2023
Antioquia 215 206 201 258 258
Córdoba 87 83 95 126 116
Cundinamarca 308 310 307 317 319
Valle del Cauca 83 80 82 98 97
tabla3_relativa <- tabla3_absoluta |> prop.table()
tabla3_relativa, , = Cereales
2019 2020 2021 2022
Antioquia 7.797026e-03 8.268620e-03 1.223001e-02 1.207281e-02
Córdoba 4.684503e-03 4.778822e-03 6.445122e-03 7.011035e-03
Cundinamarca 6.822398e-03 6.633760e-03 1.046939e-02 1.100387e-02
Valle del Cauca 4.244349e-03 4.118590e-03 6.445122e-03 6.633760e-03
2023
Antioquia 1.169554e-02
Córdoba 7.168233e-03
Cundinamarca 1.068947e-02
Valle del Cauca 6.287924e-03
, , = Cultivos para condimentos, bebidas medicinales y aromáticas
2019 2020 2021 2022
Antioquia 7.231113e-04 7.231113e-04 6.602320e-04 8.174301e-04
Córdoba 3.143962e-05 0.000000e+00 0.000000e+00 0.000000e+00
Cundinamarca 1.257585e-03 1.446223e-03 1.100387e-03 1.194706e-03
Valle del Cauca 1.163266e-03 1.131826e-03 7.545509e-04 8.488697e-04
2023
Antioquia 7.859905e-04
Córdoba 0.000000e+00
Cundinamarca 1.163266e-03
Valle del Cauca 8.803094e-04
, , = Cultivos tropicales tradicionales
2019 2020 2021 2022
Antioquia 8.268620e-03 8.237180e-03 8.174301e-03 8.142862e-03
Córdoba 7.859905e-04 7.231113e-04 6.602320e-04 8.803094e-04
Cundinamarca 4.936020e-03 4.936020e-03 4.873141e-03 4.904581e-03
Valle del Cauca 4.212909e-03 4.370107e-03 4.212909e-03 4.338668e-03
2023
Antioquia 8.079982e-03
Córdoba 8.803094e-04
Cundinamarca 4.904581e-03
Valle del Cauca 4.307228e-03
, , = Frutales
2019 2020 2021 2022
Antioquia 1.854938e-02 2.059295e-02 2.156758e-02 2.150470e-02
Córdoba 3.489798e-03 3.521237e-03 3.552677e-03 3.615556e-03
Cundinamarca 1.568837e-02 1.672588e-02 1.980696e-02 2.049863e-02
Valle del Cauca 1.323608e-02 1.575125e-02 1.678876e-02 1.685164e-02
2023
Antioquia 2.200773e-02
Córdoba 3.646996e-03
Cundinamarca 2.040431e-02
Valle del Cauca 1.691452e-02
, , = Hortalizas
2019 2020 2021 2022
Antioquia 1.131826e-02 1.197850e-02 1.235577e-02 1.329896e-02
Córdoba 2.609488e-03 2.578049e-03 2.672368e-03 3.018204e-03
Cundinamarca 1.314176e-02 1.361336e-02 1.395919e-02 1.556261e-02
Valle del Cauca 1.103531e-02 1.260729e-02 1.251297e-02 1.267017e-02
2023
Antioquia 1.314176e-02
Córdoba 3.143962e-03
Cundinamarca 1.556261e-02
Valle del Cauca 1.270161e-02
, , = Leguminosas
2019 2020 2021 2022
Antioquia 6.413683e-03 6.539441e-03 6.822398e-03 7.356871e-03
Córdoba 1.100387e-03 1.037507e-03 9.431886e-04 1.100387e-03
Cundinamarca 9.431886e-03 9.494765e-03 9.714843e-03 1.034364e-02
Valle del Cauca 4.715943e-03 4.747383e-03 4.590185e-03 4.590185e-03
2023
Antioquia 7.356871e-03
Córdoba 1.068947e-03
Cundinamarca 1.024932e-02
Valle del Cauca 4.590185e-03
, , = Oleaginosas
2019 2020 2021 2022
Antioquia 4.087151e-04 4.087151e-04 2.829566e-04 2.829566e-04
Córdoba 5.659132e-04 5.659132e-04 4.715943e-04 5.973528e-04
Cundinamarca 1.886377e-04 1.886377e-04 1.886377e-04 1.886377e-04
Valle del Cauca 6.287924e-04 5.659132e-04 5.344735e-04 4.087151e-04
2023
Antioquia 2.515170e-04
Córdoba 5.973528e-04
Cundinamarca 1.571981e-04
Valle del Cauca 4.715943e-04
, , = Raíces y tubérculos
2019 2020 2021 2022
Antioquia 6.759518e-03 6.476562e-03 6.319364e-03 8.111422e-03
Córdoba 2.735247e-03 2.609488e-03 2.986764e-03 3.961392e-03
Cundinamarca 9.683403e-03 9.746282e-03 9.651963e-03 9.966360e-03
Valle del Cauca 2.609488e-03 2.515170e-03 2.578049e-03 3.081083e-03
2023
Antioquia 8.111422e-03
Córdoba 3.646996e-03
Cundinamarca 1.002924e-02
Valle del Cauca 3.049643e-03
ftable(df_evas$departamento) Antioquia Córdoba Cundinamarca Valle del Cauca
10652 2859 10831 7465
tabla_deptos |> margin.table()[1] 31807
tabla_depto_year |> margin.table(margin = 1)
Antioquia Córdoba Cundinamarca Valle del Cauca
10652 2859 10831 7465
tabla_depto_year |> margin.table(margin = 2)
2019 2020 2021 2022 2023
5701 5968 6499 6834 6805
tabla_depto_year_relat |> margin.table(margin = 1)
Antioquia Córdoba Cundinamarca Valle del Cauca
0.33489483 0.08988587 0.34052253 0.23469676
tabla_depto_year_relat |> margin.table(margin = 2)
2019 2020 2021 2022 2023
0.1792373 0.1876317 0.2043261 0.2148584 0.2139466
tabla3_absoluta |> margin.table(margin = 3)
Cereales
4946
Cultivos para condimentos, bebidas medicinales y aromáticas
467
Cultivos tropicales tradicionales
2889
Frutales
9374
Hortalizas
6663
Leguminosas
3569
Oleaginosas
253
Raíces y tubérculos
3646
tabla3_relativa |> margin.table(margin = 3)
Cereales
0.155500362
Cultivos para condimentos, bebidas medicinales y aromáticas
0.014682303
Cultivos tropicales tradicionales
0.090829063
Frutales
0.294715000
Hortalizas
0.209482189
Leguminosas
0.112208005
Oleaginosas
0.007954224
Raíces y tubérculos
0.114628855
tabla_frecuencias1 <-
df_evas |>
count(departamento, sort = TRUE, name = "frec_abs") |>
mutate(
frec_abs_acum = cumsum(frec_abs),
frec_rel = frec_abs / sum(frec_abs),
frec_rel_acum = cumsum(frec_rel)
)
tabla_frecuencias1tabla_frecuencias2 <-
df_evas |>
count(departamento,
ano,
sort = TRUE,
name = "frec_abs") |>
mutate(frec_abs_acum = cumsum(frec_abs),
frec_rel = frec_abs / sum(frec_abs),
frec_rel_acum = cumsum(frec_rel))
tabla_frecuencias2tabla_frecuencias2 |>
filter(departamento == "Antioquia") |>
filter(ano %in% c(2022, 2023)) |>
reframe(total = sum(frec_rel))
df_aguacate <-
df_evas |>
filter(cultivo == "Aguacate")ggqqplot(df_aguacate$produccion_t)
ggqqplot(log(df_aguacate$produccion_t))
ggqqplot(df_aguacate$rendimiento_t_ha)
ggqqplot(log(df_aguacate$rendimiento_t_ha))
\[\rho_{(X,Y)} = \frac{Cov_{(X,Y)}}{\sigma_X\times\sigma_Y} = \frac{\sum_{i=1}^{n}(X_i-\mu_X)(Y_i-\mu_Y)}{\sigma_X\times\sigma_Y}\]
cor(x = df_aguacate$produccion_t,
y = df_aguacate$rendimiento_t_ha,
method = "pearson")[1] 0.236039
cor(x = log1p(df_aguacate$produccion_t),
y = log1p(df_aguacate$rendimiento_t_ha),
method = "pearson")[1] 0.7158733
\[\rho = 1 - \frac{6\sum D^2}{N (N^2 - 1)}\]
cor(x = df_aguacate$produccion_t,
y = df_aguacate$rendimiento_t_ha,
method = "spearman")[1] 0.6123416
cor(x = log1p(df_aguacate$produccion_t),
y = log1p(df_aguacate$rendimiento_t_ha),
method = "spearman")[1] 0.6123416
df_embalses |>
filter(Name == "CHUZA") |>
ggplot(aes(x = Date, y = Value)) +
geom_line() +
labs(x = "Fecha", y = "%",
title = "Porcentaje de volumen útil",
subtitle = "Embalse Chuza (Chingaza) - Bogotá, Colombia") +
theme_minimal()