Estadística Descriptiva y Exploratoria

Estadística

Author

Edimer David Jaramillo

Bibliotecas

Code
library(tidyverse)
library(gt)

Recursos importantes

Métricas estadísticas

Función Descripción Tipo de variable
mean() Calcular promedio cuantitativa
weighted.mean() Calcular promedio ponderado Cunatitativa
median() Calcular mediana Cuantitativa
sd() Calcular desviación estándar cuantitativa
var() Calcular la varianza Cuantitativa
range() Calcular el rango Cuantitativa
IQR() Calcular rango intercuartílico Cuantitativa
quantile() Calcular cuartiles, deciles y percentiles Cuantitativa
min() Valor mínimo Cuantitativa
max() Valor máximo Cuantitativa

Datos

Code
datos <- read_csv("datos/PorcVoluUtilDiar.csv") |> 
  rename(id = Id,
         embalse = Name,
         porc_volumen = Value,
         fecha = Date) |> 
  mutate(year_es = year(fecha),
         mes = month(fecha),
         trimestre = quarter(fecha),
         semestre = semester(fecha))
datos |> head()

Tablas descriptivas

Code
datos |> 
  group_by(year_es) |> 
  reframe(
    promedio = mean(porc_volumen, na.rm = TRUE),
    mediana = median(porc_volumen, na.rm = TRUE),
    desviacion = sd(porc_volumen, na.rm = TRUE),
    maximo = max(porc_volumen, na.rm = TRUE),
    minimo = min(porc_volumen, na.rm = TRUE),
    coef_var = (desviacion / promedio) * 100
  ) |> 
  gt()
year_es promedio mediana desviacion maximo minimo coef_var
2014 0.6256844 0.636290 0.2354465 1.17409 0.00105 37.63023
2015 0.5533925 0.555950 0.2313380 1.17654 -0.01852 41.80360
2016 0.5194375 0.506725 0.2465720 1.13329 0.00051 47.46904
2017 0.7270821 0.756945 0.2194066 1.24386 0.00073 30.17632
2018 0.6906946 0.732250 0.2436080 1.20055 0.00195 35.27000
2019 0.6260755 0.634340 0.2244219 1.20446 0.00031 35.84582
2020 0.5602562 0.562100 0.2373018 1.15110 0.00094 42.35595
2021 0.7486693 0.790890 0.2376802 1.23771 0.01542 31.74703
2022 0.7434982 0.792760 0.2344250 1.19048 0.00031 31.53000
2023 0.6478680 0.660065 0.2194029 1.23769 0.00031 33.86537
2024 0.4418656 0.446140 0.2241496 1.00016 0.00430 50.72802
Code
datos |> 
  filter(embalse == "CHUZA") |> 
  group_by(year_es) |> 
  reframe(
    promedio = mean(porc_volumen, na.rm = TRUE),
    mediana = median(porc_volumen, na.rm = TRUE),
    desviacion = sd(porc_volumen, na.rm = TRUE),
    maximo = max(porc_volumen, na.rm = TRUE),
    minimo = min(porc_volumen, na.rm = TRUE),
    coef_var = (desviacion / promedio) * 100,
  ) |> 
  gt()
year_es promedio mediana desviacion maximo minimo coef_var
2014 0.7231294 0.714940 0.20063589 1.01423 0.45242 27.74550
2015 0.6905362 0.673300 0.21891977 1.01315 0.36648 31.70287
2016 0.7395945 0.666800 0.15320788 0.96592 0.55513 20.71512
2017 0.7971378 0.787700 0.11104568 1.00643 0.55881 13.93055
2018 0.7775977 0.744930 0.17932210 1.01234 0.52894 23.06104
2019 0.6924883 0.636320 0.24324467 1.04113 0.42693 35.12618
2020 0.7216357 0.680715 0.18703915 1.03120 0.50704 25.91878
2021 0.7511966 0.735410 0.17560852 1.01481 0.48251 23.37717
2022 0.5813787 0.597430 0.14125597 0.81128 0.30657 24.29672
2023 0.4435066 0.427110 0.09342354 0.64818 0.30252 21.06474
2024 0.2335559 0.222605 0.07154652 0.37840 0.13351 30.63357

Cantidades

  • ¿Cuántos embalses?
Code
datos |> 
  group_by(year_es) |> 
  reframe(embalses = length(unique(embalse))) |> 
  ggplot(aes(x = year_es, y = embalses)) +
  geom_col() +
  scale_x_continuous(breaks = seq(2014, 2024, 1))

Code
datos |> 
  group_by(year_es) |> 
  reframe(embalses = length(unique(embalse))) |> 
  ggplot(aes(x = year_es, y = embalses)) +
  geom_col(color = "red", fill = "blue", alpha = 0.5) +
  scale_x_continuous(breaks = seq(2014, 2024, 1)) +
  labs(x = "Año", 
       y = "Embalses (n)",
       title = "Total de embalses por año",
       subtitle = "Colombia - XM") +
  theme_minimal()

Code
datos |> 
  filter(embalse == "CHUZA") |> 
  group_by(year_es) |> 
  reframe(promedio = mean(porc_volumen, na.rm = TRUE)) |> 
  ggplot(aes(x = year_es, y = promedio)) +
  geom_point(color = "dodgerblue2", size = 3, shape = 19) +
  geom_line(color = "dodgerblue2", linetype = 1) +
  scale_x_continuous(breaks = seq(2014, 2024, 1)) +
  labs(x = "Año", 
       y = "Volumen (%)",
       title = "Volumen promedio por año",
       subtitle = "Embalse Chuza") +
  theme_minimal()

Code
datos |> 
  group_by(year_es, embalse) |> 
  reframe(promedio = mean(porc_volumen, na.rm = TRUE)) |> 
  ggplot(aes(x = year_es, y = promedio)) +
  facet_wrap(~ embalse, scales = "free_y", ncol = 4) +
  geom_point() +
  geom_line() +
  scale_x_continuous(breaks = seq(2014, 2024, 1)) +
  labs(x = "Año", 
       y = "Volumen (%)",
       title = "Volumen promedio por año",
       subtitle = "Embalses de Colombia") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Proporciones

  • Para este ejemplo nos preguntamos ¿cuál fue la proporción de días en cada año que el embalse de Chuza superó el 50% del volumen?
Code
tabla_proporcion <-
  datos |> 
  filter(embalse == "CHUZA") |> 
  group_by(year_es) |> 
  reframe(total = sum(porc_volumen > 0.5)) |> 
  mutate(proporcion = total / 365.5)

tabla_proporcion |> 
  ggplot(aes(x = year_es, y = proporcion)) +
  geom_col() +
  geom_label(aes(label = round(proporcion, digits = 3)),
             size = 3)

Code
sum(datos$porc_volumen[1:5] > 0.5)
[1] 2

Distribuciones

Code
datos |> 
  filter(embalse == "CHUZA") |> 
  ggplot(aes(x = porc_volumen)) +
  geom_histogram(color = "black")

Code
datos |> 
  filter(embalse == "CHUZA") |> 
  ggplot(aes(x = porc_volumen)) +
  geom_density(fill = "dodgerblue", alpha = 0.5)

Code
datos |> 
  filter(embalse == "CHUZA") |> 
  ggplot(aes(x = "", y = porc_volumen)) +
  geom_boxplot(fill = "dodgerblue", alpha = 0.5) +
  coord_flip()

Code
datos |> 
  filter(embalse == "CHUZA") |> 
  ggplot(aes(x = porc_volumen)) +
  facet_wrap(~year_es, ncol = 1, scales = "free_y") +
  geom_histogram(fill = "dodgerblue",
                 alpha = 0.5,
                 color = "black") +
  geom_vline(xintercept = 0.5, color = "red", lty = 2)

Code
datos |>
  filter(embalse == "CHUZA") |>
  mutate(year_es = as.factor(year_es)) |> 
  ggplot(aes(x = year_es, y = porc_volumen)) +
  geom_boxplot(fill = "dodgerblue",
               alpha = 0.5,
               color = "black") +
  geom_hline(yintercept = 0.5,
             color = "red",
             lty = 2)

Code
datos |>
  ggplot(aes(x = embalse, y = porc_volumen)) +
  geom_boxplot(fill = "dodgerblue",
               alpha = 0.5,
               color = "black") +
  geom_hline(yintercept = 0.5,
             color = "red",
             lty = 2) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Incertidumbre

Code
datos |> 
  filter(embalse == "CHUZA") |> 
  group_by(year_es, semestre) |> 
  reframe(promedio = mean(porc_volumen, na.rm = TRUE),
          desviacion = sd(porc_volumen, na.rm = TRUE)) |>
  mutate(semestre = as.factor(semestre)) |>
  ggplot(aes(x = year_es,
             y = promedio,
             ymin = promedio - desviacion,
             ymax = promedio + desviacion,
             color = semestre)) +
  geom_point() +
  geom_errorbar(width = 0.2) +
  theme(legend.position = "top")

Code
datos |> 
  filter(embalse == "CHUZA") |> 
  group_by(year_es, semestre) |> 
  reframe(promedio = mean(porc_volumen, na.rm = TRUE),
          desviacion = sd(porc_volumen, na.rm = TRUE)) |>
  mutate(semestre = as.factor(semestre)) |> 
  ggplot(aes(x = year_es,
             y = promedio,
             ymin = promedio - desviacion,
             ymax = promedio + desviacion,
             color = semestre)) +
  geom_pointrange() +
  theme(legend.position = "top")

Code
datos |> 
  filter(embalse == "CHUZA") |> 
  group_by(year_es, semestre) |> 
  reframe(promedio = mean(porc_volumen, na.rm = TRUE),
          desviacion = sd(porc_volumen, na.rm = TRUE)) |> 
  mutate(semestre = as.factor(semestre)) |> 
  ggplot(aes(x = year_es,
             y = promedio,
             ymin = promedio - desviacion,
             ymax = promedio + desviacion,
             color = semestre,
             fill = semestre)) +
  geom_ribbon(alpha = 0.5) +
  geom_point() +
  geom_line() +
  scale_x_continuous(breaks = seq(2014, 2024, 1))  +
  labs(x = "Año", y = "Volumen (%)", color = "Semestre", fill = "Semestre") +
  theme(legend.position = "bottom")

Gráficos interactivos

Code
library(plotly)

ejemplo <- 
  datos |> 
  filter(embalse == "CHUZA") |> 
  group_by(year_es, semestre) |> 
  reframe(promedio = mean(porc_volumen, na.rm = TRUE),
          desviacion = sd(porc_volumen, na.rm = TRUE)) |> 
  mutate(semestre = as.factor(semestre)) |> 
  ggplot(aes(x = year_es,
             y = promedio,
             ymin = promedio - desviacion,
             ymax = promedio + desviacion,
             color = semestre,
             fill = semestre)) +
  geom_ribbon(alpha = 0.5) +
  geom_point() +
  geom_line() +
  scale_x_continuous(breaks = seq(2014, 2024, 1))  +
  labs(x = "Año", y = "Volumen (%)", color = "Semestre", fill = "Semestre") +
  theme(legend.position = "bottom")


ggplotly(ejemplo)

Relación X vs Y: Y vs tiempo

Code
datos |> 
  filter(embalse == "CHUZA") |> 
  ggplot(aes(x = fecha, y = porc_volumen)) +
  geom_line()

Code
datos |> 
  filter(embalse == "CHUZA") |> 
  ggplot(aes(x = fecha, y = porc_volumen)) +
  geom_area(color = "forestgreen", fill = "forestgreen", alpha = 0.5)

Code
datos |> 
  filter(embalse == "CHUZA") |> 
  ggplot(aes(x = fecha, y = porc_volumen)) +
  geom_line() +
  geom_smooth()

Asociación de variables: y vs x

Code
datos_ancho <-
  datos |>
  pivot_wider(names_from = embalse,
              values_from = porc_volumen)
Code
datos_ancho |> 
  ggplot(aes(x = PENOL, y = PLAYAS)) +
  geom_point()

Code
datos_ancho |> 
  ggplot(aes(x = PENOL, y = PLAYAS)) +
  geom_density_2d()

Code
datos_ancho |> 
  ggplot(aes(x = PENOL, y = PLAYAS)) +
  geom_point() +
  geom_smooth(method = "lm")

  • ¿Cuál es la correlación entre el nivel del embalse Peñol y Playas?
Code
cor(x = datos_ancho$PENOL,
    y = datos_ancho$PLAYAS,
    use = "pairwise.complete.obs")
[1] 0.5148998
Code
datos_ancho |> 
  select(`AGREGADO BOGOTA`:ALTOANCHICAYA) |> 
  cor(use = "pairwise.complete.obs")
                AGREGADO BOGOTA       AMANI      BETANIA     CALIMA1
AGREGADO BOGOTA      1.00000000 0.041133663  0.076019392  0.31109336
AMANI                0.04113366 1.000000000  0.008134462  0.71550669
BETANIA              0.07601939 0.008134462  1.000000000  0.09249142
CALIMA1              0.31109336 0.715506693  0.092491417  1.00000000
CHUZA               -0.09378850 0.144845967  0.045480168  0.08066138
EL QUIMBO            0.34166096 0.367395913  0.169216761  0.43674644
ESMERALDA            0.43158201 0.342507403  0.013191343  0.27638579
GUAVIO               0.36135592 0.376343164  0.086475629  0.38713535
ITUANGO              0.45917072 0.259572540  0.227205611  0.51346889
MIRAFLORES           0.45124060 0.646525254  0.017929900  0.65742622
MUNA                 0.23897660 0.129269439  0.286518642  0.26207255
PENOL                0.25654007 0.744149498  0.048064694  0.69571870
PLAYAS              -0.07100517 0.391403774 -0.041935047  0.40535008
PORCE II            -0.17462506 0.364008035  0.069178738  0.22465949
PORCE III           -0.08711553 0.267574223  0.106191804  0.16687024
PRADO               -0.09597060 0.527977934  0.226004372  0.61263430
PUNCHINA            -0.08086221 0.262976892  0.007265291  0.14299005
RIOGRANDE2           0.30743322 0.763724631  0.139950966  0.68796374
SALVAJINA            0.04944611 0.314089139  0.157748883  0.44696069
SAN LORENZO          0.20192065 0.722126953  0.061130940  0.54413438
TOPOCORO             0.25477294 0.843541787  0.097295144  0.75185196
TRONERAS            -0.14092429 0.343938410  0.118837843  0.25156984
URRA1                0.29840894 0.635264103  0.059628566  0.49049291
ALTOANCHICAYA       -0.09647304 0.111282329  0.098218364 -0.00827293
                      CHUZA    EL QUIMBO    ESMERALDA     GUAVIO    ITUANGO
AGREGADO BOGOTA -0.09378850  0.341660956  0.431582014 0.36135592 0.45917072
AMANI            0.14484597  0.367395913  0.342507403 0.37634316 0.25957254
BETANIA          0.04548017  0.169216761  0.013191343 0.08647563 0.22720561
CALIMA1          0.08066138  0.436746444  0.276385788 0.38713535 0.51346889
CHUZA            1.00000000  0.445154982  0.659896960 0.72445043 0.56577465
EL QUIMBO        0.44515498  1.000000000  0.544181737 0.73348033 0.68757960
ESMERALDA        0.65989696  0.544181737  1.000000000 0.84363618 0.17009125
GUAVIO           0.72445043  0.733480329  0.843636177 1.00000000 0.50627325
ITUANGO          0.56577465  0.687579596  0.170091247 0.50627325 1.00000000
MIRAFLORES       0.33880572  0.543409198  0.668571334 0.68103591 0.46074503
MUNA             0.12821849  0.280799979  0.207334275 0.25573427 0.48338606
PENOL            0.27547523  0.523172997  0.513989354 0.59224586 0.68269381
PLAYAS           0.18464299  0.171000286  0.252028204 0.24297821 0.27532916
PORCE II         0.13063686  0.070927248  0.053259671 0.16269518 0.12743912
PORCE III        0.12140032  0.095455945  0.050045996 0.12873142 0.13778237
PRADO           -0.16619709  0.238888032 -0.153570672 0.02280428 0.44713232
PUNCHINA         0.19965817  0.068111034  0.195927063 0.21516099 0.06390521
RIOGRANDE2       0.22580337  0.495249811  0.489057298 0.60251915 0.38858804
SALVAJINA       -0.09613891  0.330835612 -0.001611343 0.13117939 0.22164256
SAN LORENZO      0.45273044  0.507806358  0.623280879 0.69908647 0.37920110
TOPOCORO         0.24799015  0.537841028  0.469877688 0.57783627 0.59701078
TRONERAS         0.34274071  0.337142905  0.224847113 0.35282037 0.20053572
URRA1            0.36700664  0.483855012  0.618105789 0.72832928 0.28150188
ALTOANCHICAYA    0.14823418 -0.004743978  0.092367476 0.10844314 0.12371424
                MIRAFLORES       MUNA      PENOL      PLAYAS    PORCE II
AGREGADO BOGOTA 0.45124060 0.23897660 0.25654007 -0.07100517 -0.17462506
AMANI           0.64652525 0.12926944 0.74414950  0.39140377  0.36400803
BETANIA         0.01792990 0.28651864 0.04806469 -0.04193505  0.06917874
CALIMA1         0.65742622 0.26207255 0.69571870  0.40535008  0.22465949
CHUZA           0.33880572 0.12821849 0.27547523  0.18464299  0.13063686
EL QUIMBO       0.54340920 0.28079998 0.52317300  0.17100029  0.07092725
ESMERALDA       0.66857133 0.20733428 0.51398935  0.25202820  0.05325967
GUAVIO          0.68103591 0.25573427 0.59224586  0.24297821  0.16269518
ITUANGO         0.46074503 0.48338606 0.68269381  0.27532916  0.12743912
MIRAFLORES      1.00000000 0.18043561 0.78853017  0.40089595  0.24404649
MUNA            0.18043561 1.00000000 0.26740592  0.17386840  0.09357301
PENOL           0.78853017 0.26740592 1.00000000  0.51489977  0.39773943
PLAYAS          0.40089595 0.17386840 0.51489977  1.00000000  0.34423827
PORCE II        0.24404649 0.09357301 0.39773943  0.34423827  1.00000000
PORCE III       0.28046045 0.12083169 0.31465199  0.30557715  0.43406906
PRADO           0.25922613 0.22398684 0.38330871  0.27150639  0.13239018
PUNCHINA        0.20677836 0.15605460 0.28913664  0.29751925  0.31487575
RIOGRANDE2      0.80929197 0.32948512 0.84774045  0.40894148  0.39706313
SALVAJINA       0.11696417 0.29046675 0.25767252  0.06898259 -0.01169367
SAN LORENZO     0.72727031 0.21915286 0.71039654  0.37103617  0.38444930
TOPOCORO        0.71039530 0.35556809 0.83213542  0.45267555  0.37751046
TRONERAS        0.29397245 0.26497288 0.44639117  0.42705192  0.52981811
URRA1           0.74401843 0.08343807 0.67682057  0.29933848  0.33263293
ALTOANCHICAYA   0.05227836 0.18192698 0.07920718  0.14231576  0.12040414
                  PORCE III       PRADO     PUNCHINA RIOGRANDE2    SALVAJINA
AGREGADO BOGOTA -0.08711553 -0.09597060 -0.080862209  0.3074332  0.049446114
AMANI            0.26757422  0.52797793  0.262976892  0.7637246  0.314089139
BETANIA          0.10619180  0.22600437  0.007265291  0.1399510  0.157748883
CALIMA1          0.16687024  0.61263430  0.142990050  0.6879637  0.446960692
CHUZA            0.12140032 -0.16619709  0.199658167  0.2258034 -0.096138906
EL QUIMBO        0.09545595  0.23888803  0.068111034  0.4952498  0.330835612
ESMERALDA        0.05004600 -0.15357067  0.195927063  0.4890573 -0.001611343
GUAVIO           0.12873142  0.02280428  0.215160991  0.6025192  0.131179393
ITUANGO          0.13778237  0.44713232  0.063905210  0.3885880  0.221642559
MIRAFLORES       0.28046045  0.25922613  0.206778364  0.8092920  0.116964174
MUNA             0.12083169  0.22398684  0.156054596  0.3294851  0.290466751
PENOL            0.31465199  0.38330871  0.289136637  0.8477405  0.257672519
PLAYAS           0.30557715  0.27150639  0.297519255  0.4089415  0.068982592
PORCE II         0.43406906  0.13239018  0.314875748  0.3970631 -0.011693673
PORCE III        1.00000000  0.15511457  0.251104413  0.3504755 -0.026756487
PRADO            0.15511457  1.00000000  0.016050474  0.4525739  0.552719988
PUNCHINA         0.25110441  0.01605047  1.000000000  0.2299305  0.005916317
RIOGRANDE2       0.35047551  0.45257392  0.229930549  1.0000000  0.280157397
SALVAJINA       -0.02675649  0.55271999  0.005916317  0.2801574  1.000000000
SAN LORENZO      0.36949596  0.23100262  0.357200844  0.7974894  0.219918208
TOPOCORO         0.30325670  0.49908944  0.313488113  0.8549322  0.504323150
TRONERAS         0.45546836  0.14580332  0.371664810  0.4524165  0.078543981
URRA1            0.28283160  0.18889093  0.263439866  0.7276359  0.100604312
ALTOANCHICAYA    0.11739917  0.11675999  0.167198178  0.1168749  0.049517452
                SAN LORENZO   TOPOCORO    TRONERAS      URRA1 ALTOANCHICAYA
AGREGADO BOGOTA  0.20192065 0.25477294 -0.14092429 0.29840894  -0.096473043
AMANI            0.72212695 0.84354179  0.34393841 0.63526410   0.111282329
BETANIA          0.06113094 0.09729514  0.11883784 0.05962857   0.098218364
CALIMA1          0.54413438 0.75185196  0.25156984 0.49049291  -0.008272930
CHUZA            0.45273044 0.24799015  0.34274071 0.36700664   0.148234180
EL QUIMBO        0.50780636 0.53784103  0.33714291 0.48385501  -0.004743978
ESMERALDA        0.62328088 0.46987769  0.22484711 0.61810579   0.092367476
GUAVIO           0.69908647 0.57783627  0.35282037 0.72832928   0.108443136
ITUANGO          0.37920110 0.59701078  0.20053572 0.28150188   0.123714242
MIRAFLORES       0.72727031 0.71039530  0.29397245 0.74401843   0.052278356
MUNA             0.21915286 0.35556809  0.26497288 0.08343807   0.181926981
PENOL            0.71039654 0.83213542  0.44639117 0.67682057   0.079207180
PLAYAS           0.37103617 0.45267555  0.42705192 0.29933848   0.142315762
PORCE II         0.38444930 0.37751046  0.52981811 0.33263293   0.120404143
PORCE III        0.36949596 0.30325670  0.45546836 0.28283160   0.117399171
PRADO            0.23100262 0.49908944  0.14580332 0.18889093   0.116759994
PUNCHINA         0.35720084 0.31348811  0.37166481 0.26343987   0.167198178
RIOGRANDE2       0.79748940 0.85493221  0.45241645 0.72763589   0.116874921
SALVAJINA        0.21991821 0.50432315  0.07854398 0.10060431   0.049517452
SAN LORENZO      1.00000000 0.79710656  0.48266125 0.77545330   0.127249131
TOPOCORO         0.79710656 1.00000000  0.44415180 0.73611776   0.108537181
TRONERAS         0.48266125 0.44415180  1.00000000 0.34304283   0.183150446
URRA1            0.77545330 0.73611776  0.34304283 1.00000000   0.084690914
ALTOANCHICAYA    0.12724913 0.10853718  0.18315045 0.08469091   1.000000000
  • Opción por defecto:
Code
library(corrplot)

datos_ancho |> 
  select(`AGREGADO BOGOTA`:ALTOANCHICAYA) |> 
  cor(use = "pairwise.complete.obs") |> 
  corrplot()

  • Mejorando el gráfico:
Code
datos_ancho |> 
  select(`AGREGADO BOGOTA`:ALTOANCHICAYA) |> 
  cor(use = "pairwise.complete.obs") |> 
  corrplot(
    method = "pie",
    type = "lower",
    diag = FALSE,
    tl.col = "black",
    tl.srt = 35
  )

Code
library(corrr)
datos_ancho |> 
  select(`AGREGADO BOGOTA`:ALTOANCHICAYA) |> 
  correlate() |> 
  network_plot()