diff --git a/Advance analytics course/Appendix-1_1.qmd b/Advance analytics course/Appendix-1_1.qmd new file mode 100644 index 0000000..a4adbdb --- /dev/null +++ b/Advance analytics course/Appendix-1_1.qmd @@ -0,0 +1,1752 @@ +# Appendix 1_1 {.appendix, .unnumbered} + +### Nathaly Paez \[05\] + +#### Maestría en Matemáticas Aplicadas y Ciencias de la Computación + +#### April 2025 + +Advance analytics course + +## Augmented Data Analyst {.unnumbered} + +```{r} +#| include: false +source('setup.R') + +# Load the dataset +delitos_data <- st_read("data/spatial/crime_spatial_course.gpkg") +delitos_data <- delitos_data[delitos_data$dpto_ccdgo == '05', ] +``` + +> delitos_data \<- delitos_data\[delitos_data\$dpto_ccdgo == c ('05'), \] \> dim(delitos_data) \[1\] 49610 91 + +In this line the 05 code represents the Antioquia department from the Dane data base codes. The line dim(delitos_data) give us a result about how big is our data base, the distribution in rows (49610) and columns (91). + +The summary code show us the different kind of crimes, misdemeanors, offenses or infractions in a period of time. For example, for this exercises we choose 24HP that means "hurto a personas en 2024". + +```{r} +# interactive polygons location +leaflet(delitos_data) %>% + addTiles() %>% # Base map + addPolygons(color = "steelblue", weight = 1, fillOpacity = 0.5) + +# quantile +quantile(delitos_data$sum_24HP, probs = seq(0, 1, 0.1), na.rm = TRUE) + +#boxplot +boxplot(delitos_data$sum_24HP, main = "Boxplot of Robberies to people (HP) in Antioquia", horizontal = TRUE) + +# cuantile and decil dynamic map +delitos_data$sum_24HP_jittered <- delitos_data$sum_24HP + runif(nrow(delitos_data), -0.0001, 0.0001) + +delitos_data$decil <- cut(delitos_data$sum_24HP_jittered, + breaks = quantile(delitos_data$sum_24HP_jittered, probs = seq(0, 1, 0.1), na.rm = TRUE), + labels = paste0("D", 1:10), + include.lowest = TRUE) + +# Define Color Palette for Deciles +custom_palette <- c( + "#F0F9E8", # D1 - Lightest green + "#BAE4BC", # D2 + "#7BCCC4", # D3 + "#43A2CA", # D4 + "#0868AC", # D5 - Light Blue + "#FEE08B", # D6 - Yellow + "#FDAE61", # D7 - Orange + "#F46D43", # D8 - Reddish Orange + "#D73027", # D9 - Strong Red + "#A50026" # D10 - Darkest Red (Highest Decile) +) +palette_decil <- colorFactor(palette = custom_palette, domain = delitos_data$decil) + +# Create the Decile Map using Leaflet +leaflet(delitos_data[order(delitos_data$sum_24HP, decreasing = TRUE)[1:10], ]) %>% + addTiles() %>% + addPolygons( + fillColor = ~palette_decil(decil), + color = "black", weight = 1, opacity = 0.5, + fillOpacity = 0.7, + popup = ~paste("Crime Rate:", round(sum_24HP, 2), "
Decile:", decil) + ) %>% + addLegend(pal = palette_decil, values = delitos_data$decil, + title = "Decile Classification", position = "bottomright") + +# Drop delitos_data$sum_24HP_jittered +delitos_data$sum_24HP_jittered <- NULL + +``` + +In quantiles, we can observe that up to 80% of the records have a value of 0, meaning that no robberies were recorded. At the 90th percentile, the value of 1 indicates that only 10% of the observations have a value greater than 1. The maximum value, at the 100th percentile, is 699, which can be interpreted as at least one area having a significantly higher concentration of robberies compared to the majority of the data. + +In the boxplot, we can observe that the distribution of the variables is asymmetric, confirming that the vast majority of the data has values close to zero. Additionally, the presence of scattered outliers on the right is clearly visible. Lastly, the maps display the spatial distribution of robberies targeting individuals, making their territorial patterns evident. + +## Skewness + +```{r} +#| echo: true +#| message: false +#| warning: false + +# step by step +n <- length(delitos_data$sum_24HP) +mean_x <- mean(delitos_data$sum_24HP) +sd_x <- sd(delitos_data$sum_24HP) # Uses (n-1) denominator +z_scores <- (delitos_data$sum_24HP - mean_x) / sd_x +z_cubed <- z_scores^3 +sum_cubed <- sum(z_cubed) +skewness <- (n / ((n - 1) * (n - 2))) * sum_cubed +paste0('sum_24HP: ', skewness) + +# function +skewness(delitos_data$sum_24HP, na.rm = TRUE) + +# skewness +delitos_data %>% + st_drop_geometry() %>% + select(contains('24')) %>% + summarise(across(everything(), ~ skewness(.x, na.rm = TRUE))) %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "Crime Type") %>% + mutate(V1 = round(V1, 2)) %>% + rename(Skewness = V1) %>% + gt() +``` + +> paste0('sum_24HP: ', skewness) \[1\] "sum_24HP: 108.920265681" \> \> \# function \> skewness(delitos_data\$sum_24HP, na.rm = TRUE) \[1\] 108.917 + +**Antioquia:** A skewness of 0 or close to 0 indicates a symmetric distribution. If it is greater than 1, the skew is to the right, and if it is negative, the skew is to the left. Consequently, in our data, the skewness value of 108.92 represents extreme asymmetry, providing further evidence of the inequality in the distribution of our data. + +## Kurtosis + +```{r} +#| echo: true +#| message: false +#| warning: false + +# step by step +z_fourth <- z_scores^4 +sum_fourth <- sum(z_fourth) +kurtosis <- ((n * (n + 1)) / ((n - 1) * (n - 2) * (n - 3))) * sum_fourth - (3 * (n - 1)^2) / ((n - 2) * (n - 3)) +print(kurtosis) + +# function +kurtosis(delitos_data$sum_24HP, na.rm = TRUE) + +# Kurtosis +delitos_data %>% + st_drop_geometry() %>% + select(contains('24')) %>% + summarise(across(everything(), ~ kurtosis(.x, na.rm = TRUE))) %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "Crime Type") %>% + mutate(V1 = round(V1, 2)) %>% + rename(Kurtosis = V1) %>% + gt() +``` + +------------------------------------------------------------------------ + +**Antioquia:** the kurtosis calculation yields a value of 17488.53, which is exceptionally high. This suggests a distribution with most values concentrated near the mean while also exhibiting a substantial number of extreme values (outliers). In other words, the distribution has heavy tails, making it highly leptokurtic. + +Image take from: + +function \> kurtosis(delitos_data\$sum_24HP, na.rm = TRUE) \[1\] 17488.53 + +## Coefficient of Variation + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Compute statistics +mean_val <- mean(delitos_data$sum_24HP, na.rm = TRUE) +print(mean_val) +std_dev <- sd(delitos_data$sum_24HP, na.rm = TRUE) +print(std_dev) + +# Compute the range for first standard deviation +lower_bound <- mean_val - std_dev +upper_bound <- mean_val + std_dev +paste0('lower_bound: ', round(lower_bound, 2), ' - upper_bound: ', round(upper_bound, 2)) + +# Count the number of points within 1 standard deviation +within_1sd <- sum(delitos_data$sum_24HP >= lower_bound & delitos_data$sum_24HP <= upper_bound, na.rm = TRUE) +percentage_1sd <- (within_1sd / nrow(delitos_data)) * 100 +paste0('within_1sd: ', round(within_1sd, 2), ' - percentage_1sd: ', round(percentage_1sd, 2)) + +# Create histogram +ggplot(delitos_data, aes(x = sum_24HP)) + + geom_histogram(binwidth = 5, fill = "blue", alpha = 0.5, color = "black") + + + # Add vertical lines for mean, median, and 1st SD + geom_vline(aes(xintercept = mean_val), color = "red", linetype = "dashed", size = 1.2) + + #geom_vline(aes(xintercept = median_val), color = "green", linetype = "dashed", size = 1.2) + + geom_vline(aes(xintercept = lower_bound), color = "purple", linetype = "dashed", size = 1) + + geom_vline(aes(xintercept = upper_bound), color = "purple", linetype = "dashed", size = 1) + + + # Labels and title + labs(title = "Histogram of AUTOMOTORES with Mean, and 1SD Range", + x = "AUTOMOTORES Values", y = "Frequency") + + + # Add annotation for 1SD range + annotate("text", x = mean_val, y = 10, + label = paste(round(percentage_1sd, 1), "1SD", sep = ""), + color = "black", size = 5, hjust = 0.5, vjust = -1) + + + theme_minimal() + +# cv +paste0('cv: ', round(std_dev / mean_val * 100), 2) + +# variation +delitos_data %>% + st_drop_geometry() %>% + select(contains('24')) %>% + summarise( + across( + everything(), + ~ ifelse(mean(.x, na.rm = TRUE) != 0, + sd(.x, na.rm = TRUE) / mean(.x, na.rm = TRUE), + NA), # Compute CV safely + .names = "{col}" + ) + ) %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "Crime Type") %>% + mutate(V1 = round(V1, 2)) %>% + rename(Variation = V1) %>% + gt() + +``` + +------------------------------------------------------------------------ + +**Antioquia:** since the coefficient of variation (CV) is used to compare the variability of datasets with different units or widely differing means, and a lower CV indicates less variability relative to the mean, we observe that a CV of 10.262% suggests high variability. + +Regarding the mean, which is a measure of central tendency (meaning its calculated value represents the center of the dataset) a result of 0.398 suggests that most areas have few or no recorded events. This implies that the mean may be influenced by a few high values, making it an unreliable representation of the central tendency. + +Additionally, the standard deviation, which measures the dispersion of the data relative to the mean, is 4.084 in this case. This indicates that values can vary by approximately ±4.084 from the mean of 0.398, further confirming the high variability in the dataset. + +Finally, the ranges within the standard deviation variation (1SD) are calculated using the lower limit as the mean minus the standard deviation and the upper limit as the mean plus the standard deviation. The ranges within the 1SD variation, with a lower limit of -3.69 and an upper limit of 4.48, indicate that the events are highly skewed. This is because most values fall between 0 and 4 (with negative numbers being adjusted to 0). In this dataset, negative values are not possible. If negative values appear, it suggests that the data distribution is highly asymmetric. + +**Prompt** : What is the meaning of having percentages of values within 1SD of "within_1sd: 48877 - percentage_1sd: 98.52"? Explain in a concise paragraph both numbers. + +The result **"within_1sd: 48,877 - percentage_1sd: 98.52"** indicates that **48,877 observations (98.52% of the dataset)** fall within one standard deviation (±1SD) of the mean. In a **normally distributed dataset**, approximately **68%** of values should fall within this range. However, in this case, nearly **99% of the values** are concentrated within this narrow range, suggesting that the dataset is **highly skewed** with most values being very close to the mean and a few extreme outliers inflating the standard deviation. This confirms that **the standard deviation is not a reliable measure of spread for these data**, and alternative measures such as percentiles or transformations should be considered. + +## Median Absolute Deviation MAD and MAD/median + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Compute statistics +median_val <- median(delitos_data$sum_24HP, na.rm = TRUE) +print(median_val) +mad_val <- mad(delitos_data$sum_24HP, na.rm = TRUE) # Compute MAD +print(mad_val) + +# Compute the range for first standard deviation +lower_bound <- median_val - mad_val +upper_bound <- median_val + mad_val +paste0('lower_bound: ', round(lower_bound, 2), ' - upper_bound: ', round(upper_bound, 2)) + +# Count the number of points within 1 MAD +within_1mad <- sum(delitos_data$sum_24HP >= lower_bound & delitos_data$sum_24HP <= upper_bound, na.rm = TRUE) +percentage_1mad <- (within_1mad / nrow(delitos_data)) * 100 +paste0('within_1mad: ', round(within_1mad, 2), ' - percentage_1mad: ', round(percentage_1mad, 2)) + +# Create histogram +ggplot(delitos_data, aes(x = sum_24HP)) + + geom_histogram(binwidth = 5, fill = "blue", alpha = 0.5, color = "black") + + + # Add vertical lines for mean, median, and 1st SD + #geom_vline(aes(xintercept = mean_val), color = "red", linetype = "dashed", size = 1.2) + + geom_vline(aes(xintercept = median_val), color = "green", linetype = "dashed", size = 1.2) + + geom_vline(aes(xintercept = lower_bound), color = "purple", linetype = "dashed", size = 1) + + geom_vline(aes(xintercept = upper_bound), color = "purple", linetype = "dashed", size = 1) + + + # Labels and title + labs(title = "Histogram of AUTOMOTORES with Median, and 1MAD Range", + x = "AUTOMOTORES Values", y = "Frequency") + + + # Add annotation for 1SD range + annotate("text", x = median_val, y = 10, + label = paste(within_1mad, "points (", round(percentage_1mad, 1), "1MAD", sep = ""), + color = "black", size = 5, hjust = 0.5, vjust = -1) + + + theme_minimal() + +# MAD/Median +paste0('MAD/Median: ', round(mad_val / median_val * 100), 2) + +``` + +------------------------------------------------------------------------ + +**Antioquia:** + +In the case of **Antioquia** and the sum_24HP variable, when calculating the median and the Median Absolute Deviation (MAD), the results show that the dataset has an extremely high concentration of zero values, with little overall variability and only a few extreme values. The calculated median was 0, meaning that at least 50% of the observations reported no personal thefts during the analyzed year. Additionally, the MAD was also 0, confirming that the majority of values are exactly at the median, that is, zero. This finding aligns with the calculation of the ±1 MAD range, which only covers the value 0 and shows that 85.44% of the observations fall within this range. Thus, only 14.56% of the dataset corresponds to areas where personal thefts were actually recorded. When calculating the MAD/Median ratio, the result was undefined (NaN), because the median value is 0, highlighting that traditional dispersion metrics are ineffective in this context. In conclusion, in 85.44% of Antioquia, no personal thefts were recorded, indicating that thefts are concentrated in specific areas of the department. This highlights the need for intervention strategies and measures not only from law enforcement but also through active citizen participation. The state should implement prevention and control activities in collaboration with the community. + +**Prompt** : What is the interpretation of these results? It is correct the interpretation that i have. + +## Covariance Matrix + +```{r} +#| echo: true +#| message: false +#| warning: false +delitos_data %>% + st_drop_geometry() %>% + select(contains("24")) %>% + cov() %>% + round(2) %>% + knitr::kable(digits = 2, caption = "Covariance Matrix") +``` + +**Antioquia:** + +In the case of "Hurto a Personas" (sum_24HP), the variance is 16.68, indicating a high dispersion in its values. This can be confirmed by the fact that the mean is 0.3982, while the maximum value reaches 699, evidencing the presence of extreme or outlier values. + +Regarding covariance, the associations obtained are moderately positive, such as with sum_24VI (3.07), sum_24DS (1.50), sum_24LP (1.04), and sum_24HM (0.70), indicating that as sum_24HP increases, these variables tend to increase as well. It can also be observed that most of the other variables show weak or null relationships. + +Therefore, since sum_24HP and the aforementioned variables increase simultaneously, there is evidence of a positive covariance. There is no evidence that when sum_24HP increases, another variable decreases systematically. Thus, there is no negative covariance, even when some variables have values equal to zero, as there is no negativity or inversely proportional relationship observed. + +------------------------------------------------------------------------ + +## Covariance Matrix of Log-Transformed Data + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Define the dataset +x <- delitos_data$sum_24HP + +# 1. Compute Raw Arithmetic Mean +arithmetic_mean <- mean(x) +print(arithmetic_mean) + +# 2. Compute Log-Mean (Multiplicative Center) +log_x <- log(x + 1) # Take logarithm of values +head(log_x) +log_mean <- mean(log_x) # Compute mean in log-space +print(log_mean) +log_mean_exp <- exp(log_mean) # Convert back to original scale +print(log_mean_exp) + +# Create the comparison table +comparison_table <- data.frame( + Index = seq_along(x), # Just an index for x-axis + Original_Value = x, + Log_Value = log_x +) + +p1 <- ggplot(comparison_table, aes(x = Original_Value, y = Log_Value)) + + geom_line(color = "gray70", size = 0.7, alpha = 0.5) + # Thin line connecting points + geom_point(alpha = 0.7, color = "blue") + # Scatter points with transparency + labs( + title = "Scatter Plot: Original vs. Log-Transformed Values", + x = "Original Values", + y = "Log-Transformed Values" + ) + + theme_minimal() + +# Add marginal histogram +ggMarginal( + p1, + type = "histogram", # Add marginal histograms + bins = 40, # Number of bins for the histogram + margins = "both", # Add histogram to both x and y margins + size = 5, # Size of the histograms relative to the scatter plot + fill = "gray", # Fill color for the histogram + color = "black", # Outline color for the histogram + alpha = 0.5 # Transparency +) + +``` + +**Antioquia:** + +``` +> # 1. Compute Raw Arithmetic Mean > arithmetic_mean <- mean(x) > print(arithmetic_mean) [1] 0.3981858 > +> # 2. Compute Log-Mean (Multiplicative Center) +> log_x <- log(x + 1) # Take logarithm of values +> head(log_x) +[1] 0.0000000 2.1972246 0.0000000 0.0000000 0.0000000 0.6931472 +> log_mean <- mean(log_x) # Compute mean in log-space +> print(log_mean) +[1] 0.148689 +> log_mean_exp <- exp(log_mean) # Convert back to original scale +> print(log_mean_exp) +[1] 1.160312 +``` + +When a logarithmic transformation is applied to the data, the goal is to help reduce the asymmetry of the data, especially when dealing with very large values and a skewed distribution. Additionally, it converts multiplicative relationships into additive ones, which facilitates analysis in statistical models. In our case, it also helps to more easily manage and interpret the data due to variability across several orders of magnitude. Thus, by applying this logarithmic transformation, variance is stabilized, allowing patterns to be observed more clearly (Atkinson, 2020). First, we observe the original mean of 0.3981, indicating low values in the records of our target variable. In the logarithmic transformation, 1 is added to each value to avoid issues with zeros (log(x + 1)), resulting in a transformed mean of 0.1487. Subsequently, this mean is translated back to the original scale using an exponential function, since the transformed values are no longer in their natural scale and no longer represent the actual number of events or quantities, but rather their order of magnitude. Therefore, the inverse operation of the logarithm, the exponential function, must be applied, yielding 1.1603. This measure of central tendency is more robust as it is not influenced by extreme values. + +In addition, the scatter plot between the original values and their corresponding log-transformed values illustrates how the logarithmic scale compresses extreme values—particularly those significantly above the mean—making nonlinear relationships more manageable. This property is especially important in contexts where the data span several orders of magnitude or contain outliers, as previously mentioned. + +The plot also includes marginal histograms that allow for a visual comparison of the data distribution before and after the transformation. While the original variable exhibits a strong concentration of low values and a long right tail, the histogram of the transformed values suggests a more symmetrical distribution. + +**Reference:** + +Atkinson, A. (2020). The Box-Cox Transformation: Review and Extensions. The London School of Economics. + +```{r} +#| include: false +# Store values for inline Quarto text +log_values <- paste(round(head(comparison_table$Log_Value), 2), collapse = ", ") +original_values <- paste(head(comparison_table$Original_Value), collapse = ", ") +``` + +```{r} +#| echo: true +#| message: false +#| warning: false + +#log transformed data +# Compute statistics for raw and log-transformed data +mean_raw <- mean(delitos_data$sum_24HP, na.rm = TRUE) +sd_raw <- sd(delitos_data$sum_24HP, na.rm = TRUE) +mad_raw <- mad(delitos_data$sum_24HP, na.rm = TRUE) + +delitos_data_log <- delitos_data %>% + #mutate(LOG_AUTOMOTORES = log(AUTOMOTORES + 1)) + mutate(LOG_AUTOMOTORES = log1p(sum_24HP)) # log1p(x) = log(1 + x) to handle zeros + +mean_log <- mean(delitos_data_log$LOG_AUTOMOTORES, na.rm = TRUE) +sd_log <- sd(delitos_data_log$LOG_AUTOMOTORES, na.rm = TRUE) +mad_log <- mad(delitos_data_log$LOG_AUTOMOTORES, na.rm = TRUE) + +# Compute statistics for raw and log-transformed data +data.frame( + Measure = c("Mean", "Median", "Standard Deviation", "MAD"), + Raw_Data = c(mean(delitos_data$sum_24HP, na.rm = TRUE), + median(delitos_data$sum_24HP, na.rm = TRUE), + sd(delitos_data$sum_24HP, na.rm = TRUE), + mad(delitos_data$sum_24HP, na.rm = TRUE)), + Log_Transformed_Data = c(mean(delitos_data_log$LOG_AUTOMOTORES, na.rm = TRUE), + median(delitos_data_log$LOG_AUTOMOTORES, na.rm = TRUE), + sd(delitos_data_log$LOG_AUTOMOTORES, na.rm = TRUE), + mad(delitos_data_log$LOG_AUTOMOTORES, na.rm = TRUE))) + +# Transform the data to a long format for ggplot +delitos_long <- delitos_data %>% + st_drop_geometry() %>% + select(contains('24')) %>% + pivot_longer(cols = everything(), names_to = "Crime Type", values_to = "Value") + +# Create faceted histograms +ggplot(delitos_long, aes(x = Value)) + + geom_histogram(bins = 30, fill = "skyblue", color = "black", alpha = 0.7) + + facet_wrap(~ `Crime Type`, scales = "free") + # Facet by crime type + theme_minimal() + + labs( + title = "Distributions of Crime Data", + x = "Value", + y = "Frequency" + ) + + theme( + axis.text.x = element_text(size = 5) # Reduce the font size of X-axis text + ) + +# Transform the data to long format and apply log transformation +delitos_long_log <- delitos_data %>% + st_drop_geometry() %>% + select(contains('24')) %>% + mutate(across(everything(), ~ log(.x), .names = "{col}")) %>% # Log transform (log(x + 1) to avoid log(0)) + pivot_longer(cols = everything(), names_to = "Crime Type", values_to = "Log Value") + +# Create faceted histograms for log-transformed values +ggplot(delitos_long_log, aes(x = `Log Value`)) + + geom_histogram(bins = 30, fill = "skyblue", color = "black", alpha = 0.7) + + facet_wrap(~ `Crime Type`, scales = "free") + # Facet by crime type + theme_minimal() + + labs( + title = "Log-Transformed Distributions of Crime Data", + x = "Log Value", + y = "Frequency" + ) + + theme( + axis.text.x = element_text(size = 3) # Reduce the font size of X-axis text + ) + +# Covariance Matrix (Log-Transformed) +delitos_data %>% + st_drop_geometry() %>% + select(contains('24')) %>% + mutate(across(everything(), ~ log(.x+1))) %>% # Log-transform (+1 to handle zeros) + cov() %>% + round(2) %>% + kable(digits = 2, caption = "Covariance Matrix (Log-Transformed)") + +``` + +The results of the first table summarize key statistical measures before and after the application of the logarithmic transformation, including the mean, median, standard deviation, and MAD. A noticeable decrease in both the mean and standard deviation is observed on the logarithmic scale. This reflects the effect of the transformation, which reduces dispersion and compresses extreme values. No changes are seen in the median and MAD, as these zero values indicate the strong asymmetry present in the original data, where many records report zero crimes. + +Moving to the histograms of the raw values, we observe a high concentration of frequencies at zero with long right tails. After applying the logarithmic transformation, the resulting histograms show significantly compressed distributions. Although the data remain skewed, the right tails are much less pronounced, and the values span a smaller range. + +Finally, the covariance matrix of the log-transformed variables provides a much more interpretable structure, as high-frequency crimes no longer dominate the total variance. This makes the dataset more suitable for subsequent multivariate analyses, such as Principal Component Analysis or clustering. + +## Redundant Variables + +Redundant variables provide little additional information due to high correlation with others, leading to multicollinearity in models. + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Define the matrix A +matrix_a <- matrix(c(4, 2, + 2, 3), nrow = 2, byrow = TRUE) +print("Matrix A:") +print(matrix_a) + +# Compute the eigen decomposition using R's built-in eigen() function +eigen_builtin <- eigen(matrix_a) +print("Built-in eigen() values:") +print(eigen_builtin$values) +print("Built-in eigen() vectors:") +print(eigen_builtin$vectors) + +# Multiply A by the matrix of eigenvectors: +# Each column of eigen_builtin$vectors is an eigenvector. +res <- matrix_a %*% eigen_builtin$vectors +print("A * eigenvectors:") +print(res) + +# Multiply the eigenvector matrix by the diagonal matrix of eigenvalues. +res2 <- eigen_builtin$vectors %*% diag(eigen_builtin$values) +print("eigenvectors * eigenvalues:") +print(res2) + +# Check if these two matrices are equal (they should be equal within numerical precision) +are_equal <- all.equal(res, res2) +print("Are A * eigenvectors and eigenvectors * eigenvalues equal?") +print(are_equal) +``` + +This R script performs the eigen decomposition of the matrix 𝐴 = (4 2 ; 2 3) and verifies a fundamental property of this operation: that multiplying the original matrix by its eigenvectors yields the same result as multiplying those vectors by their corresponding eigenvalues. By obtaining the eigenvalues λ₁ = 5.56 and λ₂ = 1.44, the code confirms that both eigenvectors satisfy the relation A⋅v = λ⋅v. + +In the context of redundant variable analysis, a low eigenvalue (such as 1.44 compared to 5.56) may indicate that a linear combination of the variables contributes little to the total variability, suggesting redundancy or dependence among them. Additionally, this relationship, expressed as A⋅v = λ⋅v, confirms the correctness of the decomposition and illustrates how the matrix’s internal structure is captured by its eigen components. Such a property is crucial in various applications, including Principal Component Analysis (PCA)(IBM, 2023), dimensionality reduction, and the detection of variable redundancy in multivariate datasets. + +**Reference:** + +IBM. (2023, 8 de diciembre). *¿Qué es el análisis de componentes principales (PCA)?* IBM Think. + +**Prompt** : What improvements in writing could you recommend? + +------------------------------------------------------------------------ + +### Redundant Variables Detection + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Covariance matrix +cm_delitos_data <- delitos_data %>% + st_drop_geometry() %>% + select(contains('24')) %>% + select(-sum_24TR, -sum_24SE, -sum_24SS) %>% + cov() + +# Compute eigenvalues and eigenvectors +eigen_results <- cm_delitos_data %>% eigen() + +# Extract eigenvalues and eigenvectors +eigenvalues <- eigen_results$values +eigenvectors <- eigen_results$vectors + +# Display eigenvalues and eigenvectors +print(eigenvalues) +head(eigenvectors) + +# The Smallest Eigenvalues +sort(eigenvalues, decreasing = FALSE) + +# The smallest eigenvalue is approximately zero +smallest_eigenvalue <- min(eigenvalues) +print(smallest_eigenvalue) + +# Corresponding eigenvector +smallest_eigenvector <- eigenvectors[, which.min(eigenvalues)] +print(smallest_eigenvector) + +# Normalize the eigenvector by dividing by the largest absolute value +normalized_eigenvector <- smallest_eigenvector / max(abs(smallest_eigenvector)) +print(normalized_eigenvector) + +# Sorted normalize the eigenvector +sort(abs(normalized_eigenvector), decreasing = T) + +# Get numeric variable names (order matches eigenvector indices) +variable_names <- colnames(cm_delitos_data) + +# Sort normalized eigenvector by absolute contribution (descending order) +sorted_contributions <- sort(abs(normalized_eigenvector), decreasing = TRUE) + +# Get the indices of the top contributions +top_indices <- order(abs(normalized_eigenvector), decreasing = TRUE) + +# Get the names of the top variables +top_variable_names <- variable_names[top_indices] + +# Print the top variable names +print(top_variable_names) + +# Fit a regression model to confirm the relationship +model <- lm(sum_24HP ~ sum_24HA + sum_24EX + sum_24HM + sum_24LP + sum_24HR + sum_24VI + sum_24HC + sum_24HOM + sum_24DS, + data = data.frame(delitos_data)) + +## This has to be done for all the variables, not just those with the lowest eigenvalues. We just make one example in the activity. + +# model <- lm(sum_24SE ~ sum_24DS + sum_24EX + sum_24SS + sum_24LP + +# sum_24HOM + sum_24HR + sum_24VI + sum_24HM + +# sum_24HA + sum_24HP + sum_24HC, +# data = data.frame(delitos_data)) +# +# model <- lm(sum_24SS ~ sum_24EX + sum_24HC + sum_24HR + +# sum_24HA + sum_24DS + sum_24HM + +# sum_24HOM + sum_24HP + sum_24VI + sum_24LP, +# data = data.frame(delitos_data)) +# +# model <- lm(sum_24HOM ~ sum_24HA + sum_24EX + sum_24HM + +# sum_24LP + sum_24HR + sum_24VI + +# sum_24HC + sum_24HP + sum_24DS, +# data = data.frame(delitos_data)) + +summary(model) + +# Variance Inflation Factors +vif(model) + + +``` + +**Antioquia** + +``` +> # Display eigenvalues and eigenvectors +> print(eigenvalues) + [1] 17.55691297 1.07591451 0.60209249 0.26881275 0.19908439 0.10952497 0.06262054 0.03352665 0.02274910 0.01283346 +> head(eigenvectors) + [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] +[1,] -0.0004231918 -0.003829432 0.005457511 0.0023751067 -0.006451881 0.005964127 -0.002573178 0.012437880 -0.014402920 +[2,] -0.0652493825 -0.382707853 0.851186294 0.3480378541 -0.002056865 0.042491533 0.029848617 -0.028814357 -0.007083756 +[3,] -0.1910203832 -0.798277248 -0.469181406 0.2077242218 -0.142848122 0.204009599 -0.018339401 -0.024190171 0.005368092 +[4,] -0.0916212757 -0.251803410 -0.033747533 -0.1222204689 0.132916257 -0.919619096 0.209431256 -0.066616391 -0.020276954 +[5,] -0.9731955314 0.222505190 0.023949267 0.0003688328 0.035100240 0.037187906 -0.013526268 -0.001955515 0.002515685 +[6,] -0.0175002118 -0.130458872 0.078607849 -0.2429392945 0.070744873 -0.136310264 -0.945114615 -0.021209897 -0.014020648 + [,10] +[1,] 0.9997518527 +[2,] -0.0069003915 +[3,] -0.0028786842 +[4,] 0.0068908447 +[5,] 0.0003391443 +[6,] -0.0014600195 +> +> # The Smallest Eigenvalues +> sort(eigenvalues, decreasing = FALSE) + [1] 0.01283346 0.02274910 0.03352665 0.06262054 0.10952497 0.19908439 0.26881275 0.60209249 1.07591451 17.55691297 +> +> # The smallest eigenvalue is approximately zero +> smallest_eigenvalue <- min(eigenvalues) +> print(smallest_eigenvalue) +[1] 0.01283346 +> +> # Corresponding eigenvector +> smallest_eigenvector <- eigenvectors[, which.min(eigenvalues)] +> print(smallest_eigenvector) + [1] 0.9997518527 -0.0069003915 -0.0028786842 0.0068908447 0.0003391443 -0.0014600195 -0.0048317931 -0.0167378062 0.0011175220 +[10] -0.0092655878 +> +> # Normalize the eigenvector by dividing by the largest absolute value +> normalized_eigenvector <- smallest_eigenvector / max(abs(smallest_eigenvector)) +> print(normalized_eigenvector) + [1] 1.0000000000 -0.0069021043 -0.0028793987 0.0068925550 0.0003392285 -0.0014603819 -0.0048329924 -0.0167419607 0.0011177994 +[10] -0.0092678876 +> +> # Sorted normalize the eigenvector +> sort(abs(normalized_eigenvector), decreasing = T) + [1] 1.0000000000 0.0167419607 0.0092678876 0.0069021043 0.0068925550 0.0048329924 0.0028793987 0.0014603819 0.0011177994 0.0003392285 +> +> # Get numeric variable names (order matches eigenvector indices) +> variable_names <- colnames(cm_delitos_data) +> +> # Sort normalized eigenvector by absolute contribution (descending order) +> sorted_contributions <- sort(abs(normalized_eigenvector), decreasing = TRUE) +> +> # Get the indices of the top contributions +> top_indices <- order(abs(normalized_eigenvector), decreasing = TRUE) +> +> # Get the names of the top variables +> top_variable_names <- variable_names[top_indices] +> +> # Print the top variable names +> print(top_variable_names) + [1] "sum_24HOM" "sum_24HA" "sum_24EX" "sum_24LP" "sum_24DS" "sum_24HC" "sum_24VI" "sum_24HR" "sum_24HM" "sum_24HP" +> +> # Fit a regression model to confirm the relationship +> model <- lm(sum_24HP ~ sum_24HA + sum_24EX + sum_24HM + sum_24LP + sum_24HR + sum_24VI + sum_24HC + sum_24HOM + sum_24DS, ++ data = data.frame(delitos_data)) +> +> ## This has to be done for all the variables, not just those with the lowest eigenvalues. We just make one example in the activity. +> +> # model <- lm(sum_24SE ~ sum_24DS + sum_24EX + sum_24SS + sum_24LP + +> # sum_24HOM + sum_24HR + sum_24VI + sum_24HM + +> # sum_24HA + sum_24HP + sum_24HC, +> # data = data.frame(delitos_data)) +> # +> # model <- lm(sum_24SS ~ sum_24EX + sum_24HC + sum_24HR + +> # sum_24HA + sum_24DS + sum_24HM + +> # sum_24HOM + sum_24HP + sum_24VI + sum_24LP, +> # data = data.frame(delitos_data)) +> # +> # model <- lm(sum_24HOM ~ sum_24HA + sum_24EX + sum_24HM + +> # sum_24LP + sum_24HR + sum_24VI + +> # sum_24HC + sum_24HP + sum_24DS, +> # data = data.frame(delitos_data)) +> +> summary(model) + +Call: +lm(formula = sum_24HP ~ sum_24HA + sum_24EX + sum_24HM + sum_24LP + + sum_24HR + sum_24VI + sum_24HC + sum_24HOM + sum_24DS, data = data.frame(delitos_data)) + +Residuals: + Min 1Q Median 3Q Max +-133.92 0.07 0.07 0.07 341.65 + +Coefficients: + Estimate Std. Error t value Pr(>|t|) +(Intercept) -0.06823 0.01341 -5.088 3.64e-07 *** +sum_24HA 1.00846 0.08414 11.986 < 2e-16 *** +sum_24EX 0.52432 0.07031 7.457 9.01e-14 *** +sum_24HM -0.34409 0.02777 -12.392 < 2e-16 *** +sum_24LP 0.26706 0.01738 15.369 < 2e-16 *** +sum_24HR -1.19276 0.04883 -24.427 < 2e-16 *** +sum_24VI 0.99880 0.01570 63.606 < 2e-16 *** +sum_24HC 1.56277 0.02753 56.763 < 2e-16 *** +sum_24HOM -0.20314 0.11297 -1.798 0.0722 . +sum_24DS 3.09406 0.03530 87.660 < 2e-16 *** +--- +Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 + +Residual standard error: 2.851 on 49600 degrees of freedom +Multiple R-squared: 0.5129, Adjusted R-squared: 0.5128 +F-statistic: 5802 on 9 and 49600 DF, p-value: < 2.2e-16 + +> +> # Variance Inflation Factors +> vif(model) + sum_24HA sum_24EX sum_24HM sum_24LP sum_24HR sum_24VI sum_24HC sum_24HOM sum_24DS + 1.057830 1.205454 1.655481 1.292659 1.488089 2.225936 1.066578 1.003870 2.428226 +``` + +When detecting redundant variables, it is essential to clearly define our objective. For example, if our goal is to determine whether the variable sum_24HP is redundant with respect to the others, we only need to run a regression model using sum_24HP as the dependent variable and the remaining variables as predictors. However, if our objective is to identify all redundant variables in the dataset, then a separate model must be run for each variable, treating each one as the dependent variable against the rest. + +To detect these redundant variables, we start by calculating the covariance matrix, which provides the foundation for evaluating linear relationships among variables. Then, a spectral decomposition is performed to obtain the eigenvalues and eigenvectors. In the case of Antioquia, the smallest eigenvalue is 0.0128, which is very low compared to the largest eigenvalue (17.5569). This suggests that there exists a linear combination of variables with very low variance—indicating possible redundancy. Moreover, the eigenvector associated with the eigenvalue 0.0128 defines a linear combination where the variance is nearly zero, further supporting this idea. Within this eigenvector, the variable sum_24HOM has the highest relative weight (1.00 after normalization), followed by sum_24HA (0.0167) and sum_24HP (0.0093), which shows that these variables are highly involved in the direction of minimal variance and may be considered potentially redundant with one another. + +By normalizing the eigenvector (dividing each component by the largest absolute value), we obtain a relative scale where sum_24HOM retains an absolute value of 1.00, followed by sum_24HA (0.0167), sum_24HP (0.0093), and sum_24EX (0.0069). This reinforces the idea that these variables dominate the direction of lowest variance and are, therefore, primary candidates for redundancy in our dataset. + +When fitting the linear regression model using sum_24HP as the dependent variable, we observe an adjusted R² of 0.5128, indicating that nearly 51% of the variance in sum_24HP can be explained by a linear combination of the other variables. All coefficients were statistically significant (p \< 0.001), except for sum_24HOM, which had a p-value of 0.072, suggesting a marginal contribution. This implies that sum_24HP is not strictly independent but instead shares information with the rest of the dataset—particularly with sum_24DS, sum_24HC, and sum_24VI, which had notably large coefficients. + +The collinearity analysis using VIF (Variance Inflation Factor) showed that none of the predictors exhibit severe multicollinearity (all VIFs \< 5). However, sum_24DS (2.43) and sum_24VI (2.22) had the highest VIFs in the model, suggesting moderate redundancy between them. This supports what was previously identified in the spectral analysis, where these variables also emerged as relevant in the direction of lowest variance. + +All of this leads to the conclusion that there are linear combinations of variables in the dataset that exhibit low variance, clearly evidencing statistical redundancy among some of them. The variables sum_24HOM, sum_24HA, sum_24HP, sum_24EX, and sum_24DS consistently appear as major contributors to this redundancy, both in the eigenvector analysis and in the regression model with VIFs. While multicollinearity is not extreme, these findings support the possibility of dimension reduction or model simplification, especially if it is confirmed that some variables do not add substantial new information. + +**Prompt:** I would like to automatically generate this analysis for each variable in the dataset. We could use a for-loop to build models, calculate VIFs, and summarize redundancies. + +```{r} +# Proposal to use a for-loop to build models, calculate VIFs, and summarize redundancies +library(dplyr) +library(car) # vif() funtion + +# Varriables to exclude manually +excluded_vars <- c("sum_24TR", "sum_24SE", "sum_24SS") + +# Select only the numeric sum_24 variables, excluding the ones marked for removal +vars <- delitos_data %>% + st_drop_geometry() %>% + select(contains("24")) %>% + select(-all_of(excluded_vars)) %>% + colnames() + +# Create a list to store the results +vif_results <- list() + +# Loop over each variable as the dependent variable +for (target_var in vars) { + + # Predictor variables (all except the current dependent variable) + predictors <- setdiff(vars, target_var) + + # Create formula: target ~ predictors + formula_str <- paste(target_var, "~", paste(predictors, collapse = " + ")) + formula_obj <- as.formula(formula_str) + + # Fit model + model <- lm(formula_obj, data = delitos_data) + + # Calculate VIFs + vif_vals <- vif(model) + + # Save results + vif_results[[target_var]] <- list( + r_squared_adj = summary(model)$adj.r.squared, + vif = vif_vals + ) +} + +# Display summary of the models with the highest multicollinearity +vif_summary <- lapply(vif_results, function(res) { + max_vif <- max(res$vif) + data.frame( + Adj_R2 = res$r_squared_adj, + Max_VIF = max_vif + ) +}) %>% bind_rows(.id = "Dependent_Var") %>% + arrange(desc(Max_VIF)) + +# Show table +print(vif_summary) + +``` + +------------------------------------------------------------------------ + +## Global Variability Metric + +```{r} +#| echo: true +#| message: false +#| warning: false + +cov_matrix <- delitos_data %>% + st_drop_geometry() %>% + select(contains('24')) %>% + select(-sum_24TR, -sum_24SE, -sum_24SS) %>% + cov() + +# Effective Variance +det(cov_matrix)^(1/ncol(cov_matrix)) + +# Log-Transformed Effective Variance +det(log(cov_matrix + 1))^(1/ncol(cov_matrix)) + +# Effective Standard Deviation +det(cov_matrix)^(1/(ncol(cov_matrix) * 2)) + +# Log-Transformed Effective Standard Deviation +det(log(cov_matrix + 1))^(1/(ncol(cov_matrix) * 2)) +``` + +**Antioquia:** + +The Global Variability Metric measures the overall variability of a dataset through the calculation of the effective variance and the effective standard deviation. In this department, an effective variance of 0.1824 and an effective standard deviation of 0.4271 were obtained. These values are relatively low, indicating reduced overall dispersion among the analyzed variables and suggesting low general variability within the proposed multidimensional space. It is important to note that the logarithmic metrics yielded a NaN result. This is likely due to zero values in the covariance matrix or the fact that applying a logarithm directly to the matrix (element-wise) can result in an invalid matrix for determinant computation. To properly use a log-transformed variability metric, it is recommended to apply the logarithmic transformation to the raw data prior to calculating the covariance matrix. + +------------------------------------------------------------------------ + +## Linear Dependency and Precision Matrix + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Compute precision matrix +S_inv <- solve(cov_matrix) + +# Display precision matrix (should match example values) +cat("Precision Matrix (S⁻¹):\n") +print(S_inv, digits = 2) + +# Extract correct row components of the selected crime +dependent_variable_index <- 5 + +first_row <- S_inv[dependent_variable_index, ] +print(first_row, digits = 2) + +diag_element <- S_inv[dependent_variable_index, dependent_variable_index] +print(diag_element, digits = 2) + +# Compute regression coefficients +beta_coefficients <- -first_row[-dependent_variable_index] / diag_element +print(beta_coefficients, digits = 2) + +# Compute residual variance +residual_variance <- 1 / diag_element +residual_sd <- sqrt(residual_variance) # Residual standard error + +# Print residual standard error +print(residual_sd, digits = 2) + +# Compute R^2 +r_squared <- 1 - (residual_variance / cov_matrix[dependent_variable_index, dependent_variable_index]) +print(r_squared, digits = 2) + +# Verify with lm() regression +delitos <- delitos_data %>% + st_drop_geometry() %>% + select(contains('24')) %>% + select(-sum_24TR, -sum_24SE, -sum_24SS) + +# Fit model +model <- lm(sum_24HP ~ ., data = data.frame(delitos)) +summary(model) +``` + +**Antioquia:** + +Linear dependency within the data is first suggested by the covariance matrix and its eigenvalues, where a value close to zero implies that one or more variables can be expressed as a linear combination of others, signaling redundancy. To explore these dependencies further, the precision matrix (PM)—which is the inverse of the covariance matrix—is computed. This matrix captures conditional dependencies between variables: if an off-diagonal element is non-zero, it indicates a conditional association between two variables, given all others. + +In this analysis, the precision matrix for the Antioquia dataset was calculated, and the fifth variable (sum_24HP) was selected as the dependent variable. The corresponding row from the PM provides the coefficients of the regression of sum_24HP on the other variables, scaled by its own variance. After adjusting and standardizing these values, the derived estimate regression coefficients were: + +sum_24HOM: -0.20, sum_24LP: 0.27, sum_24VI: 1.00, sum_24DS: 3.09, sum_24HR: -1.19, sum_24HC: 1.56, sum_24HA: 1.01, sum_24HM: -0.34, sum_24EX: 0.52 + +These match closely with the coefficients obtained using the standard lm() , confirming the consistency between theoretical derivation via the precision matrix and the empirical model fit. + +Furthermore, the residual standard deviation was computed as √(1 / PM\[5,5\]) = 2.9, and the R² was calculated as: + +R\^2 = 1 - \\frac{\\text{residual variance}}{\\text{variance of sum_24HP}} = 0.51 + +This indicates that approximately 51% of the variance in sum_24HP is explained by the linear combination of the remaining variables—consistent with the output from the linear model. + +It is also important to clarify that the variable sum_24HP_jittered (used earlier in visualizations) is a noisy replica of sum_24HP, introduced to avoid technical issues such as overplotting or artifacts in quantile-based methods, where many identical values can distort boundaries. Since it carries no new information, it is excluded from the regression model to avoid multicollinearity with the dependent variable. + +In conclusion, the precision matrix provides a powerful framework for understanding conditional relationships and detecting redundancy, and in this case, it confirms the significant linear dependency between sum_24HP and variables such as sum_24DS, sum_24HC, and sum_24VI. These findings align with both the spectral analysis and regression output, reinforcing the robustness of the conclusions drawn about multivariate relationships in the dataset. + +**Prompt** : What improvements in writing could you recommend? + +------------------------------------------------------------------------ + +# Key Players and Topics + +# Non-Parametric Correlation {.unnumbered} + +Correlation measures the strength and direction of association between two variables. While Pearson's correlation requires a linear relationship and normally distributed data, \emph{Spearman's rank correlation} and \emph{Kendall's tau} are \emph{non-parametric} measures, making them ideal for analyzing data that may not be linear or normally distributed. + +## Spearman's Rank Correlation + +Spearman's correlation coefficient $\rho$ is based on \emph{ranked} data. For two variables $X$ and $Y$, we replace each observation by its rank. + +```{r} +#| echo: true +#| message: false +#| warning: false + +delitos_data <- delitos_data %>% + select(-sum_24TR, -sum_24SE, -sum_24SS) + +delitos_data %>% + st_drop_geometry() %>% + select(contains('24')) %>% + cor(., method = "spearman", use = "complete.obs") %>% + round(., 3) %>% + print(.) %>% + corrplot(., method = "color", title = "Spearman Correlation", mar=c(0,0,1,0)) +``` + +**Antioquia:** + +Spearman’s coefficient helps us evaluate monotonic relationships between variables, that is, to identify consistent increasing or decreasing patterns without assuming linearity or normality in the data. In our case, since the data are social and criminal in nature, this is valuable because such distributions are often skewed, and outliers can strongly influence traditional parametric methods (Hauke & Kossowski, 2011). + +In general, in the department of Antioquia, most of the coefficients have low ranges close to zero. This means there is a general absence of strong monotonic associations between the variables. However, in the specific case of sum_24HP, we found interesting patterns, such as its correlations with sum_24HM (0.269), sum_24HC (0.255), and sum_24LP (0.249). Based on this information, we can deduce that although theft from persons, theft of cars or motorcycles, and personal injuries are different types of crimes, they might share some similar dynamics, albeit with a weak correlation. + +Additionally, since these correlations are positive, the relationship is direct; that is, both variables increase together. It is important to mention that Spearman’s coefficient does not measure causality, only association, which is key when interpreting our results. + +**Prompt** : I am studying the methodology of Spearman and Pearson for data analysis, and I need two references to help me understand it better. + +**References:**  + +Gauthier, J. (2001). Detecting trends using Spearman’s rank correlation coefficient. Environmental Forensics, 2(4), 359–362. Hauke, J., & Kossowski, T. (2011). Comparison of values of Pearson’s and Spearman’s correlation coefficients on the same sets of data. Quaestiones Geographicae, 30(2), 87–93. + +# Spatial Neighborhood Matrices {.unnumbered} + +## Neighbors Based on Contiguity + +```{r} +# Create a spatial neighbors list using Queen contiguity +# (i.e., polygons are considered neighbors if they share any point: edge or vertex) +nb <- spdep::poly2nb(delitos_data, queen = TRUE) +head(nb) + +# Replace invalid neighbor entries (i.e., [1] 0) with empty integer vectors +# This ensures compatibility with functions that expect valid neighbor lists only +nb_0 <- lapply(nb, function(x) if(length(x)==1 && x==0) integer(0) else x) + +# Polygons with neighbors +table(sapply(nb_0, length)) + +# Neighbors of Order k Based on Contiguity +# Neighbors of second order +nblags <- spdep::nblag(neighbours = nb, maxlag = 2) + +# Combine neighbors of all orders up to the specified lag (in this case, up to order 2) +# This creates a cumulative neighbor list including first- and second-order neighbors +nblagsc <- spdep::nblag_cumul(nblags) +table(sapply(nblagsc, length)) +``` + +**Antioquia:** + +In Antioquia dataset, a spatial neighbors matrix was constructed based on Queen contiguity, which considers polygons as neighbors if they share at least one vertex or edge. The poly2nb() function was used for this purpose, generating warnings about the presence of polygons without neighbors and fragmentation into multiple subgraphs (44,608), possibly due to geometric precision issues. Some polygons yielded a result of \[1\] 0, indicating a lack of neighbors; these were replaced with empty vectors to ensure compatibility with subsequent functions. The frequency histogram showed that 42,277 polygons had no neighbors under this definition, while the remainder had between 1 and up to 20 neighbors, reflecting high spatial heterogeneity. Subsequently, second-order neighbors (i.e., neighbors of neighbors) were computed using the nblag() function, and first- and second-order neighbors were consolidated. The results showed that most polygons now had between 1 and 4 cumulative neighbors, although some still had up to 26 connections, further highlighting the dispersed and complex spatial structure of our dataset. This procedure is essential for characterizing broader spatial relationships and modeling spatial dependence effects. + +Although the data used present limitations for contiguity-based spatial analysis—as evidenced by the high proportion of polygons without neighbors and a structure fragmented into thousands of subgraphs—this does not imply that they are entirely unsuitable. These conditions suggest issues with the topological definition of geometries or with the scale of spatial interaction. Therefore, it is necessary to verify the validity of the geometries, adjust neighbor detection parameters (such as the snap threshold), or employ alternative methods such as distance-based neighborhood structures, which provide a more flexible and suitable representation of spatial relationships in dispersed contexts or where geometries are imprecise. + +------------------------------------------------------------------------ + +## Neighbors Based on k Nearest Neighbors + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Compute centroids of the polygons +coo <- st_centroid(delitos_data) + +# Create a neighbor list where each polygon (based on its centroid `coo`) is connected +# to its 3 nearest neighbors using k-nearest neighbors (k = 3) +nb <- knn2nb(knearneigh(coo, k = 3)) # k number nearest neighbors + +# Polygons with neighbors +table(sapply(nb, length)) + +# Subset data to the first 10 polygons +delitos_data_10 <- delitos_data[1:1000, ] + +# Recompute neighbor list for these 10 polygons to avoid index mismatches +nb_10 <- knn2nb(knearneigh(st_centroid(delitos_data_10), k = 3)) + +# Compute centroids for the 10 polygons +coords_10 <- st_coordinates(st_centroid(delitos_data_10)) + +# Plot the first 10 polygons and overlay neighbor connections in red +plot(st_geometry(delitos_data_10), border = "lightgray", main = "First Polygons with 3 Nearest Neighbors") +plot.nb(nb_10, coords_10, add = TRUE, col = "red", lwd = 2) +``` + +**Antioquia:** + +In the case of Antioquia, there is a total number of 49,610 polygons, and we performed the KNN calculation with 3 neighbors. In the original code, I made a modification to the line \# Subset data to the first 10 polygons delitos_data_10 \<- delitos_data\[1:1000, \]. I adjusted it to 1000 for better observations. + +We can clearly observe distant, non-adjacent areas; however, thanks to KNN, connections are still established. A denser area is evident in the lower part of the graph, allowing us to infer that it may be an urban center or a high-density polygon area, as we see shorter and more intertwined red lines. + +------------------------------------------------------------------------ + +## Neighbors Based on Distance + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Create a neighbor list using distance-based contiguity: +# Polygons are considered neighbors if their centroids are within 0.4 units (e.g., degrees) apart +nb <- dnearneigh(x = st_centroid(delitos_data), d1 = 0, d2 = 0.4) + +# Polygons with neighbors +hist(sapply(nb, length)) + +# Subset data to the first 10 polygons +delitos_data_10 <- delitos_data[1:1000, ] + +# Recompute neighbor list for these 10 polygons to avoid index mismatches +nb_10 <- dnearneigh(x = st_centroid(delitos_data_10), d1 = 0, d2 = 0.4) + +# Compute centroids for the 10 polygons +coords_10 <- st_coordinates(st_centroid(delitos_data_10)) + +# Plot the first 10 polygons and overlay neighbor connections in red +plot(st_geometry(delitos_data_10), border = "lightgray", main = "First Polygons with 3 Nearest Neighbors") +plot.nb(nb_10, coords_10, add = TRUE, col = "red", lwd = 2) +``` + +The histogram represents the distribution of the number of neighbors per polygon. An asymmetric distribution with positive skewness can be observed: most polygons have around 60 neighbors, while very few exceed 160. This indicates high variability in the spatial density of the polygons—in other words, there is a heterogeneous spatial configuration in which densely clustered areas coexist with more isolated zones. This pattern is reflected in the map, where denser groupings are concentrated in specific regions of the territory. When comparing this with the political map of Antioquia, we can confirm that these areas correspond to urban locations, such as the city of Medellín. + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Compute k-nearest neighbors: for each polygon centroid, find its 1 nearest neighbor (k = 1) +nb1 <- knn2nb(knearneigh(coo, k = 1)) + +# Calculate the Euclidean distances between each polygon and its nearest neighbor +dist1 <- nbdists(nb1, coo) + +# Summarize all distances to understand the minimum, maximum, and quartiles +summary(unlist(dist1)) + +# Create a distance-based neighbor list: polygons whose centroids are within [0, 1.2] units are considered neighbors +nb <- dnearneigh(x = st_centroid(delitos_data), d1 = 0, d2 = 1.2) + +# Polygons with neighbors +hist(sapply(nb, length)) +``` + +Thanks to the determination of an appropriate distance threshold for defining neighborhood relationships between polygons, it is possible to more accurately address spatial contexts characterized by heterogeneous structures. In this analysis, the Euclidean distance between the centroids of each polygon and its nearest neighbor (k = 1) was calculated as the basis for establishing an objective proximity criterion. + +Euclidean distance, a fundamental measure in multivariate analysis, quantifies the geometric separation between two points in a *k*-dimensional space. It is calculated as the square root of the sum of the squared differences between each pair of corresponding coordinates. Formally, for two points i and j, it is defined as: + +$$ +[Σkh = 1 (xih ‒ xjh)2] ½ = E, Euclidean +$$ + +This metric generates a symmetric distance matrix that represents the degree of dissimilarity between entities based on standardized attributes. For more than six decades, it has been a key tool in classification methods and phenetic analysis, due to its mathematical robustness and its geometric interpretation as a sphere of radius *r*, within which the nearest neighbors are naturally located (Temple, 2023). + +In the present case, the results show a mean distance of 0.059 units and a median of 0.048, suggesting a high degree of spatial proximity across much of the analyzed territory. The minimum value (0.0014) indicates cases where polygons are nearly adjacent, while the maximum value (6.8689) points to the existence of much more isolated geographic areas, likely rural or peripheral. This variability is further supported by an interquartile range of 0.034, indicating a non-homogeneous spatial configuration. From a geometric perspective, these results allow each centroid to be visualized as the center of a sphere of influence, where the minimum radius needed to reach the first neighbor varies considerably depending on the territorial location (Temple, 2023). + +Thus, the distance matrix generated under this criterion not only quantifies the geometric dissimilarity between spatial units but also provides a fundamental empirical input for establishing more precise neighborhood thresholds (Temple, 2023). This step is essential to ensure connectivity in subsequent spatial models, such as the construction of spatial weights matrices or the estimation of spatial regression models, in which each unit must maintain a valid structural relationship with its immediate surroundings. + +**References:** + +Temple, J. T. (2023). *Characteristics of distance matrices based on Euclidean, Manhattan and Hausdorff coefficients*. *Journal of Classification, 40*, 214–232. + +## Neighborhood Matrices + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Spatial weights matrix using Queen contiguity (binary weights) +# 'queen = TRUE' considers shared edges OR vertices as neighbors +nb <- poly2nb(delitos_data, queen = TRUE) + +# Convert the neighbor list to a spatial weights list object +# 'style = "W"' row-standardizes the weights (sums to 1) +# 'zero.policy = TRUE' avoids errors when some polygons have no neighbors +nbw <- spdep::nb2listw(nb, style = "W", zero.policy = TRUE) + +# Display the first 10 rows of spatial weights (for the first 10 polygons) +nbw$weights[1:10] + +# Spatial weights matrix based on inverse distance values +# Compute centroids of polygons +coo <- st_centroid(delitos_data) + +# Use Queen contiguity again to define neighbors +nb <- poly2nb(delitos_data, queen = TRUE) + +# Compute distances between neighbors based on their centroids +dists <- nbdists(nb, coo) + +# Create inverse distance weights (1/distance) for each pair of neighbors +ids <- lapply(dists, function(x){1/x}) + +# Create a listw object using binary style ("B" = no standardization) +nbw <- nb2listw(nb, glist = ids, style = "B", zero.policy = TRUE) + +# Display the first 10 inverse-distance-based weights +nbw$weights[1:10] +``` + +The spatial weights matrix represents the relationship between spatial units—polygons, in our case—indicating whether they are neighbors and, in some cases, whether they are strongly connected based on distance. In the code, once the spatial neighbors are defined using Queen contiguity (two polygons are considered neighbors if they share at least an edge or a vertex), the neighbor list is converted into a listw object with row-standardized binary weights, such that the weights sum to 1 per row. Additionally, the parameter zero.policy = TRUE is included to avoid errors in cases where a polygon has no neighbors. The resulting output is: + +``` +[[1]] NULL [[2]] [1] 1 [[3]] [1] 1 [[4]] [1] 1 [[5]] NULL [[6]] NULL [[7]] NULL [[8]] [1] 1 [[9]] NULL [[10]] NULL +``` + +``` +[[1]] NULL [[2]] [1] 0.9643715 [[3]] [1] 0.3376093 [[4]] [1] 0.3157392 [[5]] NULL [[6]] NULL [[7]] NULL [[8]] [1] 0.03847188 [[9]] NULL [[10]] NULL +``` + +This means that the polygons where NULL appears (\[1\], \[5\], \[6\], \[7\], \[9\], \[10\]) do not have contiguous neighbors, while the others have only one neighbor and thus a weight of 1 (1 neighbor = 1/1). Subsequently, an adjustment is made to move from binary weights to continuous weights based on the inverse distance between centroids. This results in weights proportional to 1/distance, so the closer the neighbor, the greater the weight. For example, polygon 2 has a very close neighbor with a weight of 0.9643715, while polygon 8 has a more distant neighbor with a weight of 0.03847188. These results, along with the earlier observations, support the hypothesis that many polygons lacking neighbors under Queen contiguity are likely to be isolated or very small. This can affect models that require a dense network of spatial connections. + +This pattern has direct implications for the calculation of Moran’s I index, which assesses spatial autocorrelation by considering both the variable of interest and the defined spatial structure. When numerous polygons have no neighbors or only minimal connections, as is the case here, the index’s ability to detect meaningful spatial patterns weakens, since the weights matrix becomes sparse and lacks continuity (Chen, 2023). As a result, the interpretation of Moran’s I may be biased or limited, potentially underestimating the true spatial autocorrelation in poorly connected areas. Therefore, it is essential to critically evaluate the chosen neighborhood structure and, if necessary, adopt alternative approaches such as distance-based or k-nearest neighbors, which provide a more uniform and suitable connection network for the phenomenon under study. + +The study by Ejigu and Wencheko (2020) proposes a methodological innovation by introducing covariate-dependent spatial weight matrices, which are capable of simultaneously incorporating both geographic distance and similarity in relevant variables (e.g., altitude, temperature, distance to water bodies, among others). The proposed matrix is defined through an exponential decay function: + +$$ +\Large\ wij=exp(−(αue+(1−α)us)) +$$ + +Where: + +- wij is the weight assigned between spatial units i and j. +- ue=∣ei−ej∣ is the absolute difference in the environmental covariate. +- us=∥si−sj∥ is the Euclidean distance between locations. +- α∈\[0,1\] regulates the relative influence of the covariate compared to geographic distance. + +When α=0, the matrix reduces to a traditional distance-based function; when α=1, it is constructed solely based on covariate similarity. This flexible formulation allows the spatial dependency structure to be adapted to the actual nature of the phenomenon, especially in the presence of nonstationarity. + +The authors demonstrate, using data on heavy metal concentrations in soils along the Meuse River, that this approach significantly improves the fit of spatial autoregressive (SAR) models by reducing the root mean square error (RMSE), the AIC, and increasing the adjusted R². Furthermore, they propose a new measure of environmental spatiotemporal autocorrelation (statistic B) as an alternative to Moran’s I, which incorporates the extended weight matrix. This approach represents a major advance in the spatial modeling of environmentally mediated phenomena, with direct applications in epidemiology, ecology, public health, and social geography. + +**References:** + +Chen, Y. (2023). Spatial autocorrelation equation based on Moran’s index. Scientific Reports, 13, 19296. + +Ejigu, B.A., & Wencheko, E. (2020). Introducing covariate dependent weighting matrices in fitting autoregressive models and measuring spatio-environmental autocorrelation. *Spatial Statistics, 38*, 100454.\ + + +# Spatial autocorrelation {.unnumbered} + +## Global Moran’s + +```{r} +# Compute centroids of the polygons +coo <- st_centroid(delitos_data) + +# Create a neighbor list where each polygon (based on its centroid `coo`) is connected +# to its 3 nearest neighbors using k-nearest neighbors (k = 3) +nb <- knn2nb(knearneigh(coo, k = 3)) # k number nearest neighbors + +# Global Moran's I +# Convert the neighbor list to a listw object +lw <- nb2listw(nb, style = "W") # Use nb2listw + +# Now you can use 'lw' in moran.test +gmoran <- moran.test(delitos_data$sum_24HP, lw, alternative = "greater") + +gmoran + +gmoran[["estimate"]][["Moran I statistic"]] # Moran's I + +gmoran[["statistic"]] # z-score + +gmoran[["p.value"]] # p-value +``` + +The Global Moran's I index measures overall spatial autocorrelation, that is, whether the values of a variable tend to cluster spatially (similar values located near each other) or are randomly distributed. The index is positive if similar values cluster in space, negative if dissimilar values cluster, and close to zero if the spatial pattern is random (Chen, 2023).  + +The Moran's I index calculated for the variable sum_24HP yielded a value of 0.099, indicating positive spatial autocorrelation. This result was statistically significant (z = 35.92, p \< 2.2×10⁻¹⁶), allowing the rejection of the null hypothesis of spatial randomness. Consequently, it is concluded that similar values of sum_24HP tend to cluster spatially, reflecting a pattern of territorial concentration. + +**References:** + +Chen, Y. (2023). Spatial autocorrelation equation based on Moran’s index. Scientific Reports, 13, 19296. + +**Moran's I Monte Carlo Test** + +```{r} +#| echo: true +#| message: false +#| warning: false +gmoranMC <- moran.mc(delitos_data$sum_24HP, lw, nsim = 99) +gmoranMC + +hist(gmoranMC$res) +abline(v = gmoranMC$statistic, col = "red") + +``` + +A Monte Carlo simulation (n = 99 permutations) was applied to assess the statistical significance of the global spatial autocorrelation index, Moran’s I, for the variable sum_24HP, using a spatial weights matrix (lw). This approach avoids relying on the theoretical distribution of the statistic, instead generating an empirical distribution through random permutations of the spatial values. + +The observed value of Moran’s I was 0.099, suggesting positive spatial autocorrelation: spatial units with similar values of sum_24HP tend to be located near one another. + +The resulting p-value was 0.01, indicating that under the null hypothesis of spatial randomness, only 1 out of the 100 permutations (including the observed value) produced a Moran’s I equal to or greater than the observed statistic. This allows us to reject the null hypothesis at the 1% significance level, concluding that the observed spatial pattern is not random, but rather exhibits statistically significant spatial clustering. + +The presented histogram illustrates the empirical distribution of Moran's I values generated from 99 random permutations, thereby representing the expected behavior of the statistic under the null hypothesis of spatial randomness. In this context, the red vertical line marks the observed Moran’s I value (0.099045), which is located at the far right end of the distribution. This position indicates that the observed value is unusually high compared to the simulated values, visually reinforcing its exceptionality and supporting the statistical evidence against the null hypothesis, suggesting the presence of significant spatial autocorrelation in the analyzed data. + +**Reference:** + +Gimond, M. (s.f.). Spatial Autocorrelation. Intro to GIS and Spatial Analysis. https://mgimond.github.io/Spatial/spatial-autocorrelation.html + +```{r} +#| echo: true +#| message: false +#| warning: false + +moran.plot(delitos_data$sum_24HP, lw) +``` + +This graph reveals local patterns of spatial autocorrelation, with a concentration of points in the lower-left and, to a lesser extent, the upper-right quadrant. This clustering supports the presence of positive spatial dependence. The outliers at the extremes, such as units 7017, 30771, and 16235, suggest the existence of areas that behave differently from their surrounding context and may be relevant for further investigation due to their anomalous nature. The fitted regression line reinforces the positive slope associated with autocorrelation, visually complementing and validating the results obtained from the global Moran's I index. + +## Local Moran’s I + +```{r} +#| echo: true +#| message: false +#| warning: false + +lmoran <- localmoran(delitos_data$sum_24HP, lw, alternative = "greater") +head(lmoran) +``` + +The Local Moran's I is a spatial autocorrelation statistic that allows the identification of spatial clusters and localized outliers. It differs from the Global Moran's I in that the local version evaluates each spatial unit individually in relation to its neighbors, revealing patterns of clustering (high or low value clusters) or dissimilarity (spatial outliers). In geospatial studies, it is essential for detecting areas where values are significantly similar to or different from those of their neighbors, enabling more informed decision-making (Anselin, 1995). + +For our data, when calculating the Local Moran’s I for the variable of interest *sum_24HP*, the results for the first spatial units show statistic values close to zero (0.0095), as well as low Z-scores and high p-values (greater than 0.43), which reflects a lack of significant spatial autocorrelation in these cases. A partial exception is observed in the second unit, with a negative *Ii* value (−0.1814), which could suggest dissimilarity with its surroundings, although it is not statistically significant (p = 0.5670). Based on these results, we cannot clearly identify relevant spatial clusters or significant outliers in the units analyzed. + +**Reference:** + +Anselin, L. (1995). Local Indicators of Spatial Association—LISA. Geographical Analysis, 27(2), 93–115. + +```{r} +#| eval: false +#| message: false +#| warning: false + +tmap_mode("view") + +delitos_data$lmI <- lmoran[, "Ii"] # local Moran's I +delitos_data$lmZ <- lmoran[, "Z.Ii"] # z-scores +# p-values corresponding to alternative greater +delitos_data$lmp <- lmoran[, "Pr(z > E(Ii))"] + +p1 <- tm_shape(delitos_data) + + tm_polygons(col = "sum_24HP", title = "vble", style = "quantile") + + tm_layout(legend.outside = TRUE) + +p2 <- tm_shape(delitos_data) + + tm_polygons(col = "lmI", title = "Local Moran's I", + style = "quantile") + + tm_layout(legend.outside = TRUE) + +p3 <- tm_shape(delitos_data) + + tm_polygons(col = "lmZ", title = "Z-score", + breaks = c(-Inf, 1.65, Inf)) + + tm_layout(legend.outside = TRUE) + +p4 <- tm_shape(delitos_data) + + tm_polygons(col = "lmp", title = "p-value", + breaks = c(-Inf, 0.05, Inf)) + + tm_layout(legend.outside = TRUE) + +tmap_arrange(p1, p2, p3, p4) +``` + +This code generates this error: + +Error en paste(buffer\[seq_len(position)\], collapse = ""):  + +  result would exceed 2\^31-1 bytes + +```{r} +# Activate static mode (non-interactive) +tmap_mode("plot") + +# Optional: simplify geometries if there are too many polygons +library(sf) +delitos_data <- st_simplify(delitos_data, dTolerance = 100) + +# Assign Local Moran's I results +delitos_data$lmI <- lmoran[, "Ii"] # Local Moran's I +delitos_data$lmZ <- lmoran[, "Z.Ii"] # Z-scores +delitos_data$lmp <- lmoran[, "Pr(z > E(Ii))"] # p-values (one-sided) + +# Create individual maps +p1 <- tm_shape(delitos_data) + + tm_polygons(col = "sum_24HP", title = "Variable", style = "quantile") + + tm_layout(legend.outside = TRUE) + +p2 <- tm_shape(delitos_data) + + tm_polygons(col = "lmI", title = "Local Moran's I", style = "quantile") + + tm_layout(legend.outside = TRUE) + +p3 <- tm_shape(delitos_data) + + tm_polygons(col = "lmZ", title = "Z-score", breaks = c(-Inf, 1.65, Inf)) + + tm_layout(legend.outside = TRUE) + +p4 <- tm_shape(delitos_data) + + tm_polygons(col = "lmp", title = "p-value", breaks = c(-Inf, 0.05, Inf)) + + tm_layout(legend.outside = TRUE) + +# Display maps individually to avoid memory overflow +p1 +p2 +p3 +p4 +``` + +```{r} +tmap_arrange(p1, p2, p3, p4) +``` + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Plot Local Moran's I using tmap +tm_shape(delitos_data) + + tm_polygons( + col = "lmZ", + title = "Local Moran's I", + style = "fixed", + breaks = c(-Inf, -1.96, 1.96, Inf), + labels = c("Negative SAC", "No SAC", "Positive SAC"), + palette = c("blue", "white", "red") + ) + + tm_layout(legend.outside = TRUE) +``` + +The cartographic representation of the Local Moran’s I through LISA cluster maps and thematic visualizations highlights the absence of significant spatial autocorrelation for the variable *sum_24HP*. Most of the territory exhibits Local Moran’s I values close to zero and p-values above 0.05, which is reflected in the predominance of areas classified as “No SAC” (no spatial autocorrelation). Although a few units show slightly positive or negative I values, they do not reach statistical significance based on their Z-scores and p-values. As a result, no meaningful clusters of high or low values, nor statistically significant spatial outliers, are identified in the study area. This suggests a spatial distribution that is more random or weakly structured. + +**Prompt**:  Why is it necessary or useful to create a cartographic representation of the Local Moran's I using LISA-type maps? + +## Clusters + +```{r} +#| echo: true +#| message: false +#| warning: false + +mp <- moran.plot(as.vector(scale(delitos_data$sum_24HP)), lw) +delitos_data$quadrant <- NA +# high-high +delitos_data[(mp$x >= 0 & mp$wx >= 0) & (delitos_data$lmp <= 0.05), "quadrant"]<- 1 +# low-low +delitos_data[(mp$x <= 0 & mp$wx <= 0) & (delitos_data$lmp <= 0.05), "quadrant"]<- 2 +# high-low +delitos_data[(mp$x >= 0 & mp$wx <= 0) & (delitos_data$lmp <= 0.05), "quadrant"]<- 3 +# low-high +delitos_data[(mp$x <= 0 & mp$wx >= 0) & (delitos_data$lmp <= 0.05), "quadrant"]<- 4 +# non-significant +delitos_data[(delitos_data$lmp > 0.05), "quadrant"] <- 5 +``` + +```{r} +#| echo: true +#| message: false +#| warning: false + +tm_shape(delitos_data) + tm_fill(col = "quadrant", title = "", +breaks = c(1, 2, 3, 4, 5, 6), +palette = c("red", "blue", "lightpink", "skyblue2", "white"), +labels = c("High-High", "Low-Low", "High-Low", + "Low-High", "Non-significant")) + +tm_legend(text.size = 1) + tm_borders(alpha = 0.5) + +tm_layout(frame = FALSE, title = "Clusters") + +tm_layout(legend.outside = TRUE) +``` + +The Moran scatterplot and the LISA cluster map allow the identification of spatial clusters and outliers based on the local spatial autocorrelation structure. In our dataset, spatial units were categorized into five quadrants depending on the direction of the standardized value and the average value of their neighbors, considering only statistically significant relationships (*p* ≤ 0.05). The map reveals isolated areas of High-High clusters (in red), which indicate zones of high values surrounded by other high-value areas, typically urban hotspots. Low-Low clusters (in blue) are associated with rural or peripheral regions with low activity. The High-Low and Low-High categories (pink and light blue, respectively) represent spatial outliers, where a unit significantly differs from its surroundings. However, most of the territory is classified as non-significant, suggesting the absence of strong local spatial patterns in most of the area. + +**Reference:** + +Anselin, L. (1995). Local Indicators of Spatial Association—LISA. Geographical Analysis, 27(2), 93–115. + +# Correspondance Analysis {.unnumbered} + +```{r} +#| eval: false +#| message: false +#| warning: false +#| include: false + +# Paso 1: Crear la matriz de frecuencias de delitos en polígonos +Freq <- delitos_data %>% + st_drop_geometry() %>% + select(contains('24')) + +rownames(Freq) <- c("Zona Norte", "Zona Sur", "Zona Este", "Zona Oeste") +colnames(Freq) <- c("Homicidio", "Robo", "Hurto", "Extorsión") + +print("Matriz de frecuencias:") +print(F) + +# Paso 2: Calcular la tabla de frecuencias relativas +total_F <- sum(F) +F_rel <- F / total_F + +print("Matriz de frecuencias relativas:") +print(F_rel) + +# Paso 3: Cálculo de los totales por fila y columna +fi <- rowSums(F_rel) # Suma por fila +fj <- colSums(F_rel) # Suma por columna + +D_f <- diag(fi) # Matriz diagonal de totales de fila +D_c <- diag(fj) # Matriz diagonal de totales de columna + +# Paso 4: Construcción de la matriz de correspondencias Z +D_f_sqrt_inv <- diag(1 / sqrt(fi)) # Raíz cuadrada inversa de los totales de fila +D_c_sqrt_inv <- diag(1 / sqrt(fj)) # Raíz cuadrada inversa de los totales de columna + +Z <- D_f_sqrt_inv %*% (F_rel - fi %o% fj) %*% D_c_sqrt_inv + +print("Matriz de correspondencias Z:") +print(Z) + +# Paso 5: Descomposición en Valores Singulares (SVD) +svd_result <- svd(Z) # Descomposición de Z + +U <- svd_result$u # Vectores propios de ZZ' (Filas) +D <- diag(svd_result$d) # Matriz diagonal de valores singulares +V <- svd_result$v # Vectores propios de Z'Z (Columnas) + +print("Matriz U (Vectores propios de ZZ'):") +print(U) + +print("Matriz D (Valores singulares):") +print(D) + +print("Matriz V (Vectores propios de Z'Z):") +print(V) + +# Paso 6: Proyección de filas y columnas en el espacio reducido +dim_reducida <- 2 +C_f <- D_f_sqrt_inv %*% U[, 1:dim_reducida] %*% D[1:dim_reducida, 1:dim_reducida] # Coordenadas filas +C_c <- D_c_sqrt_inv %*% V[, 1:dim_reducida] %*% D[1:dim_reducida, 1:dim_reducida] # Coordenadas columnas + +print("Coordenadas filas en el espacio reducido:") +print(C_f) + +print("Coordenadas columnas en el espacio reducido:") +print(C_c) + +# Librerías +library(MASS) # Para manipulación matricial +library(ca) # Para el análisis de correspondencias + +# Matriz de frecuencias (ejemplo: delitos en polígonos) +F <- matrix(c( + 12, 5, 8, 2, + 7, 3, 10, 1, + 4, 2, 6, 1, + 15, 8, 14, 5 +), nrow = 4, byrow = TRUE) + +rownames(F) <- c("Zona Norte", "Zona Sur", "Zona Este", "Zona Oeste") +colnames(F) <- c("Homicidio", "Robo", "Hurto", "Extorsión") + +# Total de la tabla +n_total <- sum(F) + +# Matriz de frecuencias relativas +F_rel <- F / n_total + +# Totales por fila y columna +f_i <- rowSums(F_rel) +f_j <- colSums(F_rel) + +# Matriz esperada bajo independencia +F_exp <- outer(f_i, f_j) + +# Cálculo de ji-cuadrado +chi2 <- sum((F_rel - F_exp)^2 / F_exp) * n_total +print(paste("Estadístico Ji-cuadrado:", round(chi2, 4))) + +# Distancia ji-cuadrado entre filas +D_c_inv <- diag(1 / f_j) # Inversa de los totales de columnas +dist_ji <- as.matrix((F_rel - F_exp) %*% D_c_inv %*% t(F_rel - F_exp)) +print("Matriz de distancias ji-cuadrado entre filas:") +print(dist_ji) +``` + +When + +Error en \`.rowNamesDF\<-\`(x, value = value):  + +  longitud de 'row.names' inválida + +```{r} +#| eval: false +#| message: false +#| warning: false +#| include: false + +# Paso 1: Crear la matriz de frecuencias de delitos en polígonos +Freq <- delitos_data %>% + st_drop_geometry() %>% + select(contains('24')) + +rownames(Freq) <- c("Zona Norte", "Zona Sur", "Zona Este", "Zona Oeste") +colnames(Freq) <- c("Homicidio", "Robo", "Hurto", "Extorsión") +``` + +In the second part we had: \[1\] "Matriz de frecuencias relativas:" \[1\] NaN and for the third part we had again a error. + +# Imputing and Linking Data + +```{r} +#| include: false +source('setup.R') + +# Load the dataset +delitos_data <- st_read("data/spatial/crime_spatial_course.gpkg") +delitos_data <- delitos_data[delitos_data$dpto_ccdgo == '05', ] +``` + +## Download open spatial data + +```{r} +#| echo: true +#| message: true +#| warning: true + +d <- worldclim_country(country = "Colombia", var = "tmin", + path = tempdir()) +terra::plot(mean(d), plg = list(title = "Min. temperature (C)")) +``` + +This map represents monthly minimum temperature data across Colombian territory using the `worldclim_country()` function from the geodata package. It generates a raster with 12 layers (one for each month) of climatic data, which are then averaged to obtain the annual minimum temperature. On the map, lower temperatures can be observed in the Andes mountain range, while higher temperatures appear in the coastal and lowland regions. Consequently, based on these data and information from IDEAM, this representation is consistent with the typical climatic patterns of Colombia. + +**References**: + +Fick, S. E., & Hijmans, R. J. (2017). WorldClim 2: new 1-km spatial resolution climate surfaces for global land areas. International Journal of Climatology, 37(12), 4302–4315. https://doi.org/10.1002/joc.5086 + +IDEAM. (2015). Estudio Nacional del Agua 2014. Bogotá: Instituto de Hidrología, Meteorología y Estudios Ambientales. + +## Cropping, masking, and aggregating raster data + +```{r} +#| echo: true +#| message: true +#| warning: true + +# Cropping +sextent <- terra::ext(delitos_data) +d <- terra::crop(d, sextent) +plot(d) +``` + +The climatic raster is cropped to the geographic area corresponding to the vector data using terra::ext as our boundary. The terra::crop function allows us to adjust the spatial domain to a raster of interest, avoiding computations over regions that are not part of the study area (Pebesma, 2018). In our results, we observe the spatially reduced image representing our area of interest (Antioquia), while preserving the resolution and structure of the climatic data. + +**Reference**: + +Pebesma, E. (2018). Simple Features for R: Standardized Support for Spatial Vector Data. The R Journal, 10(1), 439–446. https://journal.r-project.org/archive/2018/RJ-2018-009/RJ-2018-009.pdf + +```{r} +#| echo: true +#| message: true +#| warning: true + +# Masking +d <- terra::mask(d, vect(delitos_data)) +plot(d) +``` + +Once the raster has been cropped, a masking process is applied using the geometry of our dataset through the terra::mask function. This process removes raster values outside the polygons of interest, allowing for analyses such as average value extraction or geospatial regression. According to Higmans (2023), spatial masking is a fundamental operation in geoprocessing workflows because it enables precise integration of vector and raster data, especially when working with multiple resolutions or formats. In our code, the resulting image displays minimum temperature exclusively within the area covered by the polygons, revealing the thermal structure in a more focused and accurate manner. + +**Reference:** + +Hijmans, R. J. (2023). Terra: Spatial Data Analysis. R package version 1.7-46. https://cran.r-project.org/web/packages/terra/terra.pdf + +```{r} +#| echo: true +#| message: true +#| warning: true + +# Aggregating +d <- terra::aggregate(d, fact = 20, fun = "mean", na.rm = TRUE) +plot(d) +``` + +Finally, a spatial aggregation of the raster was performed using the terra::aggregate() function, applying a resolution reduction factor of 20. This operation merges blocks of 20×20 cells into a single cell by computing the mean value of the original ones. The purpose of this step is to decrease the spatial resolution of the raster, which can be useful for reducing computational load or adapting the data to broader scales of analysis (Bivand et al., 2013). The result is a smoothed layer that preserves the general structure of the thermal gradients, although with a loss of fine detail. + +**Referencia**:\ +Bivand, R. S., Pebesma, E., & Gómez-Rubio, V. (2013). Applied Spatial Data Analysis with R (2nd ed.). Springer. https://doi.org/10.1007/978-1-4614-7618-4 + +## Extracting raster values at points + +```{r} +#| echo: true +#| message: true +#| warning: true + +points <- st_coordinates(delitos_data) + +# Convert SpatRaster to data frame for ggplot2 +raster_df <- as.data.frame(d, xy = TRUE) +colnames(raster_df)[3] <- "tmin" # Rename the temperature column + +# Get the centroid coordinates +points_df <- st_coordinates(st_geometry(delitos_data)) +points_df <- as.data.frame(points_df) +colnames(points_df) <- c("x", "y") + +# Create the plots with ggplot2 +ggplot() + + geom_raster(data = raster_df, aes(x = x, y = y, fill = tmin)) + + scale_fill_viridis_c(name = "Min Temp (C)") + + geom_sf(data = delitos_data, color = "black", fill = "transparent") + # Plot the polygon borders + geom_point(data = points_df, aes(x = x, y = y), color = "red", size = 0.01, shape = 1, alpha = 0.5) + # Plot the centroids as transparent circles + labs(title = "Aggregated Minimum Temperature with Polygons and Centroids") + + theme_minimal() +``` + +In the combined visualization, spatial components were integrated, including aggregated minimum temperature values, the outlines of spatial units, and the centroids of each polygon represented by points. To understand each of the elements used, parts of the code were analyzed. The use of ggplot2 is justified by its highly flexible graphic system that allows overlaying multiple spatial layers through geom_raster() (for the raster), geom_sf() (for the vector polygons), and geom_point() (for the centroids calculated from the geometries). This representation enables a visual assessment of how minimum temperature values are distributed in relation to the spatial arrangement of the analysis units. The prior aggregation of the raster facilitates interpretation of the thermal pattern at a broader scale, while the precise placement of the centroids allows for their later use in value extraction or spatial attribute assignment (Hijmans, 2023; Bivand et al., 2013). + +In the plot, we can observe that the areas with lower temperatures (bluish and purple tones) are concentrated in the central region, which coincides with higher elevation zones (mountain ranges). The red points (centroids) are evenly distributed over the polygons, visually confirming the correct alignment between the spatial unit geometries and the climatic layer, thus contributing to advanced spatial analysis (Pebesma, 2018). + +**References**: + +Hijmans, R. J. (2023). terra: Spatial Data Analysis. R package version 1.7-46. + +[\ +](https://cran.r-project.org/web/packages/terra/terra.pdf)Pebesma, E. (2018). Simple Features for R: Standardized Support for Spatial Vector Data. The R Journal, 10(1), 439–446. + +Bivand, R. S., Pebesma, E., & Gómez-Rubio, V. (2013). Applied Spatial Data Analysis with R (2nd ed.). Springer. + +**Prompt:** Explain which elements, libraries, and functions I need to consider in order to merge the information from my base code with the political map of a department. + +```{r} +# Step 1: Load Colombia's political boundaries at level 1 (departments) +colombia_departments <- gadm("COL", level = 1, path = tempdir()) + +# Step 2: Filter the department of Antioquia +antioquia <- colombia_departments[colombia_departments$NAME_1 == "Antioquia", ] + +# Step 3: Crop and mask the processed raster (d) to the Antioquia boundaries +d_antioquia <- crop(d, antioquia) +d_antioquia <- mask(d_antioquia, antioquia) + +# Step 4: Download municipal boundaries (level = 2) for detailed analysis in Antioquia +colombia_municipalities <- gadm("COL", level = 2, path = tempdir()) + +# Step 5: Filter the municipalities within Antioquia +antioquia_municipalities <- colombia_municipalities[colombia_municipalities$NAME_1 == "Antioquia", ] + +# Step 6: Extract summary statistics by municipality (mean of aggregated minimum temperature) +summary_temp <- extract(d_antioquia, antioquia_municipalities, fun = mean, na.rm = TRUE) + +# Step 7: Join the summary data to the municipality shapefile +antioquia_municipalities$temp_min_prom <- summary_temp[, 2] # Second column holds the mean value + +# Step 8: Create a thematic map with tmap +tmap_mode("plot") +tm_shape(antioquia_municipalities) + + tm_polygons("temp_min_prom", title = "Min. Temp (°C)", palette = "Blues", style = "quantile") + + tm_borders(lwd = 0.5, col = "gray30") + + tm_layout(main.title = "Average Minimum Temperature\nAggregated by Municipality - Antioquia", + legend.outside = TRUE) +``` + +**Prompt**: Help me merge the code to have all the features in one place. + +```{r} +# Step 1: Load Colombia's political boundaries at level 1 (departments) +colombia_departments <- gadm("COL", level = 1, path = tempdir()) +antioquia <- colombia_departments[colombia_departments$NAME_1 == "Antioquia", ] + +# Step 2: Crop and mask the already processed raster (d) to Antioquia +d_antioquia <- crop(d, antioquia) +d_antioquia <- mask(d_antioquia, antioquia) + +# Step 3: Download and filter municipalities of Antioquia (level 2) +colombia_municipalities <- gadm("COL", level = 2, path = tempdir()) +antioquia_municipalities <- colombia_municipalities[colombia_municipalities$NAME_1 == "Antioquia", ] + +# Step 4: Extract summary per municipality (mean of minimum temperature) +summary_temp <- extract(d_antioquia, antioquia_municipalities, fun = mean, na.rm = TRUE) +antioquia_municipalities$temp_min_prom <- summary_temp[, 2] + +# Step 5: Prepare raster data for ggplot2 +raster_df <- as.data.frame(d_antioquia, xy = TRUE) +colnames(raster_df)[3] <- "tmin" # Rename temperature column + +# Step 6: Prepare centroids coordinates and geometry from delitos_data +points_df <- as.data.frame(st_coordinates(st_geometry(delitos_data))) +colnames(points_df) <- c("x", "y") + +# Step 7: Create combined plot with ggplot2 +ggplot() + + geom_raster(data = raster_df, aes(x = x, y = y, fill = tmin)) + + scale_fill_viridis_c(name = "Min Temp (°C)") + + geom_sf(data = delitos_data, color = "black", fill = NA, linewidth = 0.2) + + geom_point(data = points_df, aes(x = x, y = y), color = "red", size = 0.01, shape = 1, alpha = 0.5) + + geom_sf(data = st_as_sf(antioquia_municipalities), fill = NA, color = "gray30", linewidth = 0.5) + + labs(title = "Aggregated Minimum Temperature and Polygon Distribution in Antioquia") + + theme_minimal() +```