diff --git a/Advance analytics course/Appendix-1_3.qmd b/Advance analytics course/Appendix-1_3.qmd new file mode 100644 index 0000000..4c421ce --- /dev/null +++ b/Advance analytics course/Appendix-1_3.qmd @@ -0,0 +1,1446 @@ +# Appendix 1.3. Atlántico + +### Laura Duque + +#### Maestría en Matemáticas Aplicadas y Ciencias de la Computación +#### Abril 2025 + +##Data Analysis + +For this assignment, we selected three departments—Antioquia, Valle del Cauca, and Atlántico—and conducted a statistical analysis for each one to compare results and develop strategies aimed at reducing crime incidence. We focused on the problem of robbery against persons (HP). In this report we present the findings for the Atlántico department. + + +```{r} +#| echo: true +#| message: false +#| warning: false +# working directory +#setwd(dirname(rstudioapi::getSourceEditorContext()$path)) + +# packages +list_packages = c('readxl', 'dplyr', 'moments', 'tidyr', 'tibble', 'gt', 'ggplot2', + 'fmsb', 'car', 'reshape2', 'knitr', 'gridExtra', 'ggExtra', 'sf', + 'leaflet', 'igraph', 'ggraph', 'tidygraph', 'spdep', 'classInt', + 'corrplot', 'spData', 'Matrix', "tmap") +new.packages = list_packages[!(list_packages %in% installed.packages()[,"Package"])] +if (length(new.packages)) { + install.packages(new.packages) +} +for (package in list_packages){ + library(package, character.only = T) +} + +# Load the dataset +delitos_data <- st_read("data/spatial/crime_spatial_course.gpkg") +delitos_data_Atl <- delitos_data[delitos_data$dpto_ccdgo == ('08'),] # Dataset with Atlántico Data +delitos_data <- delitos_data[delitos_data$dpto_ccdgo == c('08',"05","76" ), ] # dataset with data from analysed departments: 08 Atlántico 76 Valle del Cauca 05 Antioquia + +dim(delitos_data) +summary(delitos_data) + +dim(delitos_data_Atl) +summary(delitos_data_Atl) + +# interactive polygons location +leaflet(delitos_data) %>% + addTiles() %>% # Base map + addPolygons(color = "steelblue", weight = 1, fillOpacity = 0.5) + +# quantile +quantile(delitos_data_Atl$sum_24HP, probs = seq(0, 1, 0.1), na.rm = TRUE) + +#boxplot +boxplot(delitos_data_Atl$sum_24HP, main = "Boxplot of HP in Atlántico", horizontal = TRUE) + + +``` + +From both the quantile summary and the boxplot, it is clear that robbery incidents in Atlántico are extremely zero-inflated and right-skewed—over 75 % of census-block polygons record zero robberies, the 90th percentile jumps to one incident, and the maximum observed is 21—indicating that risk is highly concentrated in a small fraction of “hotspot” areas; by focusing preventive measures on the top 10 % of high-incidence polygons, authorities could achieve substantial reductions in total robberies. + + +## Skewness + +```{r} +#| echo: true +#| message: false +#| warning: false + +# step by step +n <- length(delitos_data_Atl$sum_24HP) +mean_x <- mean(delitos_data_Atl$sum_24HP) +sd_x <- sd(delitos_data_Atl$sum_24HP) # Uses (n-1) denominator +z_scores <- (delitos_data_Atl$sum_24HP - mean_x) / sd_x +z_cubed <- z_scores^3 +sum_cubed <- sum(z_cubed) +skewness <- (n / ((n - 1) * (n - 2))) * sum_cubed +paste0('sum_24HP: ', skewness) + +# function +skewness(delitos_data_Atl$sum_24HP, na.rm = TRUE) + +# skewness +delitos_data_Atl %>% + st_drop_geometry() %>% + select(contains('24')) %>% + summarise(across(everything(), ~ skewness(.x, na.rm = TRUE))) %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "Crime Type") %>% + mutate(V1 = round(V1, 2)) %>% + rename(Skewness = V1) %>% + gt() + +``` + +The skewness of 10.0324 for Atlántico confirms a pronounced right‐tail, reinforcing our earlier observation of a highly asymmetric distribution around its mean. This extreme positive skew underscores that a small number of high‐incident areas drive most of the variation in reported robberies. + + +## Kurtosis + +```{r} +#| echo: true +#| message: false +#| warning: false +# step by step +z_fourth <- z_scores^4 +sum_fourth <- sum(z_fourth) +kurtosis <- ((n * (n + 1)) / ((n - 1) * (n - 2) * (n - 3))) * sum_fourth - (3 * (n - 1)^2) / ((n - 2) * (n - 3)) +print(kurtosis) + +# function +kurtosis(delitos_data_Atl$sum_24HP, na.rm = TRUE) + +# Kurtosis +delitos_data_Atl %>% + st_drop_geometry() %>% + select(contains('24')) %>% + summarise(across(everything(), ~ kurtosis(.x, na.rm = TRUE))) %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "Crime Type") %>% + mutate(V1 = round(V1, 2)) %>% + rename(Kurtosis = V1) %>% + gt() + +``` + +The kurtosis of 206.24 for Atlántico’s robbery data indicates a strongly leptokurtic distribution with pronounced right-tail heaviness, reflecting that extreme robbery counts occur far more frequently than would be expected under a normal distribution. + +## Coefficient of Variation + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Compute statistics +mean_val <- mean(delitos_data_Atl$sum_24HP, na.rm = TRUE) +print(mean_val) +std_dev <- sd(delitos_data_Atl$sum_24HP, na.rm = TRUE) +print(std_dev) + +# Compute the range for first standard deviation +lower_bound <- mean_val - std_dev +upper_bound <- mean_val + std_dev +paste0('lower_bound: ', round(lower_bound, 2), ' - upper_bound: ', round(upper_bound, 2)) + +# Count the number of points within 1 standard deviation +within_1sd <- sum(delitos_data_Atl$sum_24HP >= lower_bound & delitos_data_Atl$sum_24HP <= upper_bound, na.rm = TRUE) +percentage_1sd <- (within_1sd / nrow(delitos_data_Atl)) * 100 +paste0('within_1sd: ', round(within_1sd, 2), ' - percentage_1sd: ', round(percentage_1sd, 2)) + +# Create histogram +ggplot(delitos_data_Atl, aes(x = sum_24HP)) + + geom_histogram(binwidth = 5, fill = "blue", alpha = 0.5, color = "black") + + + # Add vertical lines for mean, median, and 1st SD + geom_vline(aes(xintercept = mean_val), color = "red", linetype = "dashed", size = 1.2) + + #geom_vline(aes(xintercept = median_val), color = "green", linetype = "dashed", size = 1.2) + + geom_vline(aes(xintercept = lower_bound), color = "purple", linetype = "dashed", size = 1) + + geom_vline(aes(xintercept = upper_bound), color = "purple", linetype = "dashed", size = 1) + + + # Labels and title + labs(title = "Histogram of AUTOMOTORES with Mean, and 1SD Range", + x = "AUTOMOTORES Values", y = "Frequency") + + + # Add annotation for 1SD range + annotate("text", x = mean_val, y = 10, + label = paste(round(percentage_1sd, 1), "1SD", sep = ""), + color = "black", size = 5, hjust = 0.5, vjust = -1) + + + theme_minimal() + +# cv +paste0('cv: ', round(std_dev / mean_val * 100), 2) + +# variation +delitos_data_Atl %>% + st_drop_geometry() %>% + select(contains('24')) %>% + summarise( + across( + everything(), + ~ ifelse(mean(.x, na.rm = TRUE) != 0, + sd(.x, na.rm = TRUE) / mean(.x, na.rm = TRUE), + NA), # Compute CV safely + .names = "{col}" + ) + ) %>% + t() %>% + as.data.frame() %>% + tibble::rownames_to_column(var = "Crime Type") %>% + mutate(V1 = round(V1, 2)) %>% + rename(Variation = V1) %>% + gt() + +``` + +With a coefficient of variation of 3712 (variance = 3.71), robbery counts in Atlántico exhibit extreme relative dispersion—its standard deviation is over 37 times the mean—so the arithmetic average is a very poor descriptor. Yet, compared to other crime types—which show even higher CVs—robbery is relatively less erratic. This tells us two things: first, that we should summarize these data with robust measures (median, IQR) or fit models tailored for over-dispersed, zero-inflated counts rather than rely on simple means; and second, that although robbery hotspots are sharply concentrated, their spatial spread is marginally more uniform than that of most other offenses. + +Examining CVs across crime types confirms this ranking of relative variability—HM (13.23), HA (12.64), HOM (9.17), HR (8.17), DS (7.55), HC (7.46), LP (6.27), VI (4.27), and HP (3.71)—and highlights that hyper-targeted interventions are most critical for the most skewed categories, while robbery prevention can be applied more broadly. Moreover, it is highly likely that the polygons driving high robbery rates also concentrate other crime types; thus, a joint correlation analysis and mapping of spatial variability will be essential to uncover these multi-offense hotspots and optimize resource allocation. + +## Median Absolute Deviation MAD and MAD/median + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Compute statistics +median_val <- median(delitos_data_Atl$sum_24HP, na.rm = TRUE) +# Es normal que de cero porque es una medida de posición +print(median_val) +mad_val <- mad(delitos_data_Atl$sum_24HP, na.rm = TRUE) # Compute MAD +print(mad_val) + +# Compute the range for first standard deviation +lower_bound <- median_val - mad_val +upper_bound <- median_val + mad_val +paste0('lower_bound: ', round(lower_bound, 2), ' - upper_bound: ', round(upper_bound, 2)) + +# Count the number of points within 1 MAD +within_1mad <- sum(delitos_data_Atl$sum_24HP >= lower_bound & delitos_data_Atl$sum_24HP <= upper_bound, na.rm = TRUE) +percentage_1mad <- (within_1mad / nrow(delitos_data_Atl)) * 100 +paste0('within_1mad: ', round(within_1mad, 2), ' - percentage_1mad: ', round(percentage_1mad, 2)) + +# Create histogram +ggplot(delitos_data_Atl, aes(x = sum_24HP)) + + geom_histogram(binwidth = 5, fill = "blue", alpha = 0.5, color = "black") + + + # Add vertical lines for mean, median, and 1st SD + #geom_vline(aes(xintercept = mean_val), color = "red", linetype = "dashed", size = 1.2) + + geom_vline(aes(xintercept = median_val), color = "green", linetype = "dashed", size = 1.2) + + geom_vline(aes(xintercept = lower_bound), color = "purple", linetype = "dashed", size = 1) + + geom_vline(aes(xintercept = upper_bound), color = "purple", linetype = "dashed", size = 1) + + + # Labels and title + labs(title = "Histogram of HP with Median, and 1MAD Range", + x = "HP Values", y = "Frequency") + + + # Add annotation for 1SD range + annotate("text", x = median_val, y = 10, + label = paste(within_1mad, "points (", round(percentage_1mad, 1), "1MAD", sep = ""), + color = "black", size = 5, hjust = 0.5, vjust = -1) + + + theme_minimal() + +# MAD/Median +paste0('MAD/Median: ', round(mad_val / median_val * 100), 2) + +``` + +With a Median Absolute Deviation (MAD) of 88.41, the robbery counts in Atlántico exhibit very high relative dispersion around the median—underscoring what our skewness (10.03) and kurtosis (206.24) already told us about the data’s heavy right tail and extreme outliers. Unlike the standard deviation, the MAD downweights those few high-incident polygons and still ends up large, which confirms that even “typical” variations (ignoring extreme hotspots) are substantial. When taken together with the coefficient of variation (3712) and the zero-inflation we observed (over 75 % zeros), this robust measure reinforces our conclusion that simple averages are misleading here and that any analysis or intervention must account for both the pervasive zeros and the handful of highly concentrated hotspots. + +## Covariance Matrix + +```{r} +#| echo: true +#| message: false +#| warning: false +delitos_data_Atl %>% + st_drop_geometry() %>% + select(contains("24")) %>% + cov() %>% + round(2) %>% + knitr::kable(digits = 2, caption = "Covariance Matrix") + + +``` + +The covariance matrix reveals a clear “core” of interrelated crimes alongside many that fluctuate independently. Robbery against persons (sum_24HP) has the highest variance (0.37), underscoring its pronounced spatial fluctuation, and shows moderate positive covariances with several other offences—personal injury (sum_24LP, 0.05), domestic violence (sum_24VI, 0.05), motorbike theft (sum_24HM, 0.06) and extortion (sum_24EX, 0.04)—suggesting these crimes tend to rise and fall together across polygons. Personal injury and domestic violence themselves covary at 0.04, further indicating their co-occurrence. By contrast, most off-diagonal covariances hover near zero, indicating weak or no relationship among those crime types. In particular, kidnapping (sum_24SS, sum_24SE) and terrorism (sum_24TR) all have covariances effectively at zero with every other offence—unsurprising given how rarely they occur. + +## Covariance Matrix of Log-Transformed Data + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Define the dataset +x <- delitos_data_Atl$sum_24HP + +# 1. Compute Raw Arithmetic Mean +arithmetic_mean <- mean(x) +print(arithmetic_mean) + +# 2. Compute Log-Mean (Multiplicative Center) +log_x <- log(x + 1) # Take logarithm of values +head(log_x) +log_mean <- mean(log_x) # Compute mean in log-space +print(log_mean) +log_mean_exp <- exp(log_mean) # Convert back to original scale +print(log_mean_exp) + +# Create the comparison table +comparison_table <- data.frame( + Index = seq_along(x), # Just an index for x-axis + Original_Value = x, + Log_Value = log_x +) + +p1 <- ggplot(comparison_table, aes(x = Original_Value, y = Log_Value)) + + geom_line(color = "gray70", size = 0.7, alpha = 0.5) + # Thin line connecting points + geom_point(alpha = 0.7, color = "blue") + # Scatter points with transparency + labs( + title = "Scatter Plot: Original vs. Log-Transformed Values", + x = "Original Values", + y = "Log-Transformed Values" + ) + + theme_minimal() + +# Add marginal histogram +ggMarginal( + p1, + type = "histogram", # Add marginal histograms + bins = 40, # Number of bins for the histogram + margins = "both", # Add histogram to both x and y margins + size = 5, # Size of the histograms relative to the scatter plot + fill = "gray", # Fill color for the histogram + color = "black", # Outline color for the histogram + alpha = 0.5 # Transparency +) + +``` + +The log transformation compresses the pronounced right tail and expands the mid-range of the distribution, ensuring that subsequent analyses are not driven by a handful of extreme polygons. It also stabilizes the variance across both low- and high-incident areas, improving comparability. + +```{r} +#| include: false + +# Store values for inline Quarto text +log_values <- paste(round(head(comparison_table$Log_Value), 2), collapse = ", ") +original_values <- paste(head(comparison_table$Original_Value), collapse = ", ") + +#| echo: true +#| message: false +#| warning: false + +#log transformed data +# Compute statistics for raw and log-transformed data +mean_raw <- mean(delitos_data_Atl$sum_24HP, na.rm = TRUE) +sd_raw <- sd(delitos_data_Atl$sum_24HP, na.rm = TRUE) +mad_raw <- mad(delitos_data_Atl$sum_24HP, na.rm = TRUE) + +delitos_data_log <- delitos_data_Atl %>% + #mutate(LOG_AUTOMOTORES = log(AUTOMOTORES + 1)) + mutate(LOG_AUTOMOTORES = log1p(sum_24HP)) # log1p(x) = log(1 + x) to handle zeros + +mean_log <- mean(delitos_data_log$LOG_AUTOMOTORES, na.rm = TRUE) +sd_log <- sd(delitos_data_log$LOG_AUTOMOTORES, na.rm = TRUE) +mad_log <- mad(delitos_data_log$LOG_AUTOMOTORES, na.rm = TRUE) + +# Compute statistics for raw and log-transformed data +data.frame( + Measure = c("Mean", "Median", "Standard Deviation", "MAD"), + Raw_Data = c(mean(delitos_data_Atl$sum_24HP, na.rm = TRUE), + median(delitos_data_Atl$sum_24HP, na.rm = TRUE), + sd(delitos_data_Atl$sum_24HP, na.rm = TRUE), + mad(delitos_data_Atl$sum_24HP, na.rm = TRUE)), + Log_Transformed_Data = c(mean(delitos_data_log$LOG_AUTOMOTORES, na.rm = TRUE), + median(delitos_data_log$LOG_AUTOMOTORES, na.rm = TRUE), + sd(delitos_data_log$LOG_AUTOMOTORES, na.rm = TRUE), + mad(delitos_data_log$LOG_AUTOMOTORES, na.rm = TRUE))) + +# Transform the data to a long format for ggplot +delitos_long <- delitos_data_Atl %>% + st_drop_geometry() %>% + select(contains('24')) %>% + pivot_longer(cols = everything(), names_to = "Crime Type", values_to = "Value") + +# Create faceted histograms +ggplot(delitos_long, aes(x = Value)) + + geom_histogram(bins = 30, fill = "skyblue", color = "black", alpha = 0.7) + + facet_wrap(~ `Crime Type`, scales = "free") + # Facet by crime type + theme_minimal() + + labs( + title = "Distributions of Crime Data", + x = "Value", + y = "Frequency" + ) + + theme( + axis.text.x = element_text(size = 5) # Reduce the font size of X-axis text + ) + +# Transform the data to long format and apply log transformation +delitos_long_log <- delitos_data_Atl %>% + st_drop_geometry() %>% + select(contains('24')) %>% + mutate(across(everything(), ~ log(.x), .names = "{col}")) %>% # Log transform (log(x + 1) to avoid log(0)) + pivot_longer(cols = everything(), names_to = "Crime Type", values_to = "Log Value") + +# Create faceted histograms for log-transformed values +ggplot(delitos_long_log, aes(x = `Log Value`)) + + geom_histogram(bins = 30, fill = "skyblue", color = "black", alpha = 0.7) + + facet_wrap(~ `Crime Type`, scales = "free") + # Facet by crime type + theme_minimal() + + labs( + title = "Log-Transformed Distributions of Crime Data", + x = "Log Value", + y = "Frequency" + ) + + theme( + axis.text.x = element_text(size = 3) # Reduce the font size of X-axis text + ) + +# Covariance Matrix (Log-Transformed) +delitos_data_Atl %>% + st_drop_geometry() %>% + select(contains('24')) %>% + mutate(across(everything(), ~ log(.x+1))) %>% # Log-transform (+1 to handle zeros) + cov() %>% + round(2) %>% + kable(digits = 2, caption = "Covariance Matrix (Log-Transformed)") + +``` + +After logging, the strongest variance still belongs to robbery against persons (sum_24HP) at 0.08—confirming it remains the most unevenly distributed crime even on a multiplicative scale. Its covariances with personal injury (sum_24LP), family violence (sum_24VI), shoplifting (sum_24HC) and motorbike theft (sum_24HM) all settle at 0.01, indicating these offences continue to rise and fall together in proportional terms across polygons. All other off-diagonal covariances fall to zero (including extortion), showing that once scale effects are removed, there is essentially no residual linear dependence among those crime pairs. Kidnapping and terrorism remain completely uncorrelated. In short, the log transform reveals that only a small “core” of related crime types truly co-vary, while most apparent associations in the raw data were artifacts of differing magnitude rather than genuine proportional relationships. + +## Redundant Variables and Variance Inflation Factors (VIF) + +Redundant variables provide little additional information due to high correlation with others, leading to multicollinearity in models. + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Covariance matrix +cm_delitos_data <- delitos_data_Atl %>% + st_drop_geometry() %>% + select(contains('24')) %>% + select(-sum_24TR, -sum_24SE, -sum_24SS) %>% + cov() + +# Compute eigenvalues and eigenvectors +eigen_results <- cm_delitos_data %>% eigen() + +# Extract eigenvalues and eigenvectors +eigenvalues <- eigen_results$values +eigenvectors <- eigen_results$vectors + +# Display eigenvalues and eigenvectors +print(eigenvalues) +head(eigenvectors) + +# The Smallest Eigenvalues +sort(eigenvalues, decreasing = FALSE) + +# The smallest eigenvalue is approximately zero +smallest_eigenvalue <- min(eigenvalues) +print(smallest_eigenvalue) + +# Corresponding eigenvector +smallest_eigenvector <- eigenvectors[, which.min(eigenvalues)] +print(smallest_eigenvector) + +# Normalize the eigenvector by dividing by the largest absolute value +normalized_eigenvector <- smallest_eigenvector / max(abs(smallest_eigenvector)) +print(normalized_eigenvector) + +# Sorted normalize the eigenvector +sort(abs(normalized_eigenvector), decreasing = T) + +# Get numeric variable names (order matches eigenvector indices) +variable_names <- colnames(cm_delitos_data) + +# Sort normalized eigenvector by absolute contribution (descending order) +sorted_contributions <- sort(abs(normalized_eigenvector), decreasing = TRUE) + +# Get the indices of the top contributions +top_indices <- order(abs(normalized_eigenvector), decreasing = TRUE) + +# Get the names of the top variables +top_variable_names <- variable_names[top_indices] + +# Print the top variable names +print(top_variable_names) + +# Fit a regression model to confirm the relationship +model <- lm(sum_24HA ~ sum_24HOM+sum_24LP+sum_24VI+sum_24DS+sum_24HP+sum_24HR+sum_24HC+sum_24HM + #+sum_24SS + +sum_24SE+sum_24EX + #+ sum_24TR + , + data = data.frame(delitos_data_Atl)) +summary(model) # HA is related to DS HP HR HC HM EX +vif(model) + +model <- lm(sum_24HM ~ sum_24HOM+sum_24LP+sum_24VI+sum_24DS+sum_24HP+sum_24HR+sum_24HC+sum_24HA+sum_24SE+sum_24EX, + data = data.frame(delitos_data_Atl)) +summary(model) # HM is related to HOM LP VI HP HA EX +vif(model) + +model <- lm(sum_24HR ~ sum_24HOM+sum_24LP+sum_24VI+sum_24DS+sum_24HP+sum_24HC+sum_24HA+sum_24HM+sum_24SE+sum_24EX, + data = data.frame(delitos_data_Atl)) +summary(model) # HR relates to LP VI DS HP HA EX +vif(model) + +model <- lm(sum_24EX ~ sum_24HOM+sum_24LP+sum_24VI+sum_24DS+sum_24HP+sum_24HR+sum_24HC+sum_24HA+sum_24HM+sum_24SE, + data = data.frame(delitos_data_Atl)) +summary(model) # Is highly correlated with all of them -- we should take this one out +vif(model) + +model <- lm(sum_24HC ~ sum_24HOM+sum_24LP+sum_24VI+sum_24DS+sum_24HP+sum_24HR+sum_24HA+sum_24HM+sum_24SE+sum_24EX, + data = data.frame(delitos_data_Atl)) +summary(model) # Related to LP VIDS HP HA +vif(model) + +model <- lm(sum_24DS ~ sum_24HOM+sum_24LP+sum_24VI+sum_24HP+sum_24HR+sum_24HC+sum_24HA+sum_24HM+sum_24SE+sum_24EX, + data = data.frame(delitos_data_Atl)) +summary(model) # LP VI HP HR HC +vif(model) + +model <- lm(sum_24LP ~ sum_24HOM+sum_24VI+sum_24DS+sum_24HP+sum_24HR+sum_24HC+sum_24HA+sum_24HM+sum_24SE+sum_24EX, + data = data.frame(delitos_data_Atl)) +summary(model) # VI HOM DS HP HR HC EX +vif(model) + +model <- lm(sum_24HOM ~ sum_24LP+sum_24VI+sum_24DS+sum_24HP+sum_24HR+sum_24HC+sum_24HA+sum_24HM+sum_24SE+sum_24EX, + data = data.frame(delitos_data_Atl)) +summary(model) # LP HP HM EX +vif(model) + +model <- lm(sum_24HP ~ sum_24HOM+sum_24LP+sum_24VI+sum_24DS+sum_24HR+sum_24HC+sum_24HA+sum_24HM + #+sum_24SS + #+sum_24SE + +sum_24EX + #+sum_24TR, + , data = data.frame(delitos_data_Atl)) +summary(model) # Todas menos SS y SE +vif(model) +alias(model) + +model <- lm(sum_24VI ~ sum_24HOM+sum_24LP+sum_24DS+sum_24HP+sum_24HR+sum_24HC+sum_24HA+sum_24HM+sum_24SE+sum_24EX, + data = data.frame(delitos_data_Atl)) +summary(model) # LP DS HP HR HC HM EX +vif(model) + +model <- lm(sum_24SE ~ sum_24HOM+sum_24LP+sum_24VI+sum_24DS+sum_24HP+sum_24HR+sum_24HC+sum_24HA+sum_24HM+sum_24EX, + data = data.frame(delitos_data_Atl)) +summary(model) # P value showed that test wqas not statistically significant +vif(model) + +model <- lm(sum_24SS ~ sum_24HOM+sum_24LP+sum_24VI+sum_24DS+sum_24HP+sum_24HR+sum_24HC+sum_24HA+sum_24HM+sum_24SE+sum_24EX, + data = data.frame(delitos_data_Atl)) +summary(model) # There is no data in this variable +vif(model) + +model <- lm(sum_24TR ~ sum_24HOM+sum_24LP+sum_24VI+sum_24DS+sum_24HP+sum_24HR+sum_24HC+sum_24HA+sum_24HM+sum_24SE+sum_24EX, + data = data.frame(delitos_data_Atl)) +summary(model) # There is no data in this variable +# Variance Inflation Factors +vif(model) + + +``` +An eigen‐decomposition of the covariance matrix produced eigenvalues +0.449, 0.242, 0.175, 0.094, 0.066, 0.028, 0.024, 0.020, 0.018 and 0.012. Even the smallest (≈0.012) is well above zero, indicating no near‐perfect linear dependencies meaning that each crime category contributes unique variance. + +We regressed HP on the nine remaining crimes (having removed SS, SE and TR, which never varied) and achieved an R² of 0.134 (F₉,₂₇₃₈₅ = 472.2, p < 2 × 10⁻¹⁶). Five predictors proved highly significant (p < 0.001): Car Robbery (HA), Extorsión (EX), Morcycle robbery (HM), personal injuries (LP) and domestic violence (VI).Residual diagnostics showed no systematic outliers or patterns, confirming the model isn’t driven by a few data points. Variance inflation factors for all retained predictors were below 1.5, so multicollinearity is negligible. + +Simple kidnapping (SS), kidnapping for ransom (SE) and terrorism (TR) never occurred in our 24-hour windows and added no predictive power, so they were excluded. + +Together, the eigenvalue spectrum, regression fit, residual checks and low VIFs confirm that our crime variables are independently informative, that HP is best predicted by HA, EX, HM, LP and VI, and that no redundant or collinear predictors remain. + + +## Global Variability Metric + +```{r} +#| echo: true +#| message: false +#| warning: false + +cov_matrix <- delitos_data_Atl %>% + st_drop_geometry() %>% + select(contains('24')) %>% + select(-sum_24TR, -sum_24SE, -sum_24SS) %>% + cov() + +# Effective Variance +det(cov_matrix)^(1/ncol(cov_matrix)) + +# Log-Transformed Effective Variance +det(log(cov_matrix + 1))^(1/ncol(cov_matrix)) + +# Effective Standard Deviation +det(cov_matrix)^(1/(ncol(cov_matrix) * 2)) + +# Log-Transformed Effective Standard Deviation +det(log(cov_matrix + 1))^(1/(ncol(cov_matrix) * 2)) +``` +Building on our earlier examination of the raw and log-transformed covariance matrices—which revealed no vanishing eigenvalues and only moderate pairwise covariances—we can further summarize the multivariate spread of our crime-count variables by computing the effective variance (the geometric mean of the covariance-matrix eigenvalues) and its square root, the effective standard deviation.On the original scale, the effective variance is about 0.0568 (effective SD ≈ 0.2383), whereas after applying a log(x + 1) transform it falls to 0.0533 (effective SD ≈ 0.2309). The roughly 6 % reduction in variance and 3 % reduction in SD confirm that the heavy right-hand tails of the distributions remain substantially influential even after log-transformation. In other words, while logging does compress extreme values and stabilizes variance to some extent, it does not fully eliminate their impact on the overall covariance structure. + +## Linear Dependency and Precision Matrix + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Compute precision matrix +S_inv <- solve(cov_matrix) + +# Display precision matrix (should match example values) +cat("Precision Matrix (S⁻¹):\n") +print(S_inv, digits = 2) + +# Extract correct row components of the selected crime +dependent_variable_index <- 5 + +first_row <- S_inv[dependent_variable_index, ] +print(first_row, digits = 2) + +diag_element <- S_inv[dependent_variable_index, dependent_variable_index] +print(diag_element, digits = 2) + +# Compute regression coefficients +beta_coefficients <- -first_row[-dependent_variable_index] / diag_element +print(beta_coefficients, digits = 2) + +# Compute residual variance +residual_variance <- 1 / diag_element +residual_sd <- sqrt(residual_variance) # Residual standard error + +# Print residual standard error +print(residual_sd, digits = 2) + +# Compute R^2 +r_squared <- 1 - (residual_variance / cov_matrix[dependent_variable_index, dependent_variable_index]) +print(r_squared, digits = 2) + +# Verify with lm() regression +delitos <- delitos_data %>% + st_drop_geometry() %>% + select(contains('24')) %>% + select(-sum_24TR, -sum_24SE, -sum_24SS) + +# Fit model +model <- lm(sum_24HP ~ ., data = data.frame(delitos)) +summary(model) +``` +Building on our precision‐matrix findings, we first saw that “robbery against people” (HP) has nonzero partial links to every other crime type once all others are held constant, yet each individual conditional association is weak—no single offense explains more than about 13 % of HP’s variance, and the residual SD remains at 0.57 (compared with an unconditional SD of ≈0.61). However, when we fit an ordinary least‐squares model including all crime counts simultaneously, the model’s R² jumps to approximately 0.61. In other words, while no lone crime category serves as a strong partial predictor of people‐targeted robberies, the combined information from every other offense captures the majority of HP’s predictable variation. + + +# Key Players and Topics + +# Non-Parametric Correlation {.unnumbered} + +## Spearman's Rank Correlation + +```{r} +#| echo: true +#| message: false +#| warning: false + +delitos_data_Atl <- delitos_data %>% + select(-sum_24TR, -sum_24SE, -sum_24SS) + +delitos_data_Atl %>% + st_drop_geometry() %>% + select(contains('24')) %>% + cor(., method = "spearman", use = "complete.obs") %>% + round(., 3) %>% + print(.) %>% + corrplot(., method = "color", title = "Spearman Correlation", mar=c(0,0,1,0)) +``` + +Both Spearman's $\rho$ captures the \emph{monotonic} relationship between two variables. They are more robust to outliers and non-linear relationships than Pearson's correlation. In the context of areal data (e.g., crime rates, population density across polygons), these measures can reveal how variables co-vary without assuming linearity or normality. + +# Spatial Neighborhood Matrices {.unnumbered} + +## Neighbors Based on Contiguity + + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Create a spatial neighbors list using Queen contiguity +# (i.e., polygons are considered neighbors if they share any point: edge or vertex) +nb <- spdep::poly2nb(delitos_data_Atl, queen = TRUE) +head(nb) + +# Replace invalid neighbor entries (i.e., [1] 0) with empty integer vectors +# This ensures compatibility with functions that expect valid neighbor lists only +nb_0 <- lapply(nb, function(x) if(length(x)==1 && x==0) integer(0) else x) + +# Polygons with neighbors +table(sapply(nb_0, length)) + +# Neighbors of Order k Based on Contiguity +# Neighbors of second order +nblags <- spdep::nblag(neighbours = nb, maxlag = 2) + +# Combine neighbors of all orders up to the specified lag (in this case, up to order 2) +# This creates a cumulative neighbor list including first- and second-order neighbors +nblagsc <- spdep::nblag_cumul(nblags) +table(sapply(nblagsc, length)) +``` + +Under the Queen contiguity (in which two blocks are “neighbors” if they share any boundary point) the vast majority of blocks (38663) have no touching neighbors, 1839 share exactly one edge or corner, 233 share two, and only a handful share three to eight (and a single block even shares thirteen—an unusual case around irregular parcel layouts). + +We then looked at second‐order contiguity (neighbors of neighbors) by computing and cumulatively merging lag‐1 and lag‐2 lists. Even with this broader definition, 40009 blocks remain isolated (no direct or indirect adjacency), 437 blocks have exactly two first- or second-order neighbors, 183 have three, and so on up to 14 in the rarest configurations. + +Together, these results confirm that DANE’s city‐block polygons are almost always separated by streets and very few blocks ever touch, even when we allow “two‐step” contiguity. + +## Neighbors Based on k Nearest Neighbors + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Compute centroids of the polygons +coo <- st_centroid(delitos_data_Atl) + +# Create a neighbor list where each polygon (based on its centroid `coo`) is connected +# to its 3 nearest neighbors using k-nearest neighbors (k = 3) +nb <- knn2nb(knearneigh(coo, k = 3)) # k number nearest neighbors + +# Polygons with neighbors +table(sapply(nb, length)) + +# Subset data to the first 10 polygons +delitos_data_10 <- delitos_data_Atl[1:100, ] + +# Recompute neighbor list for these 10 polygons to avoid index mismatches +nb_10 <- knn2nb(knearneigh(st_centroid(delitos_data_10), k = 3)) + +# Compute centroids for the 10 polygons +coords_10 <- st_coordinates(st_centroid(delitos_data_10)) + +# Plot the first 10 polygons and overlay neighbor connections in red +plot(st_geometry(delitos_data_10), border = "lightgray", main = "First Polygons with 3 Nearest Neighbors") +plot.nb(nb_10, coords_10, add = TRUE, col = "red", lwd = 2) +``` + +## Neighbors Based on Distance + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Create a neighbor list using distance-based contiguity: +# Polygons are considered neighbors if their centroids are within 0.4 units (e.g., degrees) apart +nb <- dnearneigh(x = st_centroid(delitos_data_Atl), d1 = 0, d2 = 0.4) + +# Polygons with neighbors +hist(sapply(nb, length)) + +# Subset data to the first 10 polygons +delitos_data_10 <- delitos_data[1:100, ] + +# Recompute neighbor list for these 10 polygons to avoid index mismatches +nb_10 <- dnearneigh(x = st_centroid(delitos_data_10), d1 = 0, d2 = 0.4) + +# Compute centroids for the 10 polygons +coords_10 <- st_coordinates(st_centroid(delitos_data_10)) + +# Plot the first 10 polygons and overlay neighbor connections in red +plot(st_geometry(delitos_data_10), border = "lightgray", main = "First Polygons with 3 Nearest Neighbors") +plot.nb(nb_10, coords_10, add = TRUE, col = "red", lwd = 2) +``` +When using distance‐based contiguity with a 0–0.4° threshold, the histogram shows that most city‐block polygons have between 10 and 30 “neighbors.” A small number lie at the tails: a handful have very few neighbors (0–5), reflecting isolated or edge blocks, while a long right‐hand tail extends out to about 80–120 neighbors—indicative of very dense, tightly packed urban areas where dozens of block centroids fall within the 0.4° radius. In contrast, our Queen‐contiguity test found almost no direct neighbors (since blocks are separated by streets), demonstrating that a fixed‐distance rule can reveal much broader spatial connections in compact parts of the city than simple edge‐or‐vertex sharing. + + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Compute k-nearest neighbors: for each polygon centroid, find its 1 nearest neighbor (k = 1) +nb1 <- knn2nb(knearneigh(coo, k = 1)) + +# Calculate the Euclidean distances between each polygon and its nearest neighbor +dist1 <- nbdists(nb1, coo) + +# Summarize all distances to understand the minimum, maximum, and quartiles +summary(unlist(dist1)) + +# Create a distance-based neighbor list: polygons whose centroids are within [0, 1.2] units are considered neighbors +nb <- dnearneigh(x = st_centroid(delitos_data_Atl), d1 = 0, d2 = 1.2) + +# Polygons with neighbors +hist(sapply(nb, length)) +``` +In Atlántico, city‐block centroids sit close together: half of all blocks find their nearest neighbor within just 0.07° (≈ 7–8 km) and 75 % within 0.10° (≈ 10–11 km), with only a handful of extreme isolates. Yet when we widen our lens to a 1.2° radius (≈ 130 km), “neighborhood” explodes—most blocks connect to 20–50 others, but some densely parcelled urban areas link to 400–500 blocks. This stark contrast between tight, local spacing and sprawling, regional connectivity underscores how the choice of spatial threshold dramatically reshapes the apparent structure of crime‐count patterns—and must therefore be tailored carefully to the scale of any analysis or intervention. + +Given that our aim is to model crime diffusion at the neighborhood level, however, a 130 km radius clearly overshoots the mark. A much smaller buffer—on the order of 0.06°–0.07° (≈ 6–7 km)— could better reflect the distances people can realistically cover on foot and ensures that “neighbors” in the analysis truly capture local spatial interactions. + +## Neighborhood Matrices + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Spatial weights matrix using Queen contiguity (binary weights) +# 'queen = TRUE' considers shared edges OR vertices as neighbors +nb <- poly2nb(delitos_data_Atl, queen = TRUE) + +# Convert the neighbor list to a spatial weights list object +# 'style = "W"' row-standardizes the weights (sums to 1) +# 'zero.policy = TRUE' avoids errors when some polygons have no neighbors +nbw <- spdep::nb2listw(nb, style = "W", zero.policy = TRUE) + +# Display the first 10 rows of spatial weights (for the first 10 polygons) +nbw$weights[1:10] + +# Spatial weights matrix based on inverse distance values +# Compute centroids of polygons +coo <- st_centroid(delitos_data_Atl) + +# Use Queen contiguity again to define neighbors +nb <- poly2nb(delitos_data, queen = TRUE) + +# Compute distances between neighbors based on their centroids +dists <- nbdists(nb, coo) + +# Create inverse distance weights (1/distance) for each pair of neighbors +ids <- lapply(dists, function(x){1/x}) + +# Create a listw object using binary style ("B" = no standardization) +nbw <- nb2listw(nb, glist = ids, style = "B", zero.policy = TRUE) + +# Display the first 10 inverse-distance-based weights +nbw$weights[1:10] +``` +The spatial weights matrix computed under Queen contiguity (where weights are row‐standardized so that each polygon’s neighbor weights sum to one) turns out to be entirely empty for almost every block in Atlántico. When we convert our poly2nb neighbor list into a “W” style nb2listw object, the first ten (and virtually all) entries are NULL, indicating no direct contiguity links. Even when we switch to inverse‐distance weights (style = "B" with glist = 1/distance), the same pattern emerges: city blocks remain isolated because they do not share edges or vertices, as they are separated by streets and avenues. This confirms that a pure contiguity-based weighting scheme fails to capture any meaningful spatial structure in our block-level geometry. Instead, distance-based or k-nearest-neighbor approaches are essential for defining neighbors in an urban setting where physical adjacency is broken by thoroughfares. + + +# Spatial autocorrelation {.unnumbered} + +## Global Moran’s + +Spatial Weights Matrix (W) + +The \textit{nb2listw(nb, style = "W")} function calculates the spatial weights matrix, often denoted as $W$. This matrix defines the spatial relationships between the polygons. The \textit{style = "W"} argument specifies row-standardization. + +```{r} +# Compute centroids of the polygons +coo <- st_centroid(delitos_data_Atl) + +# Create a neighbor list where each polygon (based on its centroid `coo`) is connected +# to its 3 nearest neighbors using k-nearest neighbors (k = 3) +nb <- knn2nb(knearneigh(coo, k = 3)) # k number nearest neighbors + +# Global Moran's I +# Convert the neighbor list to a listw object +lw <- nb2listw(nb, style = "W") # Use nb2listw + +# Now you can use 'lw' in moran.test +gmoran <- moran.test(delitos_data_Atl$sum_24HP, lw, alternative = "greater") + +gmoran + +gmoran[["estimate"]][["Moran I statistic"]] # Moran's I + +gmoran[["statistic"]] # z-score + +gmoran[["p.value"]] # p-value +``` +The global Moran’s I for sum_24HP is 0.00485, with a Z-score of 2.2318 and a p-value of 0.01282. Because the p-value is below 0.05 and the Z-score is positive, we reject spatial randomness and conclude there is significant positive spatial autocorrelation: blocks with high (or low) rates of robbery against people tend to cluster more than would occur by chance. Although the raw Moran’s I is small, its statistically significant departure from zero indicates a genuine, if modest, geographic clustering of these offenses—suggesting that neighborhood-level factors help drive where robberies against people concentrate. + +```{r} +#| echo: true +#| message: false +#| warning: false +gmoranMC <- moran.mc(delitos_data_Atl$sum_24HP, lw, nsim = 99) +gmoranMC + +hist(gmoranMC$res) +abline(v = gmoranMC$statistic, col = "red") +``` +The Monte-Carlo simulation of Moran’s I (99 random permutations plus the observed data) yielded an observed value of 0.14199, which ranked 100th out of 100 total runs, giving a one-sided p-value of 0.01. In other words, none of the randomized replicates produced as strong a clustering signal as the real data. This provides very strong evidence of positive spatial autocorrelation in sum_24HP: blocks with similarly high (or low) rates of “robbery against people” are geographically clustered far more than would be expected under spatial randomness. + +```{r} +#| echo: true +#| message: false +#| warning: false + +moran.plot(delitos_data_Atl$sum_24HP, lw) +``` +The Moran scatterplot for “robbery against people” (sum_24HP) vividly illustrates the modest but significant positive spatial autocorrelation we detected. The regression line’s upward slope (≈0.14) is exactly the global Moran’s I, showing that blocks with above-average HP counts tend to be neighbored by other high-HP blocks, while those with below-average counts cluster together as well. Most observations fall in the lower-left quadrant (low-HP surrounded by low-HP), with a smaller but clear cluster in the upper-right (high-HP surrounded by high-HP)—our primary cold spots and hot spots. A handful of points in the upper-left and lower-right quadrants represent spatial outliers (blocks whose HP rate contrasts sharply with their neighbors). Finally, extreme values (HP ≥ 10 or spatial lag ≥ 5) highlight the most pronounced hot-spot clusters. Together, this plot confirms that although “robbery against people” is moderately clustered, there remain both localized pockets of elevated crime and occasional outliers that warrant closer investigation. + +## Local Moran’s I + +```{r} +#| echo: true +#| message: false +#| warning: false + +lmoran <- localmoran(delitos_data_Atl$sum_24HP, lw, alternative = "greater") +head(lmoran) +``` +When we move from the global to the local level, Local Moran’s I (Ii) lets us see exactly which blocks contribute to clustering or stand out as outliers. For each block: + +- Ii measures how similar its HP count is to its immediate neighbors. Large positive Ii values flag local “hot spots” (high‐HP blocks surrounded by high‐HP neighbors) or “cold spots” (low‐HP blocks surrounded by low‐HP neighbors). Large negative Ii values identify spatial outliers (e.g. a high‐HP block amid low‐HP neighbors). Values near zero indicate no clear local pattern. + +- E.Ii is the expected value under random spatial arrangement—essentially zero—so any noticeable departure of Ii from zero hints at a genuine local pattern. + +- Var.Ii quantifies the natural variability of Ii under the null; it underpins the next two columns but isn’t interpreted on its own. + +- Z.Ii = (Ii − E.Ii)/√Var.Ii tells us how extreme each Ii is. Only blocks with |Z.Ii| > 1.96 exhibit statistically significant local clustering or dispersion at the 5 % level. + +- Pr(z > E.Ii) (the local p‐value) directly tests significance: only p < 0.05 indicates a truly non‐random pattern. + +In the first six sample blocks, Ii hovered around ±0.07 with Z‐scores between –0.48 and +0.47 and p‐values well above 0.05, meaning none of these locations show significant local clustering or outlier behavior. To pinpoint actual hot or cold spots, we must map all Ii scores and their p‐values across the study area and focus only on those blocks with p < 0.05. + +```{r} +#| eval: false +#| message: false +#| warning: false + +install.packages("tmap", type = "binary", dependencies = TRUE) +library(tmap) +tmap_mode("view") + +delitos_data_Atl$lmI <- lmoran[, "Ii"] # local Moran's I +delitos_data_Atl$lmZ <- lmoran[, "Z.Ii"] # z-scores +# p-values corresponding to alternative greater +delitos_data_Atl$lmp <- lmoran[, "Pr(z > E(Ii))"] + +p1 <- tm_shape(delitos_data_Atl) + + tm_polygons(col = "sum_24HP", title = "vble", style = "quantile") + + tm_layout(legend.outside = TRUE) + +p2 <- tm_shape(delitos_data_Atl) + + tm_polygons(col = "lmI", title = "Local Moran's I", + style = "quantile") + + tm_layout(legend.outside = TRUE) + +p3 <- tm_shape(delitos_data_Atl) + + tm_polygons(col = "lmZ", title = "Z-score", + breaks = c(-Inf, 1.65, Inf)) + + tm_layout(legend.outside = TRUE) + +p4 <- tm_shape(delitos_data_Atl) + + tm_polygons(col = "lmp", title = "p-value", + breaks = c(-Inf, 0.05, Inf)) + + tm_layout(legend.outside = TRUE) + +tmap_arrange(p1, p2, p3, p4) +``` + +p1: sum_24HP +This map shows the distribution of the variable of interest (sum_24HP). The style = "quantile" argument means the colors represent quantiles, so each color shows roughly an equal number of polygons. This map gives you a baseline for seeing how your variable is distributed spatially. + +p2: lmI (Local Moran's I) +This map displays the Local Moran's I values. +Positive values (often shown in warmer colors) indicate that a polygon has neighbors with similar values. This suggests spatial clusters of high values or low values. +Negative values (often shown in cooler colors) indicate that a polygon has neighbors with dissimilar values. This suggests spatial outliers. + +p3: lmZ (Z-scores) +This map shows the z-scores associated with the Local Moran's I values. Z-scores help you assess the statistical significance of the local clustering. +You've set breaks = c(-Inf, 1.65, Inf). Assuming this is a one-tailed test: +Values greater than 1.65 indicate statistically significant (at approximately the 0.05 level) clustering of high values (a "hot spot"). +Values less than -1.65 would indicate statistically significant clustering of low values (a "cold spot"). +Values between -1.65 and 1.65 suggest the spatial pattern is not statistically significant. + +p4: lmp (P-values) +This map displays the p-values associated with the Local Moran's I values. P-values provide another way to assess statistical significance. +Polygons with p-values less than 0.05 (shown in one color) indicate statistically significant spatial clustering. +Polygons with p-values greater than 0.05 (shown in the other color) do not show statistically significant clustering. + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Plot Local Moran's I using tmap +tm_shape(delitos_data_Atl) + + tm_polygons( + col = "lmZ", + title = "Local Moran's I", + style = "fixed", + breaks = c(-Inf, -1.96, 1.96, Inf), + labels = c("Negative SAC", "No SAC", "Positive SAC"), + palette = c("blue", "white", "red") + ) + + tm_layout(legend.outside = TRUE) +``` + +## Clusters + +```{r} +#| echo: true +#| message: false +#| warning: false + +mp <- moran.plot(as.vector(scale(delitos_data_Atl$sum_24HP)), lw) +delitos_data$quadrant <- NA +# high-high +delitos_data_Atl[(mp$x >= 0 & mp$wx >= 0) & (delitos_data$lmp <= 0.05), "quadrant"]<- 1 +# low-low +delitos_data_Atl[(mp$x <= 0 & mp$wx <= 0) & (delitos_data$lmp <= 0.05), "quadrant"]<- 2 +# high-low +delitos_data_Atl[(mp$x >= 0 & mp$wx <= 0) & (delitos_data$lmp <= 0.05), "quadrant"]<- 3 +# low-high +delitos_data_Atl[(mp$x <= 0 & mp$wx >= 0) & (delitos_data$lmp <= 0.05), "quadrant"]<- 4 +# non-significant +delitos_data_Atl[(delitos_data_Atl$lmp > 0.05), "quadrant"] <- 5 +``` +The Moran scatterplot for standardized robbery‐against‐people rates (sum_24HP) shows a modest but positive global spatial autocorrelation: the fitted regression line (Moran’s I) slopes upward, indicating that blocks with above‐average HP tend to be near other such blocks, and likewise for below‐average values. By splitting the plot into four quadrants (and retaining only those observations with a local‐Moran p ≤ 0.05), we identify: + +- High–High (hotspots): blocks where HP and its neighbors are both above average. +- Low–Low (coldspots): blocks where HP and its neighbors are both below average. +- High–Low (spatial outliers): blocks with high HP surrounded by low HP. +-Low–High (reverse outliers): blocks with low HP surrounded by high HP. + +In the data, most points lie outside the significance cutoff, so only a handful of true hotspots, coldspots, and outliers emerge. This tells us that while robberies do cluster spatially in some areas, the bulk of variation in HP remains idiosyncratic—knowledge that can guide you to target just those specific blocks where crime rates truly diverge from their immediate surroundings. + +```{r} +#| echo: true +#| message: false +#| warning: false + +tm_shape(delitos_data_Atl) + tm_fill(col = "quadrant", title = "", +breaks = c(1, 2, 3, 4, 5, 6), +palette = c("red", "blue", "lightpink", "skyblue2", "white"), +labels = c("High-High", "Low-Low", "High-Low", + "Low-High", "Non-significant")) + +tm_legend(text.size = 1) + tm_borders(alpha = 0.5) + +tm_layout(frame = FALSE, title = "Clusters") + +tm_layout(legend.outside = TRUE) + +``` + +# Correspondance Analysis {.unnumbered} + +```{r} +#| eval: false +#| message: false +#| warning: false +#| include: false + + +# Paso 1: Calcular los centroides de los polígonos +delitos_data_Atl <- delitos_data_Atl %>% + mutate(centroid = st_centroid(st_geometry(.))) + +# Paso 2: Extraer coordenadas de los centroides +coords <- st_coordinates(delitos_data_Atl$centroid) + +# Paso 3: Calcular la mediana de las coordenadas +median_x <- median(coords[, 1]) +median_y <- median(coords[, 2]) + +# Paso 4: Asignar una zona a cada polígono según su posición relativa a las medianas +delitos_data_Atl <- delitos_data_Atl %>% + mutate( + zona = case_when( + coords[, 1] >= median_x & coords[, 2] >= median_y ~ "Zona Norte", + coords[, 1] < median_x & coords[, 2] >= median_y ~ "Zona Oeste", + coords[, 1] < median_x & coords[, 2] < median_y ~ "Zona Sur", + coords[, 1] >= median_x & coords[, 2] < median_y ~ "Zona Este" + ) + ) + +# Paso 5: Crear la matriz de frecuencias de delitos por zona +Freq <- delitos_data_Atl %>% + st_drop_geometry() %>% + group_by(zona) %>% + summarise(across(contains("24"), sum, na.rm = TRUE)) %>% + column_to_rownames("zona") +Freq <- as.matrix(Freq) # No sé por qué no estaba definida como matriz así que la forcé para que no generar errores +Freq <- Freq[, colSums(Freq) > 0] + +print("Matriz de frecuencias:") +print(Freq) + +# Paso 2: Calcular la tabla de frecuencias relativas + +F_rel <- Freq / sum(Freq) +fi <- rowSums(F_rel) +fj <- colSums(F_rel) + +print("Matriz de frecuencias relativas:") +print(F_rel) + +# Paso 3: Cálculo de los totales por fila y columna +fi <- rowSums(F_rel) # Suma por fila +fj <- colSums(F_rel) # Suma por columna + +D_f <- diag(fi) # Matriz diagonal de totales de fila +D_c <- diag(fj) # Matriz diagonal de totales de columna + +# Paso 4: Construcción de la matriz de correspondencias Z +D_f_sqrt_inv <- diag(1 / sqrt(fi)) # Raíz cuadrada inversa de los totales de fila +D_c_sqrt_inv <- diag(1 / sqrt(fj)) # Raíz cuadrada inversa de los totales de columna +is.matrix(Freq) +Z <- D_f_sqrt_inv %*% (F_rel - fi %o% fj) %*% D_c_sqrt_inv + +print("Matriz de correspondencias Z:") +print(Z) + +# Paso 5: Descomposición en Valores Singulares (SVD) +svd_result <- svd(Z) # Descomposición de Z + +U <- svd_result$u # Vectores propios de ZZ' (Filas) +D <- diag(svd_result$d) # Matriz diagonal de valores singulares +V <- svd_result$v # Vectores propios de Z'Z (Columnas) + +print("Matriz U (Vectores propios de ZZ'):") +print(U) + +print("Matriz D (Valores singulares):") +print(D) + +print("Matriz V (Vectores propios de Z'Z):") +print(V) + +# Paso 6: Proyección de filas y columnas en el espacio reducido +dim_reducida <- 2 +C_f <- D_f_sqrt_inv %*% U[, 1:dim_reducida] %*% D[1:dim_reducida, 1:dim_reducida] # Coordenadas filas +C_c <- D_c_sqrt_inv %*% V[, 1:dim_reducida] %*% D[1:dim_reducida, 1:dim_reducida] # Coordenadas columnas + +print("Coordenadas filas en el espacio reducido:") +print(C_f) + +print("Coordenadas columnas en el espacio reducido:") +print(C_c) + +``` + +From the zones crime frequency table we can conclude that the spatial distribution of crime types across Atlántico reveals notable geographic disparities. As expected, robbery against people (HP) emerges as the most prevalent offense in all four zones, but its frequency is strikingly higher in the North, which alone accounts for over 18% of all observed crimes—more than quadruple the rate in the South. The North also leads in nearly every other crime category, including LP, VI, and HC, suggesting it may host higher commercial activity and population density. In contrast, the South shows the lowest overall crime levels, with consistently low relative frequencies across all variables. Meanwhile, the East and West show intermediate patterns: the East stands out for its elevated shares of commerce-related crimes and extortion, while the West shows a balanced mix, with violence and burglary playing a more prominent role. These patterns indicate that interventions may need to be geographically targeted, with the North being a priority area for multifaceted crime prevention strategies, and the South potentially requiring maintenance of low-risk conditions rather than intensive policing. + +The standardized residual matrix (Z) from the correspondence analysis highlights specific deviations from the expected distributions of crime counts across the four zones in Atlántico. In Zona Norte, the strongest positive residual appears for HP, indicating that robbery against people is significantly overrepresented in this zone compared to what would be expected under independence. Other variables such as EX and SE also show positive deviations, suggesting these types of crimes occur more frequently than anticipated in the northern area. On the other hand, LP, VI, and DS have negative residuals, implying these crimes are less prevalent than expected in this zone. +Zona Sur displays the opposite trend, with large positive values for LP and VI, and a strong negative residual for HP. Zona Este shows a balanced pattern but still presents positive residuals for HM and EX, while Zona Oeste maintains moderate positive associations with VI and DS, and negative values for HP, indicating a relatively less intense concentration of robbery against people. +These zone-specific patterns reveal that different areas within Atlántico are characterized by distinct crime profiles. Such insights can guide localized crime prevention policies that account for the particular spatial dynamics of each type of offense. + +# Imputing and Linking Data + + + +```{r} +#| echo: true +#| message: true +#| warning: true + +d <- worldclim_country(country = "Colombia", var = "tmin", + path = tempdir()) +terra::plot(mean(d), plg = list(title = "Min. temperature (C)")) +``` + +## Cropping, masking, and aggregating raster data + +```{r} +#| echo: true +#| message: true +#| warning: true + +# Cropping +sextent <- terra::ext(delitos_data_Atl) +d <- terra::crop(d, sextent) +plot(d) +``` +The resolution of this map is so low that Atlántico is in only one pixel. + +```{r} +#| echo: true +#| message: true +#| warning: true + +# Masking +d <- terra::mask(d, vect(delitos_data_Atl)) +plot(d) +``` + +```{r} +#| echo: true +#| message: true +#| warning: true + +# Aggregating +d <- terra::aggregate(d, fact = 20, fun = "mean", na.rm = TRUE) +plot(d) +``` + +## Extracting raster values at points + +```{r} +#| echo: true +#| message: true +#| warning: true + +points <- st_coordinates(delitos_data_Atl) + +# Convert SpatRaster to data frame for ggplot2 +raster_df <- as.data.frame(d, xy = TRUE) +colnames(raster_df)[3] <- "tmin" # Rename the temperature column + +# Get the centroid coordinates +points_df <- st_coordinates(st_geometry(delitos_data_Atl)) +points_df <- as.data.frame(points_df) +colnames(points_df) <- c("x", "y") + +# Create the plots with ggplot2 +ggplot() + + geom_raster(data = raster_df, aes(x = x, y = y, fill = tmin)) + + scale_fill_viridis_c(name = "Min Temp (C)") + + geom_sf(data = delitos_data_Atl, color = "black", fill = "transparent") + # Plot the polygon borders + geom_point(data = points_df, aes(x = x, y = y), color = "red", size = 0.01, shape = 1, alpha = 0.5) + # Plot the centroids as transparent circles + labs(title = "Aggregated Minimum Temperature with Polygons and Centroids") + + theme_minimal() +``` + +# How do robberies to people and socioeconomic strata correlate? + +```{r} +#| echo: true +#| message: true +#| warning: true + +# Variables del Dane Censo 2018 +manzanas <- st_read("C:/Users/laura/Downloads/SHP_MGN2018_INTGRD_MANZ/MGN_ANM_MANZANA.shp", quiet = TRUE) + +# 2) Echar un vistazo a los nombres de columna +names(manzanas) + +# 1) Extrae los códigos únicos de cada fuente +cod_delitos <- delitos_data %>% st_drop_geometry() %>% pull(manz_ccnct) %>% unique() +cod_manzanas <- manzanas %>% st_drop_geometry() %>% pull(COD_DANE_A) %>% unique() + +# 2) Comprueba el tipo de datos (carácter vs. numérico) y unifícalos si es necesario +class(cod_delitos) +class(cod_manzanas) + +# 3) Intersección y diferencias +comunes <- intersect(cod_delitos, cod_manzanas) +solo_del <- setdiff(cod_delitos, cod_manzanas) # códigos en delitos que NO están en manzanas +solo_manz <- setdiff(cod_manzanas, cod_delitos) # códigos en manzanas que NO están en delitos + +length(comunes) # cuántos códigos sí hacen match +length(solo_del) # cuántos quedan sin correspondencia en delitos +length(solo_manz) # cuántos quedan sin correspondencia en manzanas + +# Proporción de códigos de delitos que efectivamente existen en manzanas +length(comunes) / length(cod_delitos) + +# 4) (Opcional) Visualizar unos ejemplos de no-coincidentes +head(solo_del) +head(solo_manz) + +delitos_data$dpto_ccdgo %>% unique() # Making sure delitos data has information of my three departments + +# 2) Selecciona sólo la clave y las variables de interes +estratos_dane <- manzanas %>% + # si no quieres arrastrar la geometría a tu tabla final, + # puedes st_drop_geometry() aquí primero + dplyr::select( + COD_DANE_A, + TP9_3_1_NO, # Conteo de unidades no residénciales con uso Industria + TP9_3_2_NO, # Conteo de unidades no residénciales con uso Comercio + TP9_1_USO, # Conteo de unidades con uso vivienda + TP19_EE_E1, # Conteo de viviendas que reportan recibir facturación de energía eléctrica en Estrato 1 + TP19_EE_E2, # Conteo de viviendas que reportan recibir facturación de energía eléctrica en Estrato 2 + TP19_EE_E3, # Conteo de viviendas que reportan recibir facturación de energía eléctrica en Estrato 3 + TP19_EE_E4, # Conteo de viviendas que reportan recibir facturación de energía eléctrica en Estrato 4 + TP19_EE_E5, # Conteo de viviendas que reportan recibir facturación de energía eléctrica en Estrato 5 + TP19_EE_E6, # Conteo de viviendas que reportan recibir facturación de energía eléctrica en Estrato 6 + TP27_PERSO, # Número de personas + TP32_1_SEX, # Conteo de hombres + TP32_2_SEX, # Conteo de mujeres + TP51PRIMAR, # Conteo de personas donde el nivel educativo del último año alcanzado es Preescolar - Pre jardín Y Básica primaria + TP51SECUND, # Conteo de personas donde el nivel educativo del último año alcanzado es Básica secundaria 3...6 + TP51SUPERI, # Conteo de personas donde el nivel educativo del último año alcanzado es Técnica profesional 1 año 7...9 + TP51POSTGR, # Conteo de personas donde el nivel educativo del último año alcanzado es Especialización 1 año…maestría doctorado + TP51_13_ED # Conteo de personas donde el nivel educativo del último año alcanzado es Ninguno + ) + +# 4) Haz el join espacial (o no espacial, si delitos_data ya es sf con manz_ccnct) +# Usamos left_join para conservar todas las filas de delitos_data +delitos_data_estrato <- delitos_data %>% + left_join( + st_drop_geometry(manzanas), + by = c("manz_ccnct" = "COD_DANE_A") + ) + +# Hacer modelo de regresión incluyendo estratos + +model <- lm(sum_24HP + ~ TP9_3_1_NO + + TP9_3_2_NO + + TP9_1_USO + + TP19_EE_E1 + + TP19_EE_E2 + + TP19_EE_E3 + + TP19_EE_E4 + + TP19_EE_E5 + + TP19_EE_E6 + # + TP27_PERSO - Se quita por análisis VIF + # + TP32_1_SEX - Se quita por análisis VIF + # + TP32_2_SEX - Se quita por análisis VIF + # + TP51PRIMAR - Se quita por análisis VIF + # + TP51SECUND - Se quita por análisis VIF + # + TP51SUPERI - Se quita por análisis VIF + # + TP51POSTGR - Se quita por análisis VIF + # + TP51_13_ED - Se quita por análisis VIF + , data = data.frame(delitos_data_estrato)) +summary(model) + +vif(model) +alias(model) + +delitos_data_estrato <- delitos_data_estrato %>% + select(sum_24HP, + TP9_3_1_NO, + TP9_3_2_NO, + TP9_1_USO, + TP19_EE_E4, + TP19_EE_E5, + TP19_EE_E6) + +delitos_data_estrato %>% + st_drop_geometry() %>% + cor(., method = "spearman", use = "complete.obs") %>% + round(., 3) %>% + print(.) %>% + corrplot(., method = "color", title = "Spearman Correlation", mar=c(0,0,1,0)) + +``` +For the linear regression model aimed at explaining the variability in robberies against people (sum_24HP), a range of variables related to land use, socioeconomic strata, and demographic and educational characteristics were initially included. However, some variables were excluded from the final model due to multicollinearity issues or because they were not statistically significant, as indicated by the Variance Inflation Factor (VIF) analysis and the estimated coefficients. + +Specifically, demographic and educational variables such as the total number of people (TP27_PERSO), the number of men (TP32_1_SEX), women (TP32_2_SEX), and the different levels of education (TP51PRIMAR, TP51SECUND, TP51SUPERI, TP51POSTGR, TP51_13_ED) were removed. These variables either contributed to instability in the model or did not provide significant explanatory power. Their exclusion improves model parsimony, enhances interpretability, and avoids redundancy among highly correlated predictors. + +```{r} +#| echo: true +#| message: false +#| warning: false + +# Covariance matrix +cm_delitos_data <- delitos_data_estrato %>% + st_drop_geometry() %>% + na.omit() %>% # elimina filas con NA + cov() + +# Compute eigenvalues and eigenvectors +eigen_results <- cm_delitos_data %>% eigen() + +# Extract eigenvalues and eigenvectors +eigenvalues <- eigen_results$values +eigenvectors <- eigen_results$vectors + +# Display eigenvalues and eigenvectors +print(eigenvalues) +head(eigenvectors) + +# The Smallest Eigenvalues +sort(eigenvalues, decreasing = FALSE) + +# The smallest eigenvalue is approximately zero +smallest_eigenvalue <- min(eigenvalues) +print(smallest_eigenvalue) + +# Corresponding eigenvector +smallest_eigenvector <- eigenvectors[, which.min(eigenvalues)] +print(smallest_eigenvector) + +# Normalize the eigenvector by dividing by the largest absolute value +normalized_eigenvector <- smallest_eigenvector / max(abs(smallest_eigenvector)) +print(normalized_eigenvector) + +# Sorted normalize the eigenvector +sort(abs(normalized_eigenvector), decreasing = T) + +# Get numeric variable names (order matches eigenvector indices) +variable_names <- colnames(cm_delitos_data) + +# Sort normalized eigenvector by absolute contribution (descending order) +sorted_contributions <- sort(abs(normalized_eigenvector), decreasing = TRUE) + +# Get the indices of the top contributions +top_indices <- order(abs(normalized_eigenvector), decreasing = TRUE) + +# Get the names of the top variables +top_variable_names <- variable_names[top_indices] + +# Print the top variable names +print(top_variable_names) + +``` +The Spearman correlation analysis between reported robberies against individuals (sum_24HP) and various socioeconomic variables reveals meaningful patterns in the Atlántico dataset. Notably, sum_24HP shows a moderate positive correlation with the number of plots used for industrial purposes (TP9_3_2_NO, ρ = 0.206), the number of registered households (TP9_1_USO, ρ = 0.193), and plots used for commercial purposes (TP9_3_1_NO, ρ = 0.132). These correlations suggest that areas with higher densities of industrial and commercial activity, as well as more households, tend to report more robberies against people. These land-use patterns may indicate busier areas with more targets and opportunities for such crimes. + +The association strengthens when examining neighborhoods by socioeconomic strata. Higher correlations are observed with households in strata 4 (TP19_EE_E4, ρ = 0.221), strata 5 (TP19_EE_E5, ρ = 0.201), and to a lesser extent strata 6 (TP19_EE_E6, ρ = 0.139). This trend suggests that robberies are more frequent in middle- and upper-middle-income neighborhoods, potentially due to the increased presence of valuable goods or the movement of people in and out of those zones. + +A principal component analysis (PCA) on the covariance matrix of the same variables (after omitting missing values) provides further insight. The smallest eigenvalue is approximately 1.15, indicating no critical issues with multicollinearity or redundancy among variables—each dimension contributes some unique variance. The eigenvector associated with this smallest eigenvalue is dominated by the TP9_3_1_NO (commercial use) variable, followed by TP9_3_2_NO (industrial use) and sum_24HP itself. This confirms that the commercial and industrial land use categories capture unique variance that is not fully explained by the other variables in the model. + +Together, these findings suggest that robbery against individuals in Atlántico is partially structured by the type of land use and the socioeconomic profile of each block. While multivariable models capture more variance overall, commercial and industrial presence, along with middle-income strata, stand out as the most influential contributors when interpreting patterns of this specific crime. + +# Prompts + + +- How would you complete the analysis given these results? +- Thanks for the analysis, could you please include into the text something like: this means that we should focus the preventive campaign on the few polygons we have crime concentrated. +- Prompt: can you help me refining this one? Robberies in Atlántico data have a skewness of 10.0324, this reinforces what we could see from previous analysis: the distribution has a long tail to the right side. In other words, the distribution is very asymmetric around the mean. +- thanks! What do you conclude over this: Covariance Matrix (showed it to ChatGPT) +- could you help me polish this text based on the analysis you just did? (not copying the text here because would be repetitive, but I gave it the analysis I did. Then checked its analysis and changed a few things) +- what would ypu add to this text based on previous analysis? "The Median Absolute Deviation (MAD) is a robust measure of variability that quantifies the dispersion of a dataset, in this case, as it is 88,41, indicates that relative variability of robbery data is very high". +- thanks for the advise, I will take it into account. What else would you say from theses statistic measures: Data coefficient of variation of 3712 and variation of 3.71 indicate, as we have been confirming through the full analysis, that robbery data varies a lot relatively from its mean. But not as much as other crime variables do.?? +- look at the data for other variables Crime Type Variation (showed the data to ChatGPT) +- Me gustaría añadir que es altamente probables que los polígonos que concentren hurto a personas también concentren las demás variables. Por eso es importante hacer un análisis de correlación y además de variabilidad espacial. ¿me ayudas incluyéndolo? +- thanks! What do you conclude over this: Covariance Matrix (showed it to ChatGPT) +- could you help me polish this text based on the analysis you just did? (not copying the text here because would be repetitive, but I gave it the analysis I did. Then checked its analysis and changed a few things) +- Can you please help me making a list of all prompts I've gave you for the development of my crime report? +-"Spatial weights matrix using Queen contiguity..." +-"Display the first 10 rows of spatial weights..." +-"Convert the neighbor list to a spatial weights list object..." +-"What do you interpret about Queen contiguity producing mostly NULL weights?" +-"Help me write a paragraph analyzing this..." +-"Which file should I use from DANE to match my polygons?" +-"I want to use variables like TP19_EE_E1 to TP19_EE_E6 (estratos)..." +-"How can I check if manz_ccnct and COD_DANE_A correspond?" +-"Can I join them given that 97% match?" +-"Help me join the DANE variables to delitos_data..." +-"I get an error in select(): arguments not used..." +-"How can I fix this select error?" +-"Assign quadrant zones using median coordinates..." +-"Calculate centroids of polygons..." +-"Create zone labels like Zona Norte, Zona Este..." +-"I get an error in st_centroid: no applicable method for 'function'..." +-"Fix error in assigning zones..." +-"Create matrix of crime frequencies by zone..." +-"Why does rownames assignment fail?..." +-"What is the corrected version of my code for creating Freq?" +-"Print relative frequency matrix and perform Chi-squared decomposition..." +-"What can we interpret from the frequency matrix?" +-"Which crimes dominate in each zone?" +-"Write a paragraph based on the correspondence matrix..." +-"Interpret linear regression results predicting HP from DANE variables..." +-"Why am I getting error in eigen() calculation?" +-"Spearman correlation matrix analysis with variables TP9_3_1_NO, etc." +-"What does the eigenvector of smallest eigenvalue tell me?" +-"I made a mistake in variable labels (TP9_3_2_NO is industrial usage)..." +-"Please repeat the analysis with the correct variable names and in paragraphs..." +-"What is the importance of adding external data to crime analysis?" +-"Help me write a paragraph interpreting the correspondence matrix..." +-"Help me write a paragraph explaining the importance of socio-economic data..." +-"Summarize conclusions based on eigenvalue decomposition..." +-"Help me debug why I'm getting this error in assigning rownames..." +-"Is my print(Freq) command not working?..." +-"Is my frequency matrix not a matrix?" +-"Why does my code for Z matrix multiplication fail?" + +