From ba19188b6a6277f154915aae4bcc84add38b93a0 Mon Sep 17 00:00:00 2001 From: Breck Date: Sun, 25 Jul 2021 18:19:12 -0400 Subject: [PATCH 01/12] refactored components --- R/modeling_configs.R | 17 ++++++ R/sim_configs.R | 107 ++++++++++++++++++++++++++++++++++++++ R/test/test_sim_configs.R | 18 +++++++ 3 files changed, 142 insertions(+) create mode 100644 R/modeling_configs.R create mode 100644 R/sim_configs.R create mode 100644 R/test/test_sim_configs.R diff --git a/R/modeling_configs.R b/R/modeling_configs.R new file mode 100644 index 0000000..8557ec7 --- /dev/null +++ b/R/modeling_configs.R @@ -0,0 +1,17 @@ +#' Returns SIRD configurations for model 'twitter_sird' that use tweets and +#' not. ODE solver not specified +#' @param run_df The dataframe that is being copied and built up. +model_stan_SIRDs <- function(run_df) { + + model_no_tweets_df <- copy(run_df) + model_no_tweets_df$apply_twitter_data <- 0 + model_no_tweets_df$model_to_run <- 'twitter_sird' + model_no_tweets_df$description <- paste0(model_no_tweets_df$description, + "no tweets") + model_tweets_df <- copy(run_df) + model_tweets_df$apply_twitter_data <- 1 + model_tweets_df$model_to_run <- 'twitter_sird' + model_tweets_df$description <- paste0(model_tweets_df$description, + "use tweets") + return(rbind(model_no_tweets_df, model_tweets_df)) +} \ No newline at end of file diff --git a/R/sim_configs.R b/R/sim_configs.R new file mode 100644 index 0000000..6694563 --- /dev/null +++ b/R/sim_configs.R @@ -0,0 +1,107 @@ +source(here::here("R","util.R")) +source(here::here("R","SIRTDsim.R")) + +# Each run is a row in runDf, the rows contain all information necessary to run +# an experiment. The data in the columns can vary based on whether a sim is being +# run, real data etc. +# Brazil data +# Simulated data +# +# section 1 +#Globals + +#' Returns a value +- the variation param around a mean value. +#' @param mean +#' @param variation +#' @return +draw_unif_prob <- function(mean, variation) { + if (mean > 1 || mean < 0) { + stop(sprintf("mean out of range for probability in draw_unif_prob %d", + mean)) + } + return (round(runif(n = 1, + min = max(mean - variation, 0), + max = min(mean + variation, 1)),3)) +} + +#' Takes source_df with values for simulation parameters and draws n_sims +#' times from uniform values +- .2 for Beta, Lambda, .05 for death_prob, +#' and so on, look to source for details in 'R/data_config.R' +#' @param n_sims number of draws for params +#' @param source_df run_df with point estimates that will center draws +#' @return run_df with n_sims number of draws, source_df not included +sim_draw_params <- function(n_sims, source_df) { + last_sim_id <- source_df$sim_run_id + return_df <- data.frame() + for (j in 1:n_sims) { + drawn_df <- copy(source_df) + last_sim_id <- last_sim_id + 1 + drawn_df$sim_run_id <- last_sim_id + drawn_df$beta_mean <- draw_unif_prob(mean = source_df$beta_mean, + variation = 0.2) + drawn_df$beta_daily_rate <- list(rep(drawn_df$beta_mean, + drawn_df$n_days)) + drawn_df$gamma <- draw_unif_prob(mean = source_df$gamma, + variation = 0.2) + drawn_df$tweet_rate <- draw_unif_prob(mean = source_df$tweet_rate, + variation = .25) + drawn_df$death_prob <- draw_unif_prob(mean = source_df$death_prob, + variation = .02) + sim_df <- sirtd_vary_beta_exact(seed = drawn_df$seed, + n_pop = drawn_df$n_pop, + n_days = drawn_df$n_days, + print = FALSE, + beta_daily_inf_rates = + unlist(drawn_df$beta_daily_rate), + gamma_res_per_day_rate = drawn_df$gamma, + tweet_rate_infected = drawn_df$tweet_rate, + mean_days_to_death_from_t = drawn_df$days2death, + n_patient_zero = drawn_df$n_patient_zero, + death_prob = drawn_df$death_prob) + drawn_df$s <- list(sim_df$s) + drawn_df$i <- list(sim_df$i) + drawn_df$r <- list(sim_df$r) + drawn_df$t <- list(sim_df$t) + drawn_df$d <- list(sim_df$d) + drawn_df$tweets <- list(sim_df$tweets) + return_df <- rbind(return_df, drawn_df) + } + return(return_df) +} + +#' Mirrors Brazil's observerd deaths/tweets roughly with simulation and given +#' params, no model configuration/reporting configured +#' @return dataframe for use in runEval.R +sim_brazil_1 <- function (source_df) { + run_df <- copy(source_df) + run_df$beta_mean <- .30 + run_df$beta_daily_rate <- list(rep(run_df$beta_mean, + run_df$n_days)) + run_df$gamma <- .255 + run_df$death_prob <- .01 + run_df$tweet_rate <- .25 + run_df$days2death <- 10 + run_df$n_patient_zero <- 10 + run_df$description <- "Brazil 1 approximation" + run_df$sim_run_id <- 1 + sim_df <- sirtd_vary_beta_exact(seed = run_df$seed, + n_pop = run_df$n_pop, + n_days = run_df$n_days, + print = FALSE, + beta_daily_inf_rates = + unlist(run_df$beta_daily_rate), + gamma_res_per_day_rate = run_df$gamma, + tweet_rate_infected = run_df$tweet_rate, + mean_days_to_death_from_t = run_df$days2death, + n_patient_zero = run_df$n_patient_zero, + death_prob = run_df$death_prob) + run_df$s <- list(sim_df$s) + run_df$i <- list(sim_df$i) + run_df$r <- list(sim_df$r) + run_df$t <- list(sim_df$t) + run_df$d <- list(sim_df$d) + run_df$tweets <- list(sim_df$tweets) + return(run_df) +} + +#sim_brazil_1() diff --git a/R/test/test_sim_configs.R b/R/test/test_sim_configs.R new file mode 100644 index 0000000..e94e862 --- /dev/null +++ b/R/test/test_sim_configs.R @@ -0,0 +1,18 @@ +library(testthat) + +source(here::here('R','sim_configs.R')) +source(here::here('R','util.R')) + + +test_that("sim_draw_params test", { + run_df <- setup_run_df(seed = 123, n_pop = 100000, n_days = 40) + brazil_df <- sim_brazil_1(run_df) + draws_run_df <- sim_draw_params(n_sim = 2, source_df = brazil_df) + expect_equal(draws_run_df[1,]$gamma != draws_run_df[2,]$gamma, TRUE) +}) + +test_that("Brazil 1 test", { + run_df <- setup_run_df(seed = 123, n_pop = 100000, n_days = 40) + brazil_df <- sim_brazil_1(run_df) + expect_equal(brazil_df$seed, 123) +}) \ No newline at end of file From 2c25a6ea3b8bf540eefdb0295e2ce96cfa26e3e7 Mon Sep 17 00:00:00 2001 From: Breck Date: Sun, 25 Jul 2021 18:20:16 -0400 Subject: [PATCH 02/12] updated reorg o fsim code/site --- R/runEval.R | 243 ++---- R/util.R | 52 +- runEval.Rmd | 223 +++-- runEval.html | 1990 ++++++++++++++++++++++++++++++++++-------- stan/tweet_sird.stan | 4 +- 5 files changed, 1845 insertions(+), 667 deletions(-) diff --git a/R/runEval.R b/R/runEval.R index 48073ec..07e421f 100644 --- a/R/runEval.R +++ b/R/runEval.R @@ -1,162 +1,54 @@ +# dependencies library(tidyverse) library(cmdstanr) library(data.table) library(kableExtra) -rm(list = ls()) -loadBrazil = FALSE -loadSim = TRUE + source(here::here("R","util.R")) source(here::here("R","SIRTDsim.R")) - -# Each run is a row in runDf, the rows contain all information necessary to run -# an experiment. The data in the columns can vary based on whether a sim is being -# run, real data etc. -# Brazil data -# Simulated data -# -# section 1 -#Globals -seed = 93435 -n_pop <- 20000 -n_days <- 70 - -# Set up our template, all runs need this info -# Each row of dataframe is a separate run -template_df <- data.frame(sim_run_id = c(NA)) -template_df$description <- NA -template_df$seed <- seed - -# any simulation/model run -template_df$n_pop <- n_pop -template_df$n_days <- n_days - -# any model setup -template_df$model_to_run <- 'none' -template_df$compute_likelihood <- NA - -#setup data columns -template_df$s <- list(c()) -template_df$i <- list(c()) -template_df$r <- list(c()) -template_df$t <- list(c()) -template_df$d <- list(c()) -template_df$tweets <- list(c()) - -# setup prediction columns -template_df$d_in_interval <- NA_integer_ -template_df$tweets_in_interval <- NA_integer_ - -# section 1 -# section 2 -# setup for current simulations - -set.seed(template_df$seed) -n_runs <- 2 -SIRTD_sim_df = copy(template_df) -if (n_runs > 1) { - for (j in 2:n_runs) { - SIRTD_sim_df <- rbind(SIRTD_sim_df,copy(template_df)) - } -} - -SIRTD_sim_df$beta_mean <- runif(n_runs, 0.2, 0.3) -SIRTD_sim_df$beta_daily_rate <- list(rep(SIRTD_sim_df$beta_mean, - SIRTD_sim_df$n_days)) -SIRTD_sim_df$gamma <- runif(n_runs, 0.1, 0.2) -SIRTD_sim_df$death_prob <- runif(n_runs, 0.01, 0.03) -SIRTD_sim_df$tweet_rate <- runif(n_runs, 0.5, 1.5) -SIRTD_sim_df$days2death <- 20 -SIRTD_sim_df$n_patient_zero <- 20 -SIRTD_sim_df$description <- sprintf("SIRTD sim: %d runs", n_runs) -SIRTD_sim_df$sim_run_id <- 1:n_runs -# section 2 -# section 3 -for (j in 1:n_runs) { - sim_df <- sirtd_vary_beta(seed = SIRTD_sim_df[j,]$seed, - n_pop = SIRTD_sim_df[j,]$n_pop, - n_days = SIRTD_sim_df[j,]$n_days, - print = FALSE, - beta_daily_inf_rates = - unlist(SIRTD_sim_df[j,]$beta_daily_rate), - gamma_res_per_day_rate = SIRTD_sim_df[j,]$gamma, - tweet_rate_infected = SIRTD_sim_df[j,]$tweet_rate, - mean_days_to_death_from_t = SIRTD_sim_df[j,]$days2death, - n_patient_zero = SIRTD_sim_df[j,]$n_patient_zero, - death_prob = SIRTD_sim_df[j,]$death_prob) - SIRTD_sim_df[j,]$s <- list(sim_df$s) - SIRTD_sim_df[j,]$i <- list(sim_df$i) - SIRTD_sim_df[j,]$r <- list(sim_df$r) - SIRTD_sim_df[j,]$t <- list(sim_df$t) - SIRTD_sim_df[j,]$d <- list(sim_df$d) - SIRTD_sim_df[j,]$tweets <- list(sim_df$tweets) -} -# section 3 -# section 4 -# setup for current model -model_no_tweets_df <- copy(SIRTD_sim_df) -model_no_tweets_df$apply_twitter_data <- 0 -model_no_tweets_df$model_to_run <- 'twitter_sird' -model_no_tweets_df$description <- paste0(model_no_tweets_df$description, - " no tweets") - -model_tweets_df <- copy(SIRTD_sim_df) -model_tweets_df$apply_twitter_data <- 1 -model_tweets_df$model_to_run <- 'twitter_sird' -model_tweets_df$description <- paste0(model_tweets_df$description, - " tweets") -run_df <- rbind(model_no_tweets_df, model_tweets_df) -run_df$ode_solver <- 'block' -run_df$compute_likelihood <- 1 -run_df$reports <- list(c('graph_sim','graph_ODE', 'graph_tweets', 'graph_d', - 'plot','param_recovery')) -# section 4 - - -# if (loadBrazil) { -# tweets = read.csv(here::here("data","tweet_count.csv")) -# colnames(tweets) = c('dateT','predicted') -# brazilDeaths = readRDS(here::here("data","brazil_nation.rds")) -# -# dataDf = data.frame(date = brazilDeaths[1:354,]$date, -# d = brazilDeaths[1:354,]$last_available_deaths) -# tweetsPadded = rbind(data.frame(dateT = rep(NA,106), -# predicted = rep(0,106)), -# tweets) -# dataDf = cbind(dataDf,tweetsPadded) -# colnames(dataDf) = c('date','d','dateT','tweets') -# dataDf = dataDf %>% mutate(perc_d = d/nPop) %>% -# mutate(perc_t = tweets/nPop) -# dataDf = dataDf[1:nDays,] #doing 2020 only -# } - -# section 5 +source(here::here("R","sim_configs.R")) +source(here::here("R","modeling_configs.R")) +# dependencies +# setup run_df +run_df <- setup_run_df(seed = 93435, n_pop = 1000, n_days = 7) # in R/util.R +run_df <- sim_brazil_1(run_df) # in R/sim_configs.R +draws_run_df <- sim_draw_params(2, run_df) # in R/sim_configs.R +run_df <- rbind(run_df, draws_run_df) + +run_df <- model_stan_SIRDs(run_df) #in R/modeling_configs.R +run_df$ode_solver <- 'block' # set ODE solver in Stan program across all runs +run_df$compute_likelihood <- 1 # compute likelihood across all runs +run_df$reports <- list(c('graph_sim','graph_ODE', 'graph_tweets', 'graph_d', 'plot','param_recovery')) +# setup run_df +# run models for (j in 1:nrow(run_df)) { + fit <- NA if (run_df[j,]$model_to_run == 'twitter_sird') { - stan_data <- - list(n_days = run_df[j,]$n_days, - sDay1 = run_df[j,]$n_pop - 1, - iDay1 = 1, - rDay1 = 0, - dDay1 = 0, - NPop = run_df[j,]$n_pop, - tweets = unlist(run_df[j,]$tweets), - deaths = unlist(run_df[j,]$d), - compute_likelihood = run_df[j,]$compute_likelihood, - run_twitter = run_df[j,]$apply_twitter_data, - run_block_ODE = ifelse(run_df[j,]$ode_solver == 'block', 1, 0), - run_rk45_ODE = ifelse(run_df[j,]$ode_solver == 'rk45', 1, 0)) - model <- cmdstan_model(here::here("stan", "tweet_sird.stan")) - - fit <- model$sample(data=stan_data, - parallel_chains = 4, - iter_warmup = 1000, - iter_sampling = 1000, - chains = 4, - seed = 4857) - run_df[j,]$fit = list(fit) - } - else if (run_df[j,]$model_to_run == 'tweet_sird_negbin_optimized') { - stan_data_2 <- list(n_days = run_df[j,]$n_days, + stan_data <- + list(n_days = run_df[j,]$n_days, + sDay1 = run_df[j,]$n_pop - 1, + iDay1 = 1, + rDay1 = 0, + dDay1 = 0, + NPop = run_df[j,]$n_pop, + tweets = unlist(run_df[j,]$tweets), + deaths = unlist(run_df[j,]$d), + compute_likelihood = run_df[j,]$compute_likelihood, + run_twitter = run_df[j,]$apply_twitter_data, + run_block_ODE = ifelse(run_df[j,]$ode_solver == 'block', 1, 0), + run_rk45_ODE = ifelse(run_df[j,]$ode_solver == 'rk45', 1, 0)) + model <- cmdstan_model(here::here("stan", "tweet_sird.stan")) + + fit <- model$sample(data=stan_data, + parallel_chains = 4, + iter_warmup = 1000, + iter_sampling = 1000, + chains = 4, + seed = 4857) + run_df[j,]$fit = list(fit) + } + else if (run_df[j,]$model_to_run == 'tweet_sird_negbin_optimized') { + stan_data_2 <- list(n_days = run_df[j,]$n_days, y0 = c(run_df[j,]$n_pop - run_df[j,]$n_patient_zero, run_df[j,]$n_patient_zero, 0, 0, 0), # one more zero here t0 = 0, @@ -166,32 +58,32 @@ for (j in 1:nrow(run_df)) { death_count = run_df[j,]$d, symptomaticTweets = run_df[j,]$tweets, trapezoidal_solver = 0) - model2 <- cmdstan_model(here::here("stan", "tweet_sird_negbin_optimized.stan")) - fit <- model2$sample(data=stan_data_2, - parallel_chains = 4, - iter_warmup = 1000, - iter_sampling = 1000, - chains = 4, - seed = 4857) - } - if (!is.na(run_df[j,]$model_to_run)) { - d_tweets_in_interval = countPredictionsInQuantile(fit = fit, - run_df = run_df, - j = j, print = TRUE) - run_df[j,]$d_in_interval = d_tweets_in_interval[1] - run_df[j,]$tweets_in_interval = d_tweets_in_interval[2] - } - else { - print(sprintf("no model selected, got:'%s'",run_df[j,]$model_to_run)); - } -# section 5 + model2 <- cmdstan_model(here::here("stan", "tweet_sird_negbin_optimized.stan")) + fit <- model2$sample(data=stan_data_2, + parallel_chains = 4, + iter_warmup = 1000, + iter_sampling = 1000, + chains = 4, + seed = 4857) + } + if (run_df[j,]$model_to_run != 'none') { + d_tweets_in_interval = countPredictionsInQuantile(fit = fit, + run_df = run_df, + j = j, print = TRUE) + run_df[j,]$d_in_interval = d_tweets_in_interval[1] + run_df[j,]$tweets_in_interval = d_tweets_in_interval[2] + } +# run models + else { + print(sprintf("no model selected, got:'%s'",run_df[j,]$model_to_run)); + } # section 6 plot <- ggplot(data = NULL, aes(x = day, y = count)) if ('graph_sim' %in% unlist(run_df[j,]$reports)) { - plot <- plot + graph_sim_data(data_df = run_df[j,]) + plot <- plot + graph_sim_data(data_df = run_df[j,], hide_s = TRUE) } if ('graph_ODE' %in% unlist(run_df[j,]$reports)) { - plot <- plot + graph_ODE(data_df = run_df[j,], fit = fit) + plot <- plot + graph_ODE(data_df = run_df[j,], fit = fit, hide_s = TRUE) } if ('graph_tweets' %in% unlist(run_df[j,]$reports)) { plot <- plot_predictions(plot = plot, prediction_label = 'pred_tweets', @@ -215,10 +107,7 @@ for (j in 1:nrow(run_df)) { } # section 8 summary_cols = c('sim_run_id', 'model_to_run', 'beta_mean', 'gamma', 'death_prob', - 'tweet_rate', 'days2death', 'ode_solver', 'description', - 'd_in_interval', 'tweets_in_interval') + 'tweet_rate', 'days2death', 'ode_solver', 'description', + 'd_in_interval', 'tweets_in_interval') print(run_df[,summary_cols]) # section 8 - -# saveRDS(runDf,here::here("R",sprintf("%d_of_%devalBrazil622.rds",i,nrow(runDf)))) - diff --git a/R/util.R b/R/util.R index 0b3a066..156c0a1 100644 --- a/R/util.R +++ b/R/util.R @@ -3,8 +3,6 @@ #' @param truth vector of true or simulated values #' @param fitPredDf dataframe with quantiles specified #' - - countPredInInterval <- function(truth = truth, fitPredDf = fitPredDf, maxQuantileLabel = maxQuantileLabel, minQuantileLabel = minQuantileLabel) { @@ -65,7 +63,8 @@ countPredictionsInQuantile <- function(fit, run_df, j, print = FALSE) { #' of the runEval framework. Returns a ggplot geom_point element with x = days #' y = count. #' @param data_df one row of the run_df with simulation data added -graph_sim_data <- function(data_df) { +#' @param hide_s Boolean to control whether to hide the s or susceptible counts +graph_sim_data <- function(data_df, hide_s) { sim_df = data.frame(day = 1:data_df$n_days, tweets = unlist(data_df$tweets), s = unlist(data_df$s), @@ -73,7 +72,11 @@ graph_sim_data <- function(data_df) { r = unlist(data_df$r), t = unlist(data_df$t), d = unlist(data_df$d)) + compartment_names <- c('s', 'i', 'r', 't', 'd') + if (hide_s) { + compartment_names <- compartment_names[-1] + } sim_long_df = gather(data = sim_df, key = "compartment_sim", value = "count", all_of(c('tweets', compartment_names))) return(geom_point(data = sim_long_df, aes(y = count, @@ -84,9 +87,13 @@ graph_sim_data <- function(data_df) { #' Graph daily ODE means from SIRTD model. Returns a ggplot geom_line element #' @param data_df A row from run_df #' @param fit A fit object returned by cmdstanR with ode_states defined -graph_ODE <- function(data_df, fit) { +#' @param hide_s Boolean to control whether to hide susceptible counts +graph_ODE <- function(data_df, fit, hide_s) { results_ODE_df = fit$summary(variables = c('ode_states')) compartment_names = c('s', 'i', 'r', 't', 'd') + if (hide_s) { + compartment_names <- compartment_names[-1] + } ODE_df = data.frame(matrix(results_ODE_df$median, nrow = data_df$n_days, ncol = length(compartment_names))) colnames(ODE_df) = compartment_names @@ -146,4 +153,39 @@ param_recovery <- function(data_df, fit) { recov_pars[recov_pars$variable == 'lambda_twitter',]$mean, recov_pars[recov_pars$variable == 'lambda_twitter',]$sd), "\n")) -} \ No newline at end of file +} + +#' Generates the starting template for the dataframe that runs the experiments +#' @param seed The random seed for random processes in R +#' @param n_pop Population size, should be constant across runs +#' @param n_days Number of days to run, assumed to be constant across runs +#' @return dataframe appropriate for use in runEval.R +setup_run_df <- function(seed, n_pop, n_days) { + # Set up our template, all runs need this info + # Each row of dataframe is a separate run + template_df <- data.frame(sim_run_id = c(NA)) + template_df$description <- NA + template_df$seed <- seed + + # any simulation/model run + template_df$n_pop <- n_pop + template_df$n_days <- n_days + + # any model setup + template_df$model_to_run <- 'none' + template_df$compute_likelihood <- NA + + #setup data columns + template_df$s <- list(c()) + template_df$i <- list(c()) + template_df$r <- list(c()) + template_df$t <- list(c()) + template_df$d <- list(c()) + template_df$tweets <- list(c()) + + # setup prediction columns + template_df$d_in_interval <- NA_integer_ + template_df$tweets_in_interval <- NA_integer_ + set.seed(template_df$seed) + return(template_df) +} diff --git a/runEval.Rmd b/runEval.Rmd index 4ed9952..330becf 100644 --- a/runEval.Rmd +++ b/runEval.Rmd @@ -1,5 +1,5 @@ --- -title: "SIRTD Data Generating Processes" +title: "Simulate/Run/Evaluate Code for SIRTD models" author: - "Breck Baldwin" - "Jose Storopoli" @@ -15,160 +15,166 @@ set.seed(4857) # Introduction -This page contains data generating programs for a common class of epidemiological models derived from compartments for the states (S) susceptible, (I) infectious and (R) for recovered --SIR models. This page/code adds compartments for (T) terminal and (D) dead counts useful for the grim task of distinguishing the dead from recovered. There is additionally data generated for tweets but that exists in parallel with the core SIRTD model. +This page describes the simulation, modeling and evaluation code in `R/runEval.R` The goal is to provide sufficient information to run and modify the code. The focus for the current data generating process is SIRTD epidemiological models derived from compartments for the states (S) susceptible, (I) infectious and (R) for recovered --SIR models. The current simulations add compartments for (T) terminal and (D) dead counts useful for the grim task of distinguishing the dead from recovered. There is additionally data generated for tweets from individuals in the infected compartment but that exists separate from the core SIRTD model. For more explanation about SIR models see [https://mc-stan.org/users/documentation/case-studies/boarding_school_case_study.html](https://mc-stan.org/users/documentation/case-studies/boarding_school_case_study.html) which we found to be a good explanation as well as an excellent introduction to Bayesian approaches using Stan. This is the model used to fit this data, there is a model reproduction check list at [https://codatmo.github.io/Simple_SIR/](https://codatmo.github.io/Simple_SIR/). # Navigation -This Rmarkdown page exists as part of the [CoDatMo](https://codatmo.github.io/) project. The repo containing this page is at [https://github.com/codatmo/dataGeneratingProcess1/](https://github.com/codatmo/dataGeneratingProcess/) as 'runEval.Rmd' and rendered as 'runEval.html'. +This Rmarkdown page exists as part of the [CoDatMo](https://codatmo.github.io/) project. The repo containing this page is at [https://github.com/codatmo/dataGeneratingProcess1/](https://github.com/codatmo/dataGeneratingProcess/) as `runEval.Rmd` and rendered as `runEval.html`. # Data Generating Process -The data generating process simulates a fixed population at the individual level on a daily basis. Larger or smaller uniform units could be used. +The data generating process simulates a fixed population at the individual level on a daily basis. There are a series of simulating DGPs with varied properties. This document covers a DGP that closely mirrors the compartment model structure with real valued compartments with population transfers between states computed via real valued multiplication on each day of the simulation. There is no stochastic element at all in this simulation since it serves as a baseline for more complex models which will have stochastic (random) elements as well as operating on different scales, e.g., agent-based models. - +cat(pull_section('# sirtd_vary_beta_exact', here::here('R','SIRTDsim.R'))) -The actual code is shown below, the Rmarkdown commands are included to be clear about where code is and how the document is being generated. -```{r echo=TRUE } -cat(paste(readLines(here::here("R","SIRTDsim.R")), "\n")) ``` -The focus on this presentation is the surrounding framework so we offer the simulation code without comment, discussion of the DGP/simulation is at [to be determined](). +The focus on this presentation is the surrounding framework so we offer the simulation code without comment, discussion of the DGP/simulation is at [to be determined](). There exists or will exist simulations that do the following: -# Running simulation and fitting a model - -Below we go over how to setup and run the simulation code that generates data and runs experiments. The code is in `R\runEval.R` and the code below is pulled dynamically from the code so it will always be up to date every time this document is generated. +* SIRTD model, real valued, non-stochastic: This model +* SIRTD model, real valued, stochastic +* SIRTD model, real valued, stochastic, varied beta +* SIRTD model, integer valued, agent based, stochastic +* SIRTD model, integer valued, agent based, varied beta -```{r echo=FALSE, eval=TRUE, message=FALSE, warning=FALSE, results='hide'} -source(here::here('R','runEval.R')) -``` +We may also run models that are time limited, feature richer social interaction models etc... +# Running simulation and fitting a model -```{r echo=TRUE} +Below we go over how to setup and run the simulation code that generates data and runs experiments. The central code is in `R/runEval.R` with use of functions in `R/util.R` for printing/graphing functions, `R/data_configs.R` for code that creates data via simulation or provides actual data, `R/modeling_configs.R` for code that configures parameters for modeling software and `R/SIRTDsim.R` that contains simulation software and is called from functions in `R/data_configs.R`. The libraries and dependencies are: -pull_section <- function(section, file) { - print = FALSE - return_string = '' - for (line in readLines(file)) { - if (line == section) { - print = !print - } - if (print) { - return_string = paste0(return_string, line, '\n') - } - } - return(return_string) -} +```{r} +cat(pull_section('# dependencies', here::here('R','runEval.R'))) +``` -cat(pull_section('# section 1', here::here('R','runEval.R'))) +## Setting up the `run_df` data frame that controls the experiments +```{r} +cat(pull_section('# setup run_df', here::here('R','runEval.R'), num_lines = TRUE)) +``` +The `run_df` is a dataframe where each row will be a complete experiment configuration. The above functions add information relevant to their role. The source is very straightforward should one want to modify for their own purposes. In more detail: +```{r} +cat(pull_section('setup_run_df', here::here('R','runEval.R'), num_lines = TRUE)) ``` -Above the `template_df` is initialized with column names and global parmeters, `n_days` and `n_pop`. The simulation will populate the data columns, actual data may only populate some of the columns, e.g., `template_df$d` and `templte_df$tweets` in the case of the current Brazil data. Printing the data frame: +Running code up to this line shows the following values, note that the dataframe is transposed with rows as columns and columns as rows--unreadable otherwise. + +```{r echo=FALSE} +# dependencies +library(tidyverse) +library(cmdstanr) +library(data.table) +library(kableExtra) + +source(here::here("R","util.R")) +source(here::here("R","SIRTDsim.R")) +source(here::here("R","sim_configs.R")) +source(here::here("R","modeling_configs.R")) +# dependencies +# setup run_df +run_df <- setup_run_df(seed = 93435, n_pop = 1000, n_days = 7) +kable(t(run_df)) +``` + +This line constructs a basic template for the `run_df`, there is just one row at this point--remember transposed output. All subsequent additions/elaborations draw from this dataframe. ```{r} -print(template_df) +cat(pull_section('sim_brazil_1', here::here('R','runEval.R'), num_lines = TRUE)) ``` +The `sim_brazil_1` is a parameterization that roughly mimics the shape of the COVID-19 outbreak with corresponding simulated values as determined by trying values by hand. Look at the source for more info. Note that the prefix on the function is 'sim', which means that it is part of simulation. There is still one row in the dataframe but with simulation data and the generating paramaterzation. -### Adding simulation parameters +The, remember transposed, dataframe is: -The parameterization for the simulation is next with deep copies being made of the `template_df` for the number of simulation configurations being created with `n_runs`. +```{r echo=FALSE} +run_df <- sim_brazil_1(run_df) +kable(t(run_df), digits=3) +``` + +We now have simulation data from the SIRTD model for `n_days`. But still only one row--remember transposed above. ```{r} -cat(pull_section('# section 2', here::here('R','runEval.R'))) +cat(pull_section('sim_draw_params', here::here('R','runEval.R'), num_lines = TRUE)) ``` -Some parameters have one value across runs, others are a separate draw. The `sim_run_id` will be the same for subsequent copies of that particular draw, this becomes important when multiple modeling options are tried on the same simulated data. +The next line runs two random based draws from the parmeterization in the single simulation in `run_df`. Two draws are conducted and returned. +```{r echo=FALSE} +draws_run_df <- sim_draw_params(2, run_df) +kable(t(draws_run_df)) +``` -## Running the simulation +Above are transposed output for two draws for slight variations of the Brazil hand parameterization. -The simulation is run next with parameters defined above: ```{r} -cat(pull_section('# section 3', here::here('R','runEval.R'))) +cat(pull_section("rbind", here::here('R','runEval.R'), num_lines = TRUE)) ``` -The simulation is run, setting `print` to `TRUE` gives day by day accounting of the simulation. We now have a complete simulation run per row of the `SIRTD_sim_df`. Next we add modeling configurations. +We bind the Brazil run with the draws resulting in 3 rows. -```{r} -print(SIRTD_sim_df) +```{r echo=FALSE} +run_df <- rbind(run_df, draws_run_df) +kable(t(run_df), digits = 3) ``` -## Running non-simulated data +## Setting up modeling + +At this point we have three sets of data, one hand configured to mirror Brazil data and two that vary simulation parameters from the hand configured model. Now we apply modeling configurations with subroutines from `R/modeling_configs.R`. -It is possible to run data in this setup from non-simulated sources. An example will be eventually supplied. +```{r} +cat(pull_section("model_stan_SIRDs", here::here('R','runEval.R'), num_lines = TRUE)) +``` -## Configuring modeling +There are two model configurations which apply to the three rows uniformly so we end up with 6 configurations total. One model applies tweets to the task, and one does not. The final result is: -Now we add the ways to configure modeling parameters. Since the number of simulations run will be copied per model setup there will be model setups times simulation setups rows in in the resulting data frame. +```{r echo=FALSE} +run_df <- model_stan_SIRDs(run_df) +kable(t(run_df), digits = 3) +``` + +## Configuring reporting +The specific reporting can be controlled from the `run_df` as well. ```{r} -cat(pull_section('# section 4', here::here('R','runEval.R'))) +cat(pull_section("reports <-", here::here('R','runEval.R'), num_lines = TRUE)) ``` -Note that the `description` value is appended so it is possible to track variations. In this example the use of tweets to inform the model is varied. The `run_df` is created from the two model setups and then the the `ode_solver` and `compute_liklihood` parameters are set that will span all the model/sim configurations. -Finaly there is the `reports` column that controls what graphs and summaries are printed during the runs. All available options are shown. - +After each `run_df` iteration the listed graphing and print routines are controlled via the above. This list is all available, they are explained below. ## Running models Each row of `run_df` is a complete specification of data and model configuration. The code simply iterates over each row, fits it with the appropriate model/model configuration if there is one specified and adds the results of computing how many simulated days data for tweets and deaths are in the .2 to .8 central interval of the samples for the model run. See `R/util.R` for the implementation of `countPredictionsInQuantile`. ```{r} -cat(pull_section('# section 5', here::here('R','runEval.R'))) +cat(pull_section('# run models', here::here('R','runEval.R'))) ``` - +Only two models are available at present. ## Presenting results At this point results will likely get more idiosyncratic to experiment needs. The below if statements check the listed values for `reports` and apply the corresponding reporting. Note that each run is reported with no attempt to collect them into a collection--we intend to add that functionality eventually. In order there are graphing options to: @@ -183,21 +189,18 @@ All subroutines are in the `R/Util.R` file. ```{r} cat(pull_section('# section 6', here::here('R','runEval.R'))) ``` -Example output is: -```{r echo=FALSE} -print(plot) -``` -A parameter recovery report finalizs the reporting options per run: +A parameter recovery report finalizes the reporting options per run: ```{r} cat(pull_section('# section 7', here::here('R','runEval.R'))) ``` -which summarizes as follows; +which reports on whether the simulation parameters and those from the model. +# All run summary ```{r echo=FALSE} - cat(param_recovery(data_df = run_df[j,], fit = fit)) +# cat(param_recovery(data_df = run_df[j,], fit = fit)) ``` A summary across all the runs is done next by reporting relevant columns of the run_df: @@ -206,12 +209,6 @@ A summary across all the runs is done next by reporting relevant columns of the cat(pull_section('# section 8', here::here('R','runEval.R'))) ``` -Example output is: - -```{r echo=FALSE} -print(run_df[,summary_cols]) -``` - There are many more result views available that we will eventually integrate into this document. They include: diff --git a/runEval.html b/runEval.html index 0d87adc..b19f855 100644 --- a/runEval.html +++ b/runEval.html @@ -13,9 +13,9 @@ - + -SIRTD Data Generating Processes +Simulate/Run/Evaluate Code for SIRTD models @@ -167,361 +167,1627 @@ -

SIRTD Data Generating Processes

+

Simulate/Run/Evaluate Code for SIRTD models

Breck Baldwin

Jose Storopoli

Conor Rosato

-

July 10, 2021

+

July 25, 2021

Introduction

-

This page contains data generating programs for a common class of epidemiological models derived from compartments for the states (S) susceptible, (I) infectious and (R) for recovered –SIR models. This page/code adds compartments for (T) terminal and (D) dead counts useful for the grim task of distinguishing the dead from recovered. There is additionally data generated for tweets but that exists in parallel with the core SIRTD model.

+

This page describes the simulation, modeling and evaluation code in R/runEval.R The goal is to provide sufficient information to run and modify the code. The focus for the current data generating process is SIRTD epidemiological models derived from compartments for the states (S) susceptible, (I) infectious and (R) for recovered –SIR models. The current simulations add compartments for (T) terminal and (D) dead counts useful for the grim task of distinguishing the dead from recovered. There is additionally data generated for tweets from individuals in the infected compartment but that exists separate from the core SIRTD model.

For more explanation about SIR models see https://mc-stan.org/users/documentation/case-studies/boarding_school_case_study.html which we found to be a good explanation as well as an excellent introduction to Bayesian approaches using Stan. This is the model used to fit this data, there is a model reproduction check list at https://codatmo.github.io/Simple_SIR/.

Data Generating Process

-

The data generating process simulates a fixed population at the individual level on a daily basis. Larger or smaller uniform units could be used.

- -

The actual code is shown below, the Rmarkdown commands are included to be clear about where code is and how the document is being generated.

-
cat(paste(readLines(here::here("R","SIRTDsim.R")), "\n"))
-
sirtd_vary_beta <- function(seed, 
-                             n_pop, 
-                             n_days, 
-                             print, 
-                             beta_daily_inf_rates, 
-                             gamma_res_per_day_rate, 
-                             death_prob, 
-                             tweet_rate_infected, 
-                             n_patient_zero, 
-                             mean_days_to_death_from_t) { 
-   set.seed(seed) 
-   day_state = c(rep('i',n_patient_zero), rep('s', n_pop - n_patient_zero)) 
-   tweets = rep(0, n_pop) 
-   col_names = c('day', 's', 'i', 'r', 't', 'd', 'tweets') 
-   df = data.frame(matrix(nrow = 0, ncol=length(col_names))) 
-   colnames(df) = col_names 
-   next_day_state = day_state 
-   for (day in 1:n_days) { 
-     df[day,] = rep(NA,length(col_names)) #setup data, will cause errors if I miss something 
-     df[day,]$s=  length(subset(day_state, day_state == 's')) 
-     df[day,]$i = length(subset(day_state, day_state == 'i')) 
-     df[day,]$r = length(subset(day_state, day_state == 'r')) 
-     df[day,]$t = length(subset(day_state, day_state == 't')) 
-     df[day,]$d = length(subset(day_state, day_state == 'd')) 
-     df[day,]$day = day 
-     df[day,]$tweets = sum(tweets) 
-     if (print) { 
-       cat( 
-         sprintf( 
-           "Day=%d, susceptible=%d, infected=%d, recovered=%d, terminal=%d, 
-            dead=%d, tweets=%d, R0=%.2f\n", 
-           df[day,]$day, df[day,]$s, df[day,]$i, df[day,]$r, df[day,]$t, 
-           df[day,]$d, df[day,]$tweets,  
-           beta_daily_inf_rate[day]/gamma_res_per_day_rate)) 
-     } 
-     tweets = rep(0, n_pop) #start fresh every day, certainly wrong. 
-     for (per in 1:n_pop) { 
-       #end infectious period 
-       if (day_state[per] == 'i') { 
-         tweets[per] = rpois(1, tweet_rate_infected) 
-         if (rbinom(n = 1, size = 1, prob = gamma_res_per_day_rate) == 1) { 
-           if (rbinom(n = 1, size = 1, prob = death_prob) == 1) { 
-             next_day_state[per] = 't' 
-           } 
-           else { 
-             next_day_state[per] = 'r' 
-           } 
-         } 
-       } 
-       if (day_state[per] == 't') { 
-          
-         if (rbinom(n = 1, size =1, prob = 1/mean_days_to_death_from_t) == 1) { 
-           next_day_state[per] = 'd' 
-         } 
-       } 
-     } 
-     for (per in 1:n_pop) { 
-       if (day_state[per] == 'i') { 
-         for (other_per in sample(1:n_pop, rpois(n = 1, lambda = beta_daily_inf_rates[day]))) { 
-           if (day_state[other_per] == 's') {  #subtle but will reduce infs if there are not lots of s 
-             next_day_state[other_per] = 'i' 
-           } 
-         } 
-       } 
-     } 
-     day_state = next_day_state 
-   } 
-   return(df) 
- } 
-  
-

The focus on this presentation is the surrounding framework so we offer the simulation code without comment, discussion of the DGP/simulation is at to be determined.

-
-
-

Running simulation and fitting a model

-

Below we go over how to setup and run the simulation code that generates data and runs experiments. The code is in R\runEval.R and the code below is pulled dynamically from the code so it will always be up to date every time this document is generated.

-

-
pull_section <- function(section, file) {
-  print = FALSE
+

The data generating process simulates a fixed population at the individual level on a daily basis. There are a series of simulating DGPs with varied properties. This document covers a DGP that closely mirrors the compartment model structure with real valued compartments with population transfers between states computed via real valued multiplication on each day of the simulation. There is no stochastic element at all in this simulation since it serves as a baseline for more complex models which will have stochastic (random) elements as well as operating on different scales, e.g., agent-based models.

+

The actual code is shown below, the Rmarkdown commands are included to be clear about where code is and how the document is being generated. Note that there is a pull_section function that prints sections of code based on being between comments that will be used throughout. This allows software to exist outside the notebook format without introducing copy/paste errors that often come up or other mismatches between running code and it’s description.

+
#' Returns lines of file between the identified section '# <section name>' and 
+#' the matching section later in the file or the end of the file.
+#' @param sections Comment that starts/stops collection of lines, e.g., '# section 1'
+#' @param file Path to file to pull from
+pull_section <- function(section, file, num_lines = FALSE) {
+  collect = FALSE
   return_string = ''
+  line_num = 0
   for (line in readLines(file)) {
+    line_num = line_num + 1
     if (line == section) {
-      print = !print
+      collect = !collect
     }
-    if (print) {
-      return_string = paste0(return_string, line, '\n')
+    prefix = ""
+    if (num_lines) {
+        prefix = paste0(line_num,": ")
+    }
+    if (grepl(section,line)) {
+      return_string = paste0(return_string, paste0(prefix,line), '\n')
+    }
+    else if (collect) {
+      return_string = paste0(return_string, paste0(prefix,line), '\n')
     }
   }
   return(return_string)
 }
 
-cat(pull_section('# section 1', here::here('R','runEval.R')))
-
# section 1
-#Globals
-seed = 93435
-n_pop <- 20000
-n_days <- 70
-
-# Set up our template, all runs need this info
-# Each row of dataframe is a separate run
-template_df <- data.frame(sim_run_id = c(NA))
-template_df$description <- NA
-template_df$seed <- seed
-
-# any simulation/model run
-template_df$n_pop <- n_pop
-template_df$n_days <- n_days
-
-# any model setup
-template_df$model_to_run <- 'none'
-template_df$compute_likelihood <- NA
-
-#setup data columns 
-template_df$s <- list(c())
-template_df$i <- list(c())
-template_df$r <- list(c())
-template_df$t <- list(c())
-template_df$d <- list(c())
-template_df$tweets <- list(c())
-
-# setup prediction columns
-template_df$d_in_interval <- NA_integer_
-template_df$tweets_in_interval <- NA_integer_
-

Above the template_df is initialized with column names and global parmeters, n_days and n_pop. The simulation will populate the data columns, actual data may only populate some of the columns, e.g., template_df$d and templte_df$tweets in the case of the current Brazil data. Printing the data frame:

-
print(template_df)
-
  sim_run_id description  seed n_pop n_days model_to_run compute_likelihood
-1         NA          NA 93435 20000     70         none                 NA
-     s    i    r    t    d tweets d_in_interval tweets_in_interval
-1 NULL NULL NULL NULL NULL   NULL            NA                 NA
-
-

Adding simulation parameters

-

The parameterization for the simulation is next with deep copies being made of the template_df for the number of simulation configurations being created with n_runs.

-
cat(pull_section('# section 2', here::here('R','runEval.R')))
-
# section 2
-# setup for current simulations
-
-set.seed(template_df$seed)
-n_runs <- 2
-SIRTD_sim_df = copy(template_df)
-if (n_runs > 1) {
-  for (j in 2:n_runs) {
-    SIRTD_sim_df <- rbind(SIRTD_sim_df,copy(template_df))
+cat(pull_section('# sirtd_vary_beta_exact', here::here('R','SIRTDsim.R')))
+
# sirtd_vary_beta_exact
+#' Runs a simple deterministic SIRTD model with possible daily varied beta values. 
+#' @param seed Seed for stochastic processes, ignored since this is a deterministic function
+#' @param n_pop Population size
+#' @param n_days Number of days to run simulation
+#' @param print Boolean to control whether daily summary is printed
+#' @param beta_daily_inf_rates Controls number of s that are infected by i to 
+#' become i. Should be between 0 and 1
+#' @param gamma_res_per_day_rate Controls number of i that transition to r or t
+#' per day. Should be between 0 and 1
+#' @param death_prob Probability transitioning from i to t
+#' @param tweet_rate_infected If infected, what rate of tweets per day. Can be 
+#' > 1
+#' @param n_patient_zero Number of patients infected at day 0
+#' @param mean_days_to_death_from_t Mean number of days in t before d
+#' @return dataframe simulating sirtd+tweets state for number days 
+sirtd_vary_beta_exact <- function(seed,
+                            n_pop,
+                            n_days,
+                            print,
+                            beta_daily_inf_rates,
+                            gamma_res_per_day_rate,
+                            death_prob,
+                            tweet_rate_infected,
+                            n_patient_zero,
+                            mean_days_to_death_from_t) {
+  old_seed <- .Random.seed
+  on.exit({.Random.seed <<- old_seed})
+  set.seed(seed)
+  i <- n_patient_zero
+  r <- 0
+  t <- 0
+  d <- 0
+  s <- n_pop - n_patient_zero 
+  col_names = c('day', 's', 'i', 'r', 't', 'd', 'tweets')
+  df = data.frame(matrix(nrow = 0, ncol=length(col_names)))
+  colnames(df) = col_names
+  tweets = 0
+  for (day in 1:n_days) {
+    df[day,] = rep(NA,length(col_names)) 
+    df[day,]$s=  round(s)
+    df[day,]$i = round(i)
+    df[day,]$r = round(r)
+    df[day,]$t = round(t)
+    df[day,]$d = round(d)
+    df[day,]$day = day
+    df[day,]$tweets = round(tweets)
+    if (print) {
+      cat(
+        sprintf(
+          "\nDay=%d, susceptible=%.2f, infected=%.2f, recovered=%.2f, terminal=%.2f,
+           dead=%.2f, tweets=%.2f, R0=%.2f %.2f",
+          df[day,]$day, df[day,]$s, df[day,]$i, df[day,]$r, df[day,]$t,
+          df[day,]$d, df[day,]$tweets,
+          beta_daily_inf_rates[day]/gamma_res_per_day_rate,
+          s + i + r + t + d))
+    }
+    resolved <- gamma_res_per_day_rate * i  
+    terminal <- death_prob * resolved
+    recovered <- resolved - terminal
+    dead <- 1/mean_days_to_death_from_t * t
+    tweets = tweet_rate_infected * i
+    infected <- min(i * beta_daily_inf_rates[day], s)
+    s <- s - infected
+    i <- i - resolved + infected
+    r <- r + recovered
+    t <- t - dead + terminal
+    d <- d + dead
   }
+  return(df)
 }
-
-SIRTD_sim_df$beta_mean <- runif(n_runs, 0.2, 0.3)
-SIRTD_sim_df$beta_daily_rate <- list(rep(SIRTD_sim_df$beta_mean, 
-                                    SIRTD_sim_df$n_days))
-SIRTD_sim_df$gamma <- runif(n_runs, 0.1, 0.2)
-SIRTD_sim_df$death_prob <- runif(n_runs, 0.01, 0.03)
-SIRTD_sim_df$tweet_rate <- runif(n_runs, 0.5, 1.5)
-SIRTD_sim_df$days2death <- 20
-SIRTD_sim_df$n_patient_zero <- 20
-SIRTD_sim_df$description <- sprintf("SIRTD sim: %d runs", n_runs)
-SIRTD_sim_df$sim_run_id <- 1:n_runs
-

Some parameters have one value across runs, others are a separate draw. The sim_run_id will be the same for subsequent copies of that particular draw, this becomes important when multiple modeling options are tried on the same simulated data.

+# sirtd_vary_beta_exact
+

The focus on this presentation is the surrounding framework so we offer the simulation code without comment, discussion of the DGP/simulation is at to be determined. There exists or will exist simulations that do the following:

+
    +
  • SIRTD model, real valued, non-stochastic: This model
  • +
  • SIRTD model, real valued, stochastic
  • +
  • SIRTD model, real valued, stochastic, varied beta
  • +
  • SIRTD model, integer valued, agent based, stochastic
  • +
  • SIRTD model, integer valued, agent based, varied beta
  • +
+

We may also run models that are time limited, feature richer social interaction models etc…

-
-

Running the simulation

-

The simulation is run next with parameters defined above:

-
cat(pull_section('# section 3', here::here('R','runEval.R')))
-
# section 3
-for (j in 1:n_runs) {
-  sim_df <- sirtd_vary_beta(seed = SIRTD_sim_df[j,]$seed,
-                         n_pop = SIRTD_sim_df[j,]$n_pop, 
-                         n_days = SIRTD_sim_df[j,]$n_days,
-                         print = FALSE,
-                         beta_daily_inf_rates = 
-                                       unlist(SIRTD_sim_df[j,]$beta_daily_rate),
-                         gamma_res_per_day_rate = SIRTD_sim_df[j,]$gamma,
-                         tweet_rate_infected = SIRTD_sim_df[j,]$tweet_rate,
-                         mean_days_to_death_from_t = SIRTD_sim_df[j,]$days2death,
-                         n_patient_zero = SIRTD_sim_df[j,]$n_patient_zero,
-                         death_prob = SIRTD_sim_df[j,]$death_prob)
-  SIRTD_sim_df[j,]$s <- list(sim_df$s)
-  SIRTD_sim_df[j,]$i <- list(sim_df$i)
-  SIRTD_sim_df[j,]$r <- list(sim_df$r)
-  SIRTD_sim_df[j,]$t <- list(sim_df$t)
-  SIRTD_sim_df[j,]$d <- list(sim_df$d)
-  SIRTD_sim_df[j,]$tweets <- list(sim_df$tweets)
-}
-

The simulation is run, setting print to TRUE gives day by day accounting of the simulation. We now have a complete simulation run per row of the SIRTD_sim_df. Next we add modeling configurations.

-
print(SIRTD_sim_df)
-
  sim_run_id       description  seed n_pop n_days model_to_run
-1          1 SIRTD sim: 2 runs 93435 20000     70         none
-2          2 SIRTD sim: 2 runs 93435 20000     70         none
-  compute_likelihood
-1                 NA
-2                 NA
-                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       s
-1 19980, 19979, 19975, 19973, 19966, 19960, 19956, 19948, 19943, 19938, 19927, 19916, 19907, 19894, 19885, 19869, 19852, 19839, 19823, 19805, 19783, 19758, 19732, 19706, 19678, 19651, 19614, 19572, 19523, 19477, 19416, 19360, 19297, 19209, 19110, 19014, 18928, 18822, 18707, 18581, 18441, 18276, 18101, 17888, 17694, 17483, 17246, 17006, 16738, 16442, 16148, 15857, 15504, 15136, 14777, 14420, 14085, 13734, 13394, 13027, 12612, 12250, 11891, 11559, 11202, 10874, 10539, 10201, 9905, 9638
-2                    19980, 19979, 19975, 19973, 19966, 19960, 19956, 19945, 19941, 19936, 19925, 19914, 19895, 19881, 19866, 19845, 19820, 19795, 19749, 19707, 19649, 19583, 19514, 19438, 19333, 19224, 19084, 18935, 18765, 18587, 18377, 18127, 17872, 17587, 17251, 16921, 16533, 16138, 15702, 15227, 14753, 14197, 13638, 13053, 12433, 11803, 11235, 10638, 10048, 9505, 8923, 8465, 7973, 7514, 7092, 6728, 6363, 6031, 5748, 5481, 5203, 4989, 4790, 4611, 4455, 4317, 4186, 4077, 3977, 3877
-                                                                                                                                                                                                                                                                                                                                                                                    i
-1                     20, 18, 22, 20, 21, 23, 22, 26, 29, 33, 39, 45, 44, 44, 51, 59, 66, 67, 67, 75, 88, 104, 110, 110, 119, 132, 138, 160, 180, 202, 225, 238, 266, 315, 350, 385, 402, 452, 488, 530, 580, 635, 709, 819, 876, 950, 1022, 1095, 1196, 1288, 1361, 1427, 1533, 1645, 1743, 1829, 1838, 1867, 1893, 1942, 2017, 2049, 2097, 2068, 2059, 2062, 2036, 2055, 2010, 1978
-2 20, 18, 22, 20, 21, 25, 24, 32, 34, 37, 43, 50, 59, 68, 79, 87, 102, 117, 151, 169, 205, 247, 285, 324, 378, 446, 528, 604, 701, 793, 891, 1023, 1151, 1274, 1445, 1578, 1787, 1946, 2114, 2305, 2506, 2753, 2983, 3204, 3391, 3586, 3674, 3802, 3916, 3975, 4052, 3998, 3936, 3866, 3727, 3630, 3529, 3442, 3310, 3164, 3014, 2856, 2712, 2568, 2385, 2202, 2051, 1881, 1746, 1621
-                                                                                                                                                                                                                                                                                                                                                                                        r
-1                     0, 3, 3, 7, 13, 17, 22, 26, 28, 29, 34, 39, 49, 62, 64, 72, 82, 94, 110, 120, 129, 137, 157, 183, 201, 215, 246, 265, 294, 318, 356, 398, 432, 469, 531, 590, 658, 713, 792, 873, 960, 1069, 1169, 1271, 1406, 1541, 1703, 1865, 2029, 2225, 2442, 2664, 2905, 3158, 3415, 3684, 4004, 4321, 4628, 4940, 5275, 5598, 5903, 6257, 6619, 6934, 7294, 7607, 7942, 8234
-2 0, 3, 3, 7, 13, 15, 20, 22, 24, 26, 31, 35, 45, 50, 54, 67, 77, 87, 99, 123, 145, 168, 198, 235, 283, 324, 382, 450, 521, 606, 716, 832, 958, 1113, 1277, 1469, 1643, 1872, 2135, 2408, 2672, 2978, 3295, 3643, 4062, 4486, 4955, 5412, 5880, 6347, 6843, 7347, 7887, 8396, 8933, 9382, 9843, 10249, 10650, 11052, 11469, 11831, 12163, 12477, 12806, 13117, 13389, 13654, 13883, 14102
-                                                                                                                                                                                                                                                                                   t
-1                                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 3, 5, 7, 8, 8, 9, 9, 11, 13, 14, 14, 13, 15, 17, 18, 22, 24, 32, 36, 34, 38, 35, 39, 40, 45, 48, 53, 57, 59, 65, 66, 70, 73, 80, 78, 82, 85, 86
-2 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2, 5, 4, 4, 9, 11, 12, 14, 15, 14, 21, 20, 24, 28, 34, 37, 46, 55, 56, 66, 80, 88, 97, 104, 109, 113, 126, 127, 126, 134, 145, 163, 170, 169, 174, 181, 187, 190, 187, 189, 189, 192, 194, 193, 200, 198, 197
-                                                                                                                                                                                                                                                               d
-1                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, 9, 9, 9, 11, 12, 13, 13, 13, 18, 20, 26, 26, 27, 28, 30, 32, 34, 37, 38, 43, 46, 47, 50, 53, 55, 58, 64
-2 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 3, 5, 5, 7, 8, 9, 10, 12, 14, 14, 16, 18, 20, 26, 28, 32, 39, 43, 47, 55, 64, 70, 79, 85, 90, 96, 104, 111, 116, 124, 137, 146, 155, 162, 170, 181, 188, 196, 203
-                                                                                                                                                                                                                                                                                                                                                                        tweets
-1            0, 33, 16, 39, 32, 32, 25, 34, 37, 35, 38, 58, 49, 74, 75, 63, 80, 97, 80, 86, 88, 139, 152, 142, 166, 165, 185, 179, 246, 272, 302, 328, 311, 379, 407, 489, 534, 555, 606, 631, 715, 809, 862, 1000, 1164, 1280, 1263, 1451, 1528, 1619, 1742, 1942, 2084, 2115, 2291, 2404, 2590, 2586, 2508, 2613, 2701, 2768, 2798, 2789, 2769, 2877, 2924, 2844, 2784, 2845
-2 0, 17, 10, 23, 18, 20, 15, 26, 26, 26, 38, 35, 43, 61, 48, 70, 89, 77, 90, 124, 150, 168, 197, 247, 276, 283, 368, 448, 475, 575, 641, 736, 851, 998, 1092, 1245, 1318, 1465, 1619, 1800, 1910, 2017, 2343, 2334, 2630, 2808, 2901, 3115, 3077, 3281, 3169, 3374, 3303, 3323, 3235, 3000, 2970, 2913, 2750, 2704, 2578, 2486, 2388, 2188, 2103, 1924, 1828, 1717, 1580, 1561
-  d_in_interval tweets_in_interval beta_mean
-1            NA                 NA 0.2897216
-2            NA                 NA 0.2588033
-                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     beta_daily_rate
-1 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033
-2 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2897216, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033, 0.2588033
-      gamma death_prob tweet_rate days2death n_patient_zero
-1 0.1668548 0.01701741  1.3750113         20             20
-2 0.1284935 0.02940307  0.8216918         20             20
+
+

Running simulation and fitting a model

+

Below we go over how to setup and run the simulation code that generates data and runs experiments. The central code is in R/runEval.R with use of functions in R/util.R for printing/graphing functions, R/data_configs.R for code that creates data via simulation or provides actual data, R/modeling_configs.R for code that configures parameters for modeling software and R/SIRTDsim.R that contains simulation software and is called from functions in R/data_configs.R. The libraries and dependencies are:

+
cat(pull_section('# dependencies', here::here('R','runEval.R')))
+
# dependencies
+library(tidyverse)
+library(cmdstanr)
+library(data.table)
+library(kableExtra)
+
+source(here::here("R","util.R"))
+source(here::here("R","SIRTDsim.R"))
+source(here::here("R","sim_configs.R"))
+source(here::here("R","modeling_configs.R"))
+# dependencies
+
+

Setting up the run_df data frame that controls the experiments

+
cat(pull_section('# setup run_df', here::here('R','runEval.R'), num_lines = TRUE))
+
12: # setup run_df
+13: run_df <- setup_run_df(seed = 93435, n_pop = 1000, n_days = 7) # in R/util.R
+14: run_df <- sim_brazil_1(run_df) # in R/sim_configs.R
+15: draws_run_df <- sim_draw_params(2, run_df) # in R/sim_configs.R
+16: run_df <- rbind(run_df, draws_run_df) 
+17: 
+18: run_df <- model_stan_SIRDs(run_df) #in R/modeling_configs.R
+19: run_df$ode_solver <- 'block' # set ODE solver in Stan program across all runs
+20: run_df$compute_likelihood <- 1 # compute likelihood across all runs
+21: run_df$reports <- list(c('graph_sim','graph_ODE', 'graph_tweets', 'graph_d', 'plot','param_recovery'))
+22: # setup run_df
+

The run_df is a dataframe where each row will be a complete experiment configuration. The above functions add information relevant to their role. The source is very straightforward should one want to modify for their own purposes. In more detail:

+
cat(pull_section('setup_run_df', here::here('R','runEval.R'), num_lines = TRUE))
+
13: run_df <- setup_run_df(seed = 93435, n_pop = 1000, n_days = 7) # in R/util.R
+

Running code up to this line shows the following values, note that the dataframe is transposed with rows as columns and columns as rows–unreadable otherwise.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+sim_run_id + +NA +
+description + +NA +
+seed + +93435 +
+n_pop + +1000 +
+n_days + +7 +
+model_to_run + +none +
+compute_likelihood + +NA +
+s + +NULL +
+i + +NULL +
+r + +NULL +
+t + +NULL +
+d + +NULL +
+tweets + +NULL +
+d_in_interval + +NA +
+tweets_in_interval + +NA +
+

This line constructs a basic template for the run_df, there is just one row at this point–remember transposed output. All subsequent additions/elaborations draw from this dataframe.

+
cat(pull_section('sim_brazil_1', here::here('R','runEval.R'), num_lines = TRUE))
+
14: run_df <- sim_brazil_1(run_df) # in R/sim_configs.R
+

The sim_brazil_1 is a parameterization that roughly mimics the shape of the COVID-19 outbreak with corresponding simulated values as determined by trying values by hand. Look at the source for more info. Note that the prefix on the function is ‘sim’, which means that it is part of simulation. There is still one row in the dataframe but with simulation data and the generating paramaterzation.

+

The, remember transposed, dataframe is:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+sim_run_id + +1 +
+description + +Brazil 1 approximation +
+seed + +93435 +
+n_pop + +1000 +
+n_days + +7 +
+model_to_run + +none +
+compute_likelihood + +NA +
+s + +c(990, 987, 984, 981, 977, 974, 970) +
+i + +c(10, 10, 11, 11, 12, 12, 13) +
+r + +c(0, 3, 5, 8, 11, 14, 17) +
+t + +c(0, 0, 0, 0, 0, 0, 0) +
+d + +c(0, 0, 0, 0, 0, 0, 0) +
+tweets + +c(0, 2, 3, 3, 3, 3, 3) +
+d_in_interval + +NA +
+tweets_in_interval + +NA +
+beta_mean + +0.3 +
+beta_daily_rate + +c(0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3) +
+gamma + +0.255 +
+death_prob + +0.01 +
+tweet_rate + +0.25 +
+days2death + +10 +
+n_patient_zero + +10 +
+

We now have simulation data from the SIRTD model for n_days. But still only one row–remember transposed above.

+
cat(pull_section('sim_draw_params', here::here('R','runEval.R'), num_lines = TRUE))
+
15: draws_run_df <- sim_draw_params(2, run_df) # in R/sim_configs.R
+

The next line runs two random based draws from the parmeterization in the single simulation in run_df. Two draws are conducted and returned.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+sim_run_id + +2 + +3 +
+description + +Brazil 1 approximation + +Brazil 1 approximation +
+seed + +93435 + +93435 +
+n_pop + +1000 + +1000 +
+n_days + +7 + +7 +
+model_to_run + +none + +none +
+compute_likelihood + +NA + +NA +
+s + +c(990, 985, 980, 974, 966, 958, 948) + +c(990, 988, 986, 984, 983, 982, 981) +
+i + +c(10, 12, 14, 16, 19, 22, 26) + +c(10, 8, 6, 5, 4, 3, 3) +
+r + +c(0, 3, 6, 10, 15, 20, 26) + +c(0, 4, 8, 11, 13, 15, 16) +
+t + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) +
+d + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) +
+tweets + +c(0, 3, 4, 5, 5, 6, 7) + +c(0, 4, 3, 3, 2, 2, 1) +
+d_in_interval + +NA + +NA +
+tweets_in_interval + +NA + +NA +
+beta_mean + +0.459 + +0.24 +
+beta_daily_rate + +c(0.459, 0.459, 0.459, 0.459, 0.459, 0.459, 0.459) + +c(0.24, 0.24, 0.24, 0.24, 0.24, 0.24, 0.24) +
+gamma + +0.29 + +0.443 +
+death_prob + +0.009 + +0.01 +
+tweet_rate + +0.334 + +0.438 +
+days2death + +10 + +10 +
+n_patient_zero + +10 + +10 +
+

Above are transposed output for two draws for slight variations of the Brazil hand parameterization.

+
cat(pull_section("rbind", here::here('R','runEval.R'), num_lines = TRUE))
+
16: run_df <- rbind(run_df, draws_run_df) 
+

We bind the Brazil run with the draws resulting in 3 rows.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+sim_run_id + +1 + +2 + +3 +
+description + +Brazil 1 approximation + +Brazil 1 approximation + +Brazil 1 approximation +
+seed + +93435 + +93435 + +93435 +
+n_pop + +1000 + +1000 + +1000 +
+n_days + +7 + +7 + +7 +
+model_to_run + +none + +none + +none +
+compute_likelihood + +NA + +NA + +NA +
+s + +c(990, 987, 984, 981, 977, 974, 970) + +c(990, 985, 980, 974, 966, 958, 948) + +c(990, 988, 986, 984, 983, 982, 981) +
+i + +c(10, 10, 11, 11, 12, 12, 13) + +c(10, 12, 14, 16, 19, 22, 26) + +c(10, 8, 6, 5, 4, 3, 3) +
+r + +c(0, 3, 5, 8, 11, 14, 17) + +c(0, 3, 6, 10, 15, 20, 26) + +c(0, 4, 8, 11, 13, 15, 16) +
+t + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) +
+d + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) +
+tweets + +c(0, 2, 3, 3, 3, 3, 3) + +c(0, 3, 4, 5, 5, 6, 7) + +c(0, 4, 3, 3, 2, 2, 1) +
+d_in_interval + +NA + +NA + +NA +
+tweets_in_interval + +NA + +NA + +NA +
+beta_mean + +0.3 + +0.459 + +0.24 +
+beta_daily_rate + +c(0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3) + +c(0.459, 0.459, 0.459, 0.459, 0.459, 0.459, 0.459) + +c(0.24, 0.24, 0.24, 0.24, 0.24, 0.24, 0.24) +
+gamma + +0.255 + +0.29 + +0.443 +
+death_prob + +0.01 + +0.009 + +0.01 +
+tweet_rate + +0.25 + +0.334 + +0.438 +
+days2death + +10 + +10 + +10 +
+n_patient_zero + +10 + +10 + +10 +
-
-

Running non-simulated data

-

It is possible to run data in this setup from non-simulated sources. An example will be eventually supplied.

+
+

Setting up modeling

+

At this point we have three sets of data, one hand configured to mirror Brazil data and two that vary simulation parameters from the hand configured model. Now we apply modeling configurations with subroutines from R/modeling_configs.R.

+
cat(pull_section("model_stan_SIRDs", here::here('R','runEval.R'), num_lines = TRUE))
+
18: run_df <- model_stan_SIRDs(run_df) #in R/modeling_configs.R
+

There are two model configurations which apply to the three rows uniformly so we end up with 6 configurations total. One model applies tweets to the task, and one does not. The final result is:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+sim_run_id + +1 + +2 + +3 + +1 + +2 + +3 +
+description + +Brazil 1 approximationno tweets + +Brazil 1 approximationno tweets + +Brazil 1 approximationno tweets + +Brazil 1 approximationuse tweets + +Brazil 1 approximationuse tweets + +Brazil 1 approximationuse tweets +
+seed + +93435 + +93435 + +93435 + +93435 + +93435 + +93435 +
+n_pop + +1000 + +1000 + +1000 + +1000 + +1000 + +1000 +
+n_days + +7 + +7 + +7 + +7 + +7 + +7 +
+model_to_run + +twitter_sird + +twitter_sird + +twitter_sird + +twitter_sird + +twitter_sird + +twitter_sird +
+compute_likelihood + +NA + +NA + +NA + +NA + +NA + +NA +
+s + +c(990, 987, 984, 981, 977, 974, 970) + +c(990, 985, 980, 974, 966, 958, 948) + +c(990, 988, 986, 984, 983, 982, 981) + +c(990, 987, 984, 981, 977, 974, 970) + +c(990, 985, 980, 974, 966, 958, 948) + +c(990, 988, 986, 984, 983, 982, 981) +
+i + +c(10, 10, 11, 11, 12, 12, 13) + +c(10, 12, 14, 16, 19, 22, 26) + +c(10, 8, 6, 5, 4, 3, 3) + +c(10, 10, 11, 11, 12, 12, 13) + +c(10, 12, 14, 16, 19, 22, 26) + +c(10, 8, 6, 5, 4, 3, 3) +
+r + +c(0, 3, 5, 8, 11, 14, 17) + +c(0, 3, 6, 10, 15, 20, 26) + +c(0, 4, 8, 11, 13, 15, 16) + +c(0, 3, 5, 8, 11, 14, 17) + +c(0, 3, 6, 10, 15, 20, 26) + +c(0, 4, 8, 11, 13, 15, 16) +
+t + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) +
+d + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) + +c(0, 0, 0, 0, 0, 0, 0) +
+tweets + +c(0, 2, 3, 3, 3, 3, 3) + +c(0, 3, 4, 5, 5, 6, 7) + +c(0, 4, 3, 3, 2, 2, 1) + +c(0, 2, 3, 3, 3, 3, 3) + +c(0, 3, 4, 5, 5, 6, 7) + +c(0, 4, 3, 3, 2, 2, 1) +
+d_in_interval + +NA + +NA + +NA + +NA + +NA + +NA +
+tweets_in_interval + +NA + +NA + +NA + +NA + +NA + +NA +
+beta_mean + +0.3 + +0.459 + +0.24 + +0.3 + +0.459 + +0.24 +
+beta_daily_rate + +c(0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3) + +c(0.459, 0.459, 0.459, 0.459, 0.459, 0.459, 0.459) + +c(0.24, 0.24, 0.24, 0.24, 0.24, 0.24, 0.24) + +c(0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3) + +c(0.459, 0.459, 0.459, 0.459, 0.459, 0.459, 0.459) + +c(0.24, 0.24, 0.24, 0.24, 0.24, 0.24, 0.24) +
+gamma + +0.255 + +0.29 + +0.443 + +0.255 + +0.29 + +0.443 +
+death_prob + +0.01 + +0.009 + +0.01 + +0.01 + +0.009 + +0.01 +
+tweet_rate + +0.25 + +0.334 + +0.438 + +0.25 + +0.334 + +0.438 +
+days2death + +10 + +10 + +10 + +10 + +10 + +10 +
+n_patient_zero + +10 + +10 + +10 + +10 + +10 + +10 +
+apply_twitter_data + +0 + +0 + +0 + +1 + +1 + +1 +
-
-

Configuring modeling

-

Now we add the ways to configure modeling parameters. Since the number of simulations run will be copied per model setup there will be model setups times simulation setups rows in in the resulting data frame.

-
cat(pull_section('# section 4', here::here('R','runEval.R')))
-
# section 4
-# setup for current model
-model_no_tweets_df <- copy(SIRTD_sim_df)
-model_no_tweets_df$apply_twitter_data <- 0
-model_no_tweets_df$model_to_run <- 'twitter_sird'
-model_no_tweets_df$description <- paste0(model_no_tweets_df$description,
-                                         " no tweets")
-
-model_tweets_df <- copy(SIRTD_sim_df)
-model_tweets_df$apply_twitter_data <- 1
-model_tweets_df$model_to_run <- 'twitter_sird'
-model_tweets_df$description <- paste0(model_tweets_df$description,
-                                      " tweets")
-run_df <- rbind(model_no_tweets_df, model_tweets_df)
-run_df$ode_solver <- 'block'
-run_df$compute_likelihood <- 1
-run_df$reports <- list(c('graph_sim','graph_ODE', 'graph_tweets', 'graph_d',
-                         'plot','param_recovery'))
-

Note that the description value is appended so it is possible to track variations. In this example the use of tweets to inform the model is varied. The run_df is created from the two model setups and then the the ode_solver and compute_liklihood parameters are set that will span all the model/sim configurations. Finaly there is the reports column that controls what graphs and summaries are printed during the runs. All available options are shown.

+
+

Configuring reporting

+

The specific reporting can be controlled from the run_df as well.

+
cat(pull_section("reports <-", here::here('R','runEval.R'), num_lines = TRUE))
+
21: run_df$reports <- list(c('graph_sim','graph_ODE', 'graph_tweets', 'graph_d', 'plot','param_recovery'))
+

After each run_df iteration the listed graphing and print routines are controlled via the above. This list is all available, they are explained below.

Running models

Each row of run_df is a complete specification of data and model configuration. The code simply iterates over each row, fits it with the appropriate model/model configuration if there is one specified and adds the results of computing how many simulated days data for tweets and deaths are in the .2 to .8 central interval of the samples for the model run. See R/util.R for the implementation of countPredictionsInQuantile.

-
cat(pull_section('# section 5', here::here('R','runEval.R')))
-
# section 5
+
cat(pull_section('# run models', here::here('R','runEval.R')))
+
# run models
 for (j in 1:nrow(run_df)) {
+  fit <- NA
   if (run_df[j,]$model_to_run == 'twitter_sird') {
-      stan_data <- 
-          list(n_days = run_df[j,]$n_days,
-               sDay1 = run_df[j,]$n_pop - 1,
-               iDay1 = 1,
-               rDay1 = 0,
-               dDay1 = 0,            
-               NPop = run_df[j,]$n_pop,
-               tweets = unlist(run_df[j,]$tweets),
-               deaths = unlist(run_df[j,]$d),
-               compute_likelihood = run_df[j,]$compute_likelihood,
-               run_twitter = run_df[j,]$apply_twitter_data,
-               run_block_ODE = ifelse(run_df[j,]$ode_solver == 'block', 1, 0),
-               run_rk45_ODE = ifelse(run_df[j,]$ode_solver == 'rk45', 1, 0))
-        model <- cmdstan_model(here::here("stan", "tweet_sird.stan"))
-
-        fit <- model$sample(data=stan_data,
-                     parallel_chains = 4,
-                     iter_warmup = 1000,
-                     iter_sampling = 1000,
-                     chains = 4,
-                     seed = 4857)
-          run_df[j,]$fit = list(fit)
-        }
-      else if (run_df[j,]$model_to_run == 'tweet_sird_negbin_optimized') {
-        stan_data_2 <- list(n_days = run_df[j,]$n_days,
+    stan_data <- 
+      list(n_days = run_df[j,]$n_days,
+           sDay1 = run_df[j,]$n_pop - 1,
+           iDay1 = 1,
+           rDay1 = 0,
+           dDay1 = 0,            
+           NPop = run_df[j,]$n_pop,
+           tweets = unlist(run_df[j,]$tweets),
+           deaths = unlist(run_df[j,]$d),
+           compute_likelihood = run_df[j,]$compute_likelihood,
+           run_twitter = run_df[j,]$apply_twitter_data,
+           run_block_ODE = ifelse(run_df[j,]$ode_solver == 'block', 1, 0),
+           run_rk45_ODE = ifelse(run_df[j,]$ode_solver == 'rk45', 1, 0))
+    model <- cmdstan_model(here::here("stan", "tweet_sird.stan"))
+    
+    fit <- model$sample(data=stan_data,
+                        parallel_chains = 4,
+                        iter_warmup = 1000,
+                        iter_sampling = 1000,
+                        chains = 4,
+                        seed = 4857)
+    run_df[j,]$fit = list(fit)
+  }
+  else if (run_df[j,]$model_to_run == 'tweet_sird_negbin_optimized') {
+    stan_data_2 <- list(n_days = run_df[j,]$n_days,
                         y0 = c(run_df[j,]$n_pop - run_df[j,]$n_patient_zero, 
                                run_df[j,]$n_patient_zero, 0, 0, 0), # one more zero here
                         t0 = 0,
@@ -531,27 +1797,23 @@ 

Running models

death_count = run_df[j,]$d, symptomaticTweets = run_df[j,]$tweets, trapezoidal_solver = 0) - model2 <- cmdstan_model(here::here("stan", "tweet_sird_negbin_optimized.stan")) - fit <- model2$sample(data=stan_data_2, - parallel_chains = 4, - iter_warmup = 1000, - iter_sampling = 1000, - chains = 4, - seed = 4857) - } - if (!is.na(run_df[j,]$model_to_run)) { - d_tweets_in_interval = countPredictionsInQuantile(fit = fit, - run_df = run_df, - j = j, print = TRUE) - run_df[j,]$d_in_interval = d_tweets_in_interval[1] - run_df[j,]$tweets_in_interval = d_tweets_in_interval[2] - } - else { - print(sprintf("no model selected, got:'%s'",run_df[j,]$model_to_run)); - }
-
-
-

Presenting results

+ model2 <- cmdstan_model(here::here("stan", "tweet_sird_negbin_optimized.stan")) + fit <- model2$sample(data=stan_data_2, + parallel_chains = 4, + iter_warmup = 1000, + iter_sampling = 1000, + chains = 4, + seed = 4857) + } + if (run_df[j,]$model_to_run != 'none') { + d_tweets_in_interval = countPredictionsInQuantile(fit = fit, + run_df = run_df, + j = j, print = TRUE) + run_df[j,]$d_in_interval = d_tweets_in_interval[1] + run_df[j,]$tweets_in_interval = d_tweets_in_interval[2] + } +# run models
+

Only two models are available at present. ## Presenting results

At this point results will likely get more idiosyncratic to experiment needs. The below if statements check the listed values for reports and apply the corresponding reporting. Note that each run is reported with no attempt to collect them into a collection–we intend to add that functionality eventually. In order there are graphing options to:

  • Graph simulation compartments with dots.
  • @@ -564,10 +1826,10 @@

    Presenting results

    # section 6
       plot <- ggplot(data = NULL, aes(x = day, y = count))
       if ('graph_sim' %in% unlist(run_df[j,]$reports)) {
    -    plot <- plot +  graph_sim_data(data_df = run_df[j,])
    +    plot <- plot +  graph_sim_data(data_df = run_df[j,], hide_s = TRUE)
       }
       if ('graph_ODE' %in% unlist(run_df[j,]$reports)) {
    -    plot <- plot + graph_ODE(data_df = run_df[j,], fit = fit)
    +    plot <- plot + graph_ODE(data_df = run_df[j,], fit = fit, hide_s = TRUE)
       }
       if ('graph_tweets' %in% unlist(run_df[j,]$reports)) {
         plot <- plot_predictions(plot = plot, prediction_label = 'pred_tweets', 
    @@ -581,45 +1843,33 @@ 

    Presenting results

    } if ('plot' %in% unlist(run_df[j,]$reports)) { print(plot) - }
    -

    Example output is:

    -

    -

    A parameter recovery report finalizs the reporting options per run:

    + } +# section 6 +

    A parameter recovery report finalizes the reporting options per run:

     cat(pull_section('# section 7', here::here('R','runEval.R')))
    # section 7
       if ('param_recovery' %in% run_df[j,]$reports) {
         cat(param_recovery(data_df = run_df[j,], fit = fit))
    -  }
    -

    which summarizes as follows;

    -
    
    -Beta sim = 0.2588 vs recovered 0.8106, sd=0.4019 
    -Gamma sim = 0.1285 vs recovered 0.6981, sd=0.3763 
    -Deaths sim = 0.0294 vs recovered 0.0017, sd=0.0046 
    -Lambda Twitter sim = 0.8217 vs recovered 0.0214, sd=0.0089 
    + } +# section 7 +

    which reports on whether the simulation parameters and those from the model.

    +
+
+
+

All run summary

A summary across all the runs is done next by reporting relevant columns of the run_df:

 cat(pull_section('# section 8', here::here('R','runEval.R')))
# section 8
 summary_cols = c('sim_run_id', 'model_to_run', 'beta_mean', 'gamma', 'death_prob',
-                   'tweet_rate', 'days2death', 'ode_solver', 'description',
-                   'd_in_interval', 'tweets_in_interval')
-print(run_df[,summary_cols])
-

Example output is:

-
  sim_run_id model_to_run beta_mean     gamma death_prob tweet_rate days2death
-1          1 twitter_sird 0.2897216 0.1668548 0.01701741  1.3750113         20
-2          2 twitter_sird 0.2588033 0.1284935 0.02940307  0.8216918         20
-3          1 twitter_sird 0.2897216 0.1668548 0.01701741  1.3750113         20
-4          2 twitter_sird 0.2588033 0.1284935 0.02940307  0.8216918         20
-  ode_solver                 description d_in_interval tweets_in_interval
-1      block SIRTD sim: 2 runs no tweets             0                  1
-2      block SIRTD sim: 2 runs no tweets             0                  1
-3      block    SIRTD sim: 2 runs tweets             0                 11
-4      block    SIRTD sim: 2 runs tweets             0                 19
+ 'tweet_rate', 'days2death', 'ode_solver', 'description', + 'd_in_interval', 'tweets_in_interval') +print(run_df[,summary_cols]) +# section 8

There are many more result views available that we will eventually integrate into this document. They include:

  • Time series analyis, e.g., what happens with simulations that only provide the first interval of data to the models?
-
diff --git a/stan/tweet_sird.stan b/stan/tweet_sird.stan index 1f8c799..d590011 100644 --- a/stan/tweet_sird.stan +++ b/stan/tweet_sird.stan @@ -98,8 +98,8 @@ transformed data { for (i in 2:n_days) { ts[i] = ts[i - 1] + 1; } - print("tweets_s_c", tweets_s_c); - print("deaths_s_c", deaths_s_c); +# print("tweets_s_c", tweets_s_c); +# print("deaths_s_c", deaths_s_c); } parameters { real gamma; From bed1aae6250bc958bee6665824a5d4f87be50d10 Mon Sep 17 00:00:00 2001 From: Jose Storopoli Date: Tue, 27 Jul 2021 06:40:08 -0300 Subject: [PATCH 03/12] corrections and typos --- runEval.Rmd | 101 ++++++++++++++++++++++++++++------------------------ 1 file changed, 54 insertions(+), 47 deletions(-) diff --git a/runEval.Rmd b/runEval.Rmd index 330becf..18034a3 100644 --- a/runEval.Rmd +++ b/runEval.Rmd @@ -15,22 +15,22 @@ set.seed(4857) # Introduction -This page describes the simulation, modeling and evaluation code in `R/runEval.R` The goal is to provide sufficient information to run and modify the code. The focus for the current data generating process is SIRTD epidemiological models derived from compartments for the states (S) susceptible, (I) infectious and (R) for recovered --SIR models. The current simulations add compartments for (T) terminal and (D) dead counts useful for the grim task of distinguishing the dead from recovered. There is additionally data generated for tweets from individuals in the infected compartment but that exists separate from the core SIRTD model. +This page describes the simulation, modeling and evaluation code in `R/runEval.R`. The goal is to provide sufficient information to run and modify the code. The focus for the current data generating process (DGP) is SIRTD epidemiological models derived from compartments for the states (S) susceptible, (I) infectious and (R) for recovered -- SIR models. The current simulations add compartments for (T) terminally-ill and (D) deceased counts useful for the grim task of distinguishing the deceased from recovered. There is additionally data generated for tweets that mention symptoms from individuals in the (I) infectious compartment but that exists separate from the core SIRTD model. -For more explanation about SIR models see [https://mc-stan.org/users/documentation/case-studies/boarding_school_case_study.html](https://mc-stan.org/users/documentation/case-studies/boarding_school_case_study.html) which we found to be a good explanation as well as an excellent introduction to Bayesian approaches using Stan. This is the model used to fit this data, there is a model reproduction check list at [https://codatmo.github.io/Simple_SIR/](https://codatmo.github.io/Simple_SIR/). +For more explanation about SIR models see [Stan Case Study -- Boarding School](https://mc-stan.org/users/documentation/case-studies/boarding_school_case_study.html) which we found to be a good explanation as well as an excellent introduction to Bayesian approaches using Stan. This is the model used to fit this data, there is a model reproduction checklist at [`codatmo/Simple_SIR`](https://codatmo.github.io/Simple_SIR/). # Navigation -This Rmarkdown page exists as part of the [CoDatMo](https://codatmo.github.io/) project. The repo containing this page is at [https://github.com/codatmo/dataGeneratingProcess1/](https://github.com/codatmo/dataGeneratingProcess/) as `runEval.Rmd` and rendered as `runEval.html`. +This Rmarkdown page exists as part of the [CoDatMo](https://codatmo.github.io/) project. The repo containing this page is at [`codatmo/dataGeneratingProcess1`](https://github.com/codatmo/dataGeneratingProcess/) as `runEval.Rmd` and rendered as `runEval.html`. -# Data Generating Process +# Data Generating Process -- DGP -The data generating process simulates a fixed population at the individual level on a daily basis. There are a series of simulating DGPs with varied properties. This document covers a DGP that closely mirrors the compartment model structure with real valued compartments with population transfers between states computed via real valued multiplication on each day of the simulation. There is no stochastic element at all in this simulation since it serves as a baseline for more complex models which will have stochastic (random) elements as well as operating on different scales, e.g., agent-based models. +The DGP simulates a fixed population at the individual level on a daily basis. There are a series of simulating DGPs with varied properties. This document covers a DGP that closely mirrors the compartment model structure with real-valued compartments with population transfers between states computed via real-valued multiplication on each day of the simulation. There is no stochastic (random) element at all in this simulation since it serves as a baseline for more complex models which will have stochastic elements as well as operating on different scales, e.g., agent-based models. -The actual code is shown below, the Rmarkdown commands are included to be clear about where code is and how the document is being generated. Note that there is a `pull_section` function that prints sections of code based on being between comments that will be used throughout. This allows software to exist outside the notebook format without introducing copy/paste errors that often come up or other mismatches between running code and it's description. +The actual code is shown below, the Rmarkdown commands are included to be clear about where code is and how the document is being generated. Note that there is a `pull_section` function that prints sections of code based on being between comments that will be used throughout. This allows software to exist outside the notebook format without introducing copy/paste errors that often come up or other mismatches between running code and it's description. ```{r echo=TRUE } -#' Returns lines of file between the identified section '#
' and +#' Returns lines of file between the identified section '#
' and #' the matching section later in the file or the end of the file. #' @param sections Comment that starts/stops collection of lines, e.g., '# section 1' #' @param file Path to file to pull from @@ -57,38 +57,39 @@ pull_section <- function(section, file, num_lines = FALSE) { return(return_string) } -cat(pull_section('# sirtd_vary_beta_exact', here::here('R','SIRTDsim.R'))) +cat(pull_section('# sirtd_vary_beta_exact', here::here('R', 'SIRTDsim.R'))) ``` -The focus on this presentation is the surrounding framework so we offer the simulation code without comment, discussion of the DGP/simulation is at [to be determined](). There exists or will exist simulations that do the following: +The focus on this presentation is the surrounding framework so we offer the simulation code without comment, discussion of the DGP/simulation is at [to be determined](). We are planning to include the following simulations: -* SIRTD model, real valued, non-stochastic: This model -* SIRTD model, real valued, stochastic -* SIRTD model, real valued, stochastic, varied beta -* SIRTD model, integer valued, agent based, stochastic -* SIRTD model, integer valued, agent based, varied beta +* SIRTD model, real-valued, non-stochastic (this model) +* SIRTD model, real-valued, stochastic +* SIRTD model, real-valued, stochastic, variying-beta +* SIRTD model, integer-valued, agent-based, stochastic +* SIRTD model, integer-valued, agent-based, variying-beta -We may also run models that are time limited, feature richer social interaction models etc... +We may also run models that are time-limited, feature-richer social interaction models etc... # Running simulation and fitting a model Below we go over how to setup and run the simulation code that generates data and runs experiments. The central code is in `R/runEval.R` with use of functions in `R/util.R` for printing/graphing functions, `R/data_configs.R` for code that creates data via simulation or provides actual data, `R/modeling_configs.R` for code that configures parameters for modeling software and `R/SIRTDsim.R` that contains simulation software and is called from functions in `R/data_configs.R`. The libraries and dependencies are: ```{r} -cat(pull_section('# dependencies', here::here('R','runEval.R'))) +cat(pull_section('# dependencies', here::here('R', 'runEval.R'))) ``` ## Setting up the `run_df` data frame that controls the experiments ```{r} -cat(pull_section('# setup run_df', here::here('R','runEval.R'), num_lines = TRUE)) +cat(pull_section('# setup run_df', here::here('R', 'runEval.R'), num_lines = TRUE)) ``` The `run_df` is a dataframe where each row will be a complete experiment configuration. The above functions add information relevant to their role. The source is very straightforward should one want to modify for their own purposes. In more detail: ```{r} -cat(pull_section('setup_run_df', here::here('R','runEval.R'), num_lines = TRUE)) +cat(pull_section('setup_run_df', here::here('R', 'runEval.R'), num_lines = TRUE)) ``` -Running code up to this line shows the following values, note that the dataframe is transposed with rows as columns and columns as rows--unreadable otherwise. + +Running code up to this line shows the following values, note that the dataframe is transposed with rows as columns and columns as rows--unreadable otherwise: ```{r echo=FALSE} # dependencies @@ -97,47 +98,50 @@ library(cmdstanr) library(data.table) library(kableExtra) -source(here::here("R","util.R")) -source(here::here("R","SIRTDsim.R")) -source(here::here("R","sim_configs.R")) -source(here::here("R","modeling_configs.R")) +source(here::here("R", "util.R")) +source(here::here("R", "SIRTDsim.R")) +source(here::here("R", "sim_configs.R")) +source(here::here("R", "modeling_configs.R")) # dependencies # setup run_df run_df <- setup_run_df(seed = 93435, n_pop = 1000, n_days = 7) kable(t(run_df)) ``` -This line constructs a basic template for the `run_df`, there is just one row at this point--remember transposed output. All subsequent additions/elaborations draw from this dataframe. +This line constructs a basic template for the `run_df`, there is just one row at this point--remember transposed output. All subsequent additions/elaborations draw from this dataframe: + ```{r} -cat(pull_section('sim_brazil_1', here::here('R','runEval.R'), num_lines = TRUE)) +cat(pull_section('sim_brazil_1', here::here('R', 'runEval.R'), num_lines = TRUE)) ``` -The `sim_brazil_1` is a parameterization that roughly mimics the shape of the COVID-19 outbreak with corresponding simulated values as determined by trying values by hand. Look at the source for more info. Note that the prefix on the function is 'sim', which means that it is part of simulation. There is still one row in the dataframe but with simulation data and the generating paramaterzation. +The `sim_brazil_1` is a parameterization that roughly mimics the shape of the COVID-19 outbreak with corresponding simulated values as determined by trying values by hand. Look at the source code for more info. Note that the prefix on the function is `sim`, which means that it is part of simulation. There is still one row in the dataframe but with simulation data and the generating parameterization. The, remember transposed, dataframe is: ```{r echo=FALSE} run_df <- sim_brazil_1(run_df) -kable(t(run_df), digits=3) +kable(t(run_df), digits = 3) ``` -We now have simulation data from the SIRTD model for `n_days`. But still only one row--remember transposed above. +We now have simulation data from the SIRTD model for `n_days`. But still only one row--remember transposed above: ```{r} cat(pull_section('sim_draw_params', here::here('R','runEval.R'), num_lines = TRUE)) ``` -The next line runs two random based draws from the parmeterization in the single simulation in `run_df`. Two draws are conducted and returned. + +The next line runs two random based draws from the parameterization in the single simulation in `run_df`. Two draws are conducted and returned: ```{r echo=FALSE} draws_run_df <- sim_draw_params(2, run_df) kable(t(draws_run_df)) ``` -Above are transposed output for two draws for slight variations of the Brazil hand parameterization. +Above are transposed output for two draws for slight variations of the Brazil hand parameterization. ```{r} -cat(pull_section("rbind", here::here('R','runEval.R'), num_lines = TRUE)) +cat(pull_section("rbind", here::here('R', 'runEval.R'), num_lines = TRUE)) ``` -We bind the Brazil run with the draws resulting in 3 rows. + +We bind the Brazil run with the draws resulting in 3 rows: ```{r echo=FALSE} run_df <- rbind(run_df, draws_run_df) @@ -146,13 +150,13 @@ kable(t(run_df), digits = 3) ## Setting up modeling -At this point we have three sets of data, one hand configured to mirror Brazil data and two that vary simulation parameters from the hand configured model. Now we apply modeling configurations with subroutines from `R/modeling_configs.R`. +At this point we have three sets of data, one hand configured to mirror Brazil data and two that vary simulation parameters from the hand configured model. Now we apply modeling configurations with subroutines from `R/modeling_configs.R`. ```{r} -cat(pull_section("model_stan_SIRDs", here::here('R','runEval.R'), num_lines = TRUE)) +cat(pull_section("model_stan_SIRDs", here::here('R', 'runEval.R'), num_lines = TRUE)) ``` -There are two model configurations which apply to the three rows uniformly so we end up with 6 configurations total. One model applies tweets to the task, and one does not. The final result is: +There are two model configurations which apply to the three rows uniformly. Hence, we end up with 6 configurations total. One model applies tweets to the task, and one does not. The final result is: ```{r echo=FALSE} run_df <- model_stan_SIRDs(run_df) @@ -160,31 +164,34 @@ kable(t(run_df), digits = 3) ``` ## Configuring reporting -The specific reporting can be controlled from the `run_df` as well. + +The specific reporting can be controlled from the `run_df` as well. ```{r} cat(pull_section("reports <-", here::here('R','runEval.R'), num_lines = TRUE)) ``` -After each `run_df` iteration the listed graphing and print routines are controlled via the above. This list is all available, they are explained below. +After each `run_df` iteration the listed graphing and print routines are controlled via the above. This list is all available, they are explained in the next section. ## Running models -Each row of `run_df` is a complete specification of data and model configuration. The code simply iterates over each row, fits it with the appropriate model/model configuration if there is one specified and adds the results of computing how many simulated days data for tweets and deaths are in the .2 to .8 central interval of the samples for the model run. See `R/util.R` for the implementation of `countPredictionsInQuantile`. +Each row of `run_df` is a complete specification of both data and model configuration. The code simply iterates over each row, fits it with the appropriate model/data configuration (if specified) and adds the results by computing simulated days data for tweets and deaths using the .2 to .8 central sample's interval for the model run. See `R/util.R` for the implementation of `countPredictionsInQuantile`. ```{r} cat(pull_section('# run models', here::here('R','runEval.R'))) ``` -Only two models are available at present. + +Only two models are available at present. + ## Presenting results -At this point results will likely get more idiosyncratic to experiment needs. The below if statements check the listed values for `reports` and apply the corresponding reporting. Note that each run is reported with no attempt to collect them into a collection--we intend to add that functionality eventually. In order there are graphing options to: +At this point results will likely get more idiosyncratic to experiment needs. The below `if` statements check the listed values for `reports` and apply the corresponding reporting. Note that each run is reported with no attempt to collect them into a collection--we intend to add that functionality eventually.There are the following graphing options: -* Graph simulation compartments with dots. +* Graph simulation compartments with dots. * Graph ODE compartments if the Stan model mirrors the states of the simulation, e.g., is roughly isomorphic. -* Graph predicted tweets and predicted 'd' or deaths from the Stan model. If the likelihood is run then this is a posterior predictive check, if the likelihood is not run then it is a prior predictive check. There is a 20%/80% central interval ribbon plot as well. +* Graph predicted tweets and predicted `d` or deceased from the Stan model. If the likelihood is computed then this is a **posterior predictive check**, if the likelihood is *not* computed then it is a **prior predictive check**. There is a 20%/80% central interval ribbon plot as well. * Control over whether the plots are printed. All plots for a single run are overlaid. -All subroutines are in the `R/Util.R` file. +All subroutines are in the `R/Util.R` file. ```{r} cat(pull_section('# section 6', here::here('R','runEval.R'))) @@ -196,21 +203,21 @@ A parameter recovery report finalizes the reporting options per run: ```{r} cat(pull_section('# section 7', here::here('R','runEval.R'))) ``` + which reports on whether the simulation parameters and those from the model. # All run summary + ```{r echo=FALSE} # cat(param_recovery(data_df = run_df[j,], fit = fit)) ``` -A summary across all the runs is done next by reporting relevant columns of the run_df: +A summary across all the runs is done next by reporting relevant columns of the `run_df`: ```{r} cat(pull_section('# section 8', here::here('R','runEval.R'))) ``` - - There are many more result views available that we will eventually integrate into this document. They include: -* Time series analyis, e.g., what happens with simulations that only provide the first interval of data to the models? +* Time series analysis, e.g., what happens with simulations that only provide the first interval of data to the models? From 7fb85071c599f1d6094573e02c63954472835968 Mon Sep 17 00:00:00 2001 From: Breck Date: Fri, 30 Jul 2021 09:00:37 -0400 Subject: [PATCH 04/12] checkout --- R/SIRTDsim.R | 245 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 244 insertions(+), 1 deletion(-) diff --git a/R/SIRTDsim.R b/R/SIRTDsim.R index 8ca210f..f0a64d4 100644 --- a/R/SIRTDsim.R +++ b/R/SIRTDsim.R @@ -1,4 +1,92 @@ -sirtd_vary_beta <- function(seed, +sirtd_vary_beta2 <- function(seed, + n_pop, + n_days, + print, + beta_daily_inf_rates, + gamma_res_per_day_rate, + death_prob, + tweet_rate_infected, + n_patient_zero, + mean_days_to_death_from_t) { + set.seed(seed) + s <- (n_patient_zero + 1):n_pop + i <- 1:n_patient_zero + r <- c() + t <- c() + d <- c() + tweets = 0 + col_names = c('day', 's', 'i', 'r', 't', 'd', 'tweets') + df = data.frame(matrix(nrow = 0, ncol=length(col_names))) + colnames(df) = col_names + for (day in 1:n_days) { + df[day,] = rep(NA,length(col_names)) #setup data, will cause errors if I miss something + df[day,]$s= length(s) + df[day,]$i = length(i) + df[day,]$r = length(r) + df[day,]$t = length(t) + df[day,]$d = length(d) + df[day,]$day = day + df[day,]$tweets = tweets + i_next <- c() + t_next <- c() + if (print) { + cat( + sprintf( + "T Day=%d, susceptible=%d, infected=%d, recovered=%d, terminal=%d, + dead=%d, tweets=%d, R0=%.2f\n", + df[day,]$day, df[day,]$s, df[day,]$i, df[day,]$r, df[day,]$t, + df[day,]$d, df[day,]$tweets, + beta_daily_inf_rates[day]/gamma_res_per_day_rate)) + } + tweets = 0 #start fresh every day, certainly wrong. + for (per in i) { + #end infectious period + tweets = tweets + rpois(1, tweet_rate_infected) + if (rbinom(n = 1, size = 1, prob = gamma_res_per_day_rate) == 1) { + if (rbinom(n = 1, size = 1, prob = death_prob) == 1) { + t_next <- c(t_next, per) + } + else { + r <- c(r, per) + } + } + else { + i_next <- c(i_next, per) + } + } + for(per in t) { + if (rbinom(n = 1, size = 1, prob = 1/mean_days_to_death_from_t) == 1) { + d <- c(d, per) + } + else { + t_next <- c(t_next, per) + } + } + s_delta <- 0 + for (per in i) { +# print("I") +# print(i) + for (other_per in 1:rpois(n = 1, lambda = beta_daily_inf_rates[day])) { + if (length(s)- s_delta >= 1) { + # print("S_next") +# print(s_next) + # i_next <- c(i_next, s[1 + s_delta]) # For now everyone gets a unique int id + s_delta <- s_delta + 1 + } + } + } + if (s_delta > 0) { + i_next <- c(i, s[1:s_delta]) + s <- s[(s_delta + 1):length(s)] + } + i <- i_next + t <- t_next + } + return(df) +} + + +sirtd_vary_beta_orig <- function(seed, n_pop, n_days, print, @@ -68,3 +156,158 @@ sirtd_vary_beta <- function(seed, return(df) } +sirtd_vary_beta_1 <- function(seed, + n_pop, + n_days, + print, + beta_daily_inf_rates, + gamma_res_per_day_rate, + death_prob, + tweet_rate_infected, + n_patient_zero, + mean_days_to_death_from_t) { + set.seed(seed) + i <- n_patient_zero + r <- 0 + t <- 0 + d <- 0 + s <- n_pop - n_patient_zero + tweets = 0 + col_names = c('day', 's', 'i', 'r', 't', 'd', 'tweets') + df = data.frame(matrix(nrow = 0, ncol=length(col_names))) + colnames(df) = col_names + for (day in 1:n_days) { + df[day,] = rep(NA,length(col_names)) + df[day,]$s= s + df[day,]$i = i + df[day,]$r = r + df[day,]$t = t + df[day,]$d = d + df[day,]$day = day + df[day,]$tweets = tweets + if (print) { + cat( + sprintf( + "\nDay=%d, susceptible=%d, infected=%d, recovered=%d, terminal=%d, + dead=%d, tweets=%d, R0=%.2f %d", + df[day,]$day, df[day,]$s, df[day,]$i, df[day,]$r, df[day,]$t, + df[day,]$d, df[day,]$tweets, + beta_daily_inf_rates[day]/gamma_res_per_day_rate, + s + i + r + t + d)) + } + tweets = 0 #start fresh every day, certainly wrong. + counter <- 0 + repeat { #move from t to d + if (counter >= t) { + break + } + if (rbinom(n = 1, size = 1, prob = 1/mean_days_to_death_from_t) == 1) { + t <- t - 1 + d <- d + 1 + } + counter <- counter + 1 + } + counter <- 0 + i_today <- i + repeat { + if (counter >= i_today){ + break + } + #end infectious period + tweets = tweets + rpois(1, tweet_rate_infected) + #infect others + for (other_per in 1:rpois(n = 1, lambda = beta_daily_inf_rates[day])) { + if (s > 1) { + s <- s - 1 + i <- i + 1 + } + } + #move from i to t,r + if (rbinom(n = 1, size = 1, prob = gamma_res_per_day_rate) == 1) { + i <- i - 1 + if (rbinom(n = 1, size = 1, prob = death_prob) == 1) { + t <- t + 1 + } + else { + r <- r + 1 + } + } + counter <- counter + 1 + } + } + return(df) +} + +# sirtd_vary_beta_exact +#' Runs a simple deterministic SIRTD model with possible daily varied beta values. +#' @param seed Seed for stochastic processes, ignored since this is a deterministic function +#' @param n_pop Population size +#' @param n_days Number of days to run simulation +#' @param print Boolean to control whether daily summary is printed +#' @param beta_daily_inf_rates Controls number of s that are infected by i to +#' become i. Should be between 0 and 1 +#' @param gamma_res_per_day_rate Controls number of i that transition to r or t +#' per day. Should be between 0 and 1 +#' @param death_prob Probability transitioning from i to t +#' @param tweet_rate_infected If infected, what rate of tweets per day. Can be +#' > 1 +#' @param n_patient_zero Number of patients infected at day 0 +#' @param mean_days_to_death_from_t Mean number of days in t before d +#' @return dataframe simulating sirtd+tweets state for number days +sirtd_vary_beta_exact <- function(seed, + n_pop, + n_days, + print, + beta_daily_inf_rates, + gamma_res_per_day_rate, + death_prob, + tweet_rate_infected, + n_patient_zero, + mean_days_to_death_from_t) { + old_seed <- .Random.seed + on.exit({.Random.seed <<- old_seed}) + set.seed(seed) + i <- n_patient_zero + r <- 0 + t <- 0 + d <- 0 + s <- n_pop - n_patient_zero + col_names = c('day', 's', 'i', 'r', 't', 'd', 'tweets') + df = data.frame(matrix(nrow = 0, ncol=length(col_names))) + colnames(df) = col_names + tweets = 0 + for (day in 1:n_days) { + df[day,] = rep(NA,length(col_names)) + df[day,]$s= round(s) + df[day,]$i = round(i) + df[day,]$r = round(r) + df[day,]$t = round(t) + df[day,]$d = round(d) + df[day,]$day = day + df[day,]$tweets = round(tweets) + if (print) { + cat( + sprintf( + "\nDay=%d, susceptible=%.2f, infected=%.2f, recovered=%.2f, terminal=%.2f, + dead=%.2f, tweets=%.2f, R0=%.2f %.2f", + df[day,]$day, df[day,]$s, df[day,]$i, df[day,]$r, df[day,]$t, + df[day,]$d, df[day,]$tweets, + beta_daily_inf_rates[day]/gamma_res_per_day_rate, + s + i + r + t + d)) + } + resolved <- gamma_res_per_day_rate * i + terminal <- death_prob * resolved + recovered <- resolved - terminal + dead <- 1/mean_days_to_death_from_t * t + tweets = tweet_rate_infected * i + infected <- min(i * beta_daily_inf_rates[day], s) + s <- s - infected + i <- i - resolved + infected + r <- r + recovered + t <- t - dead + terminal + d <- d + dead + } + return(df) +} +# sirtd_vary_beta_exact + From dc49ef297a4b8d6611a1ff4afa2392245d9b515f Mon Sep 17 00:00:00 2001 From: Jose Storopoli Date: Fri, 30 Jul 2021 11:54:53 -0300 Subject: [PATCH 05/12] tweet_sirtd_negbin_ODE model inside runEval.R --- R/runEval.R | 43 +++++++++++++++++++------------- stan/tweet_sirtd_negbin_ODE.stan | 22 +++++++++------- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/R/runEval.R b/R/runEval.R index 07e421f..5757b98 100644 --- a/R/runEval.R +++ b/R/runEval.R @@ -13,7 +13,7 @@ source(here::here("R","modeling_configs.R")) run_df <- setup_run_df(seed = 93435, n_pop = 1000, n_days = 7) # in R/util.R run_df <- sim_brazil_1(run_df) # in R/sim_configs.R draws_run_df <- sim_draw_params(2, run_df) # in R/sim_configs.R -run_df <- rbind(run_df, draws_run_df) +run_df <- rbind(run_df, draws_run_df) run_df <- model_stan_SIRDs(run_df) #in R/modeling_configs.R run_df$ode_solver <- 'block' # set ODE solver in Stan program across all runs @@ -24,12 +24,12 @@ run_df$reports <- list(c('graph_sim','graph_ODE', 'graph_tweets', 'graph_d', 'pl for (j in 1:nrow(run_df)) { fit <- NA if (run_df[j,]$model_to_run == 'twitter_sird') { - stan_data <- + stan_data <- list(n_days = run_df[j,]$n_days, sDay1 = run_df[j,]$n_pop - 1, iDay1 = 1, rDay1 = 0, - dDay1 = 0, + dDay1 = 0, NPop = run_df[j,]$n_pop, tweets = unlist(run_df[j,]$tweets), deaths = unlist(run_df[j,]$d), @@ -38,7 +38,7 @@ for (j in 1:nrow(run_df)) { run_block_ODE = ifelse(run_df[j,]$ode_solver == 'block', 1, 0), run_rk45_ODE = ifelse(run_df[j,]$ode_solver == 'rk45', 1, 0)) model <- cmdstan_model(here::here("stan", "tweet_sird.stan")) - + fit <- model$sample(data=stan_data, parallel_chains = 4, iter_warmup = 1000, @@ -47,18 +47,27 @@ for (j in 1:nrow(run_df)) { seed = 4857) run_df[j,]$fit = list(fit) } - else if (run_df[j,]$model_to_run == 'tweet_sird_negbin_optimized') { + else if (run_df[j,]$model_to_run == 'tweet_sirtd_negbin_ODE') { stan_data_2 <- list(n_days = run_df[j,]$n_days, - y0 = c(run_df[j,]$n_pop - run_df[j,]$n_patient_zero, + y0 = c(run_df[j,]$n_pop - run_df[j,]$n_patient_zero, run_df[j,]$n_patient_zero, 0, 0, 0), # one more zero here t0 = 0, ts = 1:run_df[j,]$n_days, compute_likelihood = run_df[j,]$compute_likelihood, use_twitter = run_df[j,]$apply_twitter_data, - death_count = run_df[j,]$d, - symptomaticTweets = run_df[j,]$tweets, - trapezoidal_solver = 0) - model2 <- cmdstan_model(here::here("stan", "tweet_sird_negbin_optimized.stan")) + death_count = unlist(run_df[j,]$d), + symptomaticTweets = unlist(run_df[j,]$tweets), + prior_beta_mean = 0, + prior_beta_std = 10, + prior_omega_mean = 0, + prior_omega_std = 10, + prior_dI_mean = 0, + prior_dI_std = 10, + prior_dT_mean = 0, + prior_dT_std = 10, + prior_twitter_lambda = 1 + ) + model2 <- cmdstan_model(here::here("stan", "tweet_sirtd_negbin_ODE.stan")) fit <- model2$sample(data=stan_data_2, parallel_chains = 4, iter_warmup = 1000, @@ -67,9 +76,9 @@ for (j in 1:nrow(run_df)) { seed = 4857) } if (run_df[j,]$model_to_run != 'none') { - d_tweets_in_interval = countPredictionsInQuantile(fit = fit, - run_df = run_df, - j = j, print = TRUE) + d_tweets_in_interval = countPredictionsInQuantile(fit = fit, + run_df = run_df, + j = j, print = TRUE) run_df[j,]$d_in_interval = d_tweets_in_interval[1] run_df[j,]$tweets_in_interval = d_tweets_in_interval[2] } @@ -86,13 +95,13 @@ for (j in 1:nrow(run_df)) { plot <- plot + graph_ODE(data_df = run_df[j,], fit = fit, hide_s = TRUE) } if ('graph_tweets' %in% unlist(run_df[j,]$reports)) { - plot <- plot_predictions(plot = plot, prediction_label = 'pred_tweets', - fit = fit, + plot <- plot_predictions(plot = plot, prediction_label = 'pred_tweets', + fit = fit, show_ribbon = TRUE) } if ('graph_d' %in% unlist(run_df[j,]$reports)) { - plot <- plot_predictions(plot = plot, prediction_label = 'pred_deaths', - fit = fit, + plot <- plot_predictions(plot = plot, prediction_label = 'pred_deaths', + fit = fit, show_ribbon = TRUE) } if ('plot' %in% unlist(run_df[j,]$reports)) { diff --git a/stan/tweet_sirtd_negbin_ODE.stan b/stan/tweet_sirtd_negbin_ODE.stan index 0ef0309..e98660f 100644 --- a/stan/tweet_sirtd_negbin_ODE.stan +++ b/stan/tweet_sirtd_negbin_ODE.stan @@ -64,9 +64,9 @@ transformed parameters{ real phi_twitter = 1.0 / phi_twitter_inv; // States to be recovered - vector[5] state_estimate[n_days]; + vector[5] daily_counts_ODE[n_days]; - state_estimate = ode_rk45_tol(sird, y0, t0, ts, + daily_counts_ODE = ode_rk45_tol(sird, y0, t0, ts, 1e-6, 1e-6, 1000, // if you want custom tolerances beta, omega, dI, dT); } @@ -83,10 +83,10 @@ model { if (compute_likelihood == 1){ for (i in 1:n_days) { if (use_twitter == 1) { - symptomaticTweets[i] ~ neg_binomial_2(twitter_rate * state_estimate[i, 2], + symptomaticTweets[i] ~ neg_binomial_2(twitter_rate * daily_counts_ODE[i, 2], phi_twitter); } - death_count[i] ~ neg_binomial_2(state_estimate[i, 5], phi); + death_count[i] ~ neg_binomial_2(daily_counts_ODE[i, 5], phi); } } } @@ -100,11 +100,11 @@ generated quantities { vector[n_days] state_D; // Populate States - state_S = to_vector(state_estimate[, 1]); - state_I = to_vector(state_estimate[, 2]); - state_R = to_vector(state_estimate[, 3]); - state_T = to_vector(state_estimate[, 4]); - state_D = to_vector(state_estimate[, 5]); + state_S = to_vector(daily_counts_ODE[, 1]); + state_I = to_vector(daily_counts_ODE[, 2]); + state_R = to_vector(daily_counts_ODE[, 3]); + state_T = to_vector(daily_counts_ODE[, 4]); + state_D = to_vector(daily_counts_ODE[, 5]); // Tweets vector[n_days] state_tweets; @@ -115,6 +115,7 @@ generated quantities { real R0 = beta * dI; int pred_deaths[n_days]; int pred_tweets[n_days]; + real gamma = dI; // returning dI as gamma for breck's for (i in 1:n_days) { if (compute_likelihood == 1) { pred_deaths[i] = neg_binomial_2_rng(state_D[i], phi); @@ -123,5 +124,8 @@ generated quantities { pred_tweets[i] = neg_binomial_2_rng(twitter_rate * state_I[i], phi_twitter); } + else { + pred_tweets[i] = 0; + } } } From 4ffe2b4b4fc7e3a20626f425c43dc93ac6610150 Mon Sep 17 00:00:00 2001 From: Breck Date: Fri, 30 Jul 2021 20:16:45 -0400 Subject: [PATCH 06/12] got something closer to Brazil working and UNINOVE model running --- R/modeling_configs.R | 30 +++++++++++++++++++++++------- R/runEval.R | 18 +++++++++--------- R/sim_configs.R | 7 ++++--- R/util.R | 12 +++++++++--- 4 files changed, 45 insertions(+), 22 deletions(-) diff --git a/R/modeling_configs.R b/R/modeling_configs.R index 8557ec7..a4806f2 100644 --- a/R/modeling_configs.R +++ b/R/modeling_configs.R @@ -1,17 +1,33 @@ #' Returns SIRD configurations for model 'twitter_sird' that use tweets and #' not. ODE solver not specified #' @param run_df The dataframe that is being copied and built up. -model_stan_SIRDs <- function(run_df) { - +model_stan_baseline <- function(run_df) { + model_no_tweets_df <- copy(run_df) model_no_tweets_df$apply_twitter_data <- 0 - model_no_tweets_df$model_to_run <- 'twitter_sird' model_no_tweets_df$description <- paste0(model_no_tweets_df$description, - "no tweets") + " no tweets") model_tweets_df <- copy(run_df) model_tweets_df$apply_twitter_data <- 1 - model_tweets_df$model_to_run <- 'twitter_sird' model_tweets_df$description <- paste0(model_tweets_df$description, - "use tweets") - return(rbind(model_no_tweets_df, model_tweets_df)) + " use tweets") + + combined_df = rbind(model_no_tweets_df, model_tweets_df) + combined_df$ode_solver <- 'block' + combined_df$model_to_run <- 'baseline' + return(combined_df) +} + + +#' Returns SIRD configurations for model 'twitter_sird' that use tweets and +#' not. ODE solver not specified +#' @param run_df The dataframe that is being copied and built up. +model_stan_UNINOVE_Brazil <- function(run_df) { + + model_df <- copy(run_df) + model_df$apply_twitter_data <- 1 + model_df$model_to_run <- 'UNINOVE_Brazil' + model_df$description <- paste0(model_df$description, + " UNINOVE_Brazil") + return(model_df) } \ No newline at end of file diff --git a/R/runEval.R b/R/runEval.R index 5757b98..5ea1e22 100644 --- a/R/runEval.R +++ b/R/runEval.R @@ -10,20 +10,20 @@ source(here::here("R","sim_configs.R")) source(here::here("R","modeling_configs.R")) # dependencies # setup run_df -run_df <- setup_run_df(seed = 93435, n_pop = 1000, n_days = 7) # in R/util.R +run_df <- setup_run_df(seed = 93435, n_pop = 214110287, n_days = 300) # in R/util.R run_df <- sim_brazil_1(run_df) # in R/sim_configs.R draws_run_df <- sim_draw_params(2, run_df) # in R/sim_configs.R run_df <- rbind(run_df, draws_run_df) -run_df <- model_stan_SIRDs(run_df) #in R/modeling_configs.R -run_df$ode_solver <- 'block' # set ODE solver in Stan program across all runs +run_df <- model_stan_UNINOVE_Brazil(run_df) #in R/modeling_configs.R run_df$compute_likelihood <- 1 # compute likelihood across all runs -run_df$reports <- list(c('graph_sim','graph_ODE', 'graph_tweets', 'graph_d', 'plot','param_recovery')) +run_df$reports <- list(c('graph_sim', 'graph_tweets', 'graph_d', 'plot')) + # list(c('graph_sim','graph_ODE', 'graph_tweets', 'graph_d', 'plot','param_recovery')) # setup run_df # run models for (j in 1:nrow(run_df)) { fit <- NA - if (run_df[j,]$model_to_run == 'twitter_sird') { + if (run_df[j,]$model_to_run == 'baseline') { stan_data <- list(n_days = run_df[j,]$n_days, sDay1 = run_df[j,]$n_pop - 1, @@ -47,7 +47,7 @@ for (j in 1:nrow(run_df)) { seed = 4857) run_df[j,]$fit = list(fit) } - else if (run_df[j,]$model_to_run == 'tweet_sirtd_negbin_ODE') { + else if (run_df[j,]$model_to_run == 'UNINOVE_Brazil') { stan_data_2 <- list(n_days = run_df[j,]$n_days, y0 = c(run_df[j,]$n_pop - run_df[j,]$n_patient_zero, run_df[j,]$n_patient_zero, 0, 0, 0), # one more zero here @@ -78,7 +78,7 @@ for (j in 1:nrow(run_df)) { if (run_df[j,]$model_to_run != 'none') { d_tweets_in_interval = countPredictionsInQuantile(fit = fit, run_df = run_df, - j = j, print = TRUE) + j = j, print = FALSE) run_df[j,]$d_in_interval = d_tweets_in_interval[1] run_df[j,]$tweets_in_interval = d_tweets_in_interval[2] } @@ -116,7 +116,7 @@ for (j in 1:nrow(run_df)) { } # section 8 summary_cols = c('sim_run_id', 'model_to_run', 'beta_mean', 'gamma', 'death_prob', - 'tweet_rate', 'days2death', 'ode_solver', 'description', - 'd_in_interval', 'tweets_in_interval') + 'tweet_rate', 'days2death', 'description', + 'd_in_interval', 'tweets_in_interval','n_days') print(run_df[,summary_cols]) # section 8 diff --git a/R/sim_configs.R b/R/sim_configs.R index 6694563..26e8445 100644 --- a/R/sim_configs.R +++ b/R/sim_configs.R @@ -79,7 +79,7 @@ sim_brazil_1 <- function (source_df) { run_df$n_days)) run_df$gamma <- .255 run_df$death_prob <- .01 - run_df$tweet_rate <- .25 + run_df$tweet_rate <- .1 run_df$days2death <- 10 run_df$n_patient_zero <- 10 run_df$description <- "Brazil 1 approximation" @@ -103,5 +103,6 @@ sim_brazil_1 <- function (source_df) { run_df$tweets <- list(sim_df$tweets) return(run_df) } - -#sim_brazil_1() +#214,110,287 +#run_df <- setup_run_df(seed = 123, n_pop = 214110287, n_days = 365) +#sim_brazil_1(run_df) diff --git a/R/util.R b/R/util.R index 156c0a1..4286c7a 100644 --- a/R/util.R +++ b/R/util.R @@ -51,10 +51,16 @@ countPredictionsInQuantile <- function(fit, run_df, j, print = FALSE) { if (print) { predCasesLine = 'predicted cases' - predCasesRibbon = paste0('predicted cases ',minQuantileLabel, '/', + predCasesRibbon = paste0('predicted cases ', minQuantileLabel, '/', maxQuantileLabel) - print(sprintf("Predicted %d cases and %d tweets within interval", - deaths_in_interval, tweets_in_interval)) + days_p = sprintf("For %d out of %d days", deaths_in_interval, + run_df$n_days) + i = sprintf(" the predicted deaths were within %.1f and %.1f sd quantile of truth", + minQuantile, maxQuantile) + tweets_p = sprintf("\nFor %d days, ", tweets_in_interval) + i_2 = sprintf(" the predicted tweets were within %.1f and %.1f of truth", + minQuantile, maxQuantile) + cat(paste0(days_p, i, tweets_p, i_2)) } return(c(deaths_in_interval, tweets_in_interval)) } From d824b5d0472483eea78b5749a18eb6d2fa357d96 Mon Sep 17 00:00:00 2001 From: Breck Date: Sun, 1 Aug 2021 09:03:23 -0400 Subject: [PATCH 07/12] reworked to get reasonable output from stan model and runEval.R --- R/modeling_configs.R | 10 +-- R/runEval.R | 31 ++++--- R/util.R | 64 +++++++++----- stan/baseline.stan | 196 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 263 insertions(+), 38 deletions(-) create mode 100644 stan/baseline.stan diff --git a/R/modeling_configs.R b/R/modeling_configs.R index a4806f2..303d050 100644 --- a/R/modeling_configs.R +++ b/R/modeling_configs.R @@ -3,16 +3,16 @@ #' @param run_df The dataframe that is being copied and built up. model_stan_baseline <- function(run_df) { - model_no_tweets_df <- copy(run_df) - model_no_tweets_df$apply_twitter_data <- 0 - model_no_tweets_df$description <- paste0(model_no_tweets_df$description, - " no tweets") + #model_no_tweets_df <- copy(run_df) + #model_no_tweets_df$apply_twitter_data <- 0 + #model_no_tweets_df$description <- paste0(model_no_tweets_df$description, + # " no tweets") model_tweets_df <- copy(run_df) model_tweets_df$apply_twitter_data <- 1 model_tweets_df$description <- paste0(model_tweets_df$description, " use tweets") - combined_df = rbind(model_no_tweets_df, model_tweets_df) + combined_df = model_tweets_df # rbind(model_no_tweets_df, model_tweets_df) combined_df$ode_solver <- 'block' combined_df$model_to_run <- 'baseline' return(combined_df) diff --git a/R/runEval.R b/R/runEval.R index 5ea1e22..f3ed06a 100644 --- a/R/runEval.R +++ b/R/runEval.R @@ -1,3 +1,7 @@ +#todo +# serialize run_df?? json? +# run Brazil data with baseline to get params +# # dependencies library(tidyverse) library(cmdstanr) @@ -10,18 +14,21 @@ source(here::here("R","sim_configs.R")) source(here::here("R","modeling_configs.R")) # dependencies # setup run_df +#run_df <- setup_run_df(seed = 93435, n_pop = 214110287, n_days = 300) # in R/util.R run_df <- setup_run_df(seed = 93435, n_pop = 214110287, n_days = 300) # in R/util.R run_df <- sim_brazil_1(run_df) # in R/sim_configs.R -draws_run_df <- sim_draw_params(2, run_df) # in R/sim_configs.R -run_df <- rbind(run_df, draws_run_df) +#draws_run_df <- sim_draw_params(2, run_df) # in R/sim_configs.R +#run_df <- rbind(run_df, draws_run_df) -run_df <- model_stan_UNINOVE_Brazil(run_df) #in R/modeling_configs.R +run_df <- model_stan_baseline(run_df) #in R/modeling_configs.R run_df$compute_likelihood <- 1 # compute likelihood across all runs run_df$reports <- list(c('graph_sim', 'graph_tweets', 'graph_d', 'plot')) - # list(c('graph_sim','graph_ODE', 'graph_tweets', 'graph_d', 'plot','param_recovery')) + #list(c('graph_sim','graph_ODE', 'graph_tweets', 'graph_d', 'plot','param_recovery')) # setup run_df # run models -for (j in 1:nrow(run_df)) { +j <- 0 +while (j < nrow(run_df)) { + j <- j + 1 fit <- NA if (run_df[j,]$model_to_run == 'baseline') { stan_data <- @@ -36,8 +43,10 @@ for (j in 1:nrow(run_df)) { compute_likelihood = run_df[j,]$compute_likelihood, run_twitter = run_df[j,]$apply_twitter_data, run_block_ODE = ifelse(run_df[j,]$ode_solver == 'block', 1, 0), - run_rk45_ODE = ifelse(run_df[j,]$ode_solver == 'rk45', 1, 0)) - model <- cmdstan_model(here::here("stan", "tweet_sird.stan")) + run_rk45_ODE = ifelse(run_df[j,]$ode_solver == 'rk45', 1, 0), + scale = 1, + center = 1) + model <- cmdstan_model(here::here("stan", "baseline.stan")) fit <- model$sample(data=stan_data, parallel_chains = 4, @@ -78,7 +87,7 @@ for (j in 1:nrow(run_df)) { if (run_df[j,]$model_to_run != 'none') { d_tweets_in_interval = countPredictionsInQuantile(fit = fit, run_df = run_df, - j = j, print = FALSE) + j = j, print = TRUE) run_df[j,]$d_in_interval = d_tweets_in_interval[1] run_df[j,]$tweets_in_interval = d_tweets_in_interval[2] } @@ -89,10 +98,11 @@ for (j in 1:nrow(run_df)) { # section 6 plot <- ggplot(data = NULL, aes(x = day, y = count)) if ('graph_sim' %in% unlist(run_df[j,]$reports)) { - plot <- plot + graph_sim_data(data_df = run_df[j,], hide_s = TRUE) + plot <- graph_sim_data(data_df = run_df[j,], hide_s = TRUE, plot = plot) } if ('graph_ODE' %in% unlist(run_df[j,]$reports)) { - plot <- plot + graph_ODE(data_df = run_df[j,], fit = fit, hide_s = TRUE) + plot <- graph_ODE(data_df = run_df[j,], fit = fit, hide_s = TRUE, + plot = plot) } if ('graph_tweets' %in% unlist(run_df[j,]$reports)) { plot <- plot_predictions(plot = plot, prediction_label = 'pred_tweets', @@ -104,6 +114,7 @@ for (j in 1:nrow(run_df)) { fit = fit, show_ribbon = TRUE) } + plot = plot + theme(legend.position = "none") if ('plot' %in% unlist(run_df[j,]$reports)) { print(plot) } diff --git a/R/util.R b/R/util.R index 4286c7a..324ca4c 100644 --- a/R/util.R +++ b/R/util.R @@ -1,3 +1,7 @@ +library(ggplot2) +library(ggrepel) + + #' Count the number of truth data points contained in the quantile interval of #' model fit #' @param truth vector of true or simulated values @@ -31,7 +35,7 @@ countPredictionsInQuantile <- function(fit, run_df, j, print = FALSE) { predCasesDf = fit$summary(variables = c('pred_deaths'), mean, ~quantile(.x, probs = c(minQuantile, maxQuantile), - na.rm = TRUE)) #set to FALSE when Jose fixes his model + na.rm = FALSE)) #set to FALSE when Jose fixes his model predCasesDf$day = 1:nrow(predCasesDf) predTweetsDf = fit$summary(variables = c('pred_tweets'), mean, @@ -50,15 +54,12 @@ countPredictionsInQuantile <- function(fit, run_df, j, print = FALSE) { minQuantileLabel = minQuantileLabel) if (print) { - predCasesLine = 'predicted cases' - predCasesRibbon = paste0('predicted cases ', minQuantileLabel, '/', - maxQuantileLabel) - days_p = sprintf("For %d out of %d days", deaths_in_interval, - run_df$n_days) - i = sprintf(" the predicted deaths were within %.1f and %.1f sd quantile of truth", + days_p = sprintf("Over %d days, %d predicted deaths", run_df[j,]$n_days, + deaths_in_interval) + i = sprintf(" were in the %.1f and %.1f quantiles of the truth sd", minQuantile, maxQuantile) - tweets_p = sprintf("\nFor %d days, ", tweets_in_interval) - i_2 = sprintf(" the predicted tweets were within %.1f and %.1f of truth", + tweets_p = sprintf("\n%d predicted tweets,", tweets_in_interval) + i_2 = sprintf(" were within %.1f and %.1f of truth", minQuantile, maxQuantile) cat(paste0(days_p, i, tweets_p, i_2)) } @@ -70,7 +71,7 @@ countPredictionsInQuantile <- function(fit, run_df, j, print = FALSE) { #' y = count. #' @param data_df one row of the run_df with simulation data added #' @param hide_s Boolean to control whether to hide the s or susceptible counts -graph_sim_data <- function(data_df, hide_s) { +graph_sim_data <- function(data_df, hide_s, plot) { sim_df = data.frame(day = 1:data_df$n_days, tweets = unlist(data_df$tweets), s = unlist(data_df$s), @@ -83,30 +84,46 @@ graph_sim_data <- function(data_df, hide_s) { if (hide_s) { compartment_names <- compartment_names[-1] } + i_mean = mean(sim_df$i) + gt_mean_days = sim_df[sim_df$i >= i_mean,]$day + display_day = gt_mean_days[1] sim_long_df = gather(data = sim_df, key = "compartment_sim", value = "count", all_of(c('tweets', compartment_names))) - return(geom_point(data = sim_long_df, aes(y = count, - color = compartment_sim), - size = .5)) + return(plot + + geom_point(data = sim_long_df, aes(y = count, + color = compartment_sim), + size = .5) + + geom_label_repel(data = subset(sim_long_df, + day == display_day), + aes(label = compartment_sim, + color = compartment_sim))) + } #' Graph daily ODE means from SIRTD model. Returns a ggplot geom_line element #' @param data_df A row from run_df #' @param fit A fit object returned by cmdstanR with ode_states defined #' @param hide_s Boolean to control whether to hide susceptible counts -graph_ODE <- function(data_df, fit, hide_s) { - results_ODE_df = fit$summary(variables = c('ode_states')) - compartment_names = c('s', 'i', 'r', 't', 'd') +graph_ODE <- function(data_df, fit, hide_s, plot) { + compartment_names = c('S', 'I', 'R', 'D') if (hide_s) { compartment_names <- compartment_names[-1] } - ODE_df = data.frame(matrix(results_ODE_df$median, nrow = data_df$n_days, - ncol = length(compartment_names))) - colnames(ODE_df) = compartment_names - ODE_df$day = 1:data_df$n_days - ODE_long_df = gather(data = ODE_df, key = "compartment_ODE", value = "count", - all_of(compartment_names)) - return(geom_line(data = ODE_long_df, aes(color = compartment_ODE))) + ODE_long_df = data.frame() + for (name in compartment_names) { + compartment_ODE_df = fit$summary(variables = c(name)) + compartment_ODE_df$compartment_ODE = name + compartment_ODE_df$day = 1:nrow(compartment_ODE_df) + compartment_ODE_df$count = compartment_ODE_df$mean # maybe should be median + ODE_long_df = rbind(ODE_long_df,compartment_ODE_df) + } + return( + plot + + geom_line(data = ODE_long_df, aes(color = compartment_ODE)) + + geom_label_repel(data = subset(ODE_long_df, day == data_df$n_days), + aes(label = compartment_ODE, + color = compartment_ODE)) + ) } #' returns ggplot geom_line with optional ribbon around 20/80 central interval @@ -192,6 +209,7 @@ setup_run_df <- function(seed, n_pop, n_days) { # setup prediction columns template_df$d_in_interval <- NA_integer_ template_df$tweets_in_interval <- NA_integer_ + template_df$fit <- NA set.seed(template_df$seed) return(template_df) } diff --git a/stan/baseline.stan b/stan/baseline.stan new file mode 100644 index 0000000..211aeb5 --- /dev/null +++ b/stan/baseline.stan @@ -0,0 +1,196 @@ +/* +Modified from https://mc-stan.org/users/documentation/case-studies/boarding_school_case_study.html +*/ + +// #include /home/breck/git/codatmo/dataGeneratingProcess1/stan/ode_solvers.stan + +functions { + real[] sird(real t, real[] y, real[] theta, + real[] x_r, int[] x_i) { + + real S = y[1]; + real I = y[2]; + real R = y[3]; + real D = y[4]; + real N = x_i[1]; + + real beta = theta[1]; + real gamma = theta[2]; + real deathRate = theta[3]; + + real dS_dt = -beta * I * S / N; + real dI_dt = beta * I * S / N - gamma * I ; + real dR_dt = gamma * I - deathRate * R; + real dD_dt = deathRate * R; + + return {dS_dt, dI_dt, dR_dt, dD_dt}; + } + + real[,] block_sird(int days, real[] y, real[] theta, + real[] x_r, int[] x_i, int debug) { + real day_counts[days,4]; + real S = y[1]; + real I = y[2]; + real R = y[3]; + real D = y[4]; + real N = x_i[1]; + real beta = theta[1]; + real gamma = theta[2]; + real deathRate = theta[3]; + if (debug == 1) { + print("beta, gamma, deathRate=", theta); + print("0 SIRD=", y," sum=", sum(y)); + } + for (i in 1:days) { + real dS_dt = -beta * I * S / N; + real dI_dt = beta * I * S / N - gamma * I ; + real dR_dt = gamma * I - deathRate * R; + real dD_dt = deathRate * R; + if (debug ==1) { + print("dS_dt, dI_Dt, dR_dt, dD_dt=", {dS_dt, dI_dt, dR_dt, dD_dt}, + " sum=", sum({dS_dt, dI_dt, dR_dt, dD_dt})); + } + S = dS_dt + S; + I = dI_dt + I; + R = dR_dt + R; + D = dD_dt + D; + day_counts[i] = {S, I, R, D}; + if (debug == 1) { + print(i," SIRD=", day_counts[i], " sum=", sum(day_counts[i])); + } + } + return day_counts; + } + +} +data { + int n_days; + real sDay1; + real iDay1; + real rDay1; + real dDay1; + int NPop; + vector[n_days] tweets; + vector[n_days] deaths; + int compute_likelihood; + int run_twitter; + int run_block_ODE; + int run_rk45_ODE; + int scale; + int center; +} +transformed data { + real x_r[0]; //need for ODE function + int x_i[1] = { NPop }; //need for ODE function + int debug = 0; + int debug2 = 0; + int n_compartments = 4; + int sCompartment = 1; + int iCompartment = 2; + int rCompartment = 3; + int dCompartment = 4; + real ts[n_days]; + real meanDeaths = 0; + real meanTweets = 0; + real sdDeaths = 1; + real sdTweets = 1; + vector[n_days] deaths_munged = deaths; + vector[n_days] tweets_munged = tweets; + if (center == 1) { + meanDeaths = mean(deaths); + deaths_munged = deaths_munged - meanDeaths; + meanTweets = mean(tweets); + tweets_munged = tweets_munged - meanTweets; + } + if (scale == 1) { + sdDeaths = sd(deaths); + if (sdDeaths == 0) { + reject("Standard deviation of zero for deaths"); + } + deaths_munged = deaths_munged/sdDeaths; + if (sdTweets) + sdTweets = sd(tweets); + if (sdTweets == 0) { + reject("Standard deviation of zero for tweets"); + } + tweets_munged = tweets_munged/sdTweets; + } + ts[1] = 1.0; + for (i in 2:n_days) { + ts[i] = ts[i - 1] + 1; + } + print("tweets", tweets); + print("tweets_munged", tweets_munged); + print("deaths", deaths); + print("deaths_munged", deaths_munged); +} + +parameters { + real gamma; + real beta; + real deathRate; + real lambda_twitter; +} +transformed parameters{ + real compartmentStartValues[4] = {sDay1, iDay1, rDay1, dDay1}; + real y[n_days, 4]; + matrix[n_days, 4] daily_counts_ODE; + real theta[3]; + theta[1] = beta; + theta[2] = gamma; + theta[3] = deathRate; + if (run_rk45_ODE == 1 && run_block_ODE == 1) { + reject("cannot run both rk45 and block ODEs"); + } + if (run_rk45_ODE == 1 ) { + y = integrate_ode_rk45(sird, compartmentStartValues , 0.0, ts, theta, x_r, x_i); + } + if (run_block_ODE == 1) { + y = block_sird(n_days, compartmentStartValues, theta, x_r, x_i, debug); + } + daily_counts_ODE = to_matrix(y); +} + +model { + beta ~ normal(0, 1); + gamma ~ normal(0, 1); + deathRate ~ normal(0, 1); + lambda_twitter ~ normal(0,1); + if (compute_likelihood == 1) { + for (i in 1:n_days) { + deaths_munged[i] ~ normal(daily_counts_ODE[i, dCompartment], + 1); + if (run_twitter == 1) { + tweets_munged[i] ~ normal(lambda_twitter * + daily_counts_ODE[i, iCompartment], + 1); + } + } + } +} + +generated quantities { + real R0 = beta / gamma; + real recovery_time = 1 / gamma; + real pred_deaths[n_days]; + real pred_tweets[n_days]; + real pred_i[n_days]; + matrix[n_days, n_compartments] ode_states = daily_counts_ODE; + matrix[n_compartments, n_days] transposed_ode_states = daily_counts_ODE'; + row_vector[n_days] S = transposed_ode_states[sCompartment]; + row_vector[n_days] I = transposed_ode_states[iCompartment]; + row_vector[n_days] R = transposed_ode_states[rCompartment]; + row_vector[n_days] D = transposed_ode_states[dCompartment]; + + for (i in 1:n_days) { + if (run_twitter == 1) { + pred_tweets[i] = sdTweets * lambda_twitter * + daily_counts_ODE[i, dCompartment] + meanTweets; + } + else { + pred_tweets[i] = 0; + } + pred_deaths[i] = sdDeaths * daily_counts_ODE[i, iCompartment] + + meanDeaths; + } +} From b6d545c5ec298b6c8cbd01fad376d36e561a977e Mon Sep 17 00:00:00 2001 From: Breck Date: Sun, 1 Aug 2021 17:31:42 -0400 Subject: [PATCH 08/12] can simulate and print data on same config --- R/runEval.R | 17 +++++++++++------ R/sim_configs.R | 1 + R/util.R | 25 +++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/R/runEval.R b/R/runEval.R index f3ed06a..467f219 100644 --- a/R/runEval.R +++ b/R/runEval.R @@ -11,18 +11,20 @@ library(kableExtra) source(here::here("R","util.R")) source(here::here("R","SIRTDsim.R")) source(here::here("R","sim_configs.R")) +source(here::here("R", "data_configs.R")) source(here::here("R","modeling_configs.R")) # dependencies # setup run_df #run_df <- setup_run_df(seed = 93435, n_pop = 214110287, n_days = 300) # in R/util.R run_df <- setup_run_df(seed = 93435, n_pop = 214110287, n_days = 300) # in R/util.R -run_df <- sim_brazil_1(run_df) # in R/sim_configs.R +brazil_sim_df <- sim_brazil_1(run_df) # in R/sim_configs.R +brazil_actual_df <- data_brazil_1(run_df) #draws_run_df <- sim_draw_params(2, run_df) # in R/sim_configs.R -#run_df <- rbind(run_df, draws_run_df) +run_df <- rbind(brazil_sim_df, brazil_actual_df) -run_df <- model_stan_baseline(run_df) #in R/modeling_configs.R -run_df$compute_likelihood <- 1 # compute likelihood across all runs -run_df$reports <- list(c('graph_sim', 'graph_tweets', 'graph_d', 'plot')) +#run_df <- model_stan_baseline(run_df) #in R/modeling_configs.R +#run_df$compute_likelihood <- 1 # compute likelihood across all runs +#run_df$reports <- list(c('graph_sim', 'graph_tweets', 'graph_d', 'plot')) #list(c('graph_sim','graph_ODE', 'graph_tweets', 'graph_d', 'plot','param_recovery')) # setup run_df # run models @@ -97,6 +99,9 @@ while (j < nrow(run_df)) { } # section 6 plot <- ggplot(data = NULL, aes(x = day, y = count)) + if ('graph_data' %in% unlist(run_df[j,]$reports)) { + plot <- graph_real_data(data_df = run_df[j,], plot = plot) + } if ('graph_sim' %in% unlist(run_df[j,]$reports)) { plot <- graph_sim_data(data_df = run_df[j,], hide_s = TRUE, plot = plot) } @@ -115,7 +120,7 @@ while (j < nrow(run_df)) { show_ribbon = TRUE) } plot = plot + theme(legend.position = "none") - if ('plot' %in% unlist(run_df[j,]$reports)) { + if (length(unlist(run_df[j,]$reports)) > 0) { print(plot) } # section 6 diff --git a/R/sim_configs.R b/R/sim_configs.R index 26e8445..3baed0e 100644 --- a/R/sim_configs.R +++ b/R/sim_configs.R @@ -101,6 +101,7 @@ sim_brazil_1 <- function (source_df) { run_df$t <- list(sim_df$t) run_df$d <- list(sim_df$d) run_df$tweets <- list(sim_df$tweets) + run_df$reports <- list(c('graph_sim', 'plot')) return(run_df) } #214,110,287 diff --git a/R/util.R b/R/util.R index 324ca4c..fefd09c 100644 --- a/R/util.R +++ b/R/util.R @@ -100,6 +100,21 @@ graph_sim_data <- function(data_df, hide_s, plot) { } +graph_real_data <- function(data_df, plot) { + real_data_df <- data.frame(count = unlist(data_df$d), + day = 1:data_df$n_days, + source = 'deaths') + return(plot + + geom_line(data = real_data_df, aes(y = count, + color = source), + size = .5) + + geom_label_repel(data = subset(real_data_df, + day == round(data_df$n_days/2)), + aes(label = source, + color = source))) +} + + #' Graph daily ODE means from SIRTD model. Returns a ggplot geom_line element #' @param data_df A row from run_df #' @param fit A fit object returned by cmdstanR with ode_states defined @@ -206,6 +221,16 @@ setup_run_df <- function(seed, n_pop, n_days) { template_df$d <- list(c()) template_df$tweets <- list(c()) + #needed for sims + template_df$beta_daily_rate <- NA + template_df$beta_mean <- NA + template_df$gamma <- NA + template_df$death_prob <- NA + template_df$tweet_rate <- NA + template_df$days2death <- NA + + template_df$reports <- NA + # setup prediction columns template_df$d_in_interval <- NA_integer_ template_df$tweets_in_interval <- NA_integer_ From 5e14fa801b89c99795cfc09782971985ffd0d830 Mon Sep 17 00:00:00 2001 From: Breck Date: Sun, 1 Aug 2021 18:39:25 -0400 Subject: [PATCH 09/12] got sim and actual data playing nicely at the same time --- R/modeling_configs.R | 1 + R/runEval.R | 13 +++++++------ R/util.R | 1 + 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/R/modeling_configs.R b/R/modeling_configs.R index 303d050..c11ceda 100644 --- a/R/modeling_configs.R +++ b/R/modeling_configs.R @@ -11,6 +11,7 @@ model_stan_baseline <- function(run_df) { model_tweets_df$apply_twitter_data <- 1 model_tweets_df$description <- paste0(model_tweets_df$description, " use tweets") + model_tweets_df$compute_likelihood <- 1 combined_df = model_tweets_df # rbind(model_no_tweets_df, model_tweets_df) combined_df$ode_solver <- 'block' diff --git a/R/runEval.R b/R/runEval.R index 467f219..a9edea4 100644 --- a/R/runEval.R +++ b/R/runEval.R @@ -16,13 +16,14 @@ source(here::here("R","modeling_configs.R")) # dependencies # setup run_df #run_df <- setup_run_df(seed = 93435, n_pop = 214110287, n_days = 300) # in R/util.R -run_df <- setup_run_df(seed = 93435, n_pop = 214110287, n_days = 300) # in R/util.R -brazil_sim_df <- sim_brazil_1(run_df) # in R/sim_configs.R -brazil_actual_df <- data_brazil_1(run_df) +setup_run_df <- setup_run_df(seed = 93435, n_pop = 214110287, n_days = 300) # in R/util.R +brazil_sim_df <- sim_brazil_1(setup_run_df) # in R/sim_configs.R +brazil_actual_df <- data_brazil_1(setup_run_df) #draws_run_df <- sim_draw_params(2, run_df) # in R/sim_configs.R -run_df <- rbind(brazil_sim_df, brazil_actual_df) +run_data_df <- rbind(brazil_sim_df, brazil_actual_df) -#run_df <- model_stan_baseline(run_df) #in R/modeling_configs.R +run_df <- model_stan_baseline(run_data_df) #in R/modeling_configs.R +run_df$use_tweets <- 0 #run_df$compute_likelihood <- 1 # compute likelihood across all runs #run_df$reports <- list(c('graph_sim', 'graph_tweets', 'graph_d', 'plot')) #list(c('graph_sim','graph_ODE', 'graph_tweets', 'graph_d', 'plot','param_recovery')) @@ -43,7 +44,7 @@ while (j < nrow(run_df)) { tweets = unlist(run_df[j,]$tweets), deaths = unlist(run_df[j,]$d), compute_likelihood = run_df[j,]$compute_likelihood, - run_twitter = run_df[j,]$apply_twitter_data, + run_twitter = run_df[j,]$use_tweets, run_block_ODE = ifelse(run_df[j,]$ode_solver == 'block', 1, 0), run_rk45_ODE = ifelse(run_df[j,]$ode_solver == 'rk45', 1, 0), scale = 1, diff --git a/R/util.R b/R/util.R index fefd09c..78ebe03 100644 --- a/R/util.R +++ b/R/util.R @@ -212,6 +212,7 @@ setup_run_df <- function(seed, n_pop, n_days) { # any model setup template_df$model_to_run <- 'none' template_df$compute_likelihood <- NA + template_df$use_tweets <- NA #setup data columns template_df$s <- list(c()) From be382c90d6174a298a382c2ff2640fdd22226087 Mon Sep 17 00:00:00 2001 From: Breck Date: Sun, 1 Aug 2021 18:48:54 -0400 Subject: [PATCH 10/12] graphing tweets --- R/util.R | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/R/util.R b/R/util.R index 78ebe03..91136a0 100644 --- a/R/util.R +++ b/R/util.R @@ -101,15 +101,19 @@ graph_sim_data <- function(data_df, hide_s, plot) { } graph_real_data <- function(data_df, plot) { - real_data_df <- data.frame(count = unlist(data_df$d), + real_deaths_data_df <- data.frame(count = unlist(data_df$d), day = 1:data_df$n_days, source = 'deaths') + real_tweets_data_df <- data.frame(count = unlist(data_df$tweets), + day = 1:data_df$n_days, + source = 'tweets') + real_data_df <- rbind(real_deaths_data_df,real_tweets_data_df) return(plot + - geom_line(data = real_data_df, aes(y = count, - color = source), - size = .5) + - geom_label_repel(data = subset(real_data_df, - day == round(data_df$n_days/2)), + geom_line(data = real_data_df, aes(y = count, + color = source), + size = .5) + + geom_label_repel(data = subset(real_data_df, + day == round(data_df$n_days/2)), aes(label = source, color = source))) } From 6cd92a8eecb87093fe394c7aaab04c2b3c6b80f8 Mon Sep 17 00:00:00 2001 From: Breck Date: Fri, 6 Aug 2021 10:30:20 -0400 Subject: [PATCH 11/12] no idea what chagnes are, just catching up, seems to run --- R/runEval.R | 5 +++-- stan/baseline.stan | 6 ++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/R/runEval.R b/R/runEval.R index a9edea4..7f98685 100644 --- a/R/runEval.R +++ b/R/runEval.R @@ -1,6 +1,7 @@ #todo # serialize run_df?? json? -# run Brazil data with baseline to get params +# jupyter notebook with simulation sliders +# # # dependencies library(tidyverse) @@ -20,7 +21,7 @@ setup_run_df <- setup_run_df(seed = 93435, n_pop = 214110287, n_days = 300) # in brazil_sim_df <- sim_brazil_1(setup_run_df) # in R/sim_configs.R brazil_actual_df <- data_brazil_1(setup_run_df) #draws_run_df <- sim_draw_params(2, run_df) # in R/sim_configs.R -run_data_df <- rbind(brazil_sim_df, brazil_actual_df) +run_data_df <- brazil_actual_df #rbind(brazil_sim_df, brazil_actual_df) run_df <- model_stan_baseline(run_data_df) #in R/modeling_configs.R run_df$use_tweets <- 0 diff --git a/stan/baseline.stan b/stan/baseline.stan index 211aeb5..1c3ec48 100644 --- a/stan/baseline.stan +++ b/stan/baseline.stan @@ -61,7 +61,6 @@ functions { } return day_counts; } - } data { int n_days; @@ -154,7 +153,7 @@ transformed parameters{ model { beta ~ normal(0, 1); gamma ~ normal(0, 1); - deathRate ~ normal(0, 1); + deathRate ~ normal(0, .01); lambda_twitter ~ normal(0,1); if (compute_likelihood == 1) { for (i in 1:n_days) { @@ -174,8 +173,7 @@ generated quantities { real recovery_time = 1 / gamma; real pred_deaths[n_days]; real pred_tweets[n_days]; - real pred_i[n_days]; - matrix[n_days, n_compartments] ode_states = daily_counts_ODE; + // matrix[n_days, n_compartments] ode_states = daily_counts_ODE; matrix[n_compartments, n_days] transposed_ode_states = daily_counts_ODE'; row_vector[n_days] S = transposed_ode_states[sCompartment]; row_vector[n_days] I = transposed_ode_states[iCompartment]; From 6d3280a61bbfde4ce733120a9ad5cdcc06f9ed82 Mon Sep 17 00:00:00 2001 From: Breck Date: Fri, 6 Aug 2021 17:16:46 -0400 Subject: [PATCH 12/12] got basic SIRTD working with sliders --- jupyter/simpleDGP.ipynb | 117 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 jupyter/simpleDGP.ipynb diff --git a/jupyter/simpleDGP.ipynb b/jupyter/simpleDGP.ipynb new file mode 100644 index 0000000..120151f --- /dev/null +++ b/jupyter/simpleDGP.ipynb @@ -0,0 +1,117 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 31, + "id": "cf989776", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0596e2eec4f14b81b0a8cb4f4c8f6340", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(IntSlider(value=2000, description='n_pop', max=6000, min=-2000), IntSlider(value=10, des…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from ipywidgets import interact, interactive, fixed\n", + "import ipywidgets as widgets\n", + "\n", + "@interact(n_pop=2000, n_days=10, print_daily=True, beta_daily_inf_rate=.3,\n", + " gamma_res_per_day=.31, n_patient_zero=10, mean_days_infectuous=10, death_prob= .01, mean_days_to_death=10)\n", + "def sirtd_exact_sim(n_pop, n_days, print_daily, beta_daily_inf_rate, gamma_res_per_day, n_patient_zero, \n", + " mean_days_infectuous, death_prob, mean_days_to_death):\n", + " S = [n_pop - n_patient_zero]\n", + " I = [n_patient_zero]\n", + " R = [0]\n", + " T = [0]\n", + " D = [0]\n", + " tweets = [0]\n", + " for today in range(1,n_days):\n", + " yest = today - 1\n", + " tot = S[yest] + I[yest] + R[yest] + T[yest] + D[yest]\n", + " if print_daily:\n", + " print(f\"day={yest}, S={S[yest]}, I={I[yest]}, R={R[yest]}, T={T[yest]}, D={D[yest]}, tot={tot}\")\n", + " S.append(S[yest] - beta_daily_inf_rate * S[yest] * (I[yest] / n_pop))\n", + " I.append(I[yest] + beta_daily_inf_rate * S[yest] * (I[yest] / n_pop) - \n", + " (1 / mean_days_infectuous) * I[yest])\n", + " R.append(R[yest] + (1 / mean_days_infectuous) * (I[yest] * (1 - death_prob)))\n", + " T.append(T[yest] + (1 / mean_days_infectuous) * (I[yest] * death_prob) -\n", + " (1 / mean_days_to_death) * T[yest])\n", + " D.append(D[yest] + (1 / mean_days_to_death) * T[yest])\n", + " \n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e881660d", + "metadata": {}, + "outputs": [ + { + "ename": "IndexError", + "evalue": "list assignment index out of range", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mfoo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mfoo\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m: list assignment index out of range" + ] + } + ], + "source": [ + "foo = [1,2]\n", + "foo[3] = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "663bffc0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 2, 3, 4, 5, 6, 7, 8, 9]\n" + ] + } + ], + "source": [ + "print(list(range(1,10)))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}