For many of the direct forecasters, the forecast is strictly decreasing, even in the middle of the surge. This effect is most prominent in flu, but occurs somewhat in covid. We need to resolve the source of this. It is some combination of the data and the models used.
Roughly, the conclusion after having done these experiments is that it is in fact an over-representation of decreasing time_value
s that is causing this behavior (especially look at fitting simple linear models to the data). Some possible fixes include:
This notebook depends on having successfully run the flu_hosp_explore
targets pipeline to handle the creation of the basic dataset.
Sys.setenv(TAR_PROJECT = "flu_hosp_explore")
tar_make(hhs_archive)
hhs_archive <- tar_read(hhs_archive) %>% as_epi_archive()
train_value_min <- "2002-06-01"
hhs_archive %>%
epix_as_of_current() %>%
filter(time_value > "2023-10-01") %>%
autoplot(hhs)
To avoid running too frequently, we’ll limit to a single forecast date just after the peak of the rate of growth, so that ~ everywhere is increasing. Here’s a table of the number of locations that have their max on any given week:
hhs_gr <- hhs_archive %>%
epix_as_of_current() %>%
group_by(geo_value) %>%
mutate(gr_hhs = growth_rate(hhs)) %>%
filter(time_value > "2023-10-01")
hhs_gr %>%
arrange(gr_hhs) %>%
drop_na() %>%
slice_max(gr_hhs) %>%
ungroup() %>%
group_by(time_value) %>%
summarize(nn = length(hhs)) %>%
print(n=16)
## # A tibble: 8 × 2
## time_value nn
## <date> <int>
## 1 2023-10-04 2
## 2 2023-10-18 1
## 3 2023-10-25 3
## 4 2023-11-01 5
## 5 2023-11-08 14
## 6 2023-11-15 18
## 7 2023-11-22 7
## 8 2023-12-06 3
Since most have the largest growth rate on the 15th, so let’s choose 11/29 as our forecast_date
, to make sure there’s some trend for the forecasters to pick up on.
forecast_date <- as.Date("2023-11-29")
hhs_gr %>% autoplot(gr_hhs) +
geom_vline(aes(xintercept = forecast_date), lty = 2) +
labs(title = "growth rates")
A plot to confirm that most locations are still increasing on the 29th.
Note that this is RATE DATA, and not count data, so pop_scaling = FALSE
should be the default.
Since we don’t really need to run the full pipeline to get forecasts from a single day and forecaster, we build a couple of functions for inspecting forecasts.
forecast_aheads <- function(forecaster, epi_data = hhs_forecast, aheads = 0:4 * 7) {
all_forecasts <- map(aheads, \(ahead) forecaster(epi_data, ahead)) %>% list_rbind()
all_forecasts
}
Here’s a way to easily plot a subset of the forecasts, with bands at the 80% and 50% intervals (.1-.9 and .25-.75) against the finalized data.
plot_dec_forecasts <- function(all_forecasts,
geo_values,
data_archive = hhs_archive,
earliest_truth_data = NULL) {
if (is.null(earliest_truth_data)) {
earliest_truth_data <- all_forecasts$forecast_date[[1]] - as.difftime(365, units = "days")
}
as_of_plotting <- data_archive %>%
epix_as_of(min(all_forecasts$forecast_date)) %>%
filter(time_value <= max(all_forecasts$target_end_date), geo_value %in% geo_values) %>%
as_tibble() %>%
mutate(forecast_date = time_value) %>%
filter(time_value >= earliest_truth_data)
# transform the archive to something useful for comparison
finalized_plotting <- data_archive %>%
epix_as_of_current() %>%
filter(time_value <= max(all_forecasts$target_end_date), geo_value %in% geo_values) %>%
as_tibble() %>%
mutate(forecast_date = time_value) %>%
filter(time_value >= earliest_truth_data)
all_forecasts %>%
filter(geo_value %in% geo_values) %>%
pivot_wider(names_from = quantile, values_from = value) %>%
ggplot(aes(x = target_end_date, group = geo_value, fill = forecast_date)) +
geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), alpha = 0.4) +
geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), alpha = 0.6) +
geom_line(aes(y = `0.5`, color = forecast_date)) +
geom_line(data = as_of_plotting, aes(x = time_value, y = hhs), color = "grey") +
geom_line(data = finalized_plotting, aes(x = time_value, y = hhs), color = "black") +
facet_wrap(~geo_value, scale = "free") +
theme(legend.position = "none")
}
And a method to inspect whether things are increasing that isn’t just the eyeball norm on a few of them. This calculates growth rates for each quantile and each location.
get_growth_rates <- function(forecasts, quantiles = NULL, outlier_bound = 1e2, ...) {
if (is.null(quantiles)) {
quantiles <- forecasts$quantile %>% unique()
}
forecasts %>%
group_by(geo_value, quantile) %>%
filter(min(value) != max(value), quantile %in% quantiles) %>%
mutate(growth = growth_rate(value, ...)) %>%
filter(abs(growth) < outlier_bound)
}
hhs_forecast <- hhs_archive %>%
epix_as_of(forecast_date) %>%
filter(time_value > train_value_min)
all_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE))
default_geos <- c("ca", "fl", "ny", "pa", "tx")
plot_dec_forecasts(all_forecasts, default_geos)
All the forecasts are going down rather than up, even though they have multiple weeks of data! More quantitatively, let’s compute the growth rate of the median forecast for each ahead, across all geos, and then look at a histogram of that
basic_gr <- get_growth_rates(all_forecasts, quantiles = 0.5, method = "smooth_spline")
basic_gr %>% arrange(desc(growth)) %>% print(n=30)
## # A tibble: 269 × 6
## # Groups: geo_value, quantile [54]
## geo_value forecast_date target_end_date quantile value growth
## <chr> <date> <date> <dbl> <dbl> <dbl>
## 1 as 2023-11-29 2023-12-06 0.5 0.00691 1.57
## 2 as 2023-11-29 2023-12-13 0.5 0.0219 0.792
## 3 as 2023-11-29 2023-12-20 0.5 0.0403 0.495
## 4 as 2023-11-29 2023-12-27 0.5 0.0615 0.356
## 5 nh 2023-11-29 2023-12-27 0.5 0.0768 0.0834
## 6 nh 2023-11-29 2023-12-20 0.5 0.0688 0.0781
## 7 nh 2023-11-29 2023-12-13 0.5 0.0663 0.0590
## 8 nh 2023-11-29 2023-12-06 0.5 0.0598 0.0241
## 9 nh 2023-11-29 2023-11-29 0.5 0.0636 -0.00158
## 10 ri 2023-11-29 2023-11-29 0.5 0.266 -0.0439
## 11 ri 2023-11-29 2023-12-06 0.5 0.257 -0.0459
## 12 ri 2023-11-29 2023-12-13 0.5 0.233 -0.0482
## 13 ri 2023-11-29 2023-12-20 0.5 0.235 -0.0506
## 14 ri 2023-11-29 2023-12-27 0.5 0.219 -0.0533
## 15 ct 2023-11-29 2023-11-29 0.5 0.914 -0.0589
## 16 dc 2023-11-29 2023-11-29 0.5 1.19 -0.0609
## 17 de 2023-11-29 2023-12-06 0.5 0.181 -0.0611
## 18 de 2023-11-29 2023-12-13 0.5 0.171 -0.0649
## 19 de 2023-11-29 2023-11-29 0.5 0.194 -0.0662
## 20 ut 2023-11-29 2023-11-29 0.5 0.296 -0.0696
## 21 sd 2023-11-29 2023-12-20 0.5 0.162 -0.0705
## 22 sd 2023-11-29 2023-12-13 0.5 0.172 -0.0720
## 23 oh 2023-11-29 2023-11-29 0.5 0.606 -0.0737
## 24 me 2023-11-29 2023-11-29 0.5 1.42 -0.0764
## 25 ut 2023-11-29 2023-12-06 0.5 0.278 -0.0782
## 26 co 2023-11-29 2023-11-29 0.5 2.31 -0.0788
## 27 wa 2023-11-29 2023-11-29 0.5 0.448 -0.0807
## 28 mi 2023-11-29 2023-11-29 0.5 0.846 -0.0809
## 29 mn 2023-11-29 2023-11-29 0.5 0.475 -0.0817
## 30 nc 2023-11-29 2023-11-29 0.5 1.44 -0.0819
## # ℹ 239 more rows
basic_gr %>% ggplot(aes(x = growth)) +
geom_histogram(bins = 300)
There are a few exceptional locations with actually positive growth rates, but the vast majority of aheads and geos have negative growth rates.
If we limit to the last 3 weeks of data (so effectively just a linear extrapolation shared across geos), it goes away:
hhs_forecast <- hhs_archive %>%
epix_as_of(forecast_date) %>%
filter(time_value > train_value_min)
all_short_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, n_training = 3, pop_scaling = FALSE))
plot_dec_forecasts(all_short_forecasts, default_geos)
They’re pretty jittery, but strictly decreasing they are not. And the corresponding growth rates:
short_gr <- get_growth_rates(all_short_forecasts, quantiles = 0.5, method = "smooth_spline")
short_gr %>%
arrange(growth) %>%
ggplot(aes(x = growth)) +
geom_histogram(bins = 300)
So most aheads and locations have a positive growth rate, with some strong positive outliers and some amount decreasing. This generally fits with the jittery but positive nature of the example location plots.
First, confirming that it happens for simple linear regression and not just quantile regression.
hhs_forecast <- hhs_archive %>%
epix_as_of(forecast_date) %>%
filter(time_value > train_value_min)
all_linear_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg(), pop_scaling = FALSE))
default_geos <- c("ca", "fl", "ny", "pa", "tx")
plot_dec_forecasts(all_linear_forecasts, default_geos)
Which appears to also have this problem (and very narrow quantiles in some locations).
basic_gr <- get_growth_rates(all_linear_forecasts, quantiles = 0.5, method = "smooth_spline")
basic_gr %>%
arrange(desc(growth)) %>%
print(n = 30)
## # A tibble: 270 × 6
## # Groups: geo_value, quantile [54]
## geo_value forecast_date target_end_date quantile value growth
## <chr> <date> <date> <dbl> <dbl> <dbl>
## 1 as 2023-11-29 2023-11-29 0.5 0.0659 1.24
## 2 ak 2023-11-29 2023-12-20 0.5 0.0890 1.03
## 3 ak 2023-11-29 2023-12-27 0.5 0.360 1.00
## 4 as 2023-11-29 2023-12-06 0.5 0.149 0.583
## 5 nh 2023-11-29 2023-11-29 0.5 0.115 0.549
## 6 nh 2023-11-29 2023-12-06 0.5 0.182 0.404
## 7 as 2023-11-29 2023-12-13 0.5 0.239 0.373
## 8 nh 2023-11-29 2023-12-13 0.5 0.260 0.292
## 9 de 2023-11-29 2023-11-29 0.5 0.258 0.244
## 10 as 2023-11-29 2023-12-20 0.5 0.323 0.241
## 11 de 2023-11-29 2023-12-06 0.5 0.322 0.207
## 12 sd 2023-11-29 2023-11-29 0.5 0.266 0.189
## 13 nh 2023-11-29 2023-12-20 0.5 0.328 0.184
## 14 ri 2023-11-29 2023-11-29 0.5 0.377 0.183
## 15 as 2023-11-29 2023-12-27 0.5 0.396 0.177
## 16 ut 2023-11-29 2023-11-29 0.5 0.376 0.170
## 17 sd 2023-11-29 2023-12-06 0.5 0.318 0.169
## 18 de 2023-11-29 2023-12-13 0.5 0.389 0.161
## 19 ri 2023-11-29 2023-12-06 0.5 0.454 0.149
## 20 sd 2023-11-29 2023-12-13 0.5 0.373 0.147
## 21 or 2023-11-29 2023-12-06 0.5 0.317 0.138
## 22 nh 2023-11-29 2023-12-27 0.5 0.383 0.137
## 23 ut 2023-11-29 2023-12-06 0.5 0.438 0.136
## 24 or 2023-11-29 2023-12-13 0.5 0.365 0.127
## 25 or 2023-11-29 2023-11-29 0.5 0.279 0.127
## 26 ri 2023-11-29 2023-12-13 0.5 0.512 0.122
## 27 ne 2023-11-29 2023-11-29 0.5 0.490 0.116
## 28 sd 2023-11-29 2023-12-20 0.5 0.425 0.112
## 29 ri 2023-11-29 2023-12-20 0.5 0.576 0.106
## 30 ut 2023-11-29 2023-12-13 0.5 0.494 0.104
## # ℹ 240 more rows
There’s a least a good number of locations/aheads that have a positive growth rate.
basic_gr %>% ggplot(aes(x = growth)) +
geom_histogram(bins = 300)
But the majority are negative.
hhs_forecast <- hhs_archive %>%
epix_as_of(forecast_date) %>%
filter(time_value > train_value_min)
all_boost_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = boost_tree(mode = "regression"), pop_scaling = FALSE))
default_geos <- c("ca", "fl", "ny", "pa", "tx")
plot_dec_forecasts(all_boost_forecasts, default_geos)
Boosted trees don’t have the problem? Mostly? The forecasts aren’t great, but at least they’re not plummeting. The quantiles are garbage, but that’s kind of to be expected with residual quantiles on a non-linear method.
basic_gr <- get_growth_rates(all_boost_forecasts, quantiles = 0.5, method = "smooth_spline")
basic_gr %>% ggplot(aes(x = growth)) +
geom_histogram(bins = 300)
Mostly positive, so I think we can count this as not having the problem. Something about a linear model is the issue.
hhs_forecast <- hhs_archive %>%
epix_as_of(forecast_date) %>%
filter(time_value > train_value_min)
all_one_lag_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE, lag = 0))
all_one_lag_forecasts
## # A tibble: 6,210 × 5
## geo_value forecast_date target_end_date quantile value
## <chr> <date> <date> <dbl> <dbl>
## 1 ak 2023-11-29 2023-11-29 0.01 0.369
## 2 ak 2023-11-29 2023-11-29 0.025 0.593
## 3 ak 2023-11-29 2023-11-29 0.05 0.790
## 4 ak 2023-11-29 2023-11-29 0.1 0.983
## 5 ak 2023-11-29 2023-11-29 0.15 1.12
## 6 ak 2023-11-29 2023-11-29 0.2 1.22
## # ℹ 6,204 more rows
plot_dec_forecasts(all_one_lag_forecasts, default_geos)
Still constantly falling, unfortunately, and a surprisingly similar forecast.
The first 2 years are bad for flu, as there was ~no spread. In practice we know this behavior happens when we remove those years, and even the summers. I’m including this primarily to compare the change in growth rates.
hhs_forecast_recent <- hhs_archive %>%
epix_as_of(forecast_date) %>%
filter(time_value > "2022-06-01")
all_recent_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE), epi_data = hhs_forecast_recent)
plot_dec_forecasts(all_recent_forecasts, default_geos)
Which is still decreasing.
basic_gr <- get_growth_rates(all_recent_forecasts, quantiles = 0.5, method = "smooth_spline")
basic_gr %>% arrange(desc(growth)) %>% print(n=30)
## # A tibble: 270 × 6
## # Groups: geo_value, quantile [54]
## geo_value forecast_date target_end_date quantile value growth
## <chr> <date> <date> <dbl> <dbl> <dbl>
## 1 as 2023-11-29 2023-11-29 0.5 0.0326 1.01
## 2 as 2023-11-29 2023-12-06 0.5 0.0658 0.516
## 3 as 2023-11-29 2023-12-13 0.5 0.103 0.406
## 4 as 2023-11-29 2023-12-20 0.5 0.146 0.280
## 5 nh 2023-11-29 2023-12-06 0.5 0.104 0.239
## 6 nh 2023-11-29 2023-12-13 0.5 0.133 0.221
## 7 nh 2023-11-29 2023-11-29 0.5 0.0843 0.207
## 8 as 2023-11-29 2023-12-27 0.5 0.182 0.184
## 9 nh 2023-11-29 2023-12-20 0.5 0.160 0.154
## 10 nh 2023-11-29 2023-12-27 0.5 0.183 0.124
## 11 ak 2023-11-29 2023-12-27 0.5 0.491 0.0819
## 12 de 2023-11-29 2023-11-29 0.5 0.222 0.0409
## 13 de 2023-11-29 2023-12-06 0.5 0.231 0.0402
## 14 de 2023-11-29 2023-12-13 0.5 0.239 0.0401
## 15 de 2023-11-29 2023-12-20 0.5 0.251 0.0161
## 16 sd 2023-11-29 2023-11-29 0.5 0.236 0.0159
## 17 sd 2023-11-29 2023-12-06 0.5 0.237 0.0156
## 18 sd 2023-11-29 2023-12-13 0.5 0.239 0.0154
## 19 sd 2023-11-29 2023-12-20 0.5 0.249 0.0152
## 20 sd 2023-11-29 2023-12-27 0.5 0.249 0.0149
## 21 ri 2023-11-29 2023-11-29 0.5 0.338 0.00584
## 22 ri 2023-11-29 2023-12-06 0.5 0.347 0.00581
## 23 ri 2023-11-29 2023-12-13 0.5 0.332 0.00577
## 24 ri 2023-11-29 2023-12-20 0.5 0.354 0.00574
## 25 ri 2023-11-29 2023-12-27 0.5 0.344 0.00571
## 26 or 2023-11-29 2023-12-13 0.5 0.243 0.00286
## 27 de 2023-11-29 2023-12-27 0.5 0.247 -0.00715
## 28 or 2023-11-29 2023-12-20 0.5 0.243 -0.0110
## 29 ut 2023-11-29 2023-11-29 0.5 0.336 -0.0135
## 30 ut 2023-11-29 2023-12-06 0.5 0.337 -0.0157
## # ℹ 240 more rows
basic_gr %>% ggplot(aes(x = growth)) +
geom_histogram(bins = 300)
which are more or less the same level of bad. ### Linear only However if we filter to 2022 onwards and use a linear engine:
hhs_forecast_recent <- hhs_archive %>%
epix_as_of(forecast_date) %>%
filter(time_value > "2022-06-01")
all_recent_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE, trainer = linear_reg()), epi_data = hhs_forecast_recent)
plot_dec_forecasts(all_recent_forecasts, default_geos)
Some are actually increasing, while some are decreasing.
basic_gr <- get_growth_rates(all_recent_forecasts, quantiles = 0.5, method = "smooth_spline")
basic_gr %>% arrange(desc(growth)) %>% print(n=30)
## # A tibble: 270 × 6
## # Groups: geo_value, quantile [54]
## geo_value forecast_date target_end_date quantile value growth
## <chr> <date> <date> <dbl> <dbl> <dbl>
## 1 as 2023-11-29 2023-11-29 0.5 0.134 1.29
## 2 nh 2023-11-29 2023-11-29 0.5 0.179 0.854
## 3 ak 2023-11-29 2023-12-27 0.5 0.555 0.774
## 4 ak 2023-11-29 2023-12-20 0.5 0.219 0.683
## 5 as 2023-11-29 2023-12-06 0.5 0.313 0.604
## 6 nh 2023-11-29 2023-12-06 0.5 0.339 0.516
## 7 de 2023-11-29 2023-11-29 0.5 0.324 0.470
## 8 sd 2023-11-29 2023-11-29 0.5 0.333 0.416
## 9 as 2023-11-29 2023-12-13 0.5 0.510 0.386
## 10 ri 2023-11-29 2023-11-29 0.5 0.460 0.362
## 11 or 2023-11-29 2023-11-29 0.5 0.341 0.357
## 12 nh 2023-11-29 2023-12-13 0.5 0.524 0.350
## 13 de 2023-11-29 2023-12-06 0.5 0.481 0.347
## 14 ut 2023-11-29 2023-11-29 0.5 0.446 0.344
## 15 sd 2023-11-29 2023-12-06 0.5 0.476 0.318
## 16 or 2023-11-29 2023-12-06 0.5 0.470 0.301
## 17 ne 2023-11-29 2023-11-29 0.5 0.567 0.269
## 18 ut 2023-11-29 2023-12-06 0.5 0.601 0.263
## 19 ri 2023-11-29 2023-12-06 0.5 0.631 0.261
## 20 de 2023-11-29 2023-12-13 0.5 0.654 0.257
## 21 md 2023-11-29 2023-11-29 0.5 0.433 0.255
## 22 nd 2023-11-29 2023-12-06 0.5 0.450 0.252
## 23 sd 2023-11-29 2023-12-13 0.5 0.634 0.249
## 24 as 2023-11-29 2023-12-20 0.5 0.697 0.248
## 25 or 2023-11-29 2023-12-13 0.5 0.620 0.241
## 26 id 2023-11-29 2023-12-06 0.5 0.576 0.231
## 27 md 2023-11-29 2023-12-06 0.5 0.549 0.230
## 28 mn 2023-11-29 2023-11-29 0.5 0.646 0.225
## 29 nh 2023-11-29 2023-12-20 0.5 0.693 0.222
## 30 wa 2023-11-29 2023-11-29 0.5 0.574 0.217
## # ℹ 240 more rows
basic_gr %>% ggplot(aes(x = growth)) +
geom_histogram(bins = 300)
Overall, more are now positive than they were in the totally unfiltered case, but there’s still a number of unreasonably negative values.
First lets examine the coefficients that are actually fit; to do that from within scaled_pop would involve a browser()
. For the sake of reproducibility, we will make the steps by hand. Note that I’ve tried this section with both filtering pre 2022 values and not, and the results are approximately the same.
filtering the data
hhs_forecast <- hhs_archive %>%
epix_as_of(forecast_date) %>%
filter(time_value > train_value_min)
linear_get_preproc <- function(ahead, lag = c(0, 7, 14), in_data = hhs_forecast, to_scale = FALSE) {
preproc <- epi_recipe(in_data)
if (to_scale) {
preproc <- preproc %>%
step_population_scaling(
hhs,
df = epidatasets::state_census,
df_pop_col = "pop",
create_new = FALSE,
rate_rescaling = 1e5,
by = c("geo_value" = "abbr")
)
}
preproc <- preproc %>%
step_adjust_latency(hhs, method = "extend_lags") %>%
step_epi_lag(hhs, lag = lag) %>%
step_epi_ahead(hhs, ahead = ahead) %>%
step_epi_naomit()
}
linear_get_workflow <- function(ahead, lag = c(0, 7, 14), in_data = hhs_forecast) {
preproc <- linear_get_preproc(ahead, lag, in_data)
postproc <- frosting() %>%
layer_predict() %>%
layer_residual_quantiles(quantile_levels = covidhub_probs()) %>%
layer_threshold() %>%
layer_naomit() %>%
layer_add_target_date() %>%
layer_add_forecast_date()
workflow <- epi_workflow(preproc, linear_reg()) %>%
fit(in_data) %>%
add_frosting(postproc)
workflow
}
all_workflows <- map(0:4 * 7, linear_get_workflow)
Starting with the largest ahead, the coefficients are
workflows::extract_fit_parsnip(all_workflows[[5]])
## parsnip model object
##
##
## Call:
## stats::lm(formula = ..y ~ ., data = data)
##
## Coefficients:
## (Intercept) lag_7_hhs lag_14_hhs lag_21_hhs
## 0.3961 0.9700 -0.4993 -0.1464
So the intercept is actually positive (so it’s not biased towards decreasing inherently), but the coefficients for two of the lags are negative. Even including that, the lag_7_hhs
coefficient is less than one, so regardless of the fact that two coefficients are negative it will de-facto always be below the original value.
How about the zero ahead? Thanks to latency this is actually a one week forward projection, so we can’t expect exactly just lag_7_hhs
to be one and everything else zero.
workflows::extract_fit_parsnip(all_workflows[[1]])
## parsnip model object
##
##
## Call:
## stats::lm(formula = ..y ~ ., data = data)
##
## Coefficients:
## (Intercept) lag_7_hhs lag_14_hhs lag_21_hhs
## 0.06587 1.22099 -0.20689 -0.12491
But it is still surprisingly close to exactly the lag_7_hhs
value. If the signal were constant so far though, it would still be predicting a decrease thanks to the lag_21_hhs
coefficient.
Given that something strange is going on with the data that we’re fitting, it is worth plotting the data as it is seen by the linear regressor. Since visualizing a 4D vector is a pain, let’s start with fitting just the lag = 0
case, which still has similar behavior (and in fact is the dominant coefficient above anyways). The way to get the data is using prep and bake:
preproc <- linear_get_preproc(0:4 * 7)
fit_data_long <-
preproc %>%
prep(hhs_forecast) %>%
bake(hhs_forecast) %>%
select(geo_value, time_value, starts_with("lag"), starts_with("ahead")) %>%
pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
drop_na(lag_7_hhs, value) %>%
mutate(ahead_value = factor(ahead_value, c("ahead_0_hhs", "ahead_7_hhs", "ahead_14_hhs", "ahead_21_hhs", "ahead_28_hhs"))) %>%
arrange(geo_value, time_value, ahead_value) %>%
mutate(epi_week = epiweek(time_value))
The fit for 28 days ahead is
all_single_workflows <- map(0:4 * 7, \(ahead) linear_get_workflow(ahead, lag = 0))
workflows::extract_fit_parsnip(all_single_workflows[[5]])
## parsnip model object
##
##
## Call:
## stats::lm(formula = ..y ~ ., data = data)
##
## Coefficients:
## (Intercept) lag_7_hhs
## 0.3578 0.3891
which is positive but less than one, and thus predicts a decrease. Since that data as seen by this regression is 1D, we can plot it against the target ahead, and then plot the regression.
plot_linear_data <- function(fit_data_long) {
fit_data_long %>%
ggplot(aes(x = lag_7_hhs, y = value)) +
geom_point(aes(color = epi_week), alpha = 1, size = 0.7) +
geom_smooth(method = "lm", formula = y ~ x) +
geom_abline(intercept = 0, slope = 1) +
facet_wrap(~ahead_value, scales = "free") +
scale_color_viridis_c() +
scale_fill_viridis_c(option = "magma", trans = "log10")
}
fit_data_long %>%
plot_linear_data()
All of which have slope less than 1. There is a stronger correlation between the smaller aheads and the value, and the slope is much closer to one. Adding the color corresponding to the season week potentially gives us some idea of the problem; there is a large mass of values with a low slope and a low epi_week
. If we crudely just cut out everything with an epi_week
below 10:
fit_data_long %>%
filter(epi_week > 10) %>%
plot_linear_data()
which is better but still below one. For some reason most of the remaining points egregiously below the diagonal are in epi_weeks
very near 50. If we also filter out the last 4 weeks:
fit_data_long %>%
filter(epi_week > 10, epi_week < 48) %>%
plot_linear_data()
Then suddenly the slopes can be quite large! This is of course not a particularly principled way of selecting training data.
I originally added this plot in an attempt to make the blob near zero clearer, but it actually just did a linear fit on the log graph. This is significantly closer to a positive linear slope, though it is still negative.
fit_data_long %>%
ggplot(aes(x = lag_7_hhs, y = value)) +
geom_hex(alpha = 0.5, bins = 30) +
geom_point(aes(color = epi_week), alpha = 0.3, size = 0.5) +
geom_smooth(method = "lm", formula = y ~ x) +
geom_abline(intercept = 0, slope = 1) +
scale_y_log10() +
scale_x_log10() +
facet_wrap(~ahead_value, scales = "free") +
scale_color_viridis_c() +
scale_fill_viridis_c(option = "magma", trans = "log10")
Constantly decreasing forecasters has been less of an issue in covid, so we should do a comparison. Since we’re assuming the project is flu_hosp_explore
, we have to directly access the covid archive. Covid is in counts, so first we convert to rates.
hhs_covid_archive <-
qs2::qs_read(here::here("covid_hosp_explore/objects/hhs_archive"))$DT %>%
filter(time_value > train_value_min) %>%
left_join(state_census, by = join_by(geo_value == abbr)) %>%
mutate(value = value / pop * 1e5) %>%
select(-fips, -name, -pop) %>%
rename(hhs = value) %>%
as_epi_archive()
hhs_covid_forecast <- hhs_covid_archive %>% epix_as_of(forecast_date)
hhs_covid_forecast %>% autoplot(hhs)
Forecasting using the same methods as the original problem
all_covid_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE), epi_dat = hhs_covid_forecast)
plot_dec_forecasts(all_covid_forecasts, default_geos, hhs_covid_archive)
Still sort of present! And does it show up in a histogram of growth rates?
basic_gr <- get_growth_rates(all_covid_forecasts, quantiles = 0.5, method = "smooth_spline")
basic_gr %>% arrange(desc(growth)) %>% print(n=30)
## # A tibble: 275 × 6
## # Groups: geo_value, quantile [55]
## geo_value forecast_date target_end_date quantile value growth
## <chr> <date> <date> <dbl> <dbl> <dbl>
## 1 as 2023-11-29 2023-11-29 0.5 0.184 2.32
## 2 vi 2023-11-29 2023-11-29 0.5 0.184 2.32
## 3 as 2023-11-29 2023-12-06 0.5 0.620 0.732
## 4 vi 2023-11-29 2023-12-06 0.5 0.620 0.732
## 5 as 2023-11-29 2023-12-13 0.5 1.12 0.506
## 6 vi 2023-11-29 2023-12-13 0.5 1.12 0.506
## 7 as 2023-11-29 2023-12-20 0.5 1.76 0.391
## 8 vi 2023-11-29 2023-12-20 0.5 1.76 0.391
## 9 as 2023-11-29 2023-12-27 0.5 2.46 0.292
## 10 vi 2023-11-29 2023-12-27 0.5 2.46 0.292
## 11 ak 2023-11-29 2023-12-20 0.5 2.23 0.237
## 12 ak 2023-11-29 2023-12-27 0.5 2.76 0.192
## 13 ak 2023-11-29 2023-12-13 0.5 1.77 0.176
## 14 wa 2023-11-29 2023-12-27 0.5 3.89 0.108
## 15 wa 2023-11-29 2023-12-20 0.5 3.49 0.104
## 16 pr 2023-11-29 2023-11-29 0.5 3.85 0.0937
## 17 fl 2023-11-29 2023-12-20 0.5 4.08 0.0858
## 18 pr 2023-11-29 2023-12-06 0.5 4.28 0.0845
## 19 ms 2023-11-29 2023-11-29 0.5 3.97 0.0829
## 20 ga 2023-11-29 2023-12-20 0.5 4.13 0.0822
## 21 wa 2023-11-29 2023-12-13 0.5 3.18 0.0812
## 22 fl 2023-11-29 2023-12-27 0.5 4.43 0.0781
## 23 ms 2023-11-29 2023-12-06 0.5 4.36 0.0765
## 24 pr 2023-11-29 2023-12-13 0.5 4.58 0.0761
## 25 ga 2023-11-29 2023-12-27 0.5 4.47 0.0745
## 26 la 2023-11-29 2023-12-20 0.5 4.40 0.0712
## 27 ms 2023-11-29 2023-12-13 0.5 4.61 0.0711
## 28 pr 2023-11-29 2023-12-20 0.5 4.96 0.0695
## 29 la 2023-11-29 2023-12-27 0.5 4.72 0.0692
## 30 fl 2023-11-29 2023-12-13 0.5 3.76 0.0687
## # ℹ 245 more rows
basic_gr %>% ggplot(aes(x = growth)) +
geom_histogram(bins = 300)
On average the growth rate is positive, but really at this point ~none of the forecasts should have a negative slope of the median. Let’s take a look at what the forecast is actually seeing
preproc <- linear_get_preproc(0:4 * 7, in_data = hhs_covid_forecast)
fit_covid_data <- preproc %>%
prep(hhs_covid_forecast) %>%
bake(hhs_covid_forecast) %>%
select(geo_value, time_value, starts_with("lag"), starts_with("ahead"))
fit_covid_data %>%
mutate(epi_week = epiweek(time_value)) %>%
pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
mutate(ahead_value = factor(ahead_value, c("ahead_0_hhs", "ahead_7_hhs", "ahead_14_hhs", "ahead_21_hhs", "ahead_28_hhs"))) %>%
plot_linear_data()
Which, other than some massive outliers, has similar trends (the slope decreases as we increase the ahead, along with the spread).
And if we cut out the early epi_weeks
?
fit_covid_data %>%
mutate(epi_week = epiweek(time_value)) %>%
filter(epi_week > 10) %>%
pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
mutate(ahead_value = factor(ahead_value, c("ahead_0_hhs", "ahead_7_hhs", "ahead_14_hhs", "ahead_21_hhs", "ahead_28_hhs"))) %>%
plot_linear_data()
Which is marginally better. The problematic data is less clear here date-wise than in the case of flu; partly this is because the data covers a much larger range of values. Partly I suspect that this is also it’s less seasonal.
Some other investigations.
This was an accidental find, but if we do population scale the already population scaled data, the problem mostly goes away.
hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
all_linear_scaled_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg()))
default_geos <- c("ca", "fl", "ny", "pa", "tx")
plot_dec_forecasts(all_linear_scaled_forecasts, default_geos)
And just like that the problem is gone?
basic_gr <- get_growth_rates(all_linear_scaled_forecasts, quantiles = 0.5, method = "smooth_spline")
basic_gr %>%
arrange(growth) %>%
print(n = 30)
## # A tibble: 268 × 6
## # Groups: geo_value, quantile [54]
## geo_value forecast_date target_end_date quantile value growth
## <chr> <date> <date> <dbl> <dbl> <dbl>
## 1 ak 2023-11-29 2023-12-13 0.5 0.112 -1.90
## 2 ak 2023-11-29 2023-12-06 0.5 0.464 -1.09
## 3 ak 2023-11-29 2023-11-29 0.5 1.08 -0.623
## 4 nd 2023-11-29 2023-12-27 0.5 0.181 -0.267
## 5 hi 2023-11-29 2023-12-27 0.5 0.933 -0.257
## 6 nd 2023-11-29 2023-12-20 0.5 0.235 -0.211
## 7 wy 2023-11-29 2023-12-27 0.5 2.49 -0.210
## 8 vt 2023-11-29 2023-12-27 0.5 0.231 -0.209
## 9 dc 2023-11-29 2023-12-27 0.5 0.887 -0.201
## 10 hi 2023-11-29 2023-12-20 0.5 1.19 -0.194
## 11 pr 2023-11-29 2023-12-13 0.5 2.24 -0.188
## 12 pr 2023-11-29 2023-12-20 0.5 1.84 -0.185
## 13 nd 2023-11-29 2023-12-13 0.5 0.267 -0.177
## 14 vt 2023-11-29 2023-12-20 0.5 0.280 -0.174
## 15 wy 2023-11-29 2023-12-20 0.5 3.02 -0.173
## 16 pr 2023-11-29 2023-12-06 0.5 2.66 -0.167
## 17 vt 2023-11-29 2023-12-13 0.5 0.326 -0.156
## 18 pr 2023-11-29 2023-11-29 0.5 3.13 -0.155
## 19 nd 2023-11-29 2023-12-06 0.5 0.327 -0.154
## 20 pr 2023-11-29 2023-12-27 0.5 1.57 -0.152
## 21 hi 2023-11-29 2023-12-13 0.5 1.39 -0.148
## 22 vt 2023-11-29 2023-12-06 0.5 0.382 -0.142
## 23 dc 2023-11-29 2023-12-20 0.5 1.06 -0.141
## 24 nd 2023-11-29 2023-11-29 0.5 0.382 -0.134
## 25 la 2023-11-29 2023-12-27 0.5 1.96 -0.129
## 26 vt 2023-11-29 2023-11-29 0.5 0.436 -0.125
## 27 wy 2023-11-29 2023-12-13 0.5 3.50 -0.118
## 28 hi 2023-11-29 2023-12-06 0.5 1.60 -0.114
## 29 co 2023-11-29 2023-12-27 0.5 2.12 -0.106
## 30 me 2023-11-29 2023-12-20 0.5 1.30 -0.106
## # ℹ 238 more rows
There’s a mix of growth and decay.
basic_gr %>% ggplot(aes(x = growth)) +
geom_histogram(bins = 300)
The majority of growth rates are positive; there’s some amount of negative predictions, but that is to be expected.
There appears to be something about fitting quantiles specifically that is causing this problem.
What if we just fit the median? This shouldn’t change anything
hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
all_one_q_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, quantile_levels = c(0.5), pop_scaling = FALSE))
default_geos <- c("ca", "fl", "ny", "pa", "tx")
all_one_q_forecasts %>% ggplot(aes(x = target_end_date, y = value)) +
geom_line() +
facet_wrap(~geo_value, scale = "free")
And indeed it does not, most forecasts are still negative.
In the interest of simplifying as much as possible, lets just fit using rq directly. The problem is most acute in the furthest ahead, so lets use ahead = 28
.
rec <- epi_recipe(hhs_forecast) %>%
step_population_scaling(
hhs,
df = epidatasets::state_census,
df_pop_col = "pop",
create_new = FALSE,
rate_rescaling = 1e5,
by = c("geo_value" = "abbr")
) %>%
step_adjust_latency(hhs, method = "extend_lags") %>%
step_epi_lag(hhs, lag = c(0, 7, 14)) %>%
step_epi_ahead(hhs, ahead = 28)
With the recipe in hand, we prepare the data, dealing with dropping NA
values by hand since we’re mostly bypassing recipes.
fit_obj <- rec %>%
prep(hhs_forecast) %>%
bake(hhs_forecast) %>%
drop_na(starts_with("lag"), starts_with("ahead")) %>%
rq(ahead_28_hhs ~ lag_7_hhs + lag_14_hhs + lag_21_hhs, tau = 0.5, .)
pred_data <- rec %>%
prep(hhs_forecast) %>%
bake(hhs_forecast) %>%
drop_na(starts_with("lag")) %>%
filter(time_value == max(time_value))
predictions <- predict(fit_obj, pred_data)
And since we’re not using layers, we’ll have to undo population scaling
tibble(geo_value = pred_data$geo_value, value = predictions) %>%
left_join(state_census, by = join_by(geo_value == abbr)) %>%
mutate(value = value * pop / 1e5) %>%
select(-fips, -name, -pop)
## # A tibble: 54 × 2
## geo_value value
## <chr> <dbl>
## 1 ak 0.469
## 2 al 0.534
## 3 ar 0.379
## 4 as 0.000534
## 5 az 1.13
## 6 ca 0.771
## # ℹ 48 more rows
This is a bit more annoying to try to implement, since it seems that parsnip doesn’t support disabling the intercept outside the formula (so we’d have to do it in the recipe somehow).
TL;DR: No. The phenomena is still happening, at least for the default geos. Too slow to run regularly. Let’s see what happens if we restrict ourselves to training each geo separately.
hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
all_geos <- hhs_forecast %>%
distinct(geo_value) %>%
pull(geo_value)
hhs_forecast %>%
filter(!is.na(hhs)) %>%
group_by(geo_value) %>%
summarize(n_points = n()) %>%
arrange(n_points)
all_geos_forecasts <- map(all_geos, \(geo) forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE), epi_data = hhs_forecast %>% filter(geo_value == geo))) %>% list_rbind()
all_geos_forecasts %>% plot_dec_forecasts(default_geos)
geos_gr <- get_growth_rates(all_geos_forecasts, quantiles = 0.5, method = "smooth_spline")
geos_gr %>% arrange(desc(growth))
This is at least more of a mixed bag, with plenty of states with positive growth.
geos_gr %>% ggplot(aes(x = growth)) +
geom_histogram(bins = 300)
But most have a negative growth.
Well it is at least different; how exactly is hard to parse:
all_geos_forecasts %>%
left_join(all_forecasts, by = join_by(geo_value, forecast_date, target_end_date, quantile), suffix = c("_geo", "_joint")) %>%
mutate(value = value_geo - value_joint) %>%
select(-value_geo, -value_joint) %>%
filter(geo_value %in% default_geos) %>%
ggplot(aes(x = target_end_date, group = geo_value)) +
geom_point(aes(y = value, color = quantile)) +
facet_wrap(~geo_value, scale = "free")
Iterative will be monotonic in the horizon. Direct isn’t guaranteed.
Somewhat hard to express, there’s also something about the increasing staleness of data that’s relevant. Actually implementing an experiment just means including the median from the previous forecast as a data point and forecasting one day further into the future.
Does adding an I (integrated) component to our model improve things?
hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
# Get the latest values so we know where to sum from later
latest_values <- hhs_forecast %>%
filter(geo_value %in% default_geos) %>%
slice_max(time_value, by=geo_value) %>%
select(geo_value, time_value, value = hhs)
# Take data diffs
diffed_data <- hhs_forecast %>%
group_by(geo_value) %>%
mutate(value_diff = hhs - lag(hhs)) %>%
ungroup() %>%
select(geo_value, time_value, value_diff)
# Forecast the diffs and appean the starting values from the original data
diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop(x, "value_diff", ahead = ahead, pop_scaling = FALSE), epi_data = diffed_data) %>%
select(geo_value, time_value = target_end_date, value, quantile) %>%
bind_rows(tidyr::expand_grid(quantile = all_linear_scaled_forecasts$quantile %>% unique, latest_values))
# Sum the diffs to get the final forecast
forecast <- diffed_forecast %>%
group_by(geo_value, quantile) %>%
arrange(time_value) %>%
mutate(value = cumsum(value), forecast_date = .env$forecast_date) %>%
ungroup() %>%
arrange(geo_value, quantile, time_value) %>%
select(geo_value, forecast_date, target_end_date = time_value, quantile, value)
forecast %>% plot_dec_forecasts(default_geos)
This is much better than the original forecasts (we’re forecasting on the same data as in the “problem setup” section).
Let’s combine this with seasonal training data.
hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
# Get the latest values so we know where to sum from later
latest_values <- hhs_forecast %>%
filter(geo_value %in% default_geos) %>%
slice_max(time_value, by=geo_value) %>%
select(geo_value, time_value, value = hhs)
# Take data diffs
diffed_data <- hhs_forecast %>%
group_by(geo_value) %>%
mutate(value_diff = hhs - lag(hhs)) %>%
ungroup() %>%
select(geo_value, time_value, value_diff)
# Forecast the diffs and appean the starting values from the original data
diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop_seasonal(x, "value_diff", ahead = ahead, pop_scaling = FALSE, seasonal_method = "window"), epi_data = diffed_data) %>%
select(geo_value, time_value = target_end_date, value, quantile) %>%
bind_rows(tidyr::expand_grid(quantile = diffed_forecast$quantile %>% unique, latest_values))
# Sum the diffs to get the final forecast
forecast <- diffed_forecast %>%
group_by(geo_value, quantile) %>%
arrange(time_value) %>%
mutate(value = cumsum(value), forecast_date = .env$forecast_date) %>%
ungroup() %>%
arrange(geo_value, quantile, time_value) %>%
select(geo_value, forecast_date, target_end_date = time_value, quantile, value)
forecast %>% plot_dec_forecasts(default_geos)
Turns out, we get better coverage this way.
Now let’s try training on the augmented data (fluview and ILI).
joined_archive_data_as_of <- tar_read(joined_archive_data) %>% epix_as_of(forecast_date)
# Get the latest values so we know where to sum from later
latest_values <- joined_archive_data_as_of %>%
filter(geo_value %in% default_geos, source == "nhsn") %>%
slice_max(time_value, by=geo_value) %>%
select(geo_value, time_value, value = hhs)
# Take data diffs
diffed_data <- joined_archive_data_as_of %>%
filter(geo_value %in% default_geos) %>%
filter(time_value <= forecast_date - 7) %>%
arrange(source, geo_value, time_value) %>%
group_by(geo_value, source) %>%
mutate(value_diff = hhs - lag(hhs)) %>%
filter(!near(0, value_diff)) %>%
ungroup() %>%
select(source, geo_value, time_value, value_diff) %>%
drop_na(value_diff) %>%
as_epi_df(other_keys = "source", as_of = forecast_date)
# Forecast the diffs and appean the starting values from the original data
diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop_seasonal(x, "value_diff", ahead = ahead, pop_scaling = FALSE, scale_method = "none", center_method = "none",nonlin_method = "none", clip_lower = FALSE, nonneg = FALSE, seasonal_method = "window"), epi_data = diffed_data) %>%
select(geo_value, time_value = target_end_date, value, quantile) %>%
bind_rows(tidyr::expand_grid(quantile = diffed_forecast$quantile %>% unique, latest_values))
# Sum the diffs to get the final forecast
forecast <- diffed_forecast %>%
group_by(geo_value, quantile) %>%
arrange(time_value) %>%
mutate(value = cumsum(value), forecast_date = .env$forecast_date) %>%
ungroup() %>%
arrange(geo_value, quantile, time_value) %>%
select(geo_value, forecast_date, target_end_date = time_value, quantile, value)
forecast %>% plot_dec_forecasts(default_geos)
It doesn’t look great. The median is weirdly flat. Likely need to do something about whitening and then undo the whitening.
Roughly, the idea is to filter the data to that within a range of the growth rate of the most recent data point. To actually execute on this, we’ll use a step_filter
at the end with a condition based on the growth rate at the first lag. This should only apply during training; during testing, we just want the latest data, regardless of the growth rate. First we’ll define the function that will filter based on the growth rate. ### Growth rate geo based quantile
filter_hhs <- function(geo_value, time_value, lagged, quantile_lower = 0.25, quantile_upper = 0.75, ...) {
growth_rates <- tibble(geo_value, time_value, lagged) %>%
group_by(geo_value) %>%
mutate(gr = growth_rate(lagged, ...))
gr_res <- growth_rates %>% pull(gr)
quantile_bounds <- gr_res %>% quantile(c(quantile_lower, quantile_upper), na.rm = TRUE)
(quantile_bounds[[1]] < gr_res) & (gr_res < quantile_bounds[[2]])
}
This computes the growth rate for lagged
with whatever extra parameters are handed in, and returns a boolean vector saying if it’s within the quantile range (say 25% to 75%)[^1].
Since we want this to be after everything else we compute, we can just add it after all the other steps in the recipe.
get_filtered_workflow <- function(ahead, in_data = hhs_forecast, trainer = quantile_reg(quantile_levels = covidhub_probs()), ...) {
preproc <- linear_get_preproc(ahead, in_data = in_data) %>%
step_filter(filter_hhs(geo_value, time_value, lag_7_hhs, ...), skip = TRUE)
postproc <- frosting() %>%
layer_predict() %>%
layer_quantile_distn(quantile_levels = covidhub_probs()) %>%
layer_point_from_distn() %>%
layer_threshold() %>%
layer_naomit() %>%
layer_add_target_date() %>%
layer_add_forecast_date()
workflow <- epi_workflow(preproc, trainer) %>%
fit(hhs_forecast) %>%
add_frosting(postproc)
workflow
}
hhs_forecast <- hhs_archive %>%
epix_as_of(forecast_date) %>%
filter(time_value > train_value_min)
And fitting:
fit_given_growth_rate_params <- function(...) {
fit_workflows <- map(0:4 * 7, \(ahead) get_filtered_workflow(ahead, ...))
predictions <- map(
fit_workflows,
\(wf) predict(wf, hhs_forecast %>% filter(time_value >as.Date("2023-11-29")- 7*28)) %>% pivot_quantiles_longer(.pred_distn) %>% rename(value = .pred_distn_value, quantile = .pred_distn_quantile_level) %>% filter(time_value == forecast_date)
)
predictions %>% bind_rows() %>% rename(target_end_date = target_date) %>% select(-.pred, -time_value) %>% arrange(geo_value, target_end_date, quantile)
plot_dec_forecasts(predictions %>% bind_rows() %>% rename(target_end_date = target_date), default_geos)
}
fit_given_growth_rate_params(method = "smooth_spline")
using the "smooth_spline"
method didn’t seem to help. let’s try "rel_change"
, with a window size of 5 weeks:
fit_given_growth_rate_params(h = 7 * 5)
if anything that is worse. And linear regression, also using 5 weeks:
fit_given_growth_rate_params(method = "linear_reg", h = 7 * 5)
So this doesn’t seem to be helping. Let’s take a closer look at the growth rates that we’re using to do this filtering.
growth_rates <- hhs_forecast %>%
group_by(geo_value) %>%
mutate(gr = growth_rate(hhs, method = "smooth_spline"))
quantile_bounds <-
growth_rates %>%
pull(gr) %>%
quantile(c(0.25, 0.75), na.rm = TRUE)
growth_rates %>% ggplot(aes( x = gr)) + geom_histogram(bins = 300) + geom_vline(aes(xintercept = quantile_bounds[[1]])) + scale_x_log10()
# (quantile_bounds[[1]] < gr_res) & (gr_res < quantile_bounds[[2]])