Setup
Model tweaks
Inspecting the linear model coefficients
- Fitting simple linear models to the data yields coefficients less than one
- Aside: what if we fit on a log scale?
Reproducing in the context of covid
Appendix

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## ! epidatr cache is being used (set env var EPIDATR_USE_CACHE=FALSE if not intended).
## ℹ The cache directory is ~/.cache/R/epidatr.
## ℹ The cache will be cleared after 42 days and will be pruned if it exceeds 1024 MB.
## ℹ The log of cache transactions is stored at ~/.cache/R/epidatr/logfile.txt.
## Loading required package: epidatasets
## 
## Loading required package: parsnip
## 
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## 
## Registered S3 method overwritten by 'epipredict':
##   method            from   
##   print.step_naomit recipes
## 
## 
## Attaching package: 'epipredict'
## 
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## 
## 
## Attaching package: 'epiprocess'
## 
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## 
## 
## Attaching package: 'ggplot2'
## 
## 
## The following object is masked from 'package:epipredict':
## 
##     layer
## 
## 
## here() starts at /home/dskel/repos/delphi/exploration-tooling
## 
## 
## Attaching package: 'hubValidations'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## 
## 
## Attaching package: 'lubridate'
## 
## 
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week, yday, year
## 
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## 
## 
## 
## Attaching package: 'plotly'
## 
## 
## The following object is masked from 'package:paws.storage':
## 
##     config
## 
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## 
## The following object is masked from 'package:graphics':
## 
##     layout
## 
## 
## 
## Attaching package: 'purrr'
## 
## 
## The following object is masked from 'package:magrittr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:jsonlite':
## 
##     flatten
## 
## 
## The following object is masked from 'package:data.table':
## 
##     transpose
## 
## 
## qs2 0.1.5
## 
## Loading required package: SparseM
## 
## 
## Attaching package: 'recipes'
## 
## 
## The following object is masked from 'package:stats':
## 
##     step
## 
## 
## 
## Attaching package: 'renv'
## 
## 
## The following object is masked from 'package:recipes':
## 
##     update
## 
## 
## The following object is masked from 'package:purrr':
## 
##     modify
## 
## 
## The following object is masked from 'package:pak':
## 
##     lockfile_create
## 
## 
## The following object is masked from 'package:languageserver':
## 
##     run
## 
## 
## The following objects are masked from 'package:stats':
## 
##     embed, update
## 
## 
## The following objects are masked from 'package:utils':
## 
##     history, upgrade
## 
## 
## The following objects are masked from 'package:base':
## 
##     autoload, load, remove, use
## 
## 
## 
## Attaching package: 'rlang'
## 
## 
## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl, flatten_raw,
##     invoke, splice
## 
## 
## The following object is masked from 'package:magrittr':
## 
##     set_names
## 
## 
## The following objects are masked from 'package:jsonlite':
## 
##     flatten, unbox
## 
## 
## The following object is masked from 'package:hubValidations':
## 
##     is_error
## 
## 
## The following object is masked from 'package:data.table':
## 
##     :=
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     has_name
## 
## 
## 
## Attaching package: 'scales'
## 
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
## 
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## 
## 
## Attaching package: 'stringr'
## 
## 
## The following object is masked from 'package:recipes':
## 
##     fixed
## 
## 
## 
## Attaching package: 'testthat'
## 
## 
## The following object is masked from 'package:targets':
## 
##     matches
## 
## 
## The following object is masked from 'package:tarchetypes':
## 
##     matches
## 
## 
## The following objects are masked from 'package:rlang':
## 
##     is_false, is_null, is_true
## 
## 
## The following objects are masked from 'package:readr':
## 
##     edition_get, local_edition
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_null
## 
## 
## The following objects are masked from 'package:magrittr':
## 
##     equals, is_less_than, not
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     matches
## 
## 
## The following object is masked from 'package:crew':
## 
##     matches
## 
## 
## 
## Attaching package: 'tibble'
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     has_name
## 
## 
## 
## Attaching package: 'tidyr'
## 
## 
## The following object is masked from 'package:testthat':
## 
##     matches
## 
## 
## The following object is masked from 'package:magrittr':
## 
##     extract

For many of the direct forecasters, the forecast is strictly decreasing, even in the middle of the surge. This effect is most prominent in flu, but occurs somewhat in covid. We need to resolve the source of this. It is some combination of the data and the models used.

Roughly, the conclusion after having done these experiments is that it is in fact an over-representation of decreasing time_values that is causing this behavior (especially look at fitting simple linear models to the data). Some possible fixes include:

Some method of data filtering or weighting based on the current growth rate. The seasonal forecaster probably benefits from this effect. An alternative would be to bring back some sort of classifier-type system (either using an actual classifier, or filtering to comparable growth rates).
Improve our ability to use non-linear classifiers. These would likely be able to handle forecasting both increasing and decreasing signals, but would require a better method of generating quantiles. For example, by comparing a given forecast to the climate model, or a comparison with the trajectories that occur on comparable forecast dates (this brings us back to 1).
Making a growth rate extrapolation model, and incorporating its predictions as a covariate. And maybe something better than just simple last-growth-rate-carried-forward. Ideally we’d have the growth rate taper based on growing population immunity.

Setup

Data

This notebook depends on having successfully run the flu_hosp_explore targets pipeline to handle the creation of the basic dataset.

Sys.setenv(TAR_PROJECT = "flu_hosp_explore")
tar_make(hhs_archive)
hhs_archive <- tar_read(hhs_archive) %>% as_epi_archive()
train_value_min <- "2002-06-01"
hhs_archive %>%
  epix_as_of_current() %>%
  filter(time_value > "2023-10-01") %>%
  autoplot(hhs)

To avoid running too frequently, we’ll limit to a single forecast date just after the peak of the rate of growth, so that ~ everywhere is increasing. Here’s a table of the number of locations that have their max on any given week:

hhs_gr <- hhs_archive %>%
  epix_as_of_current() %>%
  drop_na(hhs) %>%
  as_epi_df() %>%
  group_by(geo_value) %>%
  mutate(gr_hhs = growth_rate(hhs)) %>%
  filter(time_value > "2023-10-01")
hhs_gr %>%
  arrange(gr_hhs) %>%
  drop_na() %>%
  slice_max(gr_hhs) %>%
  ungroup() %>%
  group_by(time_value) %>%
  summarize(nn = length(hhs)) %>%
  print(n=16)

## # A tibble: 8 × 2
##   time_value    nn
##   <date>     <int>
## 1 2023-10-04     2
## 2 2023-10-18     1
## 3 2023-10-25     3
## 4 2023-11-01     5
## 5 2023-11-08    14
## 6 2023-11-15    18
## 7 2023-11-22     7
## 8 2023-12-06     3

Since most have the largest growth rate on the 15th, so let’s choose 11/29 as our forecast_date, to make sure there’s some trend for the forecasters to pick up on.

forecast_date <- as.Date("2023-11-29")
hhs_gr %>% autoplot(gr_hhs) +
  geom_vline(aes(xintercept = forecast_date), lty = 2) +
  labs(title = "growth rates")

A plot to confirm that most locations are still increasing on the 29th.

Note that this is RATE DATA, and not count data, so pop_scaling = FALSE should be the default.

Some utility functions

Since we don’t really need to run the full pipeline to get forecasts from a single day and forecaster, we build a couple of functions for inspecting forecasts.

forecast_aheads <- function(forecaster, epi_data = hhs_forecast, aheads = 0:4 * 7) {
  all_forecasts <- map(aheads, \(ahead) forecaster(epi_data, ahead)) %>% list_rbind()
  all_forecasts
}

Here’s a way to easily plot a subset of the forecasts, with bands at the 80% and 50% intervals (.1-.9 and .25-.75) against the finalized data.

plot_dec_forecasts <- function(all_forecasts,
                               geo_values,
                               data_archive = hhs_archive,
                               earliest_truth_data = NULL) {
  if (is.null(earliest_truth_data)) {
    earliest_truth_data <- all_forecasts$forecast_date[[1]] - as.difftime(365, units = "days")
  }
  as_of_plotting <- data_archive %>%
    epix_as_of(min(all_forecasts$forecast_date)) %>%
    filter(time_value <= max(all_forecasts$target_end_date), geo_value %in% geo_values) %>%
    as_tibble() %>%
    mutate(forecast_date = time_value) %>%
    filter(time_value >= earliest_truth_data)
  # transform the archive to something useful for comparison
  finalized_plotting <- data_archive %>%
    epix_as_of_current() %>%
    filter(time_value <= max(all_forecasts$target_end_date), geo_value %in% geo_values) %>%
    as_tibble() %>%
    mutate(forecast_date = time_value) %>%
    filter(time_value >= earliest_truth_data)
  all_forecasts %>%
    filter(geo_value %in% geo_values) %>%
    pivot_wider(names_from = quantile, values_from = value) %>%
    ggplot(aes(x = target_end_date, group = geo_value, fill = forecast_date)) +
    geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), alpha = 0.4) +
    geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), alpha = 0.6) +
    geom_line(aes(y = `0.5`, color = forecast_date)) +
    geom_line(data = as_of_plotting, aes(x = time_value, y = hhs), color = "grey") +
    geom_line(data = finalized_plotting, aes(x = time_value, y = hhs), color = "black") +
    facet_wrap(~geo_value, scale = "free") +
    theme(legend.position = "none")
}

And a method to inspect whether things are increasing that isn’t just the eyeball norm on a few of them. This calculates growth rates for each quantile and each location.

get_growth_rates <- function(forecasts, quantiles = NULL, outlier_bound = 1e2, ...) {
  if (is.null(quantiles)) {
    quantiles <- forecasts$quantile %>% unique()
  }
  forecasts %>%
    group_by(geo_value, quantile) %>%
    filter(min(value) != max(value), quantile %in% quantiles) %>%
    drop_na(value) %>%
    mutate(growth = growth_rate(value, ...)) %>%
    filter(abs(growth) < outlier_bound)
}

Establishing the problem

hhs_forecast <- hhs_archive %>%
  epix_as_of(forecast_date) %>%
  filter(time_value > train_value_min)
all_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE))
default_geos <- c("ca", "fl", "ny", "pa", "tx")
plot_dec_forecasts(all_forecasts, default_geos)

All the forecasts are going down rather than up, even though they have multiple weeks of data! More quantitatively, let’s compute the growth rate of the median forecast for each ahead, across all geos, and then look at a histogram of that

basic_gr <- get_growth_rates(all_forecasts, quantiles = 0.5, method = "smooth_spline")
basic_gr %>% arrange(desc(growth)) %>% print(n=30)

## # A tibble: 270 × 6
## # Groups:   geo_value, quantile [54]
##    geo_value forecast_date target_end_date quantile  value   growth
##    <chr>     <date>        <date>             <dbl>  <dbl>    <dbl>
##  1 as        2023-11-29    2023-11-29           0.5 0.0138  1.50   
##  2 as        2023-11-29    2023-12-06           0.5 0.0338  0.627  
##  3 as        2023-11-29    2023-12-13           0.5 0.0553  0.432  
##  4 as        2023-11-29    2023-12-20           0.5 0.0832  0.318  
##  5 as        2023-11-29    2023-12-27           0.5 0.108   0.238  
##  6 nh        2023-11-29    2023-11-29           0.5 0.0877  0.117  
##  7 nh        2023-11-29    2023-12-06           0.5 0.0956  0.105  
##  8 nh        2023-11-29    2023-12-13           0.5 0.108   0.0950 
##  9 nh        2023-11-29    2023-12-20           0.5 0.117   0.0868 
## 10 nh        2023-11-29    2023-12-27           0.5 0.128   0.0799 
## 11 sd        2023-11-29    2023-12-20           0.5 0.126   0.0563 
## 12 sd        2023-11-29    2023-12-27           0.5 0.133   0.0539 
## 13 sd        2023-11-29    2023-12-13           0.5 0.119   0.0473 
## 14 sd        2023-11-29    2023-12-06           0.5 0.116   0.00471
## 15 de        2023-11-29    2023-11-29           0.5 0.240  -0.0100 
## 16 de        2023-11-29    2023-12-06           0.5 0.237  -0.0203 
## 17 ut        2023-11-29    2023-11-29           0.5 0.324  -0.0208 
## 18 sd        2023-11-29    2023-11-29           0.5 0.118  -0.0271 
## 19 ri        2023-11-29    2023-11-29           0.5 0.330  -0.0354 
## 20 ri        2023-11-29    2023-12-06           0.5 0.328  -0.0367 
## 21 ri        2023-11-29    2023-12-13           0.5 0.299  -0.0381 
## 22 ri        2023-11-29    2023-12-20           0.5 0.300  -0.0396 
## 23 ri        2023-11-29    2023-12-27           0.5 0.286  -0.0413 
## 24 dc        2023-11-29    2023-11-29           0.5 1.22   -0.0446 
## 25 de        2023-11-29    2023-12-13           0.5 0.229  -0.0467 
## 26 ut        2023-11-29    2023-12-06           0.5 0.314  -0.0499 
## 27 ia        2023-11-29    2023-11-29           0.5 0.870  -0.0541 
## 28 mi        2023-11-29    2023-11-29           0.5 0.885  -0.0559 
## 29 de        2023-11-29    2023-12-27           0.5 0.205  -0.0559 
## 30 oh        2023-11-29    2023-11-29           0.5 0.649  -0.0574 
## # ℹ 240 more rows

basic_gr %>% ggplot(aes(x = growth)) +
  geom_histogram(bins = 300)

There are a few exceptional locations with actually positive growth rates, but the vast majority of aheads and geos have negative growth rates.

Model tweaks

Very short training windows remove the decreasing problem

If we limit to the last 3 weeks of data (so effectively just a linear extrapolation shared across geos), it goes away:

hhs_forecast <- hhs_archive %>%
  epix_as_of(forecast_date) %>%
  filter(time_value > train_value_min)
all_short_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, n_training = 3, pop_scaling = FALSE))
plot_dec_forecasts(all_short_forecasts, default_geos)

They’re pretty jittery, but strictly decreasing they are not. And the corresponding growth rates:

short_gr <- get_growth_rates(all_short_forecasts, quantiles = 0.5, method = "smooth_spline")
short_gr %>%
  arrange(growth) %>%
  ggplot(aes(x = growth)) +
  geom_histogram(bins = 300)

So most aheads and locations have a positive growth rate, with some strong positive outliers and some amount decreasing. This generally fits with the jittery but positive nature of the example location plots.

Switching to linear regression slightly mitigates, but doesn’t remove the problem

First, confirming that it happens for simple linear regression and not just quantile regression.

hhs_forecast <- hhs_archive %>%
  epix_as_of(forecast_date) %>%
  filter(time_value > train_value_min)
all_linear_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg(), pop_scaling = FALSE))
default_geos <- c("ca", "fl", "ny", "pa", "tx")
plot_dec_forecasts(all_linear_forecasts, default_geos)

Which appears to also have this problem (and very narrow quantiles in some locations).

basic_gr <- get_growth_rates(all_linear_forecasts, quantiles = 0.5, method = "smooth_spline")
basic_gr %>%
  arrange(desc(growth)) %>%
  print(n = 30)

## # A tibble: 270 × 6
## # Groups:   geo_value, quantile [54]
##    geo_value forecast_date target_end_date quantile  value growth
##    <chr>     <date>        <date>             <dbl>  <dbl>  <dbl>
##  1 as        2023-11-29    2023-11-29           0.5 0.0800  1.24 
##  2 ak        2023-11-29    2023-12-20           0.5 0.0900  1.02 
##  3 ak        2023-11-29    2023-12-27           0.5 0.361   0.999
##  4 as        2023-11-29    2023-12-06           0.5 0.181   0.585
##  5 nh        2023-11-29    2023-11-29           0.5 0.144   0.564
##  6 nh        2023-11-29    2023-12-06           0.5 0.229   0.402
##  7 as        2023-11-29    2023-12-13           0.5 0.291   0.374
##  8 sd        2023-11-29    2023-11-29           0.5 0.165   0.367
##  9 sd        2023-11-29    2023-12-06           0.5 0.230   0.318
## 10 nh        2023-11-29    2023-12-13           0.5 0.325   0.290
## 11 de        2023-11-29    2023-11-29           0.5 0.313   0.262
## 12 sd        2023-11-29    2023-12-13           0.5 0.310   0.261
## 13 as        2023-11-29    2023-12-20           0.5 0.393   0.241
## 14 de        2023-11-29    2023-12-06           0.5 0.396   0.213
## 15 ri        2023-11-29    2023-11-29           0.5 0.445   0.200
## 16 ut        2023-11-29    2023-11-29           0.5 0.408   0.199
## 17 sd        2023-11-29    2023-12-20           0.5 0.388   0.192
## 18 nh        2023-11-29    2023-12-20           0.5 0.411   0.185
## 19 as        2023-11-29    2023-12-27           0.5 0.482   0.178
## 20 de        2023-11-29    2023-12-13           0.5 0.479   0.164
## 21 ut        2023-11-29    2023-12-06           0.5 0.488   0.159
## 22 or        2023-11-29    2023-11-29           0.5 0.345   0.158
## 23 sd        2023-11-29    2023-12-27           0.5 0.461   0.157
## 24 ri        2023-11-29    2023-12-06           0.5 0.541   0.156
## 25 or        2023-11-29    2023-12-06           0.5 0.402   0.151
## 26 nh        2023-11-29    2023-12-27           0.5 0.480   0.138
## 27 or        2023-11-29    2023-12-13           0.5 0.465   0.131
## 28 ne        2023-11-29    2023-11-29           0.5 0.585   0.129
## 29 id        2023-11-29    2023-12-06           0.5 0.404   0.129
## 30 ri        2023-11-29    2023-12-13           0.5 0.613   0.123
## # ℹ 240 more rows

There’s a least a good number of locations/aheads that have a positive growth rate.

basic_gr %>% ggplot(aes(x = growth)) +
  geom_histogram(bins = 300)

But the majority are negative.

Switching to a nonlinear engine removes the decreasing problem

Boosted trees

hhs_forecast <- hhs_archive %>%
  epix_as_of(forecast_date) %>%
  filter(time_value > train_value_min)

all_boost_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = boost_tree(mode = "regression"), pop_scaling = FALSE))
default_geos <- c("ca", "fl", "ny", "pa", "tx")
plot_dec_forecasts(all_boost_forecasts, default_geos)

Boosted trees don’t have the problem? Mostly? The forecasts aren’t great, but at least they’re not plummeting. The quantiles are garbage, but that’s kind of to be expected with residual quantiles on a non-linear method.

basic_gr <- get_growth_rates(all_boost_forecasts, quantiles = 0.5, method = "smooth_spline")
basic_gr %>% ggplot(aes(x = growth)) +
  geom_histogram(bins = 300)

Mostly positive, so I think we can count this as not having the problem. Something about a linear model is the issue.

Fitting only 1 lag does not change the problem

hhs_forecast <- hhs_archive %>%
  epix_as_of(forecast_date) %>%
  filter(time_value > train_value_min)
all_one_lag_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE, lag = 0))
all_one_lag_forecasts

## # A tibble: 6,210 × 5
##   geo_value forecast_date target_end_date quantile value
##   <chr>     <date>        <date>             <dbl> <dbl>
## 1 ak        2023-11-29    2023-11-29         0.01  0.389
## 2 ak        2023-11-29    2023-11-29         0.025 0.629
## 3 ak        2023-11-29    2023-11-29         0.05  0.800
## 4 ak        2023-11-29    2023-11-29         0.1   1.00 
## 5 ak        2023-11-29    2023-11-29         0.15  1.12 
## 6 ak        2023-11-29    2023-11-29         0.2   1.22 
## # ℹ 6,204 more rows

plot_dec_forecasts(all_one_lag_forecasts, default_geos)

Still constantly falling, unfortunately, and a surprisingly similar forecast.

Removing 2020/21 and 2021/22 improves but doesn’t fix the situation

The first 2 years are bad for flu, as there was ~no spread. In practice we know this behavior happens when we remove those years, and even the summers. I’m including this primarily to compare the change in growth rates.

hhs_forecast_recent <- hhs_archive %>%
  epix_as_of(forecast_date) %>%
  filter(time_value > "2022-06-01")
all_recent_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE), epi_data = hhs_forecast_recent)
plot_dec_forecasts(all_recent_forecasts, default_geos)

Which is still decreasing.

basic_gr <- get_growth_rates(all_recent_forecasts, quantiles = 0.5, method = "smooth_spline")
basic_gr %>% arrange(desc(growth)) %>% print(n=30)

## # A tibble: 270 × 6
## # Groups:   geo_value, quantile [54]
##    geo_value forecast_date target_end_date quantile  value   growth
##    <chr>     <date>        <date>             <dbl>  <dbl>    <dbl>
##  1 as        2023-11-29    2023-11-29           0.5 0.0327  1.01   
##  2 as        2023-11-29    2023-12-06           0.5 0.0659  0.513  
##  3 as        2023-11-29    2023-12-13           0.5 0.103   0.405  
##  4 as        2023-11-29    2023-12-20           0.5 0.146   0.280  
##  5 nh        2023-11-29    2023-12-06           0.5 0.118   0.196  
##  6 nh        2023-11-29    2023-12-13           0.5 0.144   0.193  
##  7 as        2023-11-29    2023-12-27           0.5 0.182   0.184  
##  8 nh        2023-11-29    2023-11-29           0.5 0.0985  0.175  
##  9 nh        2023-11-29    2023-12-20           0.5 0.170   0.136  
## 10 sd        2023-11-29    2023-12-13           0.5 0.152   0.119  
## 11 sd        2023-11-29    2023-12-20           0.5 0.169   0.117  
## 12 sd        2023-11-29    2023-12-27           0.5 0.190   0.109  
## 13 nh        2023-11-29    2023-12-27           0.5 0.191   0.103  
## 14 sd        2023-11-29    2023-12-06           0.5 0.133   0.0895 
## 15 ak        2023-11-29    2023-12-27           0.5 0.492   0.0814 
## 16 sd        2023-11-29    2023-11-29           0.5 0.127   0.0552 
## 17 de        2023-11-29    2023-11-29           0.5 0.261   0.0230 
## 18 de        2023-11-29    2023-12-06           0.5 0.268   0.0197 
## 19 de        2023-11-29    2023-12-13           0.5 0.271   0.0125 
## 20 de        2023-11-29    2023-12-20           0.5 0.279  -0.00157
## 21 ri        2023-11-29    2023-11-29           0.5 0.390  -0.0101 
## 22 ri        2023-11-29    2023-12-06           0.5 0.398  -0.0102 
## 23 ri        2023-11-29    2023-12-13           0.5 0.374  -0.0103 
## 24 ri        2023-11-29    2023-12-20           0.5 0.393  -0.0104 
## 25 ri        2023-11-29    2023-12-27           0.5 0.372  -0.0106 
## 26 de        2023-11-29    2023-12-27           0.5 0.268  -0.0116 
## 27 ut        2023-11-29    2023-11-29           0.5 0.354  -0.0171 
## 28 or        2023-11-29    2023-12-13           0.5 0.284  -0.0174 
## 29 ut        2023-11-29    2023-12-06           0.5 0.354  -0.0198 
## 30 ut        2023-11-29    2023-12-13           0.5 0.340  -0.0251 
## # ℹ 240 more rows

basic_gr %>% ggplot(aes(x = growth)) +
  geom_histogram(bins = 300)

which are more or less the same level of bad.

Linear only

However if we filter to 2022 onwards and use a linear engine:

hhs_forecast_recent <- hhs_archive %>%
  epix_as_of(forecast_date) %>%
  filter(time_value > "2022-06-01")
all_recent_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE, trainer = linear_reg()), epi_data = hhs_forecast_recent)
plot_dec_forecasts(all_recent_forecasts, default_geos)

Some are actually increasing, while some are decreasing.

basic_gr <- get_growth_rates(all_recent_forecasts, quantiles = 0.5, method = "smooth_spline")
basic_gr %>% arrange(desc(growth)) %>% print(n=30)

## # A tibble: 270 × 6
## # Groups:   geo_value, quantile [54]
##    geo_value forecast_date target_end_date quantile value growth
##    <chr>     <date>        <date>             <dbl> <dbl>  <dbl>
##  1 as        2023-11-29    2023-11-29           0.5 0.135  1.29 
##  2 nh        2023-11-29    2023-11-29           0.5 0.195  0.790
##  3 ak        2023-11-29    2023-12-27           0.5 0.555  0.775
##  4 ak        2023-11-29    2023-12-20           0.5 0.219  0.683
##  5 sd        2023-11-29    2023-11-29           0.5 0.214  0.614
##  6 as        2023-11-29    2023-12-06           0.5 0.314  0.604
##  7 nh        2023-11-29    2023-12-06           0.5 0.355  0.492
##  8 sd        2023-11-29    2023-12-06           0.5 0.353  0.436
##  9 de        2023-11-29    2023-11-29           0.5 0.367  0.421
## 10 as        2023-11-29    2023-12-13           0.5 0.511  0.386
## 11 nh        2023-11-29    2023-12-13           0.5 0.539  0.338
## 12 ut        2023-11-29    2023-11-29           0.5 0.466  0.332
## 13 ri        2023-11-29    2023-11-29           0.5 0.518  0.323
## 14 sd        2023-11-29    2023-12-13           0.5 0.519  0.322
## 15 de        2023-11-29    2023-12-06           0.5 0.526  0.316
## 16 or        2023-11-29    2023-11-29           0.5 0.397  0.315
## 17 id        2023-11-29    2023-12-06           0.5 0.505  0.266
## 18 or        2023-11-29    2023-12-06           0.5 0.527  0.266
## 19 md        2023-11-29    2023-11-29           0.5 0.413  0.265
## 20 ut        2023-11-29    2023-12-06           0.5 0.622  0.254
## 21 as        2023-11-29    2023-12-20           0.5 0.697  0.248
## 22 md        2023-11-29    2023-12-06           0.5 0.528  0.240
## 23 ri        2023-11-29    2023-12-06           0.5 0.691  0.237
## 24 de        2023-11-29    2023-12-13           0.5 0.695  0.237
## 25 ne        2023-11-29    2023-11-29           0.5 0.653  0.233
## 26 sd        2023-11-29    2023-12-20           0.5 0.678  0.221
## 27 mn        2023-11-29    2023-11-29           0.5 0.672  0.219
## 28 id        2023-11-29    2023-11-29           0.5 0.402  0.218
## 29 or        2023-11-29    2023-12-13           0.5 0.674  0.216
## 30 wa        2023-11-29    2023-11-29           0.5 0.577  0.216
## # ℹ 240 more rows

basic_gr %>% ggplot(aes(x = growth)) +
  geom_histogram(bins = 300)

Overall, more are now positive than they were in the totally unfiltered case, but there’s still a number of unreasonably negative values.

Inspecting the linear model coefficients

First lets examine the coefficients that are actually fit; to do that from within scaled_pop would involve a browser(). For the sake of reproducibility, we will make the steps by hand. Note that I’ve tried this section with both filtering pre 2022 values and not, and the results are approximately the same.

hhs_forecast <- hhs_archive %>%
  epix_as_of(forecast_date) %>%
  filter(time_value > train_value_min)

linear_get_preproc <- function(ahead, lag = c(0, 7, 14), in_data = hhs_forecast, to_scale = FALSE) {
  preproc <- epi_recipe(in_data)
  if (to_scale) {
    preproc <- preproc %>%
      step_population_scaling(
        hhs,
        df = epidatasets::state_census,
        df_pop_col = "pop",
        create_new = FALSE,
        rate_rescaling = 1e5,
        by = c("geo_value" = "abbr")
      )
  }
  preproc <- preproc %>%
    step_adjust_latency(hhs, method = "extend_lags") %>%
    step_epi_lag(hhs, lag = lag) %>%
    step_epi_ahead(hhs, ahead = ahead) %>%
    step_epi_naomit()
}
linear_get_workflow <- function(ahead, lag = c(0, 7, 14), in_data = hhs_forecast) {
  preproc <- linear_get_preproc(ahead, lag, in_data)
  postproc <- frosting() %>%
    layer_predict() %>%
    layer_residual_quantiles(quantile_levels = covidhub_probs()) %>%
    layer_threshold() %>%
    layer_naomit() %>%
    layer_add_target_date() %>%
    layer_add_forecast_date()
  workflow <- epi_workflow(preproc, linear_reg()) %>%
    fit(in_data) %>%
    add_frosting(postproc)
  workflow
}
all_workflows <- map(0:4 * 7, linear_get_workflow)

Starting with the largest ahead, the coefficients are

workflows::extract_fit_parsnip(all_workflows[[5]])

## parsnip model object
## 
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Coefficients:
## (Intercept)    lag_7_hhs   lag_14_hhs   lag_21_hhs  
##      0.4820       0.9557      -0.4906      -0.1604

So the intercept is actually positive (so it’s not biased towards decreasing inherently), but the coefficients for two of the lags are negative. Even including that, the lag_7_hhs coefficient is less than one, so regardless of the fact that two coefficients are negative it will de-facto always be below the original value.

How about the zero ahead? Thanks to latency this is actually a one week forward projection, so we can’t expect exactly just lag_7_hhs to be one and everything else zero.

workflows::extract_fit_parsnip(all_workflows[[1]])

## parsnip model object
## 
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Coefficients:
## (Intercept)    lag_7_hhs   lag_14_hhs   lag_21_hhs  
##     0.08001      1.21847     -0.20524     -0.12728

But it is still surprisingly close to exactly the lag_7_hhs value. If the signal were constant so far though, it would still be predicting a decrease thanks to the lag_21_hhs coefficient.

Fitting simple linear models to the data yields coefficients less than one

Given that something strange is going on with the data that we’re fitting, it is worth plotting the data as it is seen by the linear regressor. Since visualizing a 4D vector is a pain, let’s start with fitting just the lag = 0 case, which still has similar behavior (and in fact is the dominant coefficient above anyways). The way to get the data is using prep and bake:

preproc <- linear_get_preproc(0:4 * 7)
fit_data_long <-
  preproc %>%
  prep(hhs_forecast) %>%
  bake(hhs_forecast) %>%
  select(geo_value, time_value, starts_with("lag"), starts_with("ahead")) %>%
  pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
  drop_na(lag_7_hhs, value) %>%
  mutate(ahead_value = factor(ahead_value, c("ahead_0_hhs", "ahead_7_hhs", "ahead_14_hhs", "ahead_21_hhs", "ahead_28_hhs"))) %>%
  arrange(geo_value, time_value, ahead_value) %>%
  mutate(epi_week = epiweek(time_value))

The fit for 28 days ahead is

all_single_workflows <- map(0:4 * 7, \(ahead) linear_get_workflow(ahead, lag = 0))
workflows::extract_fit_parsnip(all_single_workflows[[5]])

## parsnip model object
## 
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Coefficients:
## (Intercept)    lag_7_hhs  
##      0.4339       0.3734

which is positive but less than one, and thus predicts a decrease. Since that data as seen by this regression is 1D, we can plot it against the target ahead, and then plot the regression.

plot_linear_data <- function(fit_data_long) {
  fit_data_long %>%
    ggplot(aes(x = lag_7_hhs, y = value)) +
    geom_point(aes(color = epi_week), alpha = 1, size = 0.7) +
    geom_smooth(method = "lm", formula = y ~ x+0) +
    geom_abline(intercept = 0, slope = 1) +
    facet_wrap(~ahead_value, scales = "free") +
    scale_color_viridis_c() +
    scale_fill_viridis_c(option = "magma", trans = "log10")
}
fit_data_long %>%
  plot_linear_data()

All of which have slope less than 1. There is a stronger correlation between the smaller aheads and the value, and the slope is much closer to one. Adding the color corresponding to the season week potentially gives us some idea of the problem; there is a large mass of values with a low slope and a low epi_week. If we crudely just cut out everything with an epi_week below 10:

fit_data_long %>%
  filter(epi_week > 10) %>%
  plot_linear_data()

which is better but still below one. For some reason most of the remaining points egregiously below the diagonal are in epi_weeks very near 50. If we also filter out the last 4 weeks:

fit_data_long %>%
  filter(epi_week > 10, epi_week < 48) %>%
  plot_linear_data()

Then suddenly the slopes can be quite large! This is of course not a particularly principled way of selecting training data.

Aside: what if we fit on a log scale?

I originally added this plot in an attempt to make the blob near zero clearer, but it actually just did a linear fit on the log graph. This is significantly closer to a positive linear slope, though it is still negative.

fit_data_long %>%
  ggplot(aes(x = lag_7_hhs, y = value)) +
  geom_hex(alpha = 0.5, bins = 30) +
  geom_point(aes(color = epi_week), alpha = 0.3, size = 0.5) +
  geom_smooth(method = "lm", formula = y ~ x) +
  geom_abline(intercept = 0, slope = 1) +
  scale_y_log10() +
  scale_x_log10() +
  facet_wrap(~ahead_value, scales = "free") +
  scale_color_viridis_c() +
  scale_fill_viridis_c(option = "magma", trans = "log10")

Reproducing in the context of covid

Constantly decreasing forecasters has been less of an issue in covid, so we should do a comparison. Since we’re assuming the project is flu_hosp_explore, we have to directly access the covid archive. Covid is in counts, so first we convert to rates.

hhs_covid_archive <-
  qs2::qs_read(here::here("covid_hosp_explore/objects/hhs_archive"))$DT %>%
  filter(time_value > train_value_min) %>%
  left_join(state_census, by = join_by(geo_value == abbr)) %>%
  mutate(value = value / pop * 1e5) %>%
  select(-fips, -name, -pop) %>%
  rename(hhs = value) %>%
  as_epi_archive()
hhs_covid_forecast <- hhs_covid_archive %>% epix_as_of(forecast_date)
hhs_covid_forecast %>% autoplot(hhs)

Forecasting using the same methods as the original problem

all_covid_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE), epi_dat = hhs_covid_forecast)
plot_dec_forecasts(all_covid_forecasts, default_geos, hhs_covid_archive)

Still sort of present! And does it show up in a histogram of growth rates?

basic_gr <- get_growth_rates(all_covid_forecasts, quantiles = 0.5, method = "smooth_spline")
basic_gr %>% arrange(desc(growth)) %>% print(n=30)

## # A tibble: 275 × 6
## # Groups:   geo_value, quantile [55]
##    geo_value forecast_date target_end_date quantile value growth
##    <chr>     <date>        <date>             <dbl> <dbl>  <dbl>
##  1 as        2023-11-29    2023-11-29           0.5 0.184 2.32  
##  2 vi        2023-11-29    2023-11-29           0.5 0.184 2.32  
##  3 as        2023-11-29    2023-12-06           0.5 0.620 0.732 
##  4 vi        2023-11-29    2023-12-06           0.5 0.620 0.732 
##  5 as        2023-11-29    2023-12-13           0.5 1.12  0.506 
##  6 vi        2023-11-29    2023-12-13           0.5 1.12  0.506 
##  7 as        2023-11-29    2023-12-20           0.5 1.76  0.391 
##  8 vi        2023-11-29    2023-12-20           0.5 1.76  0.391 
##  9 as        2023-11-29    2023-12-27           0.5 2.46  0.292 
## 10 vi        2023-11-29    2023-12-27           0.5 2.46  0.292 
## 11 ak        2023-11-29    2023-12-20           0.5 2.23  0.237 
## 12 ak        2023-11-29    2023-12-27           0.5 2.76  0.192 
## 13 ak        2023-11-29    2023-12-13           0.5 1.77  0.176 
## 14 wa        2023-11-29    2023-12-27           0.5 3.89  0.108 
## 15 wa        2023-11-29    2023-12-20           0.5 3.49  0.104 
## 16 pr        2023-11-29    2023-11-29           0.5 3.85  0.0937
## 17 fl        2023-11-29    2023-12-20           0.5 4.08  0.0858
## 18 pr        2023-11-29    2023-12-06           0.5 4.28  0.0845
## 19 ms        2023-11-29    2023-11-29           0.5 3.97  0.0829
## 20 ga        2023-11-29    2023-12-20           0.5 4.13  0.0822
## 21 wa        2023-11-29    2023-12-13           0.5 3.18  0.0812
## 22 fl        2023-11-29    2023-12-27           0.5 4.43  0.0781
## 23 ms        2023-11-29    2023-12-06           0.5 4.36  0.0765
## 24 pr        2023-11-29    2023-12-13           0.5 4.58  0.0761
## 25 ga        2023-11-29    2023-12-27           0.5 4.47  0.0745
## 26 la        2023-11-29    2023-12-20           0.5 4.40  0.0712
## 27 ms        2023-11-29    2023-12-13           0.5 4.61  0.0711
## 28 pr        2023-11-29    2023-12-20           0.5 4.96  0.0695
## 29 la        2023-11-29    2023-12-27           0.5 4.72  0.0692
## 30 fl        2023-11-29    2023-12-13           0.5 3.76  0.0687
## # ℹ 245 more rows

basic_gr %>% ggplot(aes(x = growth)) +
  geom_histogram(bins = 300)

On average the growth rate is positive, but really at this point ~none of the forecasts should have a negative slope of the median. Let’s take a look at what the forecast is actually seeing

preproc <- linear_get_preproc(0:4 * 7, in_data = hhs_covid_forecast)
fit_covid_data <- preproc %>%
  prep(hhs_covid_forecast) %>%
  bake(hhs_covid_forecast) %>%
  select(geo_value, time_value, starts_with("lag"), starts_with("ahead"))
fit_covid_data %>%
  mutate(epi_week = epiweek(time_value)) %>%
  pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
  mutate(ahead_value = factor(ahead_value, c("ahead_0_hhs", "ahead_7_hhs", "ahead_14_hhs", "ahead_21_hhs", "ahead_28_hhs"))) %>%
  plot_linear_data()

Which, other than some massive outliers, has similar trends (the slope decreases as we increase the ahead, along with the spread).

And if we cut out the early epi_weeks?

fit_covid_data %>%
  mutate(epi_week = epiweek(time_value)) %>%
  filter(epi_week > 10) %>%
  pivot_longer(cols = starts_with("ahead"), names_to = "ahead_value") %>%
  mutate(ahead_value = factor(ahead_value, c("ahead_0_hhs", "ahead_7_hhs", "ahead_14_hhs", "ahead_21_hhs", "ahead_28_hhs"))) %>%
  plot_linear_data()

Which is marginally better. The problematic data is less clear here date-wise than in the case of flu; partly this is because the data covers a much larger range of values. Partly I suspect that this is also it’s less seasonal.

Appendix

Some other investigations.

Dividing by population squared

This was an accidental find, but if we do population scale the already population scaled data, the problem mostly goes away.

hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
all_linear_scaled_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, trainer = linear_reg()))
default_geos <- c("ca", "fl", "ny", "pa", "tx")
plot_dec_forecasts(all_linear_scaled_forecasts, default_geos)

And just like that the problem is gone?

basic_gr <- get_growth_rates(all_linear_scaled_forecasts, quantiles = 0.5, method = "smooth_spline")
basic_gr %>%
  arrange(growth) %>%
  print(n = 30)

## # A tibble: 268 × 6
## # Groups:   geo_value, quantile [54]
##    geo_value forecast_date target_end_date quantile value growth
##    <chr>     <date>        <date>             <dbl> <dbl>  <dbl>
##  1 ak        2023-11-29    2023-12-13           0.5 0.107 -1.97 
##  2 ak        2023-11-29    2023-12-06           0.5 0.461 -1.10 
##  3 ak        2023-11-29    2023-11-29           0.5 1.08  -0.624
##  4 hi        2023-11-29    2023-12-27           0.5 0.828 -0.244
##  5 wy        2023-11-29    2023-12-27           0.5 2.38  -0.213
##  6 nd        2023-11-29    2023-12-27           0.5 0.251 -0.194
##  7 dc        2023-11-29    2023-12-27           0.5 0.923 -0.193
##  8 hi        2023-11-29    2023-12-20           0.5 1.05  -0.190
##  9 pr        2023-11-29    2023-12-13           0.5 2.28  -0.178
## 10 vt        2023-11-29    2023-12-27           0.5 0.306 -0.176
## 11 wy        2023-11-29    2023-12-20           0.5 2.89  -0.175
## 12 pr        2023-11-29    2023-12-20           0.5 1.90  -0.172
## 13 nd        2023-11-29    2023-12-20           0.5 0.309 -0.162
## 14 pr        2023-11-29    2023-12-06           0.5 2.69  -0.160
## 15 hi        2023-11-29    2023-12-13           0.5 1.22  -0.151
## 16 pr        2023-11-29    2023-11-29           0.5 3.15  -0.150
## 17 vt        2023-11-29    2023-12-20           0.5 0.362 -0.149
## 18 nd        2023-11-29    2023-12-13           0.5 0.345 -0.140
## 19 pr        2023-11-29    2023-12-27           0.5 1.64  -0.138
## 20 dc        2023-11-29    2023-12-20           0.5 1.09  -0.136
## 21 vt        2023-11-29    2023-12-13           0.5 0.413 -0.128
## 22 hi        2023-11-29    2023-12-06           0.5 1.42  -0.124
## 23 nd        2023-11-29    2023-12-06           0.5 0.402 -0.122
## 24 wy        2023-11-29    2023-12-13           0.5 3.35  -0.120
## 25 vt        2023-11-29    2023-12-06           0.5 0.469 -0.112
## 26 la        2023-11-29    2023-12-27           0.5 2.15  -0.111
## 27 nd        2023-11-29    2023-11-29           0.5 0.450 -0.109
## 28 hi        2023-11-29    2023-11-29           0.5 1.57  -0.106
## 29 me        2023-11-29    2023-12-20           0.5 1.46  -0.103
## 30 me        2023-11-29    2023-12-27           0.5 1.32  -0.101
## # ℹ 238 more rows

There’s a mix of growth and decay.

basic_gr %>% ggplot(aes(x = growth)) +
  geom_histogram(bins = 300)

The majority of growth rates are positive; there’s some amount of negative predictions, but that is to be expected.

There appears to be something about fitting quantiles specifically that is causing this problem.

Fitting one quantile

What if we just fit the median? This shouldn’t change anything

hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
all_one_q_forecasts <- forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, quantile_levels = c(0.5), pop_scaling = FALSE))
default_geos <- c("ca", "fl", "ny", "pa", "tx")
all_one_q_forecasts %>% ggplot(aes(x = target_end_date, y = value)) +
  geom_line() +
  facet_wrap(~geo_value, scale = "free")

And indeed it does not, most forecasts are still negative.

Stripping away to bare rq

In the interest of simplifying as much as possible, lets just fit using rq directly. The problem is most acute in the furthest ahead, so lets use ahead = 28.

rec <- epi_recipe(hhs_forecast) %>%
  step_population_scaling(
    hhs,
    df = epidatasets::state_census,
    df_pop_col = "pop",
    create_new = FALSE,
    rate_rescaling = 1e5,
    by = c("geo_value" = "abbr")
  ) %>%
  step_adjust_latency(hhs, method = "extend_lags") %>%
  step_epi_lag(hhs, lag = c(0, 7, 14)) %>%
  step_epi_ahead(hhs, ahead = 28)

With the recipe in hand, we prepare the data, dealing with dropping NA values by hand since we’re mostly bypassing recipes.

fit_obj <- rec %>%
  prep(hhs_forecast) %>%
  bake(hhs_forecast) %>%
  drop_na(starts_with("lag"), starts_with("ahead")) %>%
  rq(ahead_28_hhs ~ lag_7_hhs + lag_14_hhs + lag_21_hhs, tau = 0.5, .)
pred_data <- rec %>%
  prep(hhs_forecast) %>%
  bake(hhs_forecast) %>%
  drop_na(starts_with("lag")) %>%
  filter(time_value == max(time_value))
predictions <- predict(fit_obj, pred_data)

And since we’re not using layers, we’ll have to undo population scaling

tibble(geo_value = pred_data$geo_value, value = predictions) %>%
  left_join(state_census, by = join_by(geo_value == abbr)) %>%
  mutate(value = value * pop / 1e5) %>%
  select(-fips, -name, -pop)

## # A tibble: 54 × 2
##   geo_value    value
##   <chr>        <dbl>
## 1 ak        0.413   
## 2 al        0.623   
## 3 ar        0.390   
## 4 as        0.000983
## 5 az        1.17    
## 6 ca        1.08    
## # ℹ 48 more rows

Presence/absence of an intercept

This is a bit more annoying to try to implement, since it seems that parsnip doesn’t support disabling the intercept outside the formula (so we’d have to do it in the recipe somehow).

Is it geo pooling?

TL;DR: No. The phenomena is still happening, at least for the default geos. Too slow to run regularly. Let’s see what happens if we restrict ourselves to training each geo separately.

hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
all_geos <- hhs_forecast %>%
  distinct(geo_value) %>%
  pull(geo_value)
hhs_forecast %>%
  filter(!is.na(hhs)) %>%
  group_by(geo_value) %>%
  summarize(n_points = n()) %>%
  arrange(n_points)
all_geos_forecasts <- map(all_geos, \(geo) forecast_aheads(\(x, ahead) scaled_pop(x, "hhs", ahead = ahead, pop_scaling = FALSE), epi_data = hhs_forecast %>% filter(geo_value == geo))) %>% list_rbind()
all_geos_forecasts %>% plot_dec_forecasts(default_geos)

geos_gr <- get_growth_rates(all_geos_forecasts, quantiles = 0.5, method = "smooth_spline")
geos_gr %>% arrange(desc(growth))

This is at least more of a mixed bag, with plenty of states with positive growth.

geos_gr %>% ggplot(aes(x = growth)) +
  geom_histogram(bins = 300)

But most have a negative growth.

How different is not geo pooling anyways?

Well it is at least different; how exactly is hard to parse:

all_geos_forecasts %>%
  left_join(all_forecasts, by = join_by(geo_value, forecast_date, target_end_date, quantile), suffix = c("_geo", "_joint")) %>%
  mutate(value = value_geo - value_joint) %>%
  select(-value_geo, -value_joint) %>%
  filter(geo_value %in% default_geos) %>%
  ggplot(aes(x = target_end_date, group = geo_value)) +
  geom_point(aes(y = value, color = quantile)) +
  facet_wrap(~geo_value, scale = "free")

Direct vs iterative forecasting

Iterative will be monotonic in the horizon. Direct isn’t guaranteed.

Somewhat hard to express, there’s also something about the increasing staleness of data that’s relevant. Actually implementing an experiment just means including the median from the previous forecast as a data point and forecasting one day further into the future.

Forecasting on value differences and iteratively summing

Does adding an I (integrated) component to our model improve things?

hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
# Get the latest values so we know where to sum from later
latest_values <- hhs_forecast %>%
  drop_na(hhs) %>%
  filter(geo_value %in% default_geos) %>%
  slice_max(time_value, by=geo_value) %>%
  select(geo_value, time_value, value = hhs)
# Take data diffs
diffed_data <- hhs_forecast %>%
  drop_na() %>%
  as_epi_df(as_of = (hhs_forecast %@% metadata)$as_of) %>%
  group_by(geo_value) %>%
  mutate(value_diff = hhs - lag(hhs)) %>%
  ungroup() %>%
  select(geo_value, time_value, value_diff)
diffed_data %>% autoplot(value_diff, .facet_by = "geo_value")

# Forecast the diffs and appean the starting values from the original data
diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop(x, "value_diff", ahead = ahead, pop_scaling = FALSE), epi_data = diffed_data)
diffed_forecast %<>%
  select(geo_value, time_value = target_end_date, value, quantile) %>%
  bind_rows(tidyr::expand_grid(quantile = covidhub_probs(), latest_values))
# Sum the diffs to get the final forecast
forecast <-
  diffed_forecast %>%
  group_by(geo_value, quantile) %>%
  arrange(time_value) %>%
  mutate(value = cumsum(value), forecast_date = .env$forecast_date) %>%
  ungroup() %>%
  arrange(geo_value, quantile, time_value) %>%
  select(geo_value, forecast_date, target_end_date = time_value, quantile, value)
forecast %>% plot_dec_forecasts(default_geos)

This is much better than the original forecasts (we’re forecasting on the same data as in the “problem setup” section).

Let’s combine this with seasonal training data.

hhs_forecast <- hhs_archive %>% epix_as_of(forecast_date)
# Get the latest values so we know where to sum from later
latest_values <- hhs_forecast %>%
  filter(geo_value %in% default_geos) %>%
  slice_max(time_value, by=geo_value) %>%
  select(geo_value, time_value, value = hhs)
# Take data diffs
diffed_data <- hhs_forecast %>%
  group_by(geo_value) %>%
  mutate(value_diff = hhs - lag(hhs)) %>%
  ungroup() %>%
  select(geo_value, time_value, value_diff)
# Forecast the diffs and appean the starting values from the original data
diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop_seasonal(x, "value_diff", ahead = ahead, pop_scaling = FALSE, seasonal_method = "window"), epi_data = diffed_data) %>%
  select(geo_value, time_value = target_end_date, value, quantile) %>%
  bind_rows(tidyr::expand_grid(quantile = diffed_forecast$quantile %>% unique, latest_values))
# Sum the diffs to get the final forecast
forecast <- diffed_forecast %>%
  group_by(geo_value, quantile) %>%
  arrange(time_value) %>%
  mutate(value = cumsum(value), forecast_date = .env$forecast_date) %>%
  ungroup() %>%
  arrange(geo_value, quantile, time_value) %>%
  select(geo_value, forecast_date, target_end_date = time_value, quantile, value)
forecast %>% plot_dec_forecasts(default_geos)

Turns out, we get better coverage this way.

Note that this is a very naive implementation for the quantiles: each quantile is computed independently, while a more correct approach would convolve the one-ahead quantile distribution of the diffs repeatedly with itself.

Now let’s try training on the augmented data (fluview and ILI).

joined_archive_data_as_of <- tar_read(joined_archive_data) %>% epix_as_of(forecast_date) %>% filter(agg_level == "state")
# Get the latest values so we know where to sum from later
latest_values <- joined_archive_data_as_of %>%
  filter(geo_value %in% default_geos, source == "nhsn") %>%
  slice_max(time_value, by=geo_value) %>%
  select(geo_value, time_value, value = hhs)
# Take data diffs
diffed_data <- joined_archive_data_as_of %>%
  arrange(source, geo_value, time_value) %>%
  group_by(geo_value, source) %>%
  mutate(value_diff = hhs - lag(hhs)) %>%
  ungroup() %>%
  select(source, geo_value, time_value, value_diff) %>%
  drop_na(value_diff) %>%
  as_epi_df(other_keys = "source", as_of = forecast_date)

# Forecast the diffs and appean the starting values from the original data
diffed_forecast <- forecast_aheads(\(x, ahead) scaled_pop_seasonal(x, "value_diff", ahead = ahead, pop_scaling = FALSE, nonlin_method = "none", filter_agg_level = "state", clip_lower = FALSE, nonneg = FALSE, seasonal_method = "window"), epi_data = diffed_data) %>%
  select(geo_value, time_value = target_end_date, value, quantile) %>%
  bind_rows(tidyr::expand_grid(quantile = diffed_forecast$quantile %>% unique, latest_values))
# Sum the diffs to get the final forecast
forecast <- diffed_forecast %>%
  group_by(geo_value, quantile) %>%
  arrange(time_value) %>%
  mutate(value = cumsum(value), forecast_date = .env$forecast_date) %>%
  ungroup() %>%
  arrange(geo_value, quantile, time_value) %>%
  select(geo_value, forecast_date, target_end_date = time_value, quantile, value)
forecast %>% plot_dec_forecasts(default_geos)

It doesn’t look great. The median is weirdly flat. Likely need to do something about whitening and then undo the whitening.

Does filtering by growth rate change it?

Roughly, the idea is to filter the data to that within a range of the growth rate of the most recent data point. To actually execute on this, we’ll use a step_filter at the end with a condition based on the growth rate at the first lag. This should only apply during training; during testing, we just want the latest data, regardless of the growth rate. First we’ll define the function that will filter based on the growth rate. ### Growth rate geo based quantile

filter_hhs <- function(geo_value, time_value, lagged, quantile_lower = 0.25, quantile_upper = 0.75, ...) {
  growth_rates <- tibble(geo_value, time_value, lagged) %>%
    group_by(geo_value) %>%
    drop_na(lagged) %>%
    mutate(gr = growth_rate(lagged, ...))
  gr_res <- growth_rates %>% pull(gr)
  quantile_bounds <- gr_res %>% quantile(c(quantile_lower, quantile_upper), na.rm = TRUE)
  (quantile_bounds[[1]] < gr_res) & (gr_res < quantile_bounds[[2]])
}

This computes the growth rate for lagged with whatever extra parameters are handed in, and returns a boolean vector saying if it’s within the quantile range (say 25% to 75%)[^1].

Since we want this to be after everything else we compute, we can just add it after all the other steps in the recipe.

get_filtered_workflow <- function(ahead, in_data = hhs_forecast, trainer = quantile_reg(quantile_levels = covidhub_probs()), ...) {
  preproc <- linear_get_preproc(ahead, in_data = in_data) %>%
  step_filter(filter_hhs(geo_value, time_value, lag_7_hhs, ...), skip = TRUE)
  postproc <- frosting() %>%
    layer_predict() %>%
    layer_quantile_distn(quantile_levels = covidhub_probs()) %>%
    layer_point_from_distn() %>%
    layer_threshold() %>%
    layer_naomit() %>%
    layer_add_target_date() %>%
    layer_add_forecast_date()
  workflow <- epi_workflow(preproc, trainer) %>%
    fit(hhs_forecast) %>%
    add_frosting(postproc)
  workflow
}

hhs_forecast <- hhs_archive %>%
  epix_as_of(forecast_date) %>%
  filter(time_value > train_value_min)

And fitting:

fit_given_growth_rate_params <- function(...) {
  fit_workflows <- map(0:4 * 7, \(ahead) get_filtered_workflow(ahead, ...))
  predictions <- map(
    fit_workflows,
    \(wf) predict(wf, hhs_forecast %>% filter(time_value >as.Date("2023-11-29")- 7*28)) %>% pivot_quantiles_longer(.pred_distn) %>% rename(value = .pred_distn_value, quantile = .pred_distn_quantile_level) %>% filter(time_value == forecast_date)
  )
  predictions %>% bind_rows() %>% rename(target_end_date = target_date) %>% select(-.pred, -time_value) %>% arrange(geo_value, target_end_date, quantile)
  plot_dec_forecasts(predictions %>% bind_rows() %>% rename(target_end_date = target_date), default_geos)
}
fit_given_growth_rate_params(method = "smooth_spline")

using the "smooth_spline" method didn’t seem to help. let’s try "rel_change", with a window size of 5 weeks:

fit_given_growth_rate_params(h = 7 * 5)

if anything that is worse. And linear regression, also using 5 weeks:

fit_given_growth_rate_params(method = "linear_reg", h = 7 * 5)

So this doesn’t seem to be helping. Let’s take a closer look at the growth rates that we’re using to do this filtering.

growth_rates <- hhs_forecast %>%
  drop_na() %>%
  as_epi_df() %>%
  group_by(geo_value) %>%
  mutate(gr = growth_rate(hhs, method = "smooth_spline"))
quantile_bounds <-
  growth_rates %>%
  pull(gr) %>%
  quantile(c(0.25, 0.75), na.rm = TRUE)
growth_rates %>% ggplot(aes( x = gr)) + geom_histogram(bins = 300) + geom_vline(aes(xintercept = quantile_bounds[[1]])) + scale_x_log10()

# (quantile_bounds[[1]] < gr_res) & (gr_res < quantile_bounds[[2]])

Decreasing Forecasters

Delphi Forecast Team

Rendered: 2025-05-20 17:10:26