setwd("~/Github/bigdata/predictions/")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#Data from CDC trend page.
cases = read_csv("cdc_cases.csv",skip=2)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   State = col_character(),
##   Date = col_character(),
##   `New Cases` = col_double(),
##   `7-Day Moving Avg` = col_double()
## )

Most recent data is from Thurs May 6th.

I’m going to start with a very basic model. Lets call it the log diff-in-diff model. Basically, this sunday/last sunday = this thursday/last thursay. I know the three inputs.

  • Basic intuition: This naturally builds in a Sunday FE. It uses 1 week of changes in most recent data to predict.
  • Natural Complications:
    • Use percent change in 7day moving AVG for forecast

1 Cleaning

length(unique(cases$State))
## [1] 1
# only US. Drop the col.
cases = cases %>% select(-State)
# Check backend rolling avg
tail(cases)
## # A tibble: 6 x 3
##   Date        `New Cases` `7-Day Moving Avg`
##   <chr>             <dbl>              <dbl>
## 1 Jan 27 2020           1                  7
## 2 Jan 26 2020           3                  7
## 3 Jan 25 2020           3                  7
## 4 Jan 24 2020           1                  6
## 5 Jan 23 2020           1                  6
## 6 Jan 22 2020          46                  6
#Interesting.

# Need to make Date more usable.
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
cases = cases %>% 
  mutate(Date=mdy(Date)) %>% #Convert to Date from Chr
  mutate(dow = wday(Date)) #Day of week.
#Fix names
cases = cases %>% rename(cases = `New Cases`, avg = `7-Day Moving Avg`,date=Date)

small = cases  %>% 
  filter(dow == 5 | dow == 1) #Grab thurs and Sun

#Building pred vars -- one-week old cases and avgs
small = small %>% 
  mutate(week_old_cases = lead(cases,n=2),
         week_old_avg   = lead(avg,n=2))

# Basic model input
small = small %>%
  mutate(ratio.cases = cases/week_old_cases,
         ratio.avg   = avg/week_old_avg)
#Compare the ratios
ggplot(small,aes(x=ratio.cases,y=ratio.avg))+geom_point()
## Warning: Removed 2 rows containing missing values (geom_point).

Extreme outliers (ratios > 20) – probably very early in pandemic.

small %>% filter(ratio.cases > 20)
## # A tibble: 3 x 8
##   date       cases   avg   dow week_old_cases week_old_avg ratio.cases ratio.avg
##   <date>     <dbl> <dbl> <dbl>          <dbl>        <dbl>       <dbl>     <dbl>
## 1 2020-03-01    24    10     1              1            3          24      3.33
## 2 2020-02-23     1     3     1              0            1         Inf      3   
## 3 2020-02-13     2     1     5              0            2         Inf      0.5
# Yup. Those are like.. Before April 2020. Going to just drop dates before May 2020.
small = small %>% filter(date > ymd("2020-04-29"))
#plot again
ggplot(small,aes(x=ratio.cases,y=ratio.avg,col=date))+
  geom_point() +
  geom_smooth(method="lm",col=3)+
  geom_abline(slope=1,intercept=0,col=1,lty=2)
## `geom_smooth()` using formula 'y ~ x'

#Really expect it to lie on that 45 degree line.
#Not a huge problem. 
#But probably a good reason to think about the avg
# not the daily. Will be comparing shortly.

2 Modelling

small = small %>%
  mutate(pred.avg = ratio.avg*lead(cases,n=1),
         pred.cases = ratio.cases*lead(cases,n=1))

truth = small %>% 
  filter(dow == 1) %>% 
  select(cases) %>% 
  as.matrix()
preds = small %>% 
  filter(dow == 5) %>%
  select(pred.avg,pred.cases) %>%
  as.matrix()

#Differing number of dimensions. Need to... make sure I line up dates correctly.
tail(small)
## # A tibble: 6 x 10
##   date       cases   avg   dow week_old_cases week_old_avg ratio.cases ratio.avg
##   <date>     <dbl> <dbl> <dbl>          <dbl>        <dbl>       <dbl>     <dbl>
## 1 2020-05-17 13366 22392     1          23871        24566       0.560     0.912
## 2 2020-05-14 27125 23471     5          30752        26319       0.882     0.892
## 3 2020-05-10 23871 24566     1          29499        27771       0.809     0.885
## 4 2020-05-07 30752 26319     5          31694        28259       0.970     0.931
## 5 2020-05-03 29499 27771     1          27950        30196       1.06      0.920
## 6 2020-04-30 31694 28259     5          37314        29142       0.849     0.970
## # … with 2 more variables: pred.avg <dbl>, pred.cases <dbl>
tail(small %>% select(date,dow,cases,pred.avg))
## # A tibble: 6 x 4
##   date         dow cases pred.avg
##   <date>     <dbl> <dbl>    <dbl>
## 1 2020-05-17     1 13366   24725.
## 2 2020-05-14     5 27125   21288.
## 3 2020-05-10     1 23871   27203.
## 4 2020-05-07     5 30752   27474.
## 5 2020-05-03     1 29499   29149.
## 6 2020-04-30     5 31694      NA
#Looks like I drop first and last in preds and last in truth.

preds = preds[-c(1,nrow(preds)),]
truth = truth[-length(truth)]

truth2 = cbind(truth,truth)
errs = preds/truth-1 #multiplicative model.mean=1.
colMeans(errs^2)
##   pred.avg pred.cases 
## 0.03127217 0.02723981
#looks like last day outperforms avg.
#closer to 'trend' i guess?

dates = small %>% 
  filter(dow==1) %>% 
  select(date) %>%
  as.matrix()
dates = dates[-nrow(dates),]
dates = ymd(dates)

df = data.frame(preds = preds[,2],
                truth=truth,
                errs=errs[,2],dates=dates)
ggplot(df,aes(x=log(preds),y=log(truth),col=errs))+
  geom_point()+
  geom_abline(slope=1,intercept=0,col=2)+
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

#Is there any patterning over time?
ggplot(df,aes(x=log(preds),y=log(truth),col=dates))+
  geom_point()

#Maybe percentage errors are getting smaller?
ggplot(df,aes(x=dates,y=errs^2^0.5))+
  geom_point() +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (stat_smooth).
## Warning: Removed 25 rows containing missing values (geom_point).

#They got smaller... but like... growing again?

Notice:

  • I came to the data with two full models.
  • No coefficients to estimate. Nothing.
  • It was “ratiolast sunday" and "alt ratiolast sunday”
  • That means every one of these predictions is out of sample.
  • And wasn’t used for fitting the model.
    • (ish – i’ve… obviously seen the gist of this data before)

Couple of further notes.

  • Looks like In August there was some kind of fundamental shift. should probably subset data to the post-august period.
  • I should also in principle consider models that are midpoints between the two here.
    • Like.. Instead of All weight on Thurs ratio, or 1/7 weight on Thurs and 1/7 on last friday,
    • Should eyeball some other weights.
  • Right now my mean absolute percentage errors are approximately 12%. That leaves… a lot of room to improve
  • My root mean squared percent errors are closer to 17%.
  • And my 95% CI is [-30%,+40%] which is… huge.
quantile(errs[1:41,2],c(0.025,0.975))
##       2.5%      97.5% 
## -0.1847040  0.1710401
  • Dropping pre-august (obs 42:52) makes it more like plus-minus 18%.
  • my rmspe = 10%, and my mape = 9%. Much more reasonable.
  • Little risky though – unclear if that variation has gone away over time, or went away as cases rose. If it is as cases rose, we should expect it back as they fall.

3 Base Forecast

But my ballpark first forecast is…..

small$pred.cases[1]
## [1] 28173.96
# (drumroll):  28173.96

My price-is-right forecast is at the 20th percentile

quantile(errs[1:41,2],c(0.2))
##         20% 
## -0.08484543
#8.5% less than that, for:
(1+quantile(errs[1:41,2],0.2))*small$pred.cases[1]
##      20% 
## 25783.53
# 25783.53

And the implied chance of coming in below 25k?

25000/28173-1
## [1] -0.1126256
# 11.3% below prediction. which occurs
mean(errs[1:41,2] < -0.1126)
## [1] 0.09756098

Just shy of 10% of the time. (9.75% exactly) This is a dumb model, and there is a high chance I screwed something up. So I’ll call it a 1/6 chance (16.66%)

This is the exact difference between in-model uncertainty, and my uncertainty after considering model uncertainty. The model says “less than a 10% chance”. I say “but theres also an 8% chance you’re garbage, so lets call it 16% overall”