setwd("~/Github/bigdata/predictions/")
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

#Data from CDC trend page.
cases = read_csv("cdc_cases.csv",skip=2)

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   State = col_character(),
##   Date = col_character(),
##   `New Cases` = col_double(),
##   `7-Day Moving Avg` = col_double()
## )

Most recent data is from Thurs May 6th.

I’m going to start with a very basic model. Lets call it the log diff-in-diff model. Basically, this sunday/last sunday = this thursday/last thursay. I know the three inputs.

Basic intuition: This naturally builds in a Sunday FE. It uses 1 week of changes in most recent data to predict.
Natural Complications:
- Use percent change in 7day moving AVG for forecast

1 Cleaning

length(unique(cases$State))

## [1] 1

# only US. Drop the col.
cases = cases %>% select(-State)
# Check backend rolling avg
tail(cases)

## # A tibble: 6 x 3
##   Date        `New Cases` `7-Day Moving Avg`
##   <chr>             <dbl>              <dbl>
## 1 Jan 27 2020           1                  7
## 2 Jan 26 2020           3                  7
## 3 Jan 25 2020           3                  7
## 4 Jan 24 2020           1                  6
## 5 Jan 23 2020           1                  6
## 6 Jan 22 2020          46                  6

#Interesting.

# Need to make Date more usable.
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

cases = cases %>% 
  mutate(Date=mdy(Date)) %>% #Convert to Date from Chr
  mutate(dow = wday(Date)) #Day of week.
#Fix names
cases = cases %>% rename(cases = `New Cases`, avg = `7-Day Moving Avg`,date=Date)

small = cases  %>% 
  filter(dow == 5 | dow == 1) #Grab thurs and Sun

#Building pred vars -- one-week old cases and avgs
small = small %>% 
  mutate(week_old_cases = lead(cases,n=2),
         week_old_avg   = lead(avg,n=2))

# Basic model input
small = small %>%
  mutate(ratio.cases = cases/week_old_cases,
         ratio.avg   = avg/week_old_avg)
#Compare the ratios
ggplot(small,aes(x=ratio.cases,y=ratio.avg))+geom_point()

## Warning: Removed 2 rows containing missing values (geom_point).

Extreme outliers (ratios > 20) – probably very early in pandemic.

small %>% filter(ratio.cases > 20)

## # A tibble: 3 x 8
##   date       cases   avg   dow week_old_cases week_old_avg ratio.cases ratio.avg
##   <date>     <dbl> <dbl> <dbl>          <dbl>        <dbl>       <dbl>     <dbl>
## 1 2020-03-01    24    10     1              1            3          24      3.33
## 2 2020-02-23     1     3     1              0            1         Inf      3   
## 3 2020-02-13     2     1     5              0            2         Inf      0.5

# Yup. Those are like.. Before April 2020. Going to just drop dates before May 2020.
small = small %>% filter(date > ymd("2020-04-29"))
#plot again
ggplot(small,aes(x=ratio.cases,y=ratio.avg,col=date))+
  geom_point() +
  geom_smooth(method="lm",col=3)+
  geom_abline(slope=1,intercept=0,col=1,lty=2)

## `geom_smooth()` using formula 'y ~ x'

#Really expect it to lie on that 45 degree line.
#Not a huge problem. 
#But probably a good reason to think about the avg
# not the daily. Will be comparing shortly.

2 Modelling

small = small %>%
  mutate(pred.avg = ratio.avg*lead(cases,n=1),
         pred.cases = ratio.cases*lead(cases,n=1))

truth = small %>% 
  filter(dow == 1) %>% 
  select(cases) %>% 
  as.matrix()
preds = small %>% 
  filter(dow == 5) %>%
  select(pred.avg,pred.cases) %>%
  as.matrix()

#Differing number of dimensions. Need to... make sure I line up dates correctly.
tail(small)

## # A tibble: 6 x 10
##   date       cases   avg   dow week_old_cases week_old_avg ratio.cases ratio.avg
##   <date>     <dbl> <dbl> <dbl>          <dbl>        <dbl>       <dbl>     <dbl>
## 1 2020-05-17 13366 22392     1          23871        24566       0.560     0.912
## 2 2020-05-14 27125 23471     5          30752        26319       0.882     0.892
## 3 2020-05-10 23871 24566     1          29499        27771       0.809     0.885
## 4 2020-05-07 30752 26319     5          31694        28259       0.970     0.931
## 5 2020-05-03 29499 27771     1          27950        30196       1.06      0.920
## 6 2020-04-30 31694 28259     5          37314        29142       0.849     0.970
## # … with 2 more variables: pred.avg <dbl>, pred.cases <dbl>

tail(small %>% select(date,dow,cases,pred.avg))

## # A tibble: 6 x 4
##   date         dow cases pred.avg
##   <date>     <dbl> <dbl>    <dbl>
## 1 2020-05-17     1 13366   24725.
## 2 2020-05-14     5 27125   21288.
## 3 2020-05-10     1 23871   27203.
## 4 2020-05-07     5 30752   27474.
## 5 2020-05-03     1 29499   29149.
## 6 2020-04-30     5 31694      NA

#Looks like I drop first and last in preds and last in truth.

preds = preds[-c(1,nrow(preds)),]
truth = truth[-length(truth)]

truth2 = cbind(truth,truth)
errs = preds/truth-1 #multiplicative model.mean=1.
colMeans(errs^2)

##   pred.avg pred.cases 
## 0.03127217 0.02723981

#looks like last day outperforms avg.
#closer to 'trend' i guess?

dates = small %>% 
  filter(dow==1) %>% 
  select(date) %>%
  as.matrix()
dates = dates[-nrow(dates),]
dates = ymd(dates)

df = data.frame(preds = preds[,2],
                truth=truth,
                errs=errs[,2],dates=dates)
ggplot(df,aes(x=log(preds),y=log(truth),col=errs))+
  geom_point()+
  geom_abline(slope=1,intercept=0,col=2)+
  geom_smooth(method="lm")

## `geom_smooth()` using formula 'y ~ x'

#Is there any patterning over time?
ggplot(df,aes(x=log(preds),y=log(truth),col=dates))+
  geom_point()

#Maybe percentage errors are getting smaller?
ggplot(df,aes(x=dates,y=errs^2^0.5))+
  geom_point() +
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning: Removed 25 rows containing non-finite values (stat_smooth).

## Warning: Removed 25 rows containing missing values (geom_point).

#They got smaller... but like... growing again?

Notice:

I came to the data with two full models.
No coefficients to estimate. Nothing.
It was “ratiolast sunday" and "alt ratiolast sunday”
That means every one of these predictions is out of sample.
And wasn’t used for fitting the model.
- (ish – i’ve… obviously seen the gist of this data before)

Couple of further notes.

Looks like In August there was some kind of fundamental shift. should probably subset data to the post-august period.
I should also in principle consider models that are midpoints between the two here.
- Like.. Instead of All weight on Thurs ratio, or 1/7 weight on Thurs and 1/7 on last friday,
- Should eyeball some other weights.
Right now my mean absolute percentage errors are approximately 12%. That leaves… a lot of room to improve
My root mean squared percent errors are closer to 17%.
And my 95% CI is [-30%,+40%] which is… huge.

quantile(errs[1:41,2],c(0.025,0.975))

##       2.5%      97.5% 
## -0.1847040  0.1710401

Dropping pre-august (obs 42:52) makes it more like plus-minus 18%.
my rmspe = 10%, and my mape = 9%. Much more reasonable.
Little risky though – unclear if that variation has gone away over time, or went away as cases rose. If it is as cases rose, we should expect it back as they fall.

3 Base Forecast

But my ballpark first forecast is…..

small$pred.cases[1]

## [1] 28173.96

# (drumroll):  28173.96

My price-is-right forecast is at the 20th percentile

quantile(errs[1:41,2],c(0.2))

##         20% 
## -0.08484543

#8.5% less than that, for:
(1+quantile(errs[1:41,2],0.2))*small$pred.cases[1]

##      20% 
## 25783.53

# 25783.53

And the implied chance of coming in below 25k?

25000/28173-1

## [1] -0.1126256

# 11.3% below prediction. which occurs
mean(errs[1:41,2] < -0.1126)

## [1] 0.09756098

Just shy of 10% of the time. (9.75% exactly) This is a dumb model, and there is a high chance I screwed something up. So I’ll call it a 1/6 chance (16.66%)

This is the exact difference between in-model uncertainty, and my uncertainty after considering model uncertainty. The model says “less than a 10% chance”. I say “but theres also an 8% chance you’re garbage, so lets call it 16% overall”

Predictions Round 2

US Covid Cases on Sunday May 9th

May 7th, 2020

1 Cleaning

2 Modelling

3 Base Forecast