setwd("~/Github/bigdata/predictions/")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#Data from CDC trend page.
cases = read_csv("cdc_cases.csv",skip=2)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## State = col_character(),
## Date = col_character(),
## `New Cases` = col_double(),
## `7-Day Moving Avg` = col_double()
## )
Most recent data is from Thurs May 6th.
I’m going to start with a very basic model. Lets call it the log diff-in-diff model. Basically, this sunday/last sunday = this thursday/last thursay. I know the three inputs.
length(unique(cases$State))
## [1] 1
# only US. Drop the col.
cases = cases %>% select(-State)
# Check backend rolling avg
tail(cases)
## # A tibble: 6 x 3
## Date `New Cases` `7-Day Moving Avg`
## <chr> <dbl> <dbl>
## 1 Jan 27 2020 1 7
## 2 Jan 26 2020 3 7
## 3 Jan 25 2020 3 7
## 4 Jan 24 2020 1 6
## 5 Jan 23 2020 1 6
## 6 Jan 22 2020 46 6
#Interesting.
# Need to make Date more usable.
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
cases = cases %>%
mutate(Date=mdy(Date)) %>% #Convert to Date from Chr
mutate(dow = wday(Date)) #Day of week.
#Fix names
cases = cases %>% rename(cases = `New Cases`, avg = `7-Day Moving Avg`,date=Date)
small = cases %>%
filter(dow == 5 | dow == 1) #Grab thurs and Sun
#Building pred vars -- one-week old cases and avgs
small = small %>%
mutate(week_old_cases = lead(cases,n=2),
week_old_avg = lead(avg,n=2))
# Basic model input
small = small %>%
mutate(ratio.cases = cases/week_old_cases,
ratio.avg = avg/week_old_avg)
#Compare the ratios
ggplot(small,aes(x=ratio.cases,y=ratio.avg))+geom_point()
## Warning: Removed 2 rows containing missing values (geom_point).
Extreme outliers (ratios > 20) – probably very early in pandemic.
small %>% filter(ratio.cases > 20)
## # A tibble: 3 x 8
## date cases avg dow week_old_cases week_old_avg ratio.cases ratio.avg
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020-03-01 24 10 1 1 3 24 3.33
## 2 2020-02-23 1 3 1 0 1 Inf 3
## 3 2020-02-13 2 1 5 0 2 Inf 0.5
# Yup. Those are like.. Before April 2020. Going to just drop dates before May 2020.
small = small %>% filter(date > ymd("2020-04-29"))
#plot again
ggplot(small,aes(x=ratio.cases,y=ratio.avg,col=date))+
geom_point() +
geom_smooth(method="lm",col=3)+
geom_abline(slope=1,intercept=0,col=1,lty=2)
## `geom_smooth()` using formula 'y ~ x'
#Really expect it to lie on that 45 degree line.
#Not a huge problem.
#But probably a good reason to think about the avg
# not the daily. Will be comparing shortly.
small = small %>%
mutate(pred.avg = ratio.avg*lead(cases,n=1),
pred.cases = ratio.cases*lead(cases,n=1))
truth = small %>%
filter(dow == 1) %>%
select(cases) %>%
as.matrix()
preds = small %>%
filter(dow == 5) %>%
select(pred.avg,pred.cases) %>%
as.matrix()
#Differing number of dimensions. Need to... make sure I line up dates correctly.
tail(small)
## # A tibble: 6 x 10
## date cases avg dow week_old_cases week_old_avg ratio.cases ratio.avg
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020-05-17 13366 22392 1 23871 24566 0.560 0.912
## 2 2020-05-14 27125 23471 5 30752 26319 0.882 0.892
## 3 2020-05-10 23871 24566 1 29499 27771 0.809 0.885
## 4 2020-05-07 30752 26319 5 31694 28259 0.970 0.931
## 5 2020-05-03 29499 27771 1 27950 30196 1.06 0.920
## 6 2020-04-30 31694 28259 5 37314 29142 0.849 0.970
## # … with 2 more variables: pred.avg <dbl>, pred.cases <dbl>
tail(small %>% select(date,dow,cases,pred.avg))
## # A tibble: 6 x 4
## date dow cases pred.avg
## <date> <dbl> <dbl> <dbl>
## 1 2020-05-17 1 13366 24725.
## 2 2020-05-14 5 27125 21288.
## 3 2020-05-10 1 23871 27203.
## 4 2020-05-07 5 30752 27474.
## 5 2020-05-03 1 29499 29149.
## 6 2020-04-30 5 31694 NA
#Looks like I drop first and last in preds and last in truth.
preds = preds[-c(1,nrow(preds)),]
truth = truth[-length(truth)]
truth2 = cbind(truth,truth)
errs = preds/truth-1 #multiplicative model.mean=1.
colMeans(errs^2)
## pred.avg pred.cases
## 0.03127217 0.02723981
#looks like last day outperforms avg.
#closer to 'trend' i guess?
dates = small %>%
filter(dow==1) %>%
select(date) %>%
as.matrix()
dates = dates[-nrow(dates),]
dates = ymd(dates)
df = data.frame(preds = preds[,2],
truth=truth,
errs=errs[,2],dates=dates)
ggplot(df,aes(x=log(preds),y=log(truth),col=errs))+
geom_point()+
geom_abline(slope=1,intercept=0,col=2)+
geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'
#Is there any patterning over time?
ggplot(df,aes(x=log(preds),y=log(truth),col=dates))+
geom_point()
#Maybe percentage errors are getting smaller?
ggplot(df,aes(x=dates,y=errs^2^0.5))+
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (stat_smooth).
## Warning: Removed 25 rows containing missing values (geom_point).
#They got smaller... but like... growing again?
Notice:
Couple of further notes.
quantile(errs[1:41,2],c(0.025,0.975))
## 2.5% 97.5%
## -0.1847040 0.1710401
But my ballpark first forecast is…..
small$pred.cases[1]
## [1] 28173.96
# (drumroll): 28173.96
My price-is-right forecast is at the 20th percentile
quantile(errs[1:41,2],c(0.2))
## 20%
## -0.08484543
#8.5% less than that, for:
(1+quantile(errs[1:41,2],0.2))*small$pred.cases[1]
## 20%
## 25783.53
# 25783.53
And the implied chance of coming in below 25k?
25000/28173-1
## [1] -0.1126256
# 11.3% below prediction. which occurs
mean(errs[1:41,2] < -0.1126)
## [1] 0.09756098
Just shy of 10% of the time. (9.75% exactly) This is a dumb model, and there is a high chance I screwed something up. So I’ll call it a 1/6 chance (16.66%)
This is the exact difference between in-model uncertainty, and my uncertainty after considering model uncertainty. The model says “less than a 10% chance”. I say “but theres also an 8% chance you’re garbage, so lets call it 16% overall”