# Simulation Params
K = 20
n = 200
q = 0.2
X = matrix(rnorm(n*K),n,K)

# Scenario 1
# 20 signals and 80 noise coefficients
n_1 = 5

# Scenario 2
# 100 noise coefficients
# n_1 = 0

# Generate Data
beta = c(rep(3,n_1),rep(0,K-n_1))
Y = X%*%beta+rnorm(n,0,1)

# Find pvals
pvals = numeric(K)
for(i in (1:K)){
	model = lm(Y~X[,i])
	pvals[i] = summary(model)$coefficients[2,4]
}

#FDR fn
fdr_cut = function(pvals,q) {
  pvals = pvals[!is.na(pvals)]
  K = length(pvals)
  k = rank(pvals,ties="min")
  alpha = max(pvals[pvals <= q*k/K ])
  alpha
}
#Find cutoff / run fn
cutoff = fdr_cut(pvals, q)
cutoff


# Plot
pvals_ordered = pvals[order(pvals,decreasing=F)]
sig.col = (pvals_ordered <= cutoff)+1 
setwd("~/Github/bigdata/lectures/l2/")
png("BH.png")
plot(pvals_ordered,pch=19,col=sig.col,xlab="k")
title(main="BH - p=20, q=0.2")
abline(0,1/K,col="lightgrey",lty=2,lwd=2)
abline(0,q/K,col=3,lwd=2)
dev.off()

if (F) {
# Examine the actual outcomes. 
table(pvals<=cutoff) # number of discoveries and non-discoveries

(1:K)[pvals<=cutoff]

tab = table(beta,pvals<=cutoff)

tab

# what is FDP (false discovery proportion)

tab[1,2]/(sum(tab[,2]))
}