# Simulation Params K = 20 n = 200 q = 0.2 X = matrix(rnorm(n*K),n,K) # Scenario 1 # 20 signals and 80 noise coefficients n_1 = 5 # Scenario 2 # 100 noise coefficients # n_1 = 0 # Generate Data beta = c(rep(3,n_1),rep(0,K-n_1)) Y = X%*%beta+rnorm(n,0,1) # Find pvals pvals = numeric(K) for(i in (1:K)){ model = lm(Y~X[,i]) pvals[i] = summary(model)$coefficients[2,4] } #FDR fn fdr_cut = function(pvals,q) { pvals = pvals[!is.na(pvals)] K = length(pvals) k = rank(pvals,ties="min") alpha = max(pvals[pvals <= q*k/K ]) alpha } #Find cutoff / run fn cutoff = fdr_cut(pvals, q) cutoff # Plot pvals_ordered = pvals[order(pvals,decreasing=F)] sig.col = (pvals_ordered <= cutoff)+1 setwd("~/Github/bigdata/lectures/l2/") png("BH.png") plot(pvals_ordered,pch=19,col=sig.col,xlab="k") title(main="BH - p=20, q=0.2") abline(0,1/K,col="lightgrey",lty=2,lwd=2) abline(0,q/K,col=3,lwd=2) dev.off() if (F) { # Examine the actual outcomes. table(pvals<=cutoff) # number of discoveries and non-discoveries (1:K)[pvals<=cutoff] tab = table(beta,pvals<=cutoff) tab # what is FDP (false discovery proportion) tab[1,2]/(sum(tab[,2])) }