#Random samples do not affect type-1 error rates
#
#Written by Uri Simonsohn (uws@wharton.upenn.edu)
#This version: 2016 08 20
#
###############################################################
#Notes:
#These are modified simulations of those originally written by Jake Westfall
#http://web.archive.org/save/http://jakewestfall.org/misc/sampling_plan_sim.html
#
#see also twitter discussion
#The original simulations purpotedly show an inflated type-1 error rate if
#researchers choose a random sample size without looking at the results. In fact the rate is 5%
#The error in the simulation is that they used the average sample size to compute the
#p-value of each study, instead of the actual sample size.
#
#To avoid problem these simulations keep track of p-values rather than t-stats in every study.
#As expected due to law of iterated expectation: prob(p<.05|null & random n)=5%
#
#Law of iterated expectations applies because for any given n prob(p<.05|null)=5% and researchers are setting sample blind to results, so r(n,p)=0
#
###########################################################################
N <- 30 #Average full study sample size used by Jake Westfall
simtot=100000 #Number of simulations to run
#Function created by Westfall, keeps track of results from set of studies
# running a two-sample t-test with a random sample size.
random_n <- function(mean){
n <- round(runif(1, min=4, max=2*mean-4))
#Jake kept track of t-value, but this requires keeping track of the d.f. of each study
#t.test(rnorm(floor(n/2)), rnorm(ceiling(n/2)), var.equal=TRUE)$statistic
#Uri keeps track of p-value, so that d.f. are already considered.
t.test(rnorm(floor(n/2)), rnorm(ceiling(n/2)), var.equal=TRUE)$p.value
}
#Same as above with a fixed sample size
fixed_n <- function(n){
#t.test(rnorm(floor(n/2)), rnorm(ceiling(n/2)), var.equal=TRUE)$statistic
t.test(rnorm(floor(n/2)), rnorm(ceiling(n/2)), var.equal=TRUE)$p.value
}
#Run simtot simulations where the average sample size is N set above
random_dist <- replicate(simtot, random_n(N))
fixed_dist <- replicate(simtot, fixed_n(N))
#The p-value is uniform for both
hist(random_dist,main='uniform p-values with random sample')
hist(fixed_dist,main='uniform p-values with fixed sample')
#5% are p<.05 with both
sum(random_n<.05)/simtot
sum(fixed_n<.05)/simtot