#R Code Behind DataColada[21]
#Written by Uri Simonsohn, based loosely on code included in original whistleblower report (especially the section computing dF)
#May, 2014
#-----> PLEASE EMAIL URI SIMONSOHN UWS@WHARTON.UPENN.EDU IF YOU SEE AN ERROR
#########################################################################
#Overview
#1.Drawing data
# The code starts entering in the means, SDs and sample sizes for the 12 studies in the paper (see sources below)
# It then creates a loop that iterates 100,000 times, each loop simulates an entire paper of 12 studies
# For each study I run a second loop that iterates 12 times, one per study.
# For each study three samples are drawn with the size of the original, n=15 or n=20 per cell.
# The low and high conditions are drawn from populations with the means & SD for the corresponding cells
# After drawing I round and bound the variable so that it has the properties of the reported data (1-7 or 0-9 integers)
# For the medium/control condition I draw from a population with mean equal to the midpoint between HIgh and Low
# This draws under the null such that Medium=mean(High,Low), the most conservative null. I do use the SD reported for that cell
#
#2, Computations before running analyses
# For each simulated study I compute the following:
# i) sample means of the three conditions: Mh, Mm, Ml (High, medium, low) respectively.
# ii) Run regression of d.v. on a variable coded -1,0,1, get the F-test for that model (linear trend ) : Fr
# iii) Run Omnibus test of d.v. on two dummy variables for the low and high get the F omnibus F(2,df) : Fc
# iv) Compare both F-tests, this comaprison is itself distributed F and is the key test in the original report : dF
# "deltaF"
# v) Compute how far from the the midpoint of H vs L did the control mean land: : MRatio
#
#3. Analyzing each simualted paper of 12 studies
#3.1 Similarity of F-tests
# The original paper assesses the statistical significance of the predicted effects via an F(2,df2) omnibus test.
# That corresonds to Fc in section 2. I compute the simple standard deviation of the 12 Fc in each of 100k paper loops.
# sd(F)
#
#3.2 Excessive linerity
#
#3.2.1 dF
# The original analysis by the anonymous whistleblower computed the dF (comparing Fc with Fr) statistic based
# on reported results and relied on the F() distribution to assign a probability to it.
# The 12 different probabilities were then aggreagated using Fisher's method.
# Here, to avoid relying on normality assumptions, for each simulation I computed the "average" dF across
# the 12 simulations and then compared the observed average dF in the paper to that set of 100,000,
#
# In the blog I ended up not reporting this, and reporting only the ratio of means so this is not discussed below.
# I wanted to assess robustness of conclusions to non-normality and they were completely robust, so not needed.
#
#3.2.2 MRatio
# In the blogpost i describe this more intuitive measure of linearity.
# I compute the average MRatio (defined above) across the 12 simualted studies for each simualted paper.
# My approach bypasses the need to use Fisher's method (which assumes a continuous distributioN) and the F-test, which
# assumes normality The downside is that I am bounded by how unlikley something may be estimated to be by the # of
# simulations I run. I observe the pattern 0/100,000 times, but cannot tell if it is 1 in 5 million or 1 in 100,000 million.
#
#4 Final analysis
# At the end of about 2 hours of running (R is incredibly bad for loops!)
# We have 100,000 rows of results, one per simulated study
# Each row has:
# i) an index of similarity of Omnibus F tests, sdF (sdF=0 implies all results are identical across studies)
# ii) index1 of similarity of data to linear pattern, meanF (meanF=0 is perfect linearity within study)
# iii) index2 of similarity of data to linear pattern, MRatio (Mratio=0 is perfect linearity within study)
#
############################################################################################################
# PARAMETERS FOR THE SIMULATIONS
#Enter POPULATION means and SD
#Source: Table 3 of whistleblower report (see http://retractionwatch.files.wordpress.com/2014/04/report_foerster.pdf)
M.low =c(2.47, 2.51, 2.4, 2.41, 2.14, 3.19, 2.63, 2.87, 2.35, 2.55, 2.66, 2.42)
SD.low=c(1.21, 0.71, 0.86, 1.07, 1.2, 1.07, 1.49, 1.24, 1.01, 1.16, 1.21, .82)
M.high =c(3.68, 3.35, 3.45, 3.64, 3.41, 4.79, 4.73, 4.79, 4.76, 4.78, 4.81, 5.02)
SD.high=c(0.68, 0.64, 0.8, 0.95, 0.71, 0.82, 1.55, 1.53, 1.71, 1.47, 1.54, 1.45)
#use as means for medium condition (population) the midpoint of high and low so that I draw under the null
M.med=(M.high+M.low)/2
#Use reported SDs
SD.med=c(0.72, 0.49, 0.51, 0.51, 0.78, 1.21, 1.21, 1.09, 1.19, 1, 1.3, 1.28)
#Sample size
n=c(20,20,20,20,20,20,20,20,20,15,20,15)
#Bounds of the d.v.
#The data are drawn from the normal, but then adjusted to characteristics of the study.
#Note: this will introduce minor discrepancies between the Ms and SDs in the samples, and the averages in the population
#The first 5 studies have as a d.v. a likert scale 0-9
#The next 7 studies average of four likert scales 1-7
bound.lower=c(0,0,0,0,0,1,1,1,1,1,1,1)
bound.upper=c(9,9,9,9,9,7,7,7,7,7,7,7)
#note: in the paper they appear to report averages of these variables, which means I am simualating data that is further from the normality assumption than needed.
#PREDICTORS
#DUMMIES used for a F(2,df) omnibus test like that reported in the paper
#This is no the most efficient coding, but it is very intuitive
#n=15
dummy.low15 =c(rep(1,15),rep(0,15),rep(0,15)) #first 15 obs are from the low condition
dummy.high15=c(rep(0,15),rep(0,15),rep(1,15)) #last 15 are high condition
#n=20
dummy.low20 =c(rep(1,20),rep(0,20),rep(0,20)) #first 20 obs are from the low condition
dummy.high20=c(rep(0,20),rep(0,20),rep(1,20)) #last 20 are high
#LINEAR CONTRAST CODING
regr.15 =c(rep(-1,15),rep(0,15),rep(1,15))
regr.20 =c(rep(-1,20),rep(0,20),rep(1,20))
############################################################################################################
#I will store each simulated paper result here
meanF=c() #Vector with average (across 12 studies) F statistic for linear vs. regressions (average dF)
minF=c() #Vector with lowest F(2,df2) statistic across 12 studies, to do analyses with subset of p<.05 results
sdF=c() #vector with average (across 12 studies) of similarity of F(2,df2) test
MRatio=c() #Vector with average (across 12 studies) % difference between midpoint H&L and control condition
alldF=c() #vector with all dF stats (12 per simulation) [I use this to assess the appropriateness of p-values used by report]
allM=c() #vector with all Means for all studies in all conditions (36 per simulations)
time0=Sys.time() #to see how long it takes
set.seed(1) #to always get the same result
simtot=100000 # of Simulations
#Start the 100k loops
for (simk in 1:simtot)
{
#SIMULATE THE 12 STUDIES ONCE
p=c() #vector for p-values for each of the 12 comparisons of linear vs contrast for this individual simulation
dF=c() #vector for F-values for the linear vs contrast comparisons
Fobs=c() #vector for F-stats for each contrast F(2,df) test
Mh=c() #vector for means in the 3 conditions of the 12 studies (h, m, l)
Mm=c()
Ml=c()
Mr=c() #store ratio of (High-med)/(med-low)
for (k in 1:12)
{
#GENERATE THE D.V.
#Draw the data
y.high=rnorm(n[k],M.high[k],SD.high[k])
y.low =rnorm(n[k],M.low[k], SD.low[k])
y.med =rnorm(n[k],M.med[k], SD.med[k])
#Bound it
y.high=pmin(y.high,bound.upper[k])
y.high=pmax(y.high,bound.lower[k])
y.med=pmin(y.med,bound.upper[k])
y.med=pmax(y.med,bound.lower[k])
y.low=pmin(y.low,bound.upper[k])
y.low=pmax(y.low,bound.lower[k])
#Round it
y.high=round(y.high,0)
y.med=round(y.med,0)
y.low=round(y.low,0)
#stack up the d.v for the three conditions
y=c(y.low,y.med,y.high)
#RUN THE ANOVAS
#For studies with 20 per cell
if (n[k]==20)
{
Fc =lm(y ~ dummy.low20 + dummy.high20) #ANOVA with contrast coding (non linear)
Fr =lm(y ~ regr.20) #ANOVA with linear trend
}
#For n=15 studies
if (n[k]==15)
{
Fc =lm(y ~ dummy.low15 + dummy.high15) #ANOVA with contrast coding (non linear)
Fr =lm(y ~ regr.15) #ANOVA with linear trend
}
#LINEARITY OF EFFECTS WITHIN STUDY
#Based on F test, like original report
dF[k] =anova(Fr,Fc, test="F")[2,5] #Take the dF statistic comparing the linear from contrast coding
if (is.na(dF[k])) dF[k]=0 #When F=0 (perfectly identical effect) the syntax changes, fix that
#Based on mean ratios
Mh[k]=round(mean(y.high),2) #HIGH MEAN
Mm[k]=round(mean(y.med),2) #CONTROL MEAN
Ml[k]=round(mean(y.low),2) #LOW MEAN
Mr[k]=abs(((Mh[k]+Ml[k])/2-Mm[k])/(Mh[k]-Ml[k])) #LINEARITY MEASURE BASED ON RATIO OF CONTROL TO MIDPOINT
#SIMILARITY OF KEY RESULT ACROSS STUDIES
#What's the reported omnibus test? (F(2,df))
Fobs[k]=summary(Fc)$fstatistic[1]
}
#Now all 12 studies have been run, we can run calculations for entire simulated paper
#LINEARITY WITHIN STUDY
#Average F-stat for linear vs contrast
meanF.simk=mean(dF) #ENED UP NOT USING THIS IN THE BLOG POST, IGNORE
#Mean ratios
MRatio.simk=mean(Mr) #AVERAGE DISTANCE BETWEEN CONTROL AND MIDPOINT IN 12 STUDIES
#SIMILIARITY ACROSS STUDIES
sdF.simk=sd(Fobs) #SIMILARITY OF KEY TEST F(2,DF)
#THE SMALLEST F(2,DF) IS USED LATER TO FOCUS ON SIMUALTED PAPERS WHERE ALL STUDIES ARE SIGNIFICANT
minF.simk=min(Fobs)
#ARCHIVE RESULTS FOR THIS SIMULATED PAPER
meanF=c(meanF, meanF.simk) #Average dF test comparing linear with contrast
minF= c(minF, minF.simk) #Lowest F value, to keep only p<.05 results later one
alldF=c(alldF, dF) #Set of 12 dF stats
allM=c(allM,Mh, Mm, Ml) #Set of 36 means
sdF=c(sdF, sdF.simk) #SD of F-statistics for key results
MRatio=c(MRatio, MRatio.simk)
#Every 100 simulations output time so far (in my 3 year old PC it takes about 7 seconds per every 100 simulations)
if (simk %% 100 ==0) cat("\n",simk, "simulations so far. Time since I started: ",Sys.time()-time0)
}
#############################################################################################
### ANALYSIS
#############################################################################################
#1 VERIFY SIMULATIONS GOT THE RIGHT MEANS
allMeans=matrix(allM,nrow=simtot,ncol=36,byrow=TRUE)
m=matrix(colMeans(allMeans)) #COMPARE OUTPUT TO PARAMETERS ENTERED AT THE TOP OF THE CODE, E.G., 1st mean should be ~ 3.68
###2 VERIFY VALIDITY P-VALUES IN CALCUALTIONS BY WHISTLEBLOWER [Not DISCUSSED ON BLOG POST]
#This is a matrix where each row is a simulated paper, and each column is the resulting F-statistic comparing
#Column one has the F(2,57) for the 100,000 Study 1, column 2 for STudy 2, and so on.
alldFs=matrix(alldF,nrow=simtot,ncol=12,byrow=TRUE)
#Here I asses the % of dF tests that are smaller than the observed dF in the data (see Table 4 in the Report)
#This serves as a stronger test as to whether it is valid to use the dF test in the Report
#e.g., I compute what % of simulated Study 1s have a dF value >=.0204. The F distribution indicates it should be 88.792%
#If it is, the p-value reported in th elast column of Table 4 (p(dF) should be equal to the % of smaller Fs in the simulations)
p.obs=c(.88792, .90667, .90216, .85205, .85925, .95094, .91283, 1, .80901, .9045, .91581, .98483) #last column in Table 4
p=c()
p[1]= 1-(sum(alldFs[,1] <=.02004)/simtot)
p[2]= 1-(sum(alldFs[,2] <=.01387)/simtot)
p[3]= 1-(sum(alldFs[,3] <=.01525)/simtot)
p[4]= 1-(sum(alldFs[,4] <=.03510)/simtot)
p[5]= 1-(sum(alldFs[,5] <=.03173)/simtot)
p[6]= 1-(sum(alldFs[,6] <=.00382)/simtot)
p[7]= 1-(sum(alldFs[,7] <=.01209)/simtot)
p[8]= 1-(sum(alldFs[,8] <=.00000)/simtot)
p[9]= 1-(sum(alldFs[,9] <=.05897)/simtot)
p[10]=1-(sum(alldFs[,10]<=.01457)/simtot)
p[11]=1-(sum(alldFs[,11]<=.01127)/simtot)
p[12]=1-(sum(alldFs[,12]<=.00037)/simtot)
#On average, the p-value computed using the F-distribtuion is off by ~1.5% percentage points
mean(abs(p.obs-p))
#Aggregatig the empirical p-values using Fisher's method gets us to
fisher=pchisq(sum(-2*log(p)),df=24)
1/fisher #1 in 89.9 million (about twice as big as 158 million, but still trivially small)
#Note: FIghser's method also assumes continuous distribution, so more of a ball-park calculation than actual probability estimate
##############################
#3 MIDPOINT ANALYSIS
##############################
#Recall: Mratio is the average distance between the Control condition and the midpoint between the high and the low
#FIGURE 1 IN COLADA[21] - ALL SIMULATIONS
#OUTPUT FIGURE TO FILE
png(filename="Colada21Fig1.png", width=1400, height=1200, res=150)
#Start Histogram
hist(MRatio,breaks=500,main="How Similar is Control Condition to Midpoint of High/Low in the 100k Simulations?",
xlab="% Difference between Control and Midpoint High/Low",
ylab="# of simulations",col="orange",xlim=c(0,.35))
#Arrow
arrows(x0=.0232,x1=.0232,y0=1500,y1=3)
#Label over arrow
text(x=.0232,y=2000,"Suspicious paper\n.2.3%")
#CLOSE FIGURE FILE (save it)
dev.off()
#No simulation is as small as the reported result
sum(MRatio<.0232)
#Smallest result is twice as big
min(MRatio)
#ONLY SIMULATIONS WITH ALL STUDIES P<.05
#Subset with significant results
MRatio.sig=subset(MRatio,minF>4)
#OUTPUT FIGURE TO FILE
png(filename="Colada21Fig1b.png", width=1400, height=1200, res=150) #SAVE FIGURE TO FILE
#Start histogram
hist(MRatio.sig,breaks=250,main="How Similar is Control Condition to Midpoint of High/Low\nSUBSET OF 45% OF SIMULATIONS WITH ALL STUDIES P<.05",
xlab="% Difference between Control and Midpoint High/Low",
ylab="# of simulations",col="orange",xlim=c(0,.35))
#Arrow
arrows(x0=.0232,x1=.0232,y0=750,y1=3)
#Label over arrow
text(x=.0232,y=1000,"Suspicious paper\n.2.3%")
dev.off() #CLOSE FIGURE FILE (i.e. SAVE IT)
#FIGURE 3 - HISTOGRAM WITH SIMILARITY OF F-VALULES
#ALL
#OUTPUT FIGURE TO FILE
png(filename="Colada21Fig3.png", width=1400, height=1200, res=150)
#histogram
hist(sdF,breaks=200,main="HOW SIMILAR ARE THE F-VALUES ACROSS THE 12 STUDIES IN THE 100K SIMULATIONS?",
xlab="SD(F): Standard Deviation of 12 F-values",
ylab="# of simulations",col="orange", xlim=c(0,15))
#Arrow
arrows(x0=.866,x1=.866,y0=600,y1=3)
#Label over arrow
text(x=.866,y=750,"Suspicious paper\nSD(F)=.866")
#CLose Figure - save it
dev.off()
#ONLY P<.05 RESULTS
sdF.sig=subset(sdF, minF>4) # Significant results
png(filename="Colada21Fig3B.png", width=1400, height=1200, res=150)
#histogram
hist(sdF.sig,breaks=100,main="HOW SIMILAR ARE THE F-VALUES ACROSS THE 12 STUDIES \nIN 45% OF SIMULATIONS WITH ALL RESULTS P<.05?",
xlab="SD(F): Standard Deviation of 12 F-values",
ylab="# of simulations",col="orange", xlim=c(0,15))
#Arrow
arrows(x0=.866,x1=.866,y0=600,y1=3)
#Label over arrow
text(x=.866,y=750,"Suspicious paper\nSD(F)=.866")
#CLose Figure - save it
dev.off()
#CORRELATIONS BETWEEN SD(F) AND MIDPOINT METRIC
#Drop infinite MRatio
MRatio.1=subset(MRatio,MRatio<10)
sdF.1 =subset(sdF,MRatio<10)
#"ALL" simulation
cor(MRatio.1,sdF.1, method="spearman")
#p<.05 simulations
cor(MRatio.sig, sdF.sig, method="spearman")