* THIS PROGRAM BOOTSRAPS WITHOUT REPLACEMENT FROM SMEESTERS ET AL JESP 2011.
* INTUITION;
* Intuitively here is what's happening:
* Each observation from 6 HIGH conditions is made into a card and put into a single HIGH deck.
* Each observation from 6 LOW conditions is made into a card and put into a single LOW deck.
* IN each bootstrap (of 100,000) the decks are fully delt into sample sizes of the original sizes. Each card is dealt once, into a single condition.
* This means the set of cards is kept constant, and the only source of sampling error is which of
* the six high condition gets which high card and same for lows.
*/ OPERATIONALIZATION OF CARD DEALING
1) The text file contains 12 columns, one per condition.
Each observation is the score of one participant.
2) That text file is copied vertically 100,000 times.
Because loops are slow in SAS I do that in batches.
I use the set command and basically input the data 10 times. And then that dataset is copied 10 times (so in 20 stes I get 100 loops) ,
and then again (so in 30 steps I get 1000 loops), and so on until we get to 100,000
3) I then add counters of observation and replication.
i: identifies the observation nubmer (1 is the first observation in a conditoin, 2 the second, and so on, most have n=14, one n=13, one n=16).
replicate: is the number of replication (1-100,000)
4) I then keep only &replicate and &condition numbers into a separate file.
The idea is to then shuffle those, patch it back to teh data, and we have rescrambled all the 100,000 dealt cards.
*/ OTHER CALCULATIONS ARE ANALOGOUS TO THE BOOTSTRAPPING SAS CODE;
*(1) The text file contains 12 columns, one per condition.
*File provided by Dirk Smeesters;
data raw;
infile "C:\uri\research\Fake data\Smeesters\Red Blue White\envelopes_data.txt" DSD delimiter='09'x;
input id cond score;
*Create dummy variable 'high' for condition predicted to have a high score, or not;
high=1;
*THese are the low conditions, replace high=1 for high=0;
if cond=1 | cond=4 | cond=6 | cond=8 | cond=9 | cond=11 then high=0;
run;
*Verify means ;
proc sort;
by cond;
proc means noprint data=raw;
var score;
by cond;
output out=verify;
run;
proc sort data=verify;
by score;
proc print data=verify;
where _STAT_="MEAN";
run;
*2) That text file is copied vertically 100,000 times.
*Simulate 1-10;
data sim;
set raw raw raw raw raw raw raw raw raw raw;
run;
*simulate 10-100;
data sim;
set sim sim sim sim sim sim sim sim sim sim;
run;
*simulate 100-1000;
data sim;
set sim sim sim sim sim sim sim sim sim sim;
run;
*simulate 1000-10000;
data sim;
set sim sim sim sim sim sim sim sim sim sim;
run;
*simulate 10000-100000;
data sim;
set sim sim sim sim sim sim sim sim sim sim;
run;
*3) I then add counters of observation and replication.;
data sim;
set sim;
i=mod(_N_,169);
if i=0 then i=169;
replicate=int((_N_-1)/169)+1;
run;
*4) I then keep only replicate and cond numbers into a separate file.
The idea is to then shuffle those, patch it back to teh data, and we have rescrambled all the 100,000 dealt cards.;
proc sort data=sim;
by replicate high cond;
run;
data cond;
set sim;
keep replicate cond ;
run;
*generate random number to sort by, seeded at 5001 (no reason, just a number);
data sim;
set sim;
u=ranuni(5001);
run;
*sort within replication's deck, by u;
proc sort data=sim;
by replicate high u;
run;
*Drop the condition number, because we use that in the other file for this;
data sim;
set sim;
drop cond;
run;
*Merge new order with simulated 100k decks;
data v; *This v file contains all 100k simulations, with new radom order, ready to go for calculations;
merge sim cond;
run;
************************
STREAKS
************************;
*1) Count each score per condition per replication;
proc sort data=v;
by replicate cond;
run;
proc freq data=v noprint;
table score/out=f;
by replicate cond;
run;
*2) Find the longest streak per condition per replication;
proc means noprint data=f;
var count;
output out=mstreaks max(count)=streak;
by replicate cond;
run;
data mstreaks;
set mstreaks;
drop _TYPE_ _FREQ_;
run;
proc transpose data=mstreaks out=streaksum prefix=cond;
var streak;
by replicate;
id cond;
run;
* 3) Sum streaks to create streak index (in paper I refer to streaksum as F);
data streaksum;
set streaksum;
streak_sum=cond1+cond2+cond3+cond4+cond5+cond6+cond7+cond8+cond9+cond10+cond11+cond12;
streak_max=max(cond1,cond2,cond3,cond4,cond5,cond6,cond7,cond8,cond9,cond10,cond11,cond12);
run;
*this tabulates F (i.e., streaksum) for the 100k simulations;
proc freq data=streaksum;
table streak_sum;
run;
************************
EXCESSIVE SIMILARITY;
************************;
*1) compute the means;
proc means data=v noprint;
var score;
by replicate cond;
output out=m ;
run;
proc print data=m (obs=8);
run;
*Keep only average;
data mav;
set m;
if _STAT_="MEAN";
drop _TYPE_ _FREQ_ _STAT_;
rename score=av;
run;
*Keep only SD;
data msd;
set m;
if _STAT_="STD";
drop _TYPE_ _FREQ_ _STAT_;
rename score=sd;
run;
*transpose them;
proc transpose data=mav out=tmav prefix=av;
var av;
by replicate;
id cond;
run;
proc transpose data=msd out=tmsd prefix=sd;
var sd;
by replicate;
id cond;
run;
*compute psi;
data m3;
merge tmav tmsd;
*pooled SD;
sdp_low = ((13*(sd1**2)+15*(sd4**2)+13*(sd6**2)+15*(sd8**2) +13*(sd9**2) +13*(sd11**2))/80)**.5;
sdp_high =((13*(sd2**2)+13*(sd3**2)+13*(sd5**2)+12*(sd7**2)+13*(sd10**2)+13*(sd12**2))/77)**.5;
*Average SE based on SDpooled;
se_low=sdp_low/sqrt(86/6);
se_high=sdp_low/sqrt(83/6);
*SDMs;
sdM_low=std(av1,av4,av6,av8,av9,av11);
sdM_high=std(av2,av3,av5,av7,av10,av12);
*SDM/SE;
psi_low= sdM_low/se_low;
psi_high=sdM_high/se_high;
*PSi;
psi=round(mean(psi_low,psi_high),.0001);
psi01=round(psi,.01);
run;
proc print data=m3 (obs=1);
run;
%pvalue(m3,psi,.3081);
************************
MERGE BOTH;
************************;
data se;
set m3;
keep replicate psi;
run;
data streaks;
set streaksum;
keep replicate streak_sum;
run;
%data_merge(se,streaks,se_streak,replicate);
data se_streak;
set se_streak;
if streak_sum<=33 & psi<=.3081 then hit=1; else hit=0;
run;
proc freq;
table hit;
run;
proc freq;
table streak_sum;
where psi<=.3085;
run;
proc freq;
table sdm;
where streak_sum<=33;
run;
proc corr ;
var psi streak_sum;
run;