420 likes | 966 Views
Simulating Data for Basic Regression Models. Components of Linear Regression Models. Explanatory (independent) variables. Error term. Response (dependent) variable. Parameters. Generating the explanatory variables. Random draws for an observational study
E N D
Components of Linear Regression Models Explanatory (independent) variables Error term Response (dependent) variable Parameters
Generating the explanatory variables. Random draws for an observational study Fixed values for a designed experiment
Decisions to be made in any simulation Size of simulated sample N=75 How we will select the xi Random sample of 75 from the U(0,1) β0=1 and β1=-2 Population values of β0 and β1 The distribution of εi N(0,.5) How will we obtain estimates of β0 and β1 PROC REG
Generate a single sample using a simple linear model %letobs = 75; %let beta0=1; %let beta1=-2; %let RMSE=.5; %let seed=54321; data Reg1(keep=x y); callstreaminit(&seed); do i = 1to &obs; x = rand("Uniform"); y = &beta0 +(&beta1)*x + rand("Normal", 0, &rmse); output; end; run;
Check results with PROC REG procregdata=Reg1 outest=betas; model y = x; quit; procprintdata=betas;run;
Put explanatory variables into arrays %letobs = 75; /* size of each sample */ %let reps = 1000;/* number of samples */ %let beta0=1; %let beta1=-1; %letrmse=.5; data RegSim1(keep= rep x y); array xx{&obs} _temporary_; callstreaminit(1); do i = 1to &obs; xx{i} = rand("Uniform"); end; do rep = 1to &reps; do i = 1to &obs; x = xx{i}; y = &beta0 +(&beta1)*x + rand("Normal", 0, 0.5); output; end; end; run;
A different way %letobs = 75; %let reps = 1000; %let beta0=1; %let beta1=-2; %letrmse=.5; %let seed=54321; data RegSim2(keep= rep i x y); callstreaminit(&seed); do i = 1to &obs; x = rand("Uniform"); do rep = 1to &reps; y = &beta0+&beta1*x + rand("Normal", 0, &rmse); output; end; end; run;
procsortdata=RegSim2; by rep; run; procregdata=RegSim2 outest=betas NOPRINT; by rep; model y = x; quit; procmeansdata=betas; var intercept x; run; proccorrdata=betas noprob plots=scatter(alpha=.05.1.25.50); label x="Estimated Coefficient of x" Intercept="Estimated Intercept"; var Intercept x; odsexcludeVarInformation; run;
procregdata=Sashelp.Class; model Weight = Height; odsselectFitStatisticsParameterEstimates; quit;
Generating a single sample from the class data %let seed=54321; %let beta0=-143; %let beta1=3.9; %letrmse=11.23; dataStudentData(keep= Height Weight); callstreaminit(&seed); setSashelp.Class; /* implicit loop over observations */ Weight = &beta0 + &beta1*Height + rand("Normal", 0, &rmse); run; procregdata=studentdata; odsselectparameterestimatesfitstatistics; model weight=height; run;
Another way – add error to predicted value %letrmse=11.23; %let seed=54321; procregdata=Sashelp.Class; model Weight = Height; outputout=ClassPredp=Predicted; quit; data StudentData2(keep= Height Weight); callstreaminit(&seed); setClassPred; Weight = Predicted + rand("Normal", 0, &rmse); run;
Create macro variables with values of parameters procregdata=Sashelp.Classoutest=betas plots=none; model Weight = Height; outputout=ClassPredp=Pred; quit; procprintdata=betas;run; data_null_; set betas; callsymputx("rmse",_rmse_); callsymputx("beta0",intercept); callsymputx("beta1",height); run; %put _user_;
data StudentData2(keep= Height Weight pred); • callstreaminit(&seed); • setClassPred; • Weight = Pred + rand("Normal", 0, &rmse); • run; • procprintdata=studentdata2;run;
Using random access and no sort %let seed=54321; %let reps=1000; data StudentSim2; callstreaminit(&seed); do rep = 1to &reps; doi = 1toNObs; setSashelp.Classpoint=inobs=NObs; Weight = &beta0 + &beta1*Height + rand("Normal", 0, &rmse); output; end; end; STOP; run;
Get values and predicted again • procregdata=Sashelp.Classoutest=betas plots=none noprint; • model Weight = Height; • outputout=ClassPredp=Pred; • quit; • data_null_; • set betas; • callsymputx("rmse",_rmse_); • callsymputx("beta0",intercept); • callsymputx("beta1",height); • run;
Another version -- read data into data step array %let reps=1000; %let seed=54321; datastudentsim (drop=ht: i); callstreaminit(&seed); arrayht{19}; retainht:; doi=1to19; setsashelp.class; ht{i}=height; end; do rep = 1to &reps; doi=1to19; height=ht{i}; predicted = &beta0 + &beta1*height; Weight = predicted + rand("Normal", 0, &rmse); output; end; end; run;
procregdata=studentsimnoprintoutest=simbetas; by rep; model weight=height; run; procprintdata=simbetas (obs=21); run; procmeansdata=simbetas; var _RMSE_ Intercept Height; title"Simulated values were: rmse=&rmse beta0=&beta0 beta1=&beta1"; run; title;
Simulating the height values from summary statistics procmeansdata=sashelp.classmeanstd; var height; run;
data simstudent3; callstreaminit(&seed); do i = 1to19; Height = rand("Normal", 62.3, 5.13); /* Height is random normal */ Weight = &beta0 + &beta1*Height + rand("Normal", 0, &rmse); output; end; run; procprintdata=simstudent3;run;
A designed experiment with a categorical variable with six levels and 5 observations at each level
Simulate a single result. dataAnovaData(keep=Treatment Y); callstreaminit(1); grandmean = 20; array effect{6} _temporary_ (9 -6 -6400); arraystd{6} _temporary_ (624412); do treatment = 1to dim(effect);/* number of treatment levels*/ do j = 1to5; /* number of obs per treatment level */ Y = grandmean + effect{i} + rand("Normal", 0, std{i}); output; end; end; run;
Analyze single result. procANOVAdata=AnovaData; class Treatment; model Y = Treatment; odsexcludeClassLevelsNObs; quit;
A linear model with 2 classification and 4 continuous variables
%letnCont = 4;/* number of continuous vars*/ %letnClas = 2;/* number of categorical vars*/ %letnLevels = 3;/* number of levels for each class var */ %letobs = 100; %let seed=54321; dataGLMData(drop=i j); array x{&nCont} x1-x&nCont; array c{&nClas} c1-c&nClas; callstreaminit(&seed); do i = 1to &obs; do j = 1to &nCont;/* continuous vars for i_thobs */ x{j} = rand("Uniform"); /* uncorrelated uniform*/ end; do j = 1to &nClas;/* class vars for i_thobs*/ c{j} = ceil(&nLevels*rand("Uniform"));/* discrete uniform */ end; y = 2*x{1} - 3*x{&nCont} + c{1} + rand("Normal"); output; end; run;
To examine effect of departures. Usual Assumption -- The error term is normally distributed. Departure --The error term has a t distribution with 5 degrees of freedom Departure -- The error term is heteroscadastic with a normal distribution depending on x. The errors are assumed to be normal with standard deviation exp(.2x)
The test statement in PROC REG %letobs=25; %let seed=54321; data Regress(drop=i); callstreaminit(&seed); do i = 1to &obs; x = rand("Normal"); z = rand("Normal"); y = 1 + 1*x + 0*z + rand("Normal"); output; end; run;
procregdata=Regress; model y = x z; z0: test z=0; odsoutputTestANOVA=testresult; quit; datatestresult; settestresult; where source="Numerator"; run; procprintdata=testresult;run;
Simulation using Greene's scenario, generate ASD %letobs = 25; %let reps = 100; %let seed=54321; data Regress(drop= iyhatstdx); callstreaminit(&seed); do i = 1to &obs; x = rand("Normal"); z = rand("Normal"); stdx = exp(x/5); do betaz = 0to1by0.1; yhat = 1 + 1*x + betaz*z; /* linear predictor*/ do rep = 1to &reps; depart = 1; y = yhat + rand("Normal"); output; depart = 2; y = yhat + rand("T", 5); output; depart = 3; y = yhat + rand("Normal", 0, stdx); output; end; end; end; run;
Sort and do regression separately for each sample procsortdata=Regress out=RegSim; by depart betaz rep; run; %ODSOff procregdata=RegSim; by depart betaz rep; model y = x z; test z=0; odsoutputTestANOVA=TestAnova; quit; %ODSOn proc print data=testanova(obs=10);run;
Do some data management on results /* Construct an indicator variable for observations that reject H0 */ data Results; setTestANOVA(where=(Source="Numerator"));/*get rid of extraneous info*/ Reject = (ProbF <= 0.05); /* indicator variable */ run; /* count number of times H0 was rejected */ procfreqdata=Results noprint; by depart betaz; tables Reject / nocumout=Signif(where=(reject=1)); run; dataSignif; setSignif; proportion = percent / 100; /* convert percent to proportion */ run;
Graph Results procformat; value ErrType 1="e ~ N(0,1)"2="e ~ t(5)"3="e ~ Hetero"; run; title"Power of F Test for betaz=0"; title2"N = &obs"; procsgplotdata=Signif; format depart ErrType.; seriesx=betazy=proportion / group=depart; yaxismin=0max=1label="Power (1 - P[Type II Error])"grid; xaxislabel="betaz"grid; keylegend / across=1location=inside position=topleft; run;
Simulating Outliers %letobs = 100; /* size of each sample */ %let seed=54321; %let p = 0.1; /* prob of contamination */ %let beta0=1; %let beta1=-2; dataRegOutliers(keep=x y Contaminated); array xx{&obs} _temporary_; callstreaminit(1); do i = 1to &obs; xx{i} = rand("Uniform"); end; do i = 1to &obs; x = xx{i}; Contaminated = rand("Bernoulli",&p); if Contaminated then e = rand("Normal", 0, 10); else e = rand("Normal", 0, 1); y = &beta0+&beta1*x + e; output; end; run;
Examine Simulated Outlier Data procformat; value Contam 0="N(0, 1)"1="N(0, 10)"; run; procsgplotdata=RegOutliers(rename=(Contaminated=Distribution)); format Distribution Contam.; scatterx=x y=y / group=Distribution; lineparmx=0y=1slope=-2; run; odsselectparameterestimates; procregdata=regoutliers; model y=x; quit;
PROC ROBUSTREG to get robust estimators procregdata=regoutliers; model y=x; odsselectparameterestimates; run; procrobustregdata=RegOutliersmethod=lts FWLS; model y = x; odsselectLTSEstimatesParameterEstimatesF; run;