# illustration of central limit theorem using a UNIFORM random distribution # To execute this script in R: # 1) open R and set the working directory to the folder that contains this file. # 2) at the command prompt, enter source("cltunif.R") # The script will plot the log-normal distribution and then use 3 plots to examine the distribution of sample means: # - the expected normal distribution drawn on top of a histogram of the sample means # - a quantile-quantile QQ plot of the sample means: the points should fall along a straight lines if the means are distributed normally # - a boxplot of the sample means: should be symmetrical and about 5% of the points should be "outliers" # Note: Only a very small sample size is required for the sample means to follow a normal distribution. # library(car) # need this package to call n.bins # library(MASS) print("central limit theorem using uniform random distribution") N <- 5000; # number of samples drawn from the population # sample size: minsamplesize<-2; maxsamplesize<-20; m<-2; # distribution mean sd<-3; # distribution standard deviation mu<-m # usually, a uniform random distribution has lower and upper limits of zero & one, # a mean of 0.5, and a variance of 1/12... # to get the correct mean and sd, we need to adjust limits of uniform distribution d<-(sqrt(12)*sd)/2 rmin<-m-d # lower limit rmax<-m+d # upper limit # print mean & sd to make sure that they are correct... tmp<-runif(10000,rmin,rmax) print(paste("expected mean = ",m,"; obtained mean = ",mean(tmp))) print(paste("expected sd = ",sd,"; obtained sd = ",sd(tmp))) if (Sys.info()[1]=="Darwin") quartz(height=6,width=8) else windows(height=6,width=8) # plot the distribution... if (Sys.info()[1]=="Darwin") quartz(height=6,width=8) else windows(height=6,width=8) # create window x<-seq((rmin-1),(rmax+1),len=500) # sequence of 500 x values ranging from rmin to rmax y<-dunif(x,rmin,rmax) # calculate the uniform density at those x values plot(x,y,main="uniform density") # scatter plot lines(x,y) # add lines # now plot distribution of sample means: if (Sys.info()[1]=="Darwin") quartz(height=6,width=8) else windows(height=6,width=8) # create another window par(mfrow=c(1,3)) # tell R to divide window into 1x3 plots for (k in minsamplesize:maxsamplesize) { sampsize <- k; # current sample size thenumbers<-runif(N* sampsize,min=rmin,max=rmax); # draw random numbers from population thedata<-matrix(thenumbers,N,sampsize) # reformat numbers into N samples of k values sample.means<-rowMeans(thedata); # clac the mean of each row # now draw a histogram hist(sample.means,breaks="FD",ylim=c(0,1.0), main=paste("sample size = ",k),prob=TRUE,col="lemonchiffon") # draw histogram of means # truehist(sample.means,ylim=c(0,1),col="lemonchiffon") # draw histogram of means # to the histogram, add the normal distribution of means # in other words, we will draw what we expect to get if the original numbers are # drawn from a normal distribution pu <-par("usr")[1:2] # fancy-pants R language for getting the range of histogram's x-axis x <- seq(pu[1],pu[2],len=500) # make a sequence of x values y <- dnorm(x,mean=m,sd=(sd/sqrt(sampsize)) ) # calc normal density at those x values lines(x,y,col="red") # draw it on top of the histogram # next, draw a qqplot to visualize departures from normality qqnorm(sample.means,main="QQ Plot (should be straight line)") qqline(sample.means) # finally, draw a boxplot to look for asymmetry and outliers boxplot(sample.means,main="should be symmetrical") Sys.sleep(1) # now calculate t-distribution when sample size = k sample.sd<-apply(thedata,MARGIN=1,sd) # calculate the sd for each row/sample sample.se<-sample.sd/sqrt(sampsize) # the STANDARD ERROR is sd/sqrt(sample size) # the t-value for each sample is defined as the sample mean minus the population mean (i.e., mu) divided by the standard error sample.t<-(sample.means-mu)/sample.se type1.error.rate=0.05 critical.t<-qt(type1.error.rate,df=(sampsize-1)); # critical value of t for a 1-tailed test that the sample mean is less than mu; # how many of our samples actually exceeded the critical t value? num.type1.errors<-length(sample.t[sample.t<=critical.t]) p.type1.error<-num.type1.errors/length(sample.means) print(" ") print("---------- TYPE I ERROR RATE ----------------") print(paste("sample size = ", sampsize)) print(paste("expected type I error rate = ",type1.error.rate,"; observed type I error rate = ",p.type1.error)) }