# illustration of central limit theorem using a lognormal random distribution # To execute this script in R: # 1) open R and set the working directory to the folder that contains this file. # 2) at the command prompt, enter source("cltlognorm.R") # The script will plot the log-normal distribution and then use 3 plots to examine the distribution of sample means: # - the expected normal distribution drawn on top of a histogram of the sample means # - a quantile-quantile QQ plot of the sample means: the points should fall along a straight lines if the means are distributed normally # - a boxplot of the sample means: should be symmetrical and about 5% of the points should be "outliers" # Note: This example is very different from the one in which scores were drawn from a uniform distribution. Here, when we draw scores from a log-normal distribution, the means do not closely follow a normal distribution even with a sample size n=30: the sampling distibution is skewed, so that there are too many large sample means. With a log-normal distribution, you need a sample size of approximately 120 before the means are distributed normally. # library(car) # need this package to call n.bins # library(MASS) N <- 5000; # number of samples drawn from population # sample size: minsamplesize<-15; maxsamplesize<-30; m<-2; # distribution "location" parameter sd<-2; # distribution standard deviation mu<-exp(log(m)+0.5*log(sd)*log(sd)) # mu is the expected value of the lognormal distribution defined by m & sd # print mean & sd to make sure that they are correct... tmp<-rlnorm(10000,meanlog=log(m),sdlog=log(sd)) print(paste("expected mean = ",mu,"; obtained mean = ",mean(tmp))) print(paste("expected sd = ",sd,"; obtained sd = ",sd(tmp))) graphics.off() # close all graphics devices # plot the distribution... if (Sys.info()[1]=="Darwin") quartz(height=6,width=8) else windows(height=6,width=8) # create window x<-seq(0.1,10,len=500) # sequence of 500 x values ranging from 0.1 to 10 y<-dlnorm(x,meanlog=log(m),sdlog=log(sd)) # calculate the probability density at those x values plot(x,y,main="log normal density") # scatter plot lines(x,y) # add lines # now plot distribution of sample means: if (Sys.info()[1]=="Darwin") quartz(height=6,width=8) else windows(height=6,width=8) par(mfrow=c(1,3)) # tell R to divide window into 1x3 plots for (k in minsamplesize:maxsamplesize) { sampsize<-k; # current sample size thenumbers<-rlnorm(N* sampsize,meanlog=log(m),sdlog=log(sd)); # draw random numbers from population thedata<-matrix(thenumbers,N,sampsize) # reformat numbers into N samples of k values sample.means<-rowMeans(thedata); # clac the mean of each row hist(sample.means,breaks="FD",ylim=c(0,1.5), main=paste("sample size = ",k),prob=TRUE,col="lemonchiffon") # draw histogram of means # truehist(sample.means,ylim=c(0,1),col="lemonchiffon") # draw histogram of means # now we will draw the normal distribution of means # in other words, we will draw what we expect to get if the original numbers are # drawn from a normal distribution pu <-par("usr")[1:2] # fancy-pants R language for getting the range of histogram's x-axis x <- seq(pu[1],pu[2],len=500) # make a sequence of x values y <- dnorm(x,mean=mu,sd=(sd/sqrt(sampsize)) ) lines(x,y,col="red") qqnorm(sample.means,main="QQ Plot (should be straight line)") qqline(sample.means) # finally, draw a boxplot to look for asymmetry and outliers boxplot(sample.means,main="should be symmetrical") Sys.sleep(1) # now calculate t-distribution when sample size = k sample.sd<-apply(thedata,MARGIN=1,sd) # calculate the sd for each row/sample sample.se<-sample.sd/sqrt(sampsize) # the STANDARD ERROR is sd/sqrt(sample size) # the t-value for each sample is defined as the sample mean minus the population mean (i.e., mu) divided by the standard error sample.t<-(sample.means-mu)/sample.se type1.error.rate=0.05 critical.t<-qt(type1.error.rate,df=(sampsize-1)); # critical value of t for a 1-tailed test that the sample mean is less than mu; # how many of our samples actually exceeded the critical t value? num.type1.errors<-length(sample.t[sample.t<=critical.t]) p.type1.error<-num.type1.errors/length(sample.means) print(" ") print("---------- TYPE I ERROR RATE ----------------") print(paste("sample size = ", sampsize)) print(paste("expected type I error rate = ",type1.error.rate,"; observed type I error rate = ",p.type1.error)) }