# illustration of central limit theorem using a lognormal random distribution

# To execute this script in R:
# 1) open R and set the working directory to the folder that contains this file.
# 2) at the command prompt, enter source("cltlognorm.R")
# The script will plot the log-normal distribution and then use 3 plots to examine the distribution of sample means:
# - the expected normal distribution drawn on top of a histogram of the sample means
# - a quantile-quantile QQ plot of the sample means: the points should fall along a straight lines if the means are distributed normally
# - a boxplot of the sample means: should be symmetrical and about 5% of the points should be "outliers"

# Note: This example is very different from the one in which scores were drawn from a uniform distribution. Here, when we draw scores from a log-normal distribution, the means do not closely follow a normal distribution even with a sample size n=30: the sampling distibution is skewed, so that there are too many large sample means. With a log-normal distribution, you need a sample size of approximately 120 before the means are distributed normally.


# library(car) # need this package to call n.bins
# library(MASS)

N <- 5000; # number of samples drawn from population
# sample size:
minsamplesize<-15;
maxsamplesize<-30;


m<-2; # distribution "location" parameter
sd<-2; # distribution standard deviation
mu<-exp(log(m)+0.5*log(sd)*log(sd)) # mu is the expected value of the lognormal distribution defined by m & sd

# print mean & sd to make sure that they are correct...
tmp<-rlnorm(10000,meanlog=log(m),sdlog=log(sd))
print(paste("expected mean = ",mu,"; obtained mean = ",mean(tmp)))
print(paste("expected sd = ",sd,"; obtained sd = ",sd(tmp))) 

graphics.off() # close all graphics devices


# plot the distribution...
if (Sys.info()[1]=="Darwin") quartz(height=6,width=8) else windows(height=6,width=8) # create window
x<-seq(0.1,10,len=500) # sequence of 500 x values ranging from 0.1 to 10
y<-dlnorm(x,meanlog=log(m),sdlog=log(sd)) # calculate the probability density at those x values
plot(x,y,main="log normal density") # scatter plot
lines(x,y) # add lines


# now plot distribution of sample means:
if (Sys.info()[1]=="Darwin") quartz(height=6,width=8) else windows(height=6,width=8)
par(mfrow=c(1,3)) # tell R to divide window into 1x3 plots

for (k in minsamplesize:maxsamplesize)
{
	sampsize<-k; # current sample size
	thenumbers<-rlnorm(N* sampsize,meanlog=log(m),sdlog=log(sd)); # draw random numbers from population
	thedata<-matrix(thenumbers,N,sampsize) # reformat numbers into N samples of k values
	sample.means<-rowMeans(thedata); # clac the mean of each row
	hist(sample.means,breaks="FD",ylim=c(0,1.5),
		main=paste("sample size = ",k),prob=TRUE,col="lemonchiffon") # draw histogram of means
	# truehist(sample.means,ylim=c(0,1),col="lemonchiffon") # draw histogram of means

	# now we will draw the normal distribution of means
	# in other words, we will draw what we expect to get if the original numbers are
	# drawn from a normal distribution

	pu <-par("usr")[1:2] # fancy-pants R language for getting the range of histogram's x-axis
	x <- seq(pu[1],pu[2],len=500) # make a sequence of x values
	y <- dnorm(x,mean=mu,sd=(sd/sqrt(sampsize)) )
	lines(x,y,col="red")
	qqnorm(sample.means,main="QQ Plot (should be straight line)")
	qqline(sample.means)

        # finally, draw a boxplot to look for asymmetry and outliers
	boxplot(sample.means,main="should be symmetrical")

	Sys.sleep(1)


       	# now calculate t-distribution when sample size = k
	sample.sd<-apply(thedata,MARGIN=1,sd) # calculate the sd for each row/sample
	sample.se<-sample.sd/sqrt(sampsize) # the STANDARD ERROR is sd/sqrt(sample size)

	# the t-value for each sample is defined as the sample mean minus the population mean (i.e., mu) divided by the standard error
	sample.t<-(sample.means-mu)/sample.se

	type1.error.rate=0.05
	critical.t<-qt(type1.error.rate,df=(sampsize-1)); # critical value of t for a 1-tailed test that the sample mean is less than mu;

	# how many of our samples actually exceeded the critical t value?
	num.type1.errors<-length(sample.t[sample.t<=critical.t])
	p.type1.error<-num.type1.errors/length(sample.means)

	print(" ")
	print("---------- TYPE I ERROR RATE ----------------")
	print(paste("sample size = ", sampsize))
	print(paste("expected type I error rate = ",type1.error.rate,"; observed type I error rate = ",p.type1.error))

}