# illustration of central limit theorem using a UNIFORM random distribution

# To execute this script in R:
# 1) open R and set the working directory to the folder that contains this file.
# 2) at the command prompt, enter source("cltunif.R")
# The script will plot the log-normal distribution and then use 3 plots to examine the distribution of sample means:
# - the expected normal distribution drawn on top of a histogram of the sample means
# - a quantile-quantile QQ plot of the sample means: the points should fall along a straight lines if the means are distributed normally
# - a boxplot of the sample means: should be symmetrical and about 5% of the points should be "outliers"

# Note: Only a very small sample size is required for the sample means to follow a normal distribution.


# library(car) # need this package to call n.bins
# library(MASS)
print("central limit theorem using uniform random distribution")

N <- 5000; # number of samples drawn from the population
# sample size:
minsamplesize<-2;
maxsamplesize<-20;


m<-2; # distribution mean
sd<-3; # distribution standard deviation
mu<-m

# usually, a uniform random distribution has lower and upper limits of zero & one,
# a mean of 0.5, and a variance of 1/12...
# to get the correct mean and sd, we need to adjust limits of uniform distribution

d<-(sqrt(12)*sd)/2
rmin<-m-d # lower limit
rmax<-m+d # upper limit

# print mean & sd to make sure that they are correct...
tmp<-runif(10000,rmin,rmax)
print(paste("expected mean = ",m,"; obtained mean = ",mean(tmp)))
print(paste("expected sd = ",sd,"; obtained sd = ",sd(tmp))) 


if (Sys.info()[1]=="Darwin") quartz(height=6,width=8) else windows(height=6,width=8)

# plot the distribution...
if (Sys.info()[1]=="Darwin") quartz(height=6,width=8) else windows(height=6,width=8) # create window
x<-seq((rmin-1),(rmax+1),len=500) # sequence of 500 x values ranging from rmin to rmax
y<-dunif(x,rmin,rmax) # calculate the uniform density at those x values
plot(x,y,main="uniform density") # scatter plot
lines(x,y) # add lines


# now plot distribution of sample means:
if (Sys.info()[1]=="Darwin") quartz(height=6,width=8) else windows(height=6,width=8) # create another window
par(mfrow=c(1,3)) # tell R to divide window into 1x3 plots

for (k in minsamplesize:maxsamplesize)
{
	sampsize <- k; # current sample size
	thenumbers<-runif(N* sampsize,min=rmin,max=rmax); # draw random numbers from population
	thedata<-matrix(thenumbers,N,sampsize) # reformat numbers into N samples of k values
	sample.means<-rowMeans(thedata); # clac the mean of each row
	
	# now draw a histogram
	hist(sample.means,breaks="FD",ylim=c(0,1.0),
		main=paste("sample size = ",k),prob=TRUE,col="lemonchiffon") # draw histogram of means
	# truehist(sample.means,ylim=c(0,1),col="lemonchiffon") # draw histogram of means

	# to the histogram, add the normal distribution of means
	# in other words, we will draw what we expect to get if the original numbers are
	# drawn from a normal distribution
	pu <-par("usr")[1:2] # fancy-pants R language for getting the range of histogram's x-axis
	x <- seq(pu[1],pu[2],len=500) # make a sequence of x values
	y <- dnorm(x,mean=m,sd=(sd/sqrt(sampsize)) ) # calc normal density at those x values
	lines(x,y,col="red") # draw it on top of the histogram
	
	# next, draw a qqplot to visualize departures from normality
	qqnorm(sample.means,main="QQ Plot (should be straight line)")
	qqline(sample.means)
	
	# finally, draw a boxplot to look for asymmetry and outliers
	boxplot(sample.means,main="should be symmetrical")
	
	Sys.sleep(1)
	
	
	# now calculate t-distribution when sample size = k
	sample.sd<-apply(thedata,MARGIN=1,sd) # calculate the sd for each row/sample
	sample.se<-sample.sd/sqrt(sampsize) # the STANDARD ERROR is sd/sqrt(sample size)

	# the t-value for each sample is defined as the sample mean minus the population mean (i.e., mu) divided by the standard error
	sample.t<-(sample.means-mu)/sample.se

	type1.error.rate=0.05
	critical.t<-qt(type1.error.rate,df=(sampsize-1)); # critical value of t for a 1-tailed test that the sample mean is less than mu;

	# how many of our samples actually exceeded the critical t value?
	num.type1.errors<-length(sample.t[sample.t<=critical.t])
	p.type1.error<-num.type1.errors/length(sample.means)

	print(" ")
	print("---------- TYPE I ERROR RATE ----------------")
	print(paste("sample size = ", sampsize))
	print(paste("expected type I error rate = ",type1.error.rate,"; observed type I error rate = ",p.type1.error))

}