Central Limit Theorem

Central Limit Theorem¶

We would like to investigate the sampling distribution of \(\bar{X}\) under 1) normal and 2) non-normal distributions.

import warnings
warnings.filterwarnings("ignore")
#import required libraries
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import ipywidgets as widgets
from IPython.display import display

Let’s start with an example where the true population distribution is standard normal.

#np.random.seed(1773) 

##an example for clt when the true pop is standard normal

def clt_norm_true(Sim_no = 500,n = 25):
    
    ##input paramaters (arguments)
    ##Sim_no: Number of simulations (experiments)
    ##n: Number of data points that will be selected from the true population without replacement.
    
    ##Fixed values
    ##true population size
    N = 10000
    ##true population mean
    mu = 0
    ##true population sd
    sigma = 1

    # Generate true population under normal distribution
    data = stats.norm.rvs(loc = mu, scale = sigma, size = N)

    ##Simulation settings
    
    # select sim_no number of samples of size n from true population
    # create an empty data size of sim_no by n

    samples = np.empty((Sim_no, n))

    for i in range(0, Sim_no): 
        sample_data = np.random.choice(data, size=n) # select one sample of size n from true pop.
        #repeat this process Sim_no times and store the data
        samples[i] = sample_data 

    #calculate the row.means (calculate mean across the columns, axis = 1)
    #returns a vector of sim_no by 1
    
    sample_means = np.mean(samples, axis=1)
    
    ## figure settings
    
    fig, ax = plt.subplots(figsize = (7, 7))
    
    # Plot the histogram of true population
    plt.subplot(1, 2, 1)
    plt.hist(data, bins=25, density=True, alpha=0.5, color='b')
    plt.title('True population with $\mu=%.1f,\sigma^2=%.1f$' % (mu, sigma*sigma))
    plt.xlabel(r'$x$')
    
    # Plot the histogram of sample means
    plt.subplot(1, 2, 2)
    plt.hist(sample_means, bins=25, density=True, alpha=0.5, color='b')
    plt.title(r"Sampling distribution of $\bar{X}$")
    plt.xlabel(r'$\bar{x}$')
    #plt.ylim(0, 2) ######i need to work on upper ylim and coordinates of plt.text later######
    #add a text
    plt.text(-0.2,1.5,
          "Mean = %.4f \n Standard error = %.4f" % (np.mean(sample_means), np.std(sample_means)))
    #side-by-side
    fig.tight_layout()
    plt.show()
nsample_wid = widgets.IntSlider(min = 0, max = 1000, step=10, value=500, description = "Number of samples drawn")
sample_size_wid = widgets.IntSlider(min = 0, max = 100, step=1, value=10, description = "Sample size")
widgets.interact(clt_norm_true, Sim_no = nsample_wid, n = sample_size_wid);

Let’s start with an example where the true population distribution is exponential distribution.

#np.random.seed(1773) 

##an example for clt when the true pop is exponential

def clt_exp_true(Sim_no = 500,n = 25):
    
    ##input paramaters (arguments)
    ##Sim_no: Number of simulations (experiments)
    ##n:Number of data points that will be selected from the true population without replacement.
    
    ##Fixed values
    ##true population size
    N = 10000
    ##true population scale 
    lam = 1 #default

    # Generate true population under normal distribution
    data = stats.expon.rvs(scale = lam, size = N)

    ##Simulation settings
    
    # select sim_no number of samples of size n from true population
    # create an empty data size of sim_no by n

    samples = np.empty((Sim_no, n))

    for i in range(0, Sim_no): 
        sample_data = np.random.choice(data, size=n) # select one sample of size n from true pop.
        #repeat this process Sim_no times and store the data
        samples[i] = sample_data 

    #calculate the row.means (calculate mean across the columns, axis = 1)
    #returns a vector of sim_no by 1
    
    sample_means = np.mean(samples, axis=1)
    
    ## figure settings
    
    fig, ax = plt.subplots(figsize = (7, 7))
    
    # Plot the histogram of true population
    plt.subplot(1, 2, 1)
    plt.hist(data, bins=25, density=True, alpha=0.5, color='b')
    plt.title('True population with $\lambda=%.1f$' % (lam))
    plt.xlabel(r'$x$')
        
    # Plot the histogram of sample means
    plt.subplot(1, 2, 2)
    plt.hist(sample_means, bins=25, density=True, alpha=0.5, color='b')
    plt.title(r"Sampling distribution of $\bar{X}$")
    plt.xlabel(r'$\bar{x}$')
    #plt.ylim(0, 2) ######i need to work on upper ylim and coordinates of plt.text later######
    #add a text
    plt.text(1,1.5,
          "Mean = %.4f \n Standard error = %.4f" % (np.mean(sample_means), np.std(sample_means)))
    #side-by-side
    fig.tight_layout()
    plt.show()
nsample_wid = widgets.IntSlider(min = 0, max = 1000, step=10, value=500, description = "Number of samples drawn")
sample_size_wid = widgets.IntSlider(min = 0, max = 100, step=1, value=10, description = "Sample size")
widgets.interact(clt_exp_true, Sim_no = nsample_wid, n = sample_size_wid);
## Turning these widgets into a Dashboard kept as a future work.