Normal Distributionยถ

Letโ€™s investigate how shape of the normal distribution curve look likes.

# import required libraries
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from IPython.display import display

Draw the normal distribution curve for a given mean and standard deviation value.

#assume mean (mu) = 0 and sd (sigma) = 1.
#you can change the values as you wish

mu = 0
sigma = 1

fig, ax = plt.subplots(figsize = (6, 6))

#norm.rvs generates random normal dist. variables according
#to given loc and scale, where loc is the mean and scale is the sd.
#assume mean (loc) = 0 and sigma (scale) = 1.

x = stats.norm.rvs(loc = mu, scale = sigma, size = 10000)
x1 = np.sort(x)

#norm.pdf calculates the corresponding pdf values of x1
#i sorted x values otherwise plt.plot cannot produce a proper plot.
#replace x1 with x in the following command, and you will see that what i mean.

#data values are on the x-axis, their corresponding pdf values are on the y-axis.
#connectiong (x,y) values gives us the pdf of normal distribution for the specified
#mean (loc) and sigma (scale).

plt.plot(x1, stats.norm.pdf(x1, loc = mu, scale = sigma), 
         label = r'$\mu=%.1f,\sigma^2=%.1f$' % (mu, sigma*sigma))  #string formatting
        
plt.xlim(-15, 15)
plt.ylim(0, 0.5)
plt.xlabel('$x$')
plt.ylabel('$f(x|\mu,\sigma)$')
plt.title('Normal Distribution Curve')
plt.legend(loc='upper right')
plt.show()
../../_images/normal_dist_4_0.png

Letโ€™s now investigate how shape of the normal distribution curve changes with respect to mean (\(\mu\)) and standard deviation (\(\sigma\)) parameters. We will define a function first to simplify our calculations.

# define a Python function with four parameters

def normal_dist_curve(mu, sigma, ltype, col):
    
    fig, ax = plt.subplots(figsize = (6, 6))
    
    #use zip for parallel iteration
    for m, s, l, cl in zip(mu, sigma, ltype, col):
        #generate 10000 random variables for given mean and sd
        x = stats.norm.rvs(loc = m, scale = s, size = 10000)
        x1 = np.sort(x)
        plt.plot(x1, stats.norm.pdf(x1, loc = m, scale = s), 
                 ls = l, c = cl, label = r'$\mu=%.1f  , \sigma^2=%.1f$' % (m, s*s))  
        
    plt.xlim(-15, 15)
    plt.ylim(0, 0.5)
    plt.xlabel('$x$')
    plt.ylabel('$f(x|\mu,\sigma^2)$')
    plt.title('Normal Distribution Curve')
    plt.legend(loc='upper right')
    plt.show()

Now play with the function to see the changes on the normal distribution curve.

## change the following values as you wish
## define the input arguments (values of the parameters)
mu_values = [-5, 0, 5]
#fix sigma at 1
sigma_values = [1, 1, 1]
linestyles = ['-', '--', '-.']
colors = ['black', 'red', 'blue']

normal_curve = normal_dist_curve(mu = mu_values, sigma = sigma_values, ltype = linestyles, col = colors)

display(normal_curve)
../../_images/normal_dist_8_0.png
None
## change the following values as you wish
## define the input arguments (values of the parameters)
#fix mu at 0
mu_values = [0, 0, 0]
sigma_values = [1, 2, 3]
linestyles = ['-', '--', '-.']
colors = ['black', 'red', 'blue']

normal_curve = normal_dist_curve(mu = mu_values, sigma = sigma_values, ltype = linestyles, col = colors)

display(normal_curve)
../../_images/normal_dist_9_0.png
None

Cumulative distribution functionยถ

Letโ€™s say, now, we would like to calculate \(Pr(X \leq a)\), where X is a normal variable and a is pre-defined value. For example, given the mean of 5 and the standard deviation of 2, calculate the probability of ๐‘ƒ(๐‘‹<4).

import scipy.stats as stats
cum_prob = stats.norm.cdf(x = 4, loc = 5, scale = 2)
print("%.4f"%cum_prob)
0.3085

Calculate the probability of ๐‘ƒ(๐‘‹<0), given that \(X \sim N(0,1)\).

import scipy.stats as stats
cum_prob = stats.norm.cdf(x = 0, loc = 0, scale = 1)
print("%.4f"%cum_prob)
0.5000

Letโ€™s use fill_between() command from matplotlib to shade the area defined by ๐‘ƒ(๐‘‹<0) under the pdf of \(X \sim N(0,1)\).

# this code is similar to second code chunk where
# some input arguments are a bit automatized

# define mu, sd and cutoff value (lessthan)

mu = 0
sd = 1
upper_bound = 0

#calculate the cumulative probability under given distribution
cum_prob = stats.norm.cdf(x = upper_bound, loc = mu, scale = sd)

#start off plotting 
fig, ax = plt.subplots(figsize = (6, 6))

# for pdf curve
#generate synthetic data to plot the pdf
x = stats.norm.rvs(loc = mu, scale = sd, size = 10000)
x1 = np.sort(x)

plt.plot(x1, stats.norm.pdf(x1, loc = mu, scale = sd), 
         label = r'$\mu=%.1f,\sigma^2=%.1f$' % (mu, sd*sd))

# arrange x-axis limits based on the min and max values in the data
plt.xlim(int(np.min(x1))-3, int(np.max(x1))+3)
#instead of default tickmarks, define yours.
plt.xticks(np.arange(int(np.min(x1)), int(np.max(x1))+1, 1))
# arrange upper limit of y-axis max values in the pdf values (y's min value is always 0).
plt.ylim(0, np.max(stats.norm.pdf(x1, loc = mu, scale = sd))+0.1)
plt.xlabel('$x$')
plt.ylabel('f(x|$\mu=%.1f,\sigma^2=%.1f$)' % (mu, sd*sd))
plt.title('Cumulative Probabilities under Normal Distribution')


#for shading area defined by the cumulative probability

#specifically generate values between int(np.min(x1)) and upper_bound with increments 0.01
#with np.arange(), data comes alread sorted (it creates an increasing series).
x2 = np.arange(int(np.min(x1))-1,upper_bound,0.01)

#go to https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.fill_between.html for more info
plt.fill_between(x2,stats.norm.pdf(x2, loc = mu, scale = sd), color='blue')
        
#add a text, where x coordinate is -4.5 and y-coordinate is based on the peak point of the curve.

plt.text(int(np.min(x1))-2,np.max(stats.norm.pdf(x1, loc = mu, scale = sd))+0.05,
         "$Pr(X \leq %.3f)=%.3f$" % (upper_bound, cum_prob), fontsize=14)

plt.show()
../../_images/normal_dist_16_0.png

Probability of falling into an intervalยถ

We can also calculate probability of falling between a and b values for a normal variable X such that Pr(a < X < b). Letโ€™s find ๐‘ƒ(-1.96<๐‘‹<1.96) with a mean of 0 and a standard deviation of 1. We need to subtract one cdf from another cdf.

import scipy.stats as stats
interval_prob = stats.norm.cdf(x = 1.96, loc = 0, scale = 1) - stats.norm.cdf(x = -1.96, loc = 0, scale = 1)
print("%.4f"%interval_prob)
0.9500

As in the previous case, letโ€™s visualize it.

# this code is similar to second code chunk where
# some input arguments are a bit automatized

# define mu, sd and cutoff value (lessthan)

mu = 0
sd = 1
upper_bound = 1.96
lower_bound = -1.96

#calculate the interval probability under given distribution
interval_prob = stats.norm.cdf(x = upper_bound, loc = mu, scale = sd) - stats.norm.cdf(x = lower_bound, loc = mu, scale = sd)
                

#start off plotting 
fig, ax = plt.subplots(figsize = (6, 6))

# for pdf curve
#generate synthetic data to plot the pdf
x = stats.norm.rvs(loc = mu, scale = sd, size = 10000)
x1 = np.sort(x)

plt.plot(x1, stats.norm.pdf(x1, loc = mu, scale = sd), 
         label = r'$\mu=%.1f,\sigma^2=%.1f$' % (mu, sd*sd))

# arrange x-axis limits based on the min and max values in the data
plt.xlim(int(np.min(x1))-3, int(np.max(x1))+3)
#instead of default tickmarks, define yours.
plt.xticks(np.arange(int(np.min(x1)), int(np.max(x1))+1, 1))
# arrange upper limit of y-axis max values in the pdf values (y's min value is always 0).
plt.ylim(0, np.max(stats.norm.pdf(x1, loc = mu, scale = sd))+0.1)
plt.xlabel('$x$')
plt.ylabel('f(x|$\mu=%.1f,\sigma^2=%.1f$)' % (mu, sd*sd))
plt.title('Probability of an interval under Normal Distribution')


#for shading area defined by the interval probability

#specifically generate values between lower_bound and upper_bound value with increments 0.01
#with np.arange(), data comes alread sorted (it creates an increasing series).
x2 = np.arange(lower_bound, upper_bound, 0.01)

plt.fill_between(x2,stats.norm.pdf(x2, loc = mu, scale = sd), color='blue')
        
#add a text, where x coordinate is -4.5 and y-coordinate is based on the peak point of the curve.

plt.text(int(np.min(x1))-2,np.max(stats.norm.pdf(x1, loc = mu, scale = sd))+0.05,
         "$Pr(%.3f \leq$ X $\leq %.3f)=%.3f$" % (lower_bound, upper_bound,interval_prob), fontsize=14)

plt.show()
../../_images/normal_dist_21_0.png

We can calculate ๐‘ƒ(๐‘‹>1.96) through 1-๐‘ƒ(๐‘‹<1.96).

import scipy.stats as stats
inverse_prob = 1- stats.norm.cdf(x = 1.96, loc = 0, scale = 1) 
print("%.4f"%inverse_prob)
0.0250

As in the previous case, letโ€™s visualize it.

# this code is similar to second code chunk where
# some input arguments are a bit automatized

# define mu, sd and cutoff value (lessthan)

mu = 0
sd = 1
lower_bound = 1.96

#calculate the cumulative probability under given distribution
complementary_prob = 1-stats.norm.cdf(x = lower_bound, loc = mu, scale = sd)

#start off plotting 
fig, ax = plt.subplots(figsize = (6, 6))

# for pdf curve
#generate synthetic data to plot the pdf
x = stats.norm.rvs(loc = mu, scale = sd, size = 10000)
x1 = np.sort(x)

plt.plot(x1, stats.norm.pdf(x1, loc = mu, scale = sd), 
         label = r'$\mu=%.1f,\sigma^2=%.1f$' % (mu, sd*sd))

# arrange x-axis limits based on the min and max values in the data
plt.xlim(int(np.min(x1))-3, int(np.max(x1))+3)
#instead of default tickmarks, define yours.
plt.xticks(np.arange(int(np.min(x1)), int(np.max(x1))+1, 1))
# arrange upper limit of y-axis max values in the pdf values (y's min value is always 0).
plt.ylim(0, np.max(stats.norm.pdf(x1, loc = mu, scale = sd))+0.1)
plt.xlabel('$x$')
plt.ylabel('f(x|$\mu=%.1f,\sigma^2=%.1f$)' % (mu, sd*sd))
plt.title('Complementary cumulative Probabilities under Normal Distribution')
#for shading area defined by the cumulative probability

#specifically generate values between lower_bound and int(np.max(x1)) with increments 0.01
#with np.arange(), data comes alread sorted (it creates an increasing series).
x2 = np.arange(lower_bound,int(np.max(x1))+1,0.01)

#go to https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.fill_between.html for more info
plt.fill_between(x2,stats.norm.pdf(x2, loc = mu, scale = sd), color='blue')
        
#add a text, where x coordinate is -4.5 and y-coordinate is based on the peak point of the curve.

plt.text(int(np.min(x1))-2,np.max(stats.norm.pdf(x1, loc = mu, scale = sd))+0.05,
         "$Pr(X \geq %.3f)=%.3f$" % (lower_bound, complementary_prob), fontsize=14)

plt.show()
../../_images/normal_dist_25_0.png

Basesd on these functions, we can create a very general function which calculates the probabilities under specified settings and turn it into a widget for a better a user-experience.