Simulating and Fitting Data Distributions

This exercise teaches students how to compare a histogram of continuous (or integer) data to the probability density functions for different statistical distributions.

Question 1

Run Sample Code

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation

# quick and dirty, a truncated normal distribution to work on the solution set

z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)

## 'data.frame':    1774 obs. of  2 variables:
##  $ ID   : int  1 3 5 7 8 9 10 11 12 13 ...
##  $ myVar: num  0.931 0.484 0.231 1.611 0.557 ...

summary(z$myVar)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000034 0.366612 0.744607 0.872181 1.259727 3.618112

p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

normPars <- fitdistr(z$myVar,"normal")
print(normPars)

##       mean          sd    
##   0.87218109   0.64726462 
##  (0.01536756) (0.01086651)

str(normPars)

## List of 5
##  $ estimate: Named num [1:2] 0.872 0.647
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.0154 0.0109
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.000236 0 0 0.000118
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 1774
##  $ loglik  : num -1746
##  - attr(*, "class")= chr "fitdistr"

normPars$estimate["mean"] # note structure of getting a named attribute

##      mean 
## 0.8721811

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$myVar),len=length(z$myVar))

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
 p1 + stat

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
 p1 + stat + stat2

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
 p1 + stat + stat2 + stat3

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

gammaPars <- fitdistr(z$myVar,"gamma")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (geom_bar).

Question 2

Try it with your own data - Data provided by LTER

z <- read.table("lpp_hglvl_data.csv",header=TRUE,sep=",")
z <- z[z$MEAN_HG_CONC>0,]
str(z)

## 'data.frame':    43 obs. of  9 variables:
##  $ LAKE           : chr  "Allequash" "Arrowhead" "Beaver" "Big" ...
##  $ AREA           : int  430 94 63 804 119 881 535 109 65 89 ...
##  $ HYDRO_POS      : chr  "D" "H" "D" "D" ...
##  $ LAKE_ORDER     : int  1 0 2 3 -2 -1 3 2 -2 -2 ...
##  $ MEAN_LENGTH    : int  183 153 164 145 156 144 97 131 136 208 ...
##  $ MEAN_LENGTH_SE : num  4.1 3.8 11.5 7.6 10.5 5.8 9 12.1 3.2 2 ...
##  $ MEAN_HG_CONC   : num  0.057 0.025 0.084 0.036 0.141 0.044 0.054 0.048 0.17 0.101 ...
##  $ MEAN_HG_CONC_SE: num  0.0047 0.0013 0.0115 0.0057 0.0172 0.0064 0.0035 0.0026 0.02 0.0345 ...
##  $ N_FISH         : int  5 3 5 3 5 4 5 9 3 3 ...

summary(z)

##      LAKE                AREA         HYDRO_POS           LAKE_ORDER     
##  Length:43          Min.   :  42.0   Length:43          Min.   :-2.0000  
##  Class :character   1st Qu.: 113.0   Class :character   1st Qu.:-1.0000  
##  Mode  :character   Median : 194.0   Mode  :character   Median : 0.0000  
##                     Mean   : 505.3                      Mean   : 0.4651  
##                     3rd Qu.: 479.0                      3rd Qu.: 2.0000  
##                     Max.   :3875.0                      Max.   : 4.0000  
##   MEAN_LENGTH    MEAN_LENGTH_SE   MEAN_HG_CONC    MEAN_HG_CONC_SE 
##  Min.   : 97.0   Min.   : 1.00   Min.   :0.0150   Min.   :0.0006  
##  1st Qu.:135.0   1st Qu.: 5.95   1st Qu.:0.0445   1st Qu.:0.0051  
##  Median :145.0   Median :10.20   Median :0.0660   Median :0.0081  
##  Mean   :146.3   Mean   :10.16   Mean   :0.0746   Mean   :0.0109  
##  3rd Qu.:155.0   3rd Qu.:13.55   3rd Qu.:0.0960   3rd Qu.:0.0115  
##  Max.   :211.0   Max.   :23.50   Max.   :0.2170   Max.   :0.0445  
##      N_FISH     
##  Min.   :2.000  
##  1st Qu.:3.000  
##  Median :4.000  
##  Mean   :4.256  
##  3rd Qu.:5.000  
##  Max.   :9.000

library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation


p1 <- ggplot(data=z, aes(x=MEAN_HG_CONC, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

normPars <- fitdistr(z$MEAN_HG_CONC,"normal")
print(normPars)

##       mean           sd     
##   0.074604651   0.041685500 
##  (0.006356979) (0.004495063)

str(normPars)

## List of 5
##  $ estimate: Named num [1:2] 0.0746 0.0417
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.00636 0.0045
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 4.04e-05 0.00 0.00 2.02e-05
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 43
##  $ loglik  : num 75.6
##  - attr(*, "class")= chr "fitdistr"

normPars$estimate["mean"] # note structure of getting a named attribute

##       mean 
## 0.07460465

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$MEAN_HG_CONC),len=length(z$MEAN_HG_CONC))

stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$MEAN_HG_CONC), args = list(mean = meanML, sd = sdML))
 p1 + stat

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#red shows a normal distribution to the curve


expoPars <- fitdistr(z$MEAN_HG_CONC,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$MEAN_HG_CONC), args = list(rate=rateML))
 p1 + stat + stat2

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# blue shows an exponential distribution

 
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$MEAN_HG_CONC), args = list(min=min(z$MEAN_HG_CONC), max=max(z$MEAN_HG_CONC)))
 p1 + stat + stat2 + stat3

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# dark green shows uniform distribution
 
gammaPars <- fitdistr(z$MEAN_HG_CONC,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$MEAN_HG_CONC), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#brown displays gamma distribution
 
pSpecial <- ggplot(data=z, aes(x=MEAN_HG_CONC/(max(MEAN_HG_CONC + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$MEAN_HG_CONC/max(z$MEAN_HG_CONC + 0.1),start=list(shape1=1,shape2=2),"beta")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$MEAN_HG_CONC), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (geom_bar).

Question 3

Find best-fitting distribution

The gamma curve fits the data the best.

Question 4

Simulate a new dataset

z_sim <- rnorm(n=43,mean=normPars$estimate["mean"])
z_sim <- data.frame(1:43,z_sim)
names(z_sim) <- list("ID","myVar")
z_sim <- z_sim[z_sim$myVar>0,]
str(z_sim)

## 'data.frame':    27 obs. of  2 variables:
##  $ ID   : int  5 6 10 11 12 13 15 16 17 19 ...
##  $ myVar: num  0.0344 0.8301 0.5742 1.2608 0.0945 ...

summary(z_sim$myVar)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.03438 0.18269 0.69180 0.65597 0.94829 1.59186

p1sim <- ggplot(data=z_sim, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1sim)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1sim <-  p1sim +  geom_density(linetype="dotted",size=0.75)
print(p1sim)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

normParssim <- fitdistr(z_sim$myVar,"normal")
print(normParssim)

##       mean          sd    
##   0.65597302   0.44027704 
##  (0.08473136) (0.05991412)

str(normParssim)

## List of 5
##  $ estimate: Named num [1:2] 0.656 0.44
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.0847 0.0599
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.00718 0 0 0.00359
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 27
##  $ loglik  : num -16.2
##  - attr(*, "class")= chr "fitdistr"

normParssim$estimate["mean"] # note structure of getting a named attribute

##     mean 
## 0.655973

meanMLsim <- normParssim$estimate["mean"]
sdMLsim <- normParssim$estimate["sd"]

xvalsim <- seq(0,max(z_sim$myVar),len=length(z_sim$myVar))

#plotting the best fit curve for original dataset - gamma 
gammaParssim <- fitdistr(z_sim$myVar,"gamma")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shapeMLsim <- gammaPars$estimate["shape"]
rateMLsim <- gammaPars$estimate["rate"]

stat4sim <- stat_function(aes(x = xvalsim, y = ..y..), fun = dgamma, colour="brown", n = length(z_sim$myVar), args = list(shape=shapeMLsim, rate=rateMLsim))
p1sim + stat4sim

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#reprinting original data historgram with gamma curve 
p1+stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The two histograms do not show similar gamma distributions. The model is not large enough to simulate realistic data to match my original measurements. With more samples, the gamma distribution may not be the best fit, and thus the simulation may be more realistic.

Home Page

Homework 6

Lindsey Carlson

2/23/2022

Simulating and Fitting Data Distributions

Question 1

Question 2

Question 3

Question 4