Simulating and Fitting Data Distributions
This exercise teaches students how to compare a histogram of continuous (or integer) data to the probability density functions for different statistical distributions.
Question 1
Run Sample Code
library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
# quick and dirty, a truncated normal distribution to work on the solution set
<- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
z names(z) <- list("ID","myVar")
<- z[z$myVar>0,]
z str(z)
## 'data.frame': 1774 obs. of 2 variables:
## $ ID : int 1 3 5 7 8 9 10 11 12 13 ...
## $ myVar: num 0.931 0.484 0.231 1.611 0.557 ...
summary(z$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000034 0.366612 0.744607 0.872181 1.259727 3.618112
<- ggplot(data=z, aes(x=myVar, y=..density..)) +
p1 geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
<- p1 + geom_density(linetype="dotted",size=0.75)
p1 print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
<- fitdistr(z$myVar,"normal")
normPars print(normPars)
## mean sd
## 0.87218109 0.64726462
## (0.01536756) (0.01086651)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 0.872 0.647
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.0154 0.0109
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.000236 0 0 0.000118
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 1774
## $ loglik : num -1746
## - attr(*, "class")= chr "fitdistr"
$estimate["mean"] # note structure of getting a named attribute normPars
## mean
## 0.8721811
<- normPars$estimate["mean"]
meanML <- normPars$estimate["sd"]
sdML
<- seq(0,max(z$myVar),len=length(z$myVar))
xval
<- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
stat + stat p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
<- fitdistr(z$myVar,"exponential")
expoPars <- expoPars$estimate["rate"]
rateML
<- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
stat2 + stat + stat2 p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
<- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
stat3 + stat + stat2 + stat3 p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
<- fitdistr(z$myVar,"gamma") gammaPars
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
<- gammaPars$estimate["shape"]
shapeML <- gammaPars$estimate["rate"]
rateML
<- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
stat4 + stat + stat2 + stat3 + stat4 p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
<- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
pSpecial geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
<- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta") betaPars
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
<- betaPars$estimate["shape1"]
shape1ML <- betaPars$estimate["shape2"]
shape2ML
<- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
statSpecial + statSpecial pSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
Question 2
Try it with your own data - Data provided by LTER
<- read.table("lpp_hglvl_data.csv",header=TRUE,sep=",")
z <- z[z$MEAN_HG_CONC>0,]
z str(z)
## 'data.frame': 43 obs. of 9 variables:
## $ LAKE : chr "Allequash" "Arrowhead" "Beaver" "Big" ...
## $ AREA : int 430 94 63 804 119 881 535 109 65 89 ...
## $ HYDRO_POS : chr "D" "H" "D" "D" ...
## $ LAKE_ORDER : int 1 0 2 3 -2 -1 3 2 -2 -2 ...
## $ MEAN_LENGTH : int 183 153 164 145 156 144 97 131 136 208 ...
## $ MEAN_LENGTH_SE : num 4.1 3.8 11.5 7.6 10.5 5.8 9 12.1 3.2 2 ...
## $ MEAN_HG_CONC : num 0.057 0.025 0.084 0.036 0.141 0.044 0.054 0.048 0.17 0.101 ...
## $ MEAN_HG_CONC_SE: num 0.0047 0.0013 0.0115 0.0057 0.0172 0.0064 0.0035 0.0026 0.02 0.0345 ...
## $ N_FISH : int 5 3 5 3 5 4 5 9 3 3 ...
summary(z)
## LAKE AREA HYDRO_POS LAKE_ORDER
## Length:43 Min. : 42.0 Length:43 Min. :-2.0000
## Class :character 1st Qu.: 113.0 Class :character 1st Qu.:-1.0000
## Mode :character Median : 194.0 Mode :character Median : 0.0000
## Mean : 505.3 Mean : 0.4651
## 3rd Qu.: 479.0 3rd Qu.: 2.0000
## Max. :3875.0 Max. : 4.0000
## MEAN_LENGTH MEAN_LENGTH_SE MEAN_HG_CONC MEAN_HG_CONC_SE
## Min. : 97.0 Min. : 1.00 Min. :0.0150 Min. :0.0006
## 1st Qu.:135.0 1st Qu.: 5.95 1st Qu.:0.0445 1st Qu.:0.0051
## Median :145.0 Median :10.20 Median :0.0660 Median :0.0081
## Mean :146.3 Mean :10.16 Mean :0.0746 Mean :0.0109
## 3rd Qu.:155.0 3rd Qu.:13.55 3rd Qu.:0.0960 3rd Qu.:0.0115
## Max. :211.0 Max. :23.50 Max. :0.2170 Max. :0.0445
## N_FISH
## Min. :2.000
## 1st Qu.:3.000
## Median :4.000
## Mean :4.256
## 3rd Qu.:5.000
## Max. :9.000
library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
<- ggplot(data=z, aes(x=MEAN_HG_CONC, y=..density..)) +
p1 geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
<- p1 + geom_density(linetype="dotted",size=0.75)
p1 print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
<- fitdistr(z$MEAN_HG_CONC,"normal")
normPars print(normPars)
## mean sd
## 0.074604651 0.041685500
## (0.006356979) (0.004495063)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 0.0746 0.0417
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.00636 0.0045
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 4.04e-05 0.00 0.00 2.02e-05
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 43
## $ loglik : num 75.6
## - attr(*, "class")= chr "fitdistr"
$estimate["mean"] # note structure of getting a named attribute normPars
## mean
## 0.07460465
<- normPars$estimate["mean"]
meanML <- normPars$estimate["sd"]
sdML
<- seq(0,max(z$MEAN_HG_CONC),len=length(z$MEAN_HG_CONC))
xval
<- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$MEAN_HG_CONC), args = list(mean = meanML, sd = sdML))
stat + stat p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#red shows a normal distribution to the curve
<- fitdistr(z$MEAN_HG_CONC,"exponential")
expoPars <- expoPars$estimate["rate"]
rateML
<- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$MEAN_HG_CONC), args = list(rate=rateML))
stat2 + stat + stat2 p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# blue shows an exponential distribution
<- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$MEAN_HG_CONC), args = list(min=min(z$MEAN_HG_CONC), max=max(z$MEAN_HG_CONC)))
stat3 + stat + stat2 + stat3 p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# dark green shows uniform distribution
<- fitdistr(z$MEAN_HG_CONC,"gamma")
gammaPars <- gammaPars$estimate["shape"]
shapeML <- gammaPars$estimate["rate"]
rateML
<- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$MEAN_HG_CONC), args = list(shape=shapeML, rate=rateML))
stat4 + stat + stat2 + stat3 + stat4 p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#brown displays gamma distribution
<- ggplot(data=z, aes(x=MEAN_HG_CONC/(max(MEAN_HG_CONC + 0.1)), y=..density..)) +
pSpecial geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
<- fitdistr(x=z$MEAN_HG_CONC/max(z$MEAN_HG_CONC + 0.1),start=list(shape1=1,shape2=2),"beta") betaPars
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
<- betaPars$estimate["shape1"]
shape1ML <- betaPars$estimate["shape2"]
shape2ML
<- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$MEAN_HG_CONC), args = list(shape1=shape1ML,shape2=shape2ML))
statSpecial + statSpecial pSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
Question 3
Find best-fitting distribution
The gamma curve fits the data the best.
Question 4
Simulate a new dataset
<- rnorm(n=43,mean=normPars$estimate["mean"])
z_sim <- data.frame(1:43,z_sim)
z_sim names(z_sim) <- list("ID","myVar")
<- z_sim[z_sim$myVar>0,]
z_sim str(z_sim)
## 'data.frame': 27 obs. of 2 variables:
## $ ID : int 5 6 10 11 12 13 15 16 17 19 ...
## $ myVar: num 0.0344 0.8301 0.5742 1.2608 0.0945 ...
summary(z_sim$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03438 0.18269 0.69180 0.65597 0.94829 1.59186
<- ggplot(data=z_sim, aes(x=myVar, y=..density..)) +
p1sim geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1sim)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
<- p1sim + geom_density(linetype="dotted",size=0.75)
p1sim print(p1sim)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
<- fitdistr(z_sim$myVar,"normal")
normParssim print(normParssim)
## mean sd
## 0.65597302 0.44027704
## (0.08473136) (0.05991412)
str(normParssim)
## List of 5
## $ estimate: Named num [1:2] 0.656 0.44
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.0847 0.0599
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.00718 0 0 0.00359
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 27
## $ loglik : num -16.2
## - attr(*, "class")= chr "fitdistr"
$estimate["mean"] # note structure of getting a named attribute normParssim
## mean
## 0.655973
<- normParssim$estimate["mean"]
meanMLsim <- normParssim$estimate["sd"]
sdMLsim
<- seq(0,max(z_sim$myVar),len=length(z_sim$myVar))
xvalsim
#plotting the best fit curve for original dataset - gamma
<- fitdistr(z_sim$myVar,"gamma") gammaParssim
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
<- gammaPars$estimate["shape"]
shapeMLsim <- gammaPars$estimate["rate"]
rateMLsim
<- stat_function(aes(x = xvalsim, y = ..y..), fun = dgamma, colour="brown", n = length(z_sim$myVar), args = list(shape=shapeMLsim, rate=rateMLsim))
stat4sim + stat4sim p1sim
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#reprinting original data historgram with gamma curve
+stat4 p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The two histograms do not show similar gamma distributions. The model is not large enough to simulate realistic data to match my original measurements. With more samples, the gamma distribution may not be the best fit, and thus the simulation may be more realistic.