Simulating and Fitting Data Distributions
This exercise teaches students how to compare a histogram of continuous (or integer) data to the probability density functions for different statistical distributions.
Question 1
Run Sample Code
library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
# quick and dirty, a truncated normal distribution to work on the solution set
z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)## 'data.frame': 1774 obs. of 2 variables:
## $ ID : int 1 3 5 7 8 9 10 11 12 13 ...
## $ myVar: num 0.931 0.484 0.231 1.611 0.557 ...
summary(z$myVar)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000034 0.366612 0.744607 0.872181 1.259727 3.618112
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
normPars <- fitdistr(z$myVar,"normal")
print(normPars)## mean sd
## 0.87218109 0.64726462
## (0.01536756) (0.01086651)
str(normPars)## List of 5
## $ estimate: Named num [1:2] 0.872 0.647
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.0154 0.0109
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.000236 0 0 0.000118
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 1774
## $ loglik : num -1746
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute## mean
## 0.8721811
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$myVar),len=length(z$myVar))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gammaPars <- fitdistr(z$myVar,"gamma")## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
Question 2
Try it with your own data - Data provided by LTER
z <- read.table("lpp_hglvl_data.csv",header=TRUE,sep=",")
z <- z[z$MEAN_HG_CONC>0,]
str(z)## 'data.frame': 43 obs. of 9 variables:
## $ LAKE : chr "Allequash" "Arrowhead" "Beaver" "Big" ...
## $ AREA : int 430 94 63 804 119 881 535 109 65 89 ...
## $ HYDRO_POS : chr "D" "H" "D" "D" ...
## $ LAKE_ORDER : int 1 0 2 3 -2 -1 3 2 -2 -2 ...
## $ MEAN_LENGTH : int 183 153 164 145 156 144 97 131 136 208 ...
## $ MEAN_LENGTH_SE : num 4.1 3.8 11.5 7.6 10.5 5.8 9 12.1 3.2 2 ...
## $ MEAN_HG_CONC : num 0.057 0.025 0.084 0.036 0.141 0.044 0.054 0.048 0.17 0.101 ...
## $ MEAN_HG_CONC_SE: num 0.0047 0.0013 0.0115 0.0057 0.0172 0.0064 0.0035 0.0026 0.02 0.0345 ...
## $ N_FISH : int 5 3 5 3 5 4 5 9 3 3 ...
summary(z)## LAKE AREA HYDRO_POS LAKE_ORDER
## Length:43 Min. : 42.0 Length:43 Min. :-2.0000
## Class :character 1st Qu.: 113.0 Class :character 1st Qu.:-1.0000
## Mode :character Median : 194.0 Mode :character Median : 0.0000
## Mean : 505.3 Mean : 0.4651
## 3rd Qu.: 479.0 3rd Qu.: 2.0000
## Max. :3875.0 Max. : 4.0000
## MEAN_LENGTH MEAN_LENGTH_SE MEAN_HG_CONC MEAN_HG_CONC_SE
## Min. : 97.0 Min. : 1.00 Min. :0.0150 Min. :0.0006
## 1st Qu.:135.0 1st Qu.: 5.95 1st Qu.:0.0445 1st Qu.:0.0051
## Median :145.0 Median :10.20 Median :0.0660 Median :0.0081
## Mean :146.3 Mean :10.16 Mean :0.0746 Mean :0.0109
## 3rd Qu.:155.0 3rd Qu.:13.55 3rd Qu.:0.0960 3rd Qu.:0.0115
## Max. :211.0 Max. :23.50 Max. :0.2170 Max. :0.0445
## N_FISH
## Min. :2.000
## 1st Qu.:3.000
## Median :4.000
## Mean :4.256
## 3rd Qu.:5.000
## Max. :9.000
library(ggplot2) # for graphics
library(MASS) # for maximum likelihood estimation
p1 <- ggplot(data=z, aes(x=MEAN_HG_CONC, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
normPars <- fitdistr(z$MEAN_HG_CONC,"normal")
print(normPars)## mean sd
## 0.074604651 0.041685500
## (0.006356979) (0.004495063)
str(normPars)## List of 5
## $ estimate: Named num [1:2] 0.0746 0.0417
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.00636 0.0045
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 4.04e-05 0.00 0.00 2.02e-05
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 43
## $ loglik : num 75.6
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute## mean
## 0.07460465
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$MEAN_HG_CONC),len=length(z$MEAN_HG_CONC))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$MEAN_HG_CONC), args = list(mean = meanML, sd = sdML))
p1 + stat## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#red shows a normal distribution to the curve
expoPars <- fitdistr(z$MEAN_HG_CONC,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$MEAN_HG_CONC), args = list(rate=rateML))
p1 + stat + stat2## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# blue shows an exponential distribution
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$MEAN_HG_CONC), args = list(min=min(z$MEAN_HG_CONC), max=max(z$MEAN_HG_CONC)))
p1 + stat + stat2 + stat3## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# dark green shows uniform distribution
gammaPars <- fitdistr(z$MEAN_HG_CONC,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$MEAN_HG_CONC), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#brown displays gamma distribution
pSpecial <- ggplot(data=z, aes(x=MEAN_HG_CONC/(max(MEAN_HG_CONC + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$MEAN_HG_CONC/max(z$MEAN_HG_CONC + 0.1),start=list(shape1=1,shape2=2),"beta")## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$MEAN_HG_CONC), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
Question 3
Find best-fitting distribution
The gamma curve fits the data the best.
Question 4
Simulate a new dataset
z_sim <- rnorm(n=43,mean=normPars$estimate["mean"])
z_sim <- data.frame(1:43,z_sim)
names(z_sim) <- list("ID","myVar")
z_sim <- z_sim[z_sim$myVar>0,]
str(z_sim)## 'data.frame': 27 obs. of 2 variables:
## $ ID : int 5 6 10 11 12 13 15 16 17 19 ...
## $ myVar: num 0.0344 0.8301 0.5742 1.2608 0.0945 ...
summary(z_sim$myVar)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03438 0.18269 0.69180 0.65597 0.94829 1.59186
p1sim <- ggplot(data=z_sim, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1sim)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1sim <- p1sim + geom_density(linetype="dotted",size=0.75)
print(p1sim)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
normParssim <- fitdistr(z_sim$myVar,"normal")
print(normParssim)## mean sd
## 0.65597302 0.44027704
## (0.08473136) (0.05991412)
str(normParssim)## List of 5
## $ estimate: Named num [1:2] 0.656 0.44
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.0847 0.0599
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.00718 0 0 0.00359
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 27
## $ loglik : num -16.2
## - attr(*, "class")= chr "fitdistr"
normParssim$estimate["mean"] # note structure of getting a named attribute## mean
## 0.655973
meanMLsim <- normParssim$estimate["mean"]
sdMLsim <- normParssim$estimate["sd"]
xvalsim <- seq(0,max(z_sim$myVar),len=length(z_sim$myVar))
#plotting the best fit curve for original dataset - gamma
gammaParssim <- fitdistr(z_sim$myVar,"gamma")## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeMLsim <- gammaPars$estimate["shape"]
rateMLsim <- gammaPars$estimate["rate"]
stat4sim <- stat_function(aes(x = xvalsim, y = ..y..), fun = dgamma, colour="brown", n = length(z_sim$myVar), args = list(shape=shapeMLsim, rate=rateMLsim))
p1sim + stat4sim## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#reprinting original data historgram with gamma curve
p1+stat4## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The two histograms do not show similar gamma distributions. The model is not large enough to simulate realistic data to match my original measurements. With more samples, the gamma distribution may not be the best fit, and thus the simulation may be more realistic.