For the past two years, every time I filled my car with gas ($n = 28$ times) I recorded how many miles I went and how many gallons I used. I want to estimate how many miles per gallon I'm getting on my car. For simplicity, I'll assume my data comes from the Normal distribution with unknown mean and variance.
I take a Bayesian approach and assume the mean follows a Normal distribution with mean 30 and variance 10 and the variance is distributed Inverse Gamma with shape 20 and scale 0.002. With these priors we get conjugate posteriors.
Here is the posterior predictive distribution (in red) overlaying the histogram of the data:
The posterior distribution for the mean with a symmetric 95% probability interval in green (calculating the 95% HPD wouldn't be necessary since the mean is normal):
post = updatemu(data,out[i,2],mumu,sigma2mu)
# get random draw for mu
out[i,1] = rnorm(1,post[1],sqrt(post[2]))
}
return (out)
}
# function to compute highest posterior density intervals
hpd = function(x, prob = 0.95, precision = 1000){
range = seq(0, 1-prob, length=precision)
range = cbind(range, range+prob)
best = range[which.min(apply(range, 1, function(y)
diff(quantile(x, y)))),]
return (quantile(x, best))
}
# get the closest values in density() for the intervals
bounds = function(int, dens, x=TRUE){
if (x) # returns x-value from density
return(dens$x[which.min(abs(dens$x-int))])
return(dens$y[which.min(abs(dens$x-int))])
}
### Miles and Gallons Data from my Saturn. Includes both highway and
### surface street miles.
mpg.dat=c(209.5/6.171, 296.0/9.664, 240.5/9.867, 221.8/10.231,
319.3/10.404, 287.5/9.543, 307.3/9.911, 227.9/9.405,
318.1/9.812, 309.4/9.634, 269.2/9.271, 297.9/10.334,
163.3/9.913, 300.0/10.12, 355.1/10.384, 286.6/9.611,
330.6/10.390, 301.0/9.956, 316.2/10.020, 366.9/9.942,
262.1/10.055, 215.8/10.048, 283.1/8.690, 357.6/9.879,
242.5/10.121, 188.9/8.485, 311.3/9.870, 225.3/10.236)
n = length(mpg.dat)
### My prior beliefs about the data, which would be normal
mpg.mumu = 30
mpg.s2mu = 10
mpg.prsh = 20
mpg.prsc = 0.002
### Running the gibbs sampler
mpgpost = gibbs(mpg.dat, mpg.mumu, mpg.s2mu, mpg.prsh, mpg.prsc, 100000)
### posterior predictive
# use each set of parameter draws (mu and sig2) to generate a
# random "observation"
preds = rnorm(nrow(mpgpost), mpgpost[,1], sqrt(mpgpost[,2]))
png("~/files/R/figs/mpghist.png")
par(mar=c(4.1,3.1,3.1,1.1))
hist(mpg.dat, breaks=10, col='gray', main="", xlab="MPG", ylab="",
xlim=c(min(mpg.dat)-3,max(mpg.dat)+3),freq=FALSE)
points(density(preds), col='red', type='l', lwd=2)
legend(15, 0.14, c("Posterior Predictive", "Data"), lty=c(1, 1),
col=c('red', "gray"), lwd=c(2,10), cex=1.3)
dev.off()
### posterior distribution on mu
densmu = density(mpgpost[,1])
int = quantile(mpgpost[,1],c(0.025,0.975))
plot(densmu,lwd=2,col='blue', xlab=expression(mu),
main=expression(paste("Posterior of ", mu)), cex.main=1.3,
cex.lab=1.3, ylab="")
polygon(density(mpgpost[,1]), col='lightblue', border='blue')
lines(rep(bounds(int[1], densmu), 2), c(0, bounds(int[1],
densmu, FALSE)), col='green', lwd=2)
lines(rep(bounds(int[2], densmu), 2), c(0, bounds(int[2],
densmu, FALSE)), col='green', lwd=2)
legend(24.2, 0.42, "95% Probability Interval", lty=1,
col='green', lwd=2)
# mean of mu
mean(mpgpost[,1])
# 95% probability interval of mu
quantile(mpgpost[,1],c(0.025,0.975))
### hypothesis test on the skewness of the data
skew = function(x) mean((x-mean(x))^3) / var(x)^(3/2)
# using the posterior predictive (but any normal distribution
# will work since they all have the same skewness)
pred.mean = mean(preds)
pred.sd = sd(preds)
observed.skew = skew(mpg.dat)
null.skew = double(100000)
for (i in 1:length(null.skew))
null.skew[i] = skew(rnorm(length(mpg.dat), pred.mean, pred.sd))
hist(null.skew, col='gray', freq=FALSE, main="Simulation-based",
xlab="Sample Skewness", ylab="", cex.main=1.3, cex.lab=1.3)
abline(v=c(-1,1)*observed.skew, col='green', lwd=2)
mean(null.skew < observed.skew) + mean(null.skew > -observed.skew)
# using bootstrapping
boot.skew = double(100000)
for (i in 1:length(boot.skew))
boot.skew[i] = skew(sample(mpg.dat, replace=TRUE))
hist(boot.skew, col='gray', freq=FALSE, xlab="Sample Skewness",
main="Bootstrapping", cex.main=1.3, cex.lab=1.3)
abline(v=quantile(boot.skew, c(0.025, 0.975)), col='green', lwd=2)
quantile(boot.skew, c(0.025, 0.975))