a. Attach the lattice library and explore the ozone data in data frame “environmental”.

library(lattice)
#?environmental
attach(environmental) #attach can induce some issues so we will detach asap
enviro <- environmental
detach(environmental)

Plot a histogram of ozone using ‘hist’. Compare a histogram of these “raw” ozone values to one that uses 3rd root transformed values (ozone^(1/3)). Describe the difference and explain why we would want to transform the ozone variable if used as dependent variable in a typical regression (this requires some familiarity with standard regression assumptions).

library(ggplot2)
theme_set(theme_bw(base_size=12)+ 
  theme(panel.grid.major = element_blank(),
  panel.grid.minor = element_blank()))
library(reshape2)


enviro$cubert <- enviro$ozone^(1/3)
menviro <- melt(enviro, id.vars=c("radiation", "temperature", "wind")) #melt the data for plotting
ggplot(menviro, aes(value))+geom_histogram(alpha=.35, color='black')+facet_wrap(~variable, nrow=2, scales="free")

Lets look at it with a density plot as well

ggplot(menviro, aes(value))+geom_density(alpha=.35, fill='black')+facet_wrap(~variable, nrow=2, scales="free")

b. Fit a LOWESS model of (transformed) ozone levels on wind and temperature using the ‘loess’ function, similar to the model for GAK 1 temperatures that we fit in Module 8:

ozo1 <- loess(cubert~wind+temperature, data=enviro, span = 1, degree = 2)

c. Plot the observed values against the fitted values. Add a 1:1 line to the plot abline(0,1)

enviro$ozo1 <- fitted(ozo1)
ggplot(enviro, aes(cubert, ozo1))+geom_point()+geom_abline(slope=1)

d. You can view the residual standard error in the summary output using:

(summary(ozo1))
## Call:
## loess(formula = cubert ~ wind + temperature, data = enviro, span = 1, 
##     degree = 2)
## 
## Number of Observations: 111 
## Equivalent Number of Parameters: 6.83 
## Residual Standard Error: 0.501 
## Trace of smoother matrix: 7.63 
## 
## Control settings:
##   normalize:  TRUE 
##   span       :  1 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2

For comparison, compute the standard error directly from the residuals. The residual standard error can be estimated as:

\[ \hat{\sigma}_{\varepsilon}^2=\sqrt{\sum\limits_{i=1}^n\sigma_{i}^2\Big/(n-p)} \]

All the information you need is either in the summary output (n is the number of observations and p is the equivalent number of parameters) or can be extracted from the fitted object (i.e. resid(ozo1) returns the vector of residuals or e). You can use vectorized arithmetic, that is you can square the entire residual vector e and then sum over the elements ei using'sum. There is no need for looping (!!), and the above formula can be written as a single short expression using sqrt, sum, and basic arithmetic operators. You can take the values for n and p from ‘summary(ozo1)’ and simply type them into your expression to compute the standard error or you can extract n and p from the summary object and then use them in your expression:

Use str(ozo1) to examine all available elements.
(Note that the computed value for the standard error of the residuals differs somewhat from the value returned in the summary output due to slight differences in the way they are calculated).

p <- ozo1$enp
n <- ozo1$n
enviro$ozo1.res <- resid(ozo1)
(RSE <- sqrt((sum(enviro$ozo1.res ^2))/(n-p)))
## [1] 0.4971813

e.

Verify that the sum of the fitted values and the residuals is equal to the observed ozone values (e.g. by plotting (fitted values + residual values) against (observed values) or by computing the differences, which should all be 0).

ggplot(enviro, aes(cubert, ozo1+ozo1.res))+geom_point()+geom_abline(slope=1, lty=4, color=2)

sum(enviro$cubert-(enviro$ozo1+enviro$ozo1.res))
## [1] 0

f. Create the following set of plots, which should be part of model checking whenever you fit a statistical model:

  1. Plot residuals (on y-axis) against fitted values, add a horizontal line at 0 (abline(h=0))
  2. Plot residuals against wind and add line at 0
  3. Plot residuals against temperature and add line at 0
  4. Create a qqplot that compares the quantiles of the residual distribution against the corresponding quantiles of a normal distribution. If residuals are normally distributed, they should roughly fall on a straight line:
#multiple similar plot so write a function
library(gridExtra)
scatterplot <- function(x,y, title="", xlab="", ylab="") {
    d <- data.frame(x=x,y=y)
    p <- ggplot(d, aes(x=x,y=y)) + geom_point() + ggtitle(title) + xlab(xlab) + ylab(ylab) + geom_hline(yintercept=0, lty=4, color=2)+geom_smooth(alpha=.15)
    return(p)
}

p1 <- scatterplot(enviro$ozo1,enviro$ozo1.res,
                  title="Residuals vs Fitted",
                  xlab="Fitted values",
                  ylab="Residuals")


p2 <- scatterplot(enviro$wind,enviro$ozo1.res,
                  title="Residuals vs Fitted",
                  xlab="Fitted values",
                  ylab="Residuals")


p3 <- scatterplot(enviro$temperature,enviro$ozo1.res,
                  title="Residuals vs Fitted",
                  xlab="Fitted values",
                  ylab="Residuals")


#another function for making qqplots within ggplot

qqplot <- function(y,
                   distribution=qnorm,
                   title="Normal Q-Q",
                   xlab="Theretical Quantiles",
                   ylab="Sample Quantiles") {
    require(ggplot2)
    x <- distribution(ppoints(y))
    d <- data.frame(x=x, y=sort(y))
    p <- ggplot(d, aes(x=x, y=y)) +
        geom_point() +
            geom_line(aes(x=x, y=x)) +
                ggtitle(title) +
                    xlab(xlab) +
                        ylab(ylab)
    return(p)
}

rs <- enviro$ozo1.res/RSE
p4 <- qqplot(rs, ylab="Standardized residuals")

grid.arrange(p1,p2,p3,p4, ncol=2)

g. Next, we set up a grid to compute predicted values on a regularly spaced grid for the purpose of plotting the predicted model surface (which provides an estimate of how ozone levels vary with wind speed and temperature):

wnd <- seq(min(enviro$wind), max(enviro$wind), length.out=50)
tmp <- seq(min(enviro$temperature), max(enviro$temperature), length.out=50)
grd <- expand.grid(wind=wnd, temperature=tmp, KEEP.OUT.ATTRS = FALSE)
grd$ozo1 <- predict(ozo1, grd)

a <- ggplot(grd, aes(wind, temperature, z=ozo1)) + geom_tile(aes(fill=ozo1)) + scale_fill_gradientn(colours = terrain.colors(5)) + geom_point(data=enviro, aes(wind, temperature))

library(directlabels)
a2 <- a + stat_contour(aes(colour = ..level..))
direct.label(a2,"top.pieces")

A little fancier (scroll across the figure)

library(plotly)
ggplotly(grd, aes(wind, temperature, z=ozo1), type='surface')

Another fancier option

plot_ly(grd, x=wind, y=temperature, z=ozo1, color = ozo1, type='scatter3d', mode = "markers")

h. Compare your image plot to an alternative solution that uses the lattice function ‘contourplot’, which automatically adds a legend. It works somewhat differently and we need to add the fitted values to the ‘grd’ matrix in order to use it:

contourplot(ozo1 ~ wind + temperature, data = grd,  
 cuts = 10, region = TRUE,  
 xlab = "Wind Speed (mph)",  
 ylab = "Temperature (F)",  
 main = "Cube Root Ozone (cube root ppb)",  
 col.regions = terrain.colors(50))

i.

Extend the model with two independent variables ozo1 to include an additional covariate - solar radiation - which may also have an effect on ozone levels. The model fits a “hyper-surface” to the data in 3 dimensions. You can think of it as a multiple regression of ozone on three independent variables with all possible interactions. (where the “interactions” are a bit more flexible in a LOWESS model than in a linear model). The model essentially allows for a different ozone response to temperature and wind conditions at different levels of solar radiation:

ozo2 <- loess(cubert ~ wind+temperature+radiation, data = enviro, span=1, degree=2)
summary(ozo2)
## Call:
## loess(formula = cubert ~ wind + temperature + radiation, data = enviro, 
##     span = 1, degree = 2)
## 
## Number of Observations: 111 
## Equivalent Number of Parameters: 11.32 
## Residual Standard Error: 0.4538 
## Trace of smoother matrix: 13.12 
## 
## Control settings:
##   normalize:  TRUE 
##   span       :  1 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2

Reduction in RSS

(anov <- anova(ozo1, ozo2))
## Model 1: loess(formula = cubert ~ wind + temperature, data = enviro, span = 1, degree = 2)
## Model 2: loess(formula = cubert ~ wind + temperature + radiation, data = enviro, span = 1, degree = 2)
## 
## Analysis of Variance:   denominator df 98.06
## 
##        ENP    RSS F-value    Pr(>F)    
## [1,]  6.83 25.749                      
## [2,] 11.32 19.790  4.4603 0.0006947 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
max(anov$RSS)-min(anov$RSS)
## [1] 5.959731

Parameters needed for reduction

anov$ENP[2]-anov$ENP[1]
## [1] 4.49

j.
set up a 3-D grid with 4 different levels of radiation, compute predicted values, and plot the results as before, but with a different fitted surface at each of 4 radiation levels:

rad <- seq(min(enviro$radiation), max(enviro$radiation), length = 4)
grd <- expand.grid(wind=wnd, temperature = tmp, radiation=rad,KEEP.OUT.ATTRS = FALSE) 
grd$ozo2 <- predict(ozo2, grd)  
grd$Radiation <- factor(grd$radiation)

Create a plot of the fitted values at different combinations of wind and temperature and at four different levels of radiation:

b <- ggplot(grd, aes(wind, temperature, z=ozo2)) + geom_tile(aes(fill=ozo2)) + scale_fill_gradientn(colours = terrain.colors(8))+stat_contour() + facet_wrap(~Radiation, ncol=2)
p <- plot_ly(grd, x=wind, y=temperature, z=ozo2, color = Radiation, xaxis = paste0("x", radiation), type='scatter3d', mode = "markers")
subplot(p)