R scripts for the lecture course "Introduction to machine learning and
pattern recognition", MPIA Feb./March 2008
  Coryn Bailer-Jones

Lecture 3


########## Smoothing spline and regularization controlled by degrees-of-freedom

# Regression spline with fixed knots and no smoothing -  smooth.spline() with spar=0
# See how varying nknots affects fit. Need nknots >=4 to avoid errors and maximum
# nknots is no. of points in data
library(fields)
plot(rat.diet$t, rat.diet$trt)
ss <- smooth.spline(rat.diet$t, rat.diet$trt, spar=0, nknots=20)
eval <- seq(min(rat.diet$t), max(rat.diet$t), 1)
#predict(ss, eval)    # gives both x and y, so when we plot we just do
lines(predict(ss, eval), col='blue')
# Directly overplot fits with variable numbers of knots
lines(predict(smooth.spline(rat.diet$t, rat.diet$trt, spar=0, nknots=5), eval), col='red')
# See how df varies with nknots
knot.values <- seq(4,39,1)
Nval <- length(knot.values)
dof <- vector(length=Nval)
for (k in 1:Nval) {
  dof[k] <- smooth.spline(rat.diet$t, rat.diet$trt, spar=0, nknots=knot.values[k])$df
}
plot(knot.values, dof)
# If you repeat this but with smoothing, i.e. with non-zero spar or non-zero df, you see
# some interesting things. But generally we don't want to have to specify the number of
# knots but rather prefer to use all points as knots. See below!

# Smoothing spline - sreg
library(fields)
#?sreg
plot(rat.diet$t, rat.diet$trt)
eval <- seq(min(rat.diet$t), max(rat.diet$t), 1)
# Try various values of df and overplot in different colours. 
# Try: 2, 3, 5, 20, 30, 39 (last is maximum)
sreg1 <- sreg(rat.diet$t, rat.diet$trt, df=2)
lines(eval, predict(sreg1, eval), col='blue')
# Look at performance / dof
sreg1
# Look at extreme: df=0 (straight line)
# df=infinity or  lambda=0 (no regularization, i.e. regression spline) doesn't work with 
# this function: we must use splint instead. Indeed, maximum df is the no. of points.
# If we don't specify dof in sreg then it is evaluated by GCV
sreg1 <- sreg(rat.diet$t, rat.diet$trt)
lines(eval, predict(sreg1, eval), col='red', lw=1)

# See how RMSE on total data set varies with d.o.f? (This is a dummy to show that we
# actually have to do CV to find the parameter)
rms <- function(x) {sqrt( (1/(length(x)-1)) * sum(x^2) )}
dof.values <- seq(2,20,0.1)
Nval <- length(dof.values)
rmse <- vector(length=Nval)
for (k in 1:Nval) {
   sreg1 <- sreg(rat.diet$t, rat.diet$trt, df=dof.values[k])
   resid <- predict(sreg1, rat.diet$t)-rat.diet$trt
   rmse[k] <- rms(resid)
}
plot(dof.values, rmse, xlab='effective degrees of freedom', ylab='RMS')
# Question: what's happening here? Why does error keep going down?
# Answer: We're fitting all data. There's no validation set: We're not testing the 
# generalization performance

# Now carry out an exhaustive CV search for the optimal df using a training set
set.seed(10)
train <- sample(seq(1:dim(rat.diet)[1]), 20)
rms <- function(x) {sqrt( (1/(length(x)-1)) * sum(x^2) )}
dof.values <- seq(2,20,0.1)
Nval <- length(dof.values)
rmse2 <- vector(length=Nval)
for (k in 1:Nval) {
   sreg1 <- sreg(rat.diet$t[train], rat.diet$trt[train], df=dof.values[k])
   resid <- predict(sreg1, rat.diet$t[-train])-rat.diet$trt[-train]
   rmse2[k] <- rms(resid)
}
plot(dof.values, rmse2, xlab='effective degrees of freedom', ylab='RMS', ylim=c(0,8))
lines(dof.values, rmse, col='red')
min(rmse2)
which.min(rmse2)
dof.values[which.min(rmse2)]   # = 3.9 with seed=100, =7.3 with seed=50
# The minimum found by GCV was df=7.5
# With some training sets (i.e. some seeds) we see second minimum near to df=7.5
# It's rather sensitive to the training set selection, which is perhaps not surprising as there
# are only 39 points in data set. Most common minimum is around 7, but it's very shallow

# As noted above, if sreg() is used withou specifying the df then this is found by GCV.
# The results of this are retained in the sreg() object and can be accessed by sreg.plot
sreg1 <- sreg(rat.diet$t, rat.diet$trt)
plot(sreg1)


########## Neural nework application to PS1 Teff prediction problem

# This will not work as is: you need to download the data.
# See http://www.mpia.de/homes/calj/ps1/PS1-CBJ-001.pdf for more details and the data
# xdat is a four column matrix with the four colours
# phot.bas$teff[train.st] is a one column vector of the Teff values
# train.st is a set of indices denoting the training data set

par(mfrow=c(2,2))
for (nhid in c(1,2,4,8)) {
  nnet.teff <- nnet(y=log10(phot.bas$teff[train.st]), x=xdat[train.st,], size=nhid, maxit=1000, linout=TRUE, abstol=1e-8)
  pred <- predict(nnet.teff, xdat[-train.st,])
  plot(log10(phot.bas$teff[-train.st]), pred, xlab='true log(Teff)', ylab='predicted log(Teff)', cex=0.4)
  mylab <- paste("No. nodes=", nhid, " RMS=", formatC(rms(pred-log10(phot.bas$teff[-train.st])), format="f", digits=3))
  title(main=mylab)
}