R scripts for the lecture course "Introduction to machine learning and pattern recognition", MPIA Feb./March 2008 Coryn Bailer-Jones Lecture 3 ########## Smoothing spline and regularization controlled by degrees-of-freedom # Regression spline with fixed knots and no smoothing - smooth.spline() with spar=0 # See how varying nknots affects fit. Need nknots >=4 to avoid errors and maximum # nknots is no. of points in data library(fields) plot(rat.diet$t, rat.diet$trt) ss <- smooth.spline(rat.diet$t, rat.diet$trt, spar=0, nknots=20) eval <- seq(min(rat.diet$t), max(rat.diet$t), 1) #predict(ss, eval) # gives both x and y, so when we plot we just do lines(predict(ss, eval), col='blue') # Directly overplot fits with variable numbers of knots lines(predict(smooth.spline(rat.diet$t, rat.diet$trt, spar=0, nknots=5), eval), col='red') # See how df varies with nknots knot.values <- seq(4,39,1) Nval <- length(knot.values) dof <- vector(length=Nval) for (k in 1:Nval) { dof[k] <- smooth.spline(rat.diet$t, rat.diet$trt, spar=0, nknots=knot.values[k])$df } plot(knot.values, dof) # If you repeat this but with smoothing, i.e. with non-zero spar or non-zero df, you see # some interesting things. But generally we don't want to have to specify the number of # knots but rather prefer to use all points as knots. See below! # Smoothing spline - sreg library(fields) #?sreg plot(rat.diet$t, rat.diet$trt) eval <- seq(min(rat.diet$t), max(rat.diet$t), 1) # Try various values of df and overplot in different colours. # Try: 2, 3, 5, 20, 30, 39 (last is maximum) sreg1 <- sreg(rat.diet$t, rat.diet$trt, df=2) lines(eval, predict(sreg1, eval), col='blue') # Look at performance / dof sreg1 # Look at extreme: df=0 (straight line) # df=infinity or lambda=0 (no regularization, i.e. regression spline) doesn't work with # this function: we must use splint instead. Indeed, maximum df is the no. of points. # If we don't specify dof in sreg then it is evaluated by GCV sreg1 <- sreg(rat.diet$t, rat.diet$trt) lines(eval, predict(sreg1, eval), col='red', lw=1) # See how RMSE on total data set varies with d.o.f? (This is a dummy to show that we # actually have to do CV to find the parameter) rms <- function(x) {sqrt( (1/(length(x)-1)) * sum(x^2) )} dof.values <- seq(2,20,0.1) Nval <- length(dof.values) rmse <- vector(length=Nval) for (k in 1:Nval) { sreg1 <- sreg(rat.diet$t, rat.diet$trt, df=dof.values[k]) resid <- predict(sreg1, rat.diet$t)-rat.diet$trt rmse[k] <- rms(resid) } plot(dof.values, rmse, xlab='effective degrees of freedom', ylab='RMS') # Question: what's happening here? Why does error keep going down? # Answer: We're fitting all data. There's no validation set: We're not testing the # generalization performance # Now carry out an exhaustive CV search for the optimal df using a training set set.seed(10) train <- sample(seq(1:dim(rat.diet)[1]), 20) rms <- function(x) {sqrt( (1/(length(x)-1)) * sum(x^2) )} dof.values <- seq(2,20,0.1) Nval <- length(dof.values) rmse2 <- vector(length=Nval) for (k in 1:Nval) { sreg1 <- sreg(rat.diet$t[train], rat.diet$trt[train], df=dof.values[k]) resid <- predict(sreg1, rat.diet$t[-train])-rat.diet$trt[-train] rmse2[k] <- rms(resid) } plot(dof.values, rmse2, xlab='effective degrees of freedom', ylab='RMS', ylim=c(0,8)) lines(dof.values, rmse, col='red') min(rmse2) which.min(rmse2) dof.values[which.min(rmse2)] # = 3.9 with seed=100, =7.3 with seed=50 # The minimum found by GCV was df=7.5 # With some training sets (i.e. some seeds) we see second minimum near to df=7.5 # It's rather sensitive to the training set selection, which is perhaps not surprising as there # are only 39 points in data set. Most common minimum is around 7, but it's very shallow # As noted above, if sreg() is used withou specifying the df then this is found by GCV. # The results of this are retained in the sreg() object and can be accessed by sreg.plot sreg1 <- sreg(rat.diet$t, rat.diet$trt) plot(sreg1) ########## Neural nework application to PS1 Teff prediction problem # This will not work as is: you need to download the data. # See http://www.mpia.de/homes/calj/ps1/PS1-CBJ-001.pdf for more details and the data # xdat is a four column matrix with the four colours # phot.bas$teff[train.st] is a one column vector of the Teff values # train.st is a set of indices denoting the training data set par(mfrow=c(2,2)) for (nhid in c(1,2,4,8)) { nnet.teff <- nnet(y=log10(phot.bas$teff[train.st]), x=xdat[train.st,], size=nhid, maxit=1000, linout=TRUE, abstol=1e-8) pred <- predict(nnet.teff, xdat[-train.st,]) plot(log10(phot.bas$teff[-train.st]), pred, xlab='true log(Teff)', ylab='predicted log(Teff)', cex=0.4) mylab <- paste("No. nodes=", nhid, " RMS=", formatC(rms(pred-log10(phot.bas$teff[-train.st])), format="f", digits=3)) title(main=mylab) }