R scripts for the lecture course "Introduction to machine learning and pattern recognition", MPIA Feb./March 2008 Coryn Bailer-Jones Lecture 1 ########## Linear regression in R (from MASS section 1.3) ########## plus some general introduction to R library(MASS) x <- seq(1, 20, 0.5) x w <- 1 + x/2 set.seed(30) y <- x + w*rnorm(x, mean=0, sd=1) # rnorm(x) is same as rnorm(length(x)) dum <- data.frame(x, y, w) dum rm(x, y, w) fm <- lm(y ~ x, data=dum) #?lm summary(fm) attributes(fm) fm$fitted.values fitted(fm) plot(x,y) # doesn't work as x,y are not on path plot(dum$x,dum$y) attach(dum) # put components of dum on path lines(x,fitted(fm),col=2, lw=2) fm1 <- lm(y ~ x, data = dum, weight = 1/w^2) summary(fm1) abline(fm1,col=3, lw=2) # a simpler way to plot model fits lrf <- loess(y ~ x, dum) # local polynomial regression lines(spline(x, fitted(lrf)), col = 4, lw=2) # loess only predicts at points: spline # interpolation is used to plot 13 par(mfrow=c(1,2)) plot(x,y) abline(fm,col=2,lw=2) ; abline(fm1, col=3,lw=3) ; lines(spline(x, fitted(lrf)), col=4,lw=2) plot(y, resid(fm), col=2, xlab = "y", ylab = "Residuals") abline(h=0, lty=2, lw=2) points(y, resid(lrf), col=4, pch=25) detach() rm(fm,fm1,lrf,dum) par(mfrow=c(1,1)) ########## Learning, generalization and regularization # After Bishop ch. 9 x <- seq(0,1,0.05) h <- function(x){0.5 + 0.4*sin(2*pi*x)} # define a function set.seed(10) h_noisy <- h(x) + rnorm(n=x, sd=0.05) plot(x, h_noisy, pch=16) x_temp <- seq(0,1,0.01) lines(x_temp, h(x_temp), lty=2, lwd=2) # Predict using locpoly # With locpoly we cannot define a prediction grid (only number of points spread uniformly) library(KernSmooth) kernelwidth <- 0.05 locpoly(x, h_noisy, bandwidth=kernelwidth) lines(locpoly(x, h_noisy, bandwidth=kernelwidth), col='blue', lwd=2) # Repeat with bandwith values: # 0.15 too smooth # 0.05 overfit # Predict using ksmooth library(stats) # replot training data and true curve plot(x, h_noisy, pch=16) ; lines(x_temp, h(x_temp), lty=2, lwd=2) kernelwidth <- 0.05 ksmooth(x, h_noisy, x.points=x, bandwidth=0.2, kernel='normal') lines(ksmooth(x, h_noisy, x.points=x, bandwidth=kernelwidth, kernel='normal'), col='red', lwd=2) # Repeat with bandwith values: # 0.5 too smooth # 0.1 okay # 0.0 overfit # Define a test set xtest <- seq(0.025,1,0.05) set.seed(20) htest_noisy <- h(xtest) + rnorm(n=xtest, sd=0.05) # plot and compare with the fits from the training data points(xtest, htest_noisy, col='blue', pch=17) # predict at test data points ksmooth(x, h_noisy, x.points=xtest, bandwidth=kernelwidth, kernel='normal') resid <- ksmooth(x, h_noisy, x.points=xtest, bandwidth=kernelwidth, kernel='normal')$y - htest_noisy # define rmse function rmse <- function(x){ sqrt( (1/length(x))*sum(x^2) ) } rmse(resid) # training function error rmse( ksmooth(x, h_noisy, x.points=x, bandwidth=kernelwidth, kernel='normal')$y - h_noisy ) # test function error rmse( ksmooth(x, h_noisy, x.points=xtest, bandwidth=kernelwidth, kernel='normal')$y - htest_noisy ) Note that the application of locpoly is using a linear fit over the bandwidth region about each point, yet appears to produce a nonlinear function. Actually, it doesn't produce a function at all: it is just doing point estimations at 401 points equally spread over the x-axis (this value is fixed by the parameter gridsize). That is, at each of these points it does a linear fit over the data lying within the box of size +/- bandwidth/2 and this is the predicted y-value. # With neural network. Stubbornly refuses to overfit predict(nnet(x,h_noisy,size=5,decay=0,maxits=1000,linout=TRUE), as.matrix(x)) A better fit would be obtained using a sine function, of course, or at least something which is periodic. Domain knowledge helps, but only if it is correct! ########## K-means clustering library(cluster) # swiss.x contains 5 measurements of socio-economic indicators in each of 46 swiss # provinces from 1888 swiss.x <- as.matrix(swiss[,-1]) #select Nclust cluster centres as being a random selection of Nclust vectors Nclust <- 3 Imax <- 5 set.seed(100) init <- sample(dim(swiss.x)[1], Nclust) # this is the initial Nclust prototypes km <- kmeans(swiss.x, centers=swiss.x[init,], iter.max=Imax) # plot data using these two columns. This is just a projection so the cluster may not be at # all obvious, as is the case with j <- 2 ; j <- 5 i <- 1 ; j <- 2 plot(swiss.x[,i], swiss.x[,j], pch=20) title(main=paste(Nclust, 'clusters with', Imax, 'iterations'), cex.main=0.75) points(km$centers[,i], km$centers[,j], pch=3, cex=1.4, col=c('blue', 'green', 'red')) sel <- which(km$cluster==1) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='blue') sel <- which(km$cluster==2) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='green') sel <- which(km$cluster==3) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='red') # Use the following to produce plots for different Imax par(mfrow=c(2, 2), mgp=c(1.8,0.5,0), mar=c(3,3,3,3)) i <- 1 ; j <- 2 plot(swiss.x[,i], swiss.x[,j], pch=20) title(main=paste(Nclust, 'initial cluster centres'), cex.main=1.0) points(swiss.x[init,i], swiss.x[init,j], pch=3, cex=1.4,) Nclust <- 3 for (Imax in 1:3) { set.seed(45) init <- sample(dim(swiss.x)[1], Nclust) km <- kmeans(swiss.x, centers=swiss.x[init,], iter.max=Imax) plot(swiss.x[,i], swiss.x[,j], pch=20) title(main=paste(Nclust, 'clusters with', Imax, 'iterations'), cex.main=1.0) points(km$centers[,i], km$centers[,j], pch=3, cex=1.8, col=c('blue', 'green', 'red')) sel <- which(km$cluster==1) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='blue') sel <- which(km$cluster==2) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='green') sel <- which(km$cluster==3) ; points(swiss.x[sel,i], swiss.x[sel,j], pch=20, col='red') } # Now repeat, using different initial cluster centres. Using seeds 100 and 45 we have quite # different starting conditions, yet converge quickly to the same solution. The final cluster # labels (colours) are, of course, arbitrary