R scripts for the lecture course "Introduction to machine learning and pattern recognition", MPIA Feb./March 2008 Coryn Bailer-Jones Lecture 2 ########## 1D density estimation - grass diagram png("figures/1dexample.png") x <- c(rnorm(21,0.3,0.1), rnorm(9,0.8,0.1)) plot(x, rep(1,30), type="h", ylim=c(0,5), ylab="frequency", cex.axis=2, cex.lab=2, lwd=2, col="red") dev.off() ########## Density estimation # Figure 5.8 p. 127 in MASS4 geyser attach(geyser) plot(waiting, duration) par(mfrow=c(2,3)) # histograms are sensitive to placing of bin centres truehist(duration, h=0.5, x0=0.0, xlim=c(0, 6), ymax=0.7) truehist(duration, h=0.5, x0=0.1, xlim=c(0, 6), ymax=0.7) truehist(duration, h=0.5, x0=0.2, xlim=c(0, 6), ymax=0.7) truehist(duration, h=0.5, x0=0.3, xlim=c(0, 6), ymax=0.7) truehist(duration, h=0.5, x0=0.4, xlim=c(0, 6), ymax=0.7) # Improve by averaging the histograms breaks <- seq(0, 5.9, 0.1) counts <- numeric(length(breaks)) for(i in (0:4)) counts[i+(1:55)] <- counts[i+(1:55)] + rep(hist(duration, breaks=0.1*i + seq(0, 5.5, 0.5), prob=TRUE, plot=FALSE)$intensities, rep(5,11)) plot(breaks+0.05, counts/5, type="l", xlab="duration", ylab="averaged", bty="n", xlim=c(0, 6), ylim=c(0, 0.7)) detach() # So use Gaussian kernel density estimation instead # Fig. 5.9 on p. 128 of MASS4 # nrd, SJ and SJ-dpi are different methods for choosing the fixed bandwidth # do ?bw.nrd for more details. To get values used: # bw.nrd(geyser$duration) = 0.389 # bw.SJ(geyser$duration) = 0.090 # bw.SJ(geyser$duration, method='dpi') = 0.144 attach(geyser) par(mfrow=c(1,2)) truehist(duration, nbins = 15, xlim = c(0.5, 6), ymax = 1.2) lines(density(duration, width = "nrd")) truehist(duration, nbins = 15, xlim = c(0.5, 6), ymax = 1.2) lines(density(duration, width = "SJ", n = 256), lty = 3) # dotted line lines(density(duration, n = 256, width = "SJ-dpi"), lty = 1) detach() ########## Density estimation in 2D # Fig. 5.11 on p. 131 of MASS4 geyser2 <- data.frame(as.data.frame(geyser)[-1, ], pduration = geyser$duration[-299]) attach(geyser2) par(mfrow=c(2, 2), mgp=c(1.8,0.5,0), mar=c(3,3,3,3)) plot(pduration, waiting, xlim = c(0.5, 6), ylim = c(40, 110), xlab = "previous duration", ylab = "waiting") f1 <- kde2d(pduration, waiting, n = 50, lims=c(0.5, 6, 40, 110)) # on a linear scale #image(f1, zlim = c(0, 0.075), xlab = "previous duration", ylab = "waiting") # to get on a log scale do: image(f1$x, f1$y, log(f1$z), zlim = c(-40, 0), xlab = "previous duration", ylab = "waiting") f2 <- kde2d(pduration, waiting, n = 50, lims=c(0.5, 6, 40, 110), h = c(width.SJ(duration), width.SJ(waiting)) ) image(f2, zlim = c(0, 0.075), xlab = "previous duration", ylab = "waiting") persp(f2, phi = 30, theta = 20, d = 5, xlab = "previous duration", ylab = "waiting", zlab = "") detach() ########## Classification via density modelling and using MLE # Generate two clusters of Gaussian data set.seed(20) x1 <- rnorm(100, mean=0, sd=0.50) y1 <- rnorm(100, mean=0, sd=0.50) x2 <- rnorm(100, mean=1, sd=0.70) y2 <- rnorm(100, mean=1, sd=0.30) par(mfrow=c(1,1)) plot(x1, y1, type='n', xlim=c(-1.5,3.0), ylim=c(-1.5,3.0), xlab='x', ylab='y') points(x1, y1, pch=19, col=2) points(x2, y2, pch=23, col=3, bg=3) # infer properties of PDF on assumption that x and y are independent ( mean1 <- c(mean(x1), mean(y1)) ) ( mean2 <- c(mean(x2), mean(y2)) ) ( sd1 <- c(sd(x1), sd(y1)) ) ( sd2 <- c(sd(x2), sd(y2)) ) # build covariance matrices and work out eigenvalues and eigenvectors, i.e. without # assuming x and y are independent covmat1 <- matrix( c(cov(x1,x1), cov(x1,y1), cov(y1,x1), cov(y1,y1)), nrow=2 ) covmat2 <- matrix( c(cov(x2,x2), cov(x2,y2), cov(y2,x2), cov(y2,y2)), nrow=2 ) eigen1 <- eigen(covmat1, symmetric=TRUE) eigen2 <- eigen(covmat2, symmetric=TRUE) # standard deviations in x and y are sqrt(eigen1$values) sqrt(eigen2$values) # note the differences in the the principal directions (eigenvectors) for class 1. This is # a symmetric Gaussian so the principal directions could happily lie in any direction. # Alternatively use SVD, e.g. svd(covmat2) sqrt(svd(covmat2)$d) # overplotted fitted Gaussians, using fact that x and y are independent xgrid=seq(-2,3,0.01) ygrid=seq(-2,3,0.01) contour( xgrid, ygrid, dnorm(xgrid, mean=mean(x1), sd=sd(x1)) %o% dnorm(ygrid, mean=mean(y1), sd=sd(y1)), col=2, add=TRUE ) contour( xgrid, ygrid, dnorm(xgrid, mean=mean(x2), sd=sd(x2)) %o% dnorm(ygrid, mean=mean(y2), sd=sd(y2)), col=3, add=TRUE ) Decision boundaries are quadratic in general. They are linear if the two class covariance matrices are equal. ########## LDA Model class distributions as multivariate Gaussians. LDA is the special case when we assume that the classes have equal covariance matrices (Hastie et al. p. 86). If we drop this assumption, we have QDA. Let there be g classes in a p dimensional data space. LDA determines the means of each class, and uses the proximity of a new data point to these to make a classification (taking into account the covariance matrix and prior probabilities - if we sphere the data and have equal priors, we just take the nearest centroid). Further, the g centroids in the p-dimensional data space lie in an affine subspace of dimension g-1. In locating the nearest centroid, directions orthogonal to this subspace do not discriminate between the classes (within the LDA assumptions), so we may as well project the data onto this subspace. Thus LDA performs a dimension reduction: p -> g-1, i.e. for g classes there are g-1 LDs. The LDs are mutually orthogonal vectors in the (sphered) data space which maximally discriminate between the classes. Can also show that they maximise the between-class variance relative to the within-class variance (MASS4 section 12.1). LDs are ordered in terms of separation significance. data(iris3) Iris <- data.frame(rbind(iris3[,,1], iris3[,,2], iris3[,,3]), Sp = rep(c("s","c","v"), rep(50,3))) ir.lda <- lda(Sp ~ ., Iris) For dataframe Iris, perform an LDA where the groups are given by named column Sp, and the explanatory variables are everything else (this is the meaning of the "." on the RHS of the formula; at least, we get the same result if we use "Sepal.L. + Sepal.W. + Petal.L. + Petal.W." on the RHS). To predict using this we use ir.ld <- predict(ir.lda) # as "newdata" not specified so predicts # using the original data set eqscplot(ir.ld$x, type="n", xlab="LD1", ylab="LD2") # plot window # plot data as coloured characters text(ir.ld$x, labels=as.character(Iris$Sp), col=1+as.numeric(Iris$Sp), cex=1.0) Evaluate contingency table of classification results: t(table(Iris$Sp, ir.ld$class)) # transpose so that a true class is a colum In LDA class boundaries are linear hyperplanes: for g classes, there are g(g-1) boundaries. The separating hyperplanes (boundaries)between the groups are NOT simply the perpendicular bisectors of the lines joining the class centroids. It is only this in the case that the covariance matrix is spherical (= Identity matrix times constant) and the priors are equal. (The former case could be satisfied by plotting with the sphered data.) Boundaries are plotted in crabs example in MASS at the bottom on p. 335 using the perp() function. See Figs. 4.1 and 4.5 of Hastie et al.. In practice, however, (esp. for QDA) it might be easier to compute the boundaries exhaustively, i.e. compute the decision rule on a finite lattice of points and then use a contouring method to compute the boundaries (as in Hastie et al. - footnote on p. 88). # Repeat but with separate train and test sets train <- sample(1:150, 100) ir.lda <- lda(Sp ~ ., Iris, subset=train) ir.ld <- predict(ir.lda) ir.ld.test <- predict(ir.lda, Iris[-train, ])$class t(table(Iris$Sp[train], ir.ld$class)) # results on test set t(table(Iris$Sp[-train], ir.ld.test)) # results on test set ########## Splines ### First look at fitting with polynomials attach(GAGurine) #par(mfrow = c(3, 2)) plot(Age, GAG) GAG.lm <- lm(GAG ~ Age + I(Age^2) + I(Age^3) + I(Age^4) + I(Age^5) + I(Age^6) + I(Age^7) + I(Age^8)) anova(GAG.lm) # 7th and 8th powers are not significant GAG.lm2 <- lm(GAG ~ Age + I(Age^2) + I(Age^3) + I(Age^4) + I(Age^5) + I(Age^6)) xx <- seq(0, 17, len = 200) lines(xx, predict(GAG.lm2, data.frame(Age = xx))) detach(GAGurine) ### Now look at various types of splines # Regression spline exactly fits all points library(fields) # needed for rat.diet data set x.grid <- seq( 0, 120, 200) fit <- splint(rat.diet$t, rat.diet$trt, x.grid) plot( rat.diet$t, rat.diet$trt) lines( x,y) # Now use GAGurine data attach(GAGurine) # Exact spline (i.e. regression spline with knot at each point) age.grid <- seq(min(Age), max(Age), 0.01) fit <- splint(Age, GAG, age.grid) plot(Age, GAG) lines(age.grid, fit) # Linear fit - do this to inspect degrees of freedom lm1 <- lm(GAG ~ Age) # Need to put in a named data frame so that predict.lm properly recognises it age.grid <- data.frame(seq(min(Age), max(Age), 0.1)) colnames(age.grid) <- "Age" plot(Age, GAG) lines(age.grid$Age, predict(lm1, age.grid), col='red', lw=2) # We know that df=2. The following gives 312, i.e. the total number of d.o.f # available is equal to the number of measurements, which is 314 lm1$df.residual # For some reason sreg does not work on the GAGurine data, e.g. # predict(sreg(Age, GAG) # gives errors, even though it is the same syntax as the example in the sreg help page More on splines in next lecture...