DATA MINING
Desktop Survival Guide
by Graham Williams

### Alternative Clustering

For model-based clustering see the BIC algorithm in the mclust package. This estimates density with a mixture of Gaussians.

For density-based clustering the following implementation of DBSCAN may be useful. It follows the notation of the original KDD-96 DBSCAN paper. For large datasets, it may be slow.

 ```#Christian Hennig distvector <- function(x,data) { ddata <- t(data)-x dv <- apply(ddata^2,2,sum) } # data may be nxp or distance matrix # eps is the dbscan distance cutoff parameter # MinPts is the minimum size of a cluster # scale: Should the data be scaled? # distances: has to be TRUE if data is a distance matrix # showplot: Should the computation process be visualized? # countmode: dbscan gives messages when processing point no. (countmode) dbscan <- function(data,eps,MinPts=5, scale=FALSE, distances=FALSE, showplot=FALSE, countmode=c(1,2,3,5,10,100,1000,5000,10000,50000)){ data <- as.matrix(data) n <- nrow(data) if (scale) data <- scale(data) unregpoints <- rep(0,n) e2 <- eps^2 cv <- rep(0,n) cn <- 0 i <- 1 for (i in 1:n){ if (i %in% countmode) cat("Processing point ", i," of ",n, ".\n") unclass <- cv<1 if (cv[i]==0){ if (distances) seeds <- data[i,]<=eps else{ seeds <- rep(FALSE,n) seeds[unclass] <- distvector(data[i,],data[unclass,])<=e2 } if (sum(seeds)+unregpoints[i]0){ if (showplot) plot(data,col=1+cv) unclass[seeds] <- FALSE cv[seeds] <- cn ap <- (1:n)[seeds] # print(ap) seeds <- rep(FALSE,n) for (j in ap){ # if (showplot) plot(data,col=1+cv) jseeds <- rep(FALSE,n) if (distances) jseeds[unclass] <- data[j,unclass]<=eps else{ jseeds[unclass] <- distvector(data[j,],data[unclass,])<=e2 } unregpoints[jseeds] <- unregpoints[jseeds]+1 # if (cn==1) # cat(j," sum seeds=",sum(seeds)," unreg=",unregpoints[j], # " newseeds=",sum(cv[jseeds]==0),"\n") if (sum(jseeds)+unregpoints[j]>=MinPts){ seeds[jseeds] <- cv[jseeds]==0 cv[jseeds & cv<0] <- cn } } # for j } # while sum seeds>0 } # else (sum seeds + ... >= MinPts) } # if cv==0 } # for i if (sum(cv==(-1))>0){ noisenumber <- cn+1 cv[cv==(-1)] <- noisenumber } else noisenumber <- FALSE out <- list(classification=cv, noisenumber=noisenumber, eps=eps, MinPts=MinPts, unregpoints=unregpoints) out } # dbscan # classification: classification vector # noisenumber: number in the classification vector indicating noise points # unregpoints: ignore... ```