Incorporating additional information into the formation of clusters.

Covariate assisted spectral clustering (CASC) is a way of including additional node level data to bias the clusters towards something that we suspect matters. In this case, we want clusters that are “continuous in space”.

Below, I have not included the literal version of CASC. Instead, the approach is as follows.

  1. Construct two graphs. The first graph is the same as above. The second graph is the 10 nearest neighbor graph for zip code locations.
  2. Compute the regularized \(L_{sym}\) matrix for both graphs. Add these matrices to form a new similarity matrix.
  3. Compute the leading eigenvectors of this matrix and row normalize. Run k-means.
load(url("http://pages.stat.wisc.edu/~karlrohe/netsci/data/zipA.RData"))
library(FNN)
dat = as.matrix(zipcode[rownames(A),c("longitude", "latitude"), with = F])
good = complete.cases(dat)
Knng = get.knn(dat[good,])
dict = rownames(A)[good]
cbind(dict[1], dict[Knng$nn.index[1,]])
##       [,1]    [,2]   
##  [1,] "21201" "21202"
##  [2,] "21201" "21217"
##  [3,] "21201" "21230"
##  [4,] "21201" "21223"
##  [5,] "21201" "21287"
##  [6,] "21201" "21231"
##  [7,] "21201" "21218"
##  [8,] "21201" "21211"
##  [9,] "21201" "21213"
## [10,] "21201" "21205"
knnEL = matrix("", nrow = prod(dim(Knng$nn.index)), ncol = 2)
for(i in 1:nrow(Knng$nn.index))  knnEL[1:10 + 10*(i-1),] = cbind(dict[i], dict[Knng$nn.index[i,]])
gknn=graph.edgelist(knnEL,directed = F) # this creates a graph.
Aknn = get.adjacency(gknn)
cross = match(rownames(A)[good], rownames(Aknn))
mean(rownames(Aknn)[cross] == rownames(A)[good])
## [1] 1
Aknn = Aknn[cross,cross]

CASC = function(A, B, k){
  rs = rowSums(A)
  D = Diagonal(n = nrow(A), 1/sqrt(rs + mean(rs)))
  La = D%*%A%*%D
  
  rs = rowSums(B)
  D = Diagonal(n = nrow(B), 1/sqrt(rs + mean(rs)))
  Lb = D%*%B%*%D
  
  # Matrix won't allow addition La+Lb ... ugh. 
  # do it by converting to igraph and take a union of graphs.
  ga = graph.adjacency(La,weighted = T)
  gb = graph.adjacency(Lb,weighted = T)
  gg = ga %u% gb
  E(gg)$weight_1[is.na(E(gg)$weight_1)] = 0
  E(gg)$weight_2[is.na(E(gg)$weight_2)] = 0
  E(gg)$weight = E(gg)$weight_1 + E(gg)$weight_2
  L2 = get.adjacency(gg,attr = "weight")
  
  X = eigs(L2,k)$vec
  X = t(apply(X, 1, function(x) return(x/sqrt(sum(x^2)))))
  return(kmeans(X, k, nstart = 20))
}

csc = CASC(A,Aknn, 10)
loc = zipcode[rownames(A), c("longitude", "latitude"), with  =F]
loc = as.matrix(loc)
# pdf(file = "newRegions2.pdf", height = 13, width = 20)
plot(rnorm(loc[,1],loc[,1],.1),rnorm(loc[,2],loc[,2],.1), col=as.factor(csc$clust),
     xlim = c(-125, -65), ylim = c(24, 50), pch  =19)
## Warning in rnorm(loc[, 1], loc[, 1], 0.1): NAs produced
## Warning in rnorm(loc[, 2], loc[, 2], 0.1): NAs produced
map('state', add = T) 
# draw a polygon around the convex hull of the first 10 clusters.
good = complete.cases(loc)
comploc = loc[good,]
for(i in 1:10){
  i = i+1
  inclust = which(csc$cluster[good] ==i)
  Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
}

# dev.off()
k = 250
csc = CASC(A,Aknn, k)
loc = zipcode[rownames(A), c("longitude", "latitude"), with  =F]
loc = as.matrix(loc)
# pdf(file = "newRegions2.pdf", height = 13, width = 20)
plot(rnorm(loc[,1],loc[,1],.1),rnorm(loc[,2],loc[,2],.1), col=as.factor(csc$clust),
     xlim = c(-125, -65), ylim = c(24, 50), pch  =19)
## Warning in rnorm(loc[, 1], loc[, 1], 0.1): NAs produced
## Warning in rnorm(loc[, 2], loc[, 2], 0.1): NAs produced
map('state', add = T) 
# draw a polygon around the convex hull of the first 10 clusters.
good = complete.cases(loc)
comploc = loc[good,]
for(i in 1:k){
  i = i+1
  inclust = which(csc$cluster[good] ==i)
  Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
}

# dev.off()

map('state', add = F) 
# draw a polygon around the convex hull of the first 10 clusters.
good = complete.cases(loc)
comploc = loc[good,]
for(i in 1:k){
  i = i+1
  inclust = which(csc$cluster[good] ==i)
  Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
}