Incorporating additional information into the formation of clusters.

Covariate assisted spectral clustering (CASC) is a way of including additional node level data to bias the clusters towards something that we suspect matters. In this case, we want clusters that are “continuous in space”.

Below, I have not included the literal version of CASC. Instead, the approach is as follows.

Construct two graphs. The first graph is the same as above. The second graph is the 10 nearest neighbor graph for zip code locations.
Compute the regularized \(L_{sym}\) matrix for both graphs. Add these matrices to form a new similarity matrix.
Compute the leading eigenvectors of this matrix and row normalize. Run k-means.

load(url("http://pages.stat.wisc.edu/~karlrohe/netsci/data/zipA.RData"))
library(FNN)
dat = as.matrix(zipcode[rownames(A),c("longitude", "latitude"), with = F])
good = complete.cases(dat)
Knng = get.knn(dat[good,])
dict = rownames(A)[good]
cbind(dict[1], dict[Knng$nn.index[1,]])

##       [,1]    [,2]   
##  [1,] "21201" "21202"
##  [2,] "21201" "21217"
##  [3,] "21201" "21230"
##  [4,] "21201" "21223"
##  [5,] "21201" "21287"
##  [6,] "21201" "21231"
##  [7,] "21201" "21218"
##  [8,] "21201" "21211"
##  [9,] "21201" "21213"
## [10,] "21201" "21205"

knnEL = matrix("", nrow = prod(dim(Knng$nn.index)), ncol = 2)
for(i in 1:nrow(Knng$nn.index))  knnEL[1:10 + 10*(i-1),] = cbind(dict[i], dict[Knng$nn.index[i,]])
gknn=graph.edgelist(knnEL,directed = F) # this creates a graph.
Aknn = get.adjacency(gknn)
cross = match(rownames(A)[good], rownames(Aknn))
mean(rownames(Aknn)[cross] == rownames(A)[good])

## [1] 1

Aknn = Aknn[cross,cross]

# this loads a function called "sumGraphs" that takes to adjacency matrices, converts them to symmetric Laplacians, and adds these matrices together.  
source(url("http://pages.stat.wisc.edu/~karlrohe/netsci/code/globalTransitivityFunctions.R"))
CASC = function(A, B, k){
  X = eigs(sumGraphs(A,B),k)$vec
  X = t(apply(X, 1, function(x) return(x/sqrt(sum(x^2)))))
  return(kmeans(X, k, nstart = 20))
  }

csc = CASC(A,Aknn, 10)

## Warning: closing unused connection 6
## (http://pages.stat.wisc.edu/~karlrohe/netsci/code/globalTransitivityFunctions.R)

sort(table(csc$clust))

## 
##    7    4    9   10    2    1    3    5    8    6 
##  499  775  898  925  989 1004 1432 1516 1805 3105

loc = zipcode[rownames(A), c("longitude", "latitude"), with  =F]
loc = as.matrix(loc)
# pdf(file = "newRegions2.pdf", height = 13, width = 20)
map('state', add = F) 
# draw a polygon around the convex hull of the first 10 clusters.
good = complete.cases(loc)
comploc = loc[good,]
points(rnorm(comploc[,1],comploc[,1],.1),rnorm(comploc[,2],comploc[,2],.1), col=as.factor(csc$clust[good]), pch  =19)

for(i in 1:10){
  i = i+1
  inclust = which(csc$cluster[good] ==i)
  Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
  }

# dev.off()

k = 250
csc = CASC(A,Aknn, k)
as.vector(sort(table(csc$clust)))

##   [1]   7  10  11  12  13  13  14  15  19  19  19  19  20  20  21  21  21
##  [18]  21  21  21  22  22  23  23  23  24  24  24  25  25  26  26  26  26
##  [35]  26  27  27  28  28  28  29  30  30  30  30  30  31  31  32  33  33
##  [52]  33  33  33  34  34  34  34  34  34  35  35  35  35  35  35  35  36
##  [69]  36  37  37  37  37  37  37  37  38  38  38  38  38  40  40  40  40
##  [86]  40  41  41  41  42  42  42  42  43  43  43  43  43  43  43  43  44
## [103]  44  44  44  45  45  46  46  46  46  47  47  48  48  48  48  49  49
## [120]  49  50  50  50  50  50  50  50  50  51  51  51  51  51  51  52  52
## [137]  52  52  53  53  53  54  54  54  54  54  55  55  55  55  56  56  56
## [154]  56  56  56  57  57  57  57  57  57  58  58  58  59  59  59  59  59
## [171]  59  60  60  60  60  62  62  63  63  63  63  63  63  63  64  64  64
## [188]  64  64  65  65  65  66  66  67  67  68  68  68  68  69  69  70  70
## [205]  71  71  72  72  72  73  73  74  74  75  75  75  76  77  77  78  79
## [222]  80  80  81  84  85  85  86  89  89  91  91  92  92  94  95  96  97
## [239]  99  99  99 100 100 101 101 108 109 110 113 167

plot(as.vector(sort(table(csc$clust))))

loc = zipcode[rownames(A), c("longitude", "latitude"), with  =F]
loc = as.matrix(loc)
# pdf(file = "newRegions2.pdf", height = 13, width = 20)
plot(rnorm(loc[,1],loc[,1],.1),rnorm(loc[,2],loc[,2],.1), col=as.factor(csc$clust),
     xlim = c(-125, -65), ylim = c(24, 50), pch  =19)

## Warning in rnorm(loc[, 1], loc[, 1], 0.1): NAs produced

## Warning in rnorm(loc[, 2], loc[, 2], 0.1): NAs produced

map('state', add = T) 
# draw a polygon around the convex hull of the first 10 clusters.
good = complete.cases(loc)
comploc = loc[good,]
for(i in 1:k){
  i = i+1
  inclust = which(csc$cluster[good] ==i)
  Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
  }

# dev.off()

map('state', add = F) 
# draw a polygon around the convex hull of the clusters
good = complete.cases(loc)
comploc = loc[good,]
for(i in 1:k){
  inclust = which(csc$cluster[good] ==i)
  Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
  }

# focus on the North East
# pdf(file = "northEast.pdf", height = 6, width= 6)
NorthEast = c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont", "New Jersey", "New York", "Pennsylvania", "Mary", "Del")
map('county', NorthEast, add = F) 
good = complete.cases(loc)
comploc = loc[good,]
points(rnorm(comploc[,1],comploc[,1],.03),rnorm(comploc[,2],comploc[,2],.03), col=as.factor(csc$clust[good]), pch = ".")
for(i in 1:k){
  i = i+1
  inclust = which(csc$cluster[good] ==i)
  Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
  }

# dev.off()

What is going on with the crazy large clusters? Let’s find them.

# given a set of 2d points, compute the sum of squares from the center
sd2 = function(x){
  x = x[complete.cases(x),]
  return( sqrt(sum(apply(scale(x, scale = F), 2, var))))
  }
# given a set of 2d points, compute the maximal distance from the center
max2 = function(x){
  x = x[complete.cases(x),]
  return( max(apply(scale(x, scale = F), 1, function(z) return(sqrt(sum(z^2))))))
  }
sizeSD = rep(NA, k)
sizeMAX = sizeSD
num = sizeSD
for(i in 1:k){
  inclust = which(csc$cluster[good] ==i)
  sizeSD[i] = sd2(comploc[inclust,])
  sizeMAX[i] = max2(comploc[inclust,])
  num[i]=length(inclust)
  }
plot(log(sizeSD,10), log(sizeMAX,10))

plot(num, log(sizeMAX))

Define the big clusters to be the ones with log(sizeSD,10) > .5.

bigClust = which(log(sizeMAX,10)>.5)

# draw a polygon around the big clusters
map('world', add = F) 
for(i in bigClust){
  inclust = which(csc$cluster[good] ==i)
  Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "red", thick = 2)
  }

map('state', add = F) 
for(i in bigClust){
  inclust = which(csc$cluster[good] ==i)
  Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
  }

The really large cluster on the map is also is the cluster with the largest number of nodes.

map('state', add = F) 
i = which.max(num)
inclust = which(csc$cluster[good] ==i)
Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
points(rnorm(comploc[inclust,1],comploc[inclust,1],.03),rnorm(comploc[inclust,2],comploc[inclust,2],.03))

This cluster contains 165 zip codes; roughly 1% of the zip codes. Note: not all zip codes that fall within the plotted region belong to the cluster!

Perhaps we simply need to increase the number of clusters!

k = 350
csc = CASC(A,Aknn, k)
loc = zipcode[rownames(A), c("longitude", "latitude"), with  =F]
loc = as.matrix(loc)
# pdf(file = "newRegions2.pdf", height = 13, width = 20)
plot(rnorm(loc[,1],loc[,1],.1),rnorm(loc[,2],loc[,2],.1), col=as.factor(csc$clust),
     xlim = c(-125, -65), ylim = c(24, 50), pch  =19)

## Warning in rnorm(loc[, 1], loc[, 1], 0.1): NAs produced

## Warning in rnorm(loc[, 2], loc[, 2], 0.1): NAs produced

map('state', add = T) 
# draw a polygon around the convex hull of the first 10 clusters.
good = complete.cases(loc)
comploc = loc[good,]
for(i in 1:k){
  i = i+1
  inclust = which(csc$cluster[good] ==i)
  Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
  }

Still big clusters! One possibility is to “tune” the CASC weighting of the knn graph and the referral network. The next lecture tries a vastly different algorithm.