Covariate assisted spectral clustering (CASC) is a way of including additional node level data to bias the clusters towards something that we suspect matters. In this case, we want clusters that are “continuous in space”.
Below, I have not included the literal version of CASC. Instead, the approach is as follows.
load(url("http://pages.stat.wisc.edu/~karlrohe/netsci/data/zipA.RData"))
library(FNN)
dat = as.matrix(zipcode[rownames(A),c("longitude", "latitude"), with = F])
good = complete.cases(dat)
Knng = get.knn(dat[good,])
dict = rownames(A)[good]
cbind(dict[1], dict[Knng$nn.index[1,]])
## [,1] [,2]
## [1,] "21201" "21202"
## [2,] "21201" "21217"
## [3,] "21201" "21230"
## [4,] "21201" "21223"
## [5,] "21201" "21287"
## [6,] "21201" "21231"
## [7,] "21201" "21218"
## [8,] "21201" "21211"
## [9,] "21201" "21213"
## [10,] "21201" "21205"
knnEL = matrix("", nrow = prod(dim(Knng$nn.index)), ncol = 2)
for(i in 1:nrow(Knng$nn.index)) knnEL[1:10 + 10*(i-1),] = cbind(dict[i], dict[Knng$nn.index[i,]])
gknn=graph.edgelist(knnEL,directed = F) # this creates a graph.
Aknn = get.adjacency(gknn)
cross = match(rownames(A)[good], rownames(Aknn))
mean(rownames(Aknn)[cross] == rownames(A)[good])
## [1] 1
Aknn = Aknn[cross,cross]
# this loads a function called "sumGraphs" that takes to adjacency matrices, converts them to symmetric Laplacians, and adds these matrices together.
source(url("http://pages.stat.wisc.edu/~karlrohe/netsci/code/globalTransitivityFunctions.R"))
CASC = function(A, B, k){
X = eigs(sumGraphs(A,B),k)$vec
X = t(apply(X, 1, function(x) return(x/sqrt(sum(x^2)))))
return(kmeans(X, k, nstart = 20))
}
csc = CASC(A,Aknn, 10)
## Warning: closing unused connection 6
## (http://pages.stat.wisc.edu/~karlrohe/netsci/code/globalTransitivityFunctions.R)
sort(table(csc$clust))
##
## 7 4 9 10 2 1 3 5 8 6
## 499 775 898 925 989 1004 1432 1516 1805 3105
loc = zipcode[rownames(A), c("longitude", "latitude"), with =F]
loc = as.matrix(loc)
# pdf(file = "newRegions2.pdf", height = 13, width = 20)
map('state', add = F)
# draw a polygon around the convex hull of the first 10 clusters.
good = complete.cases(loc)
comploc = loc[good,]
points(rnorm(comploc[,1],comploc[,1],.1),rnorm(comploc[,2],comploc[,2],.1), col=as.factor(csc$clust[good]), pch =19)
for(i in 1:10){
i = i+1
inclust = which(csc$cluster[good] ==i)
Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
}
# dev.off()
k = 250
csc = CASC(A,Aknn, k)
as.vector(sort(table(csc$clust)))
## [1] 7 10 11 12 13 13 14 15 19 19 19 19 20 20 21 21 21
## [18] 21 21 21 22 22 23 23 23 24 24 24 25 25 26 26 26 26
## [35] 26 27 27 28 28 28 29 30 30 30 30 30 31 31 32 33 33
## [52] 33 33 33 34 34 34 34 34 34 35 35 35 35 35 35 35 36
## [69] 36 37 37 37 37 37 37 37 38 38 38 38 38 40 40 40 40
## [86] 40 41 41 41 42 42 42 42 43 43 43 43 43 43 43 43 44
## [103] 44 44 44 45 45 46 46 46 46 47 47 48 48 48 48 49 49
## [120] 49 50 50 50 50 50 50 50 50 51 51 51 51 51 51 52 52
## [137] 52 52 53 53 53 54 54 54 54 54 55 55 55 55 56 56 56
## [154] 56 56 56 57 57 57 57 57 57 58 58 58 59 59 59 59 59
## [171] 59 60 60 60 60 62 62 63 63 63 63 63 63 63 64 64 64
## [188] 64 64 65 65 65 66 66 67 67 68 68 68 68 69 69 70 70
## [205] 71 71 72 72 72 73 73 74 74 75 75 75 76 77 77 78 79
## [222] 80 80 81 84 85 85 86 89 89 91 91 92 92 94 95 96 97
## [239] 99 99 99 100 100 101 101 108 109 110 113 167
plot(as.vector(sort(table(csc$clust))))
loc = zipcode[rownames(A), c("longitude", "latitude"), with =F]
loc = as.matrix(loc)
# pdf(file = "newRegions2.pdf", height = 13, width = 20)
plot(rnorm(loc[,1],loc[,1],.1),rnorm(loc[,2],loc[,2],.1), col=as.factor(csc$clust),
xlim = c(-125, -65), ylim = c(24, 50), pch =19)
## Warning in rnorm(loc[, 1], loc[, 1], 0.1): NAs produced
## Warning in rnorm(loc[, 2], loc[, 2], 0.1): NAs produced
map('state', add = T)
# draw a polygon around the convex hull of the first 10 clusters.
good = complete.cases(loc)
comploc = loc[good,]
for(i in 1:k){
i = i+1
inclust = which(csc$cluster[good] ==i)
Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
}
# dev.off()
map('state', add = F)
# draw a polygon around the convex hull of the clusters
good = complete.cases(loc)
comploc = loc[good,]
for(i in 1:k){
inclust = which(csc$cluster[good] ==i)
Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
}
# focus on the North East
# pdf(file = "northEast.pdf", height = 6, width= 6)
NorthEast = c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont", "New Jersey", "New York", "Pennsylvania", "Mary", "Del")
map('county', NorthEast, add = F)
good = complete.cases(loc)
comploc = loc[good,]
points(rnorm(comploc[,1],comploc[,1],.03),rnorm(comploc[,2],comploc[,2],.03), col=as.factor(csc$clust[good]), pch = ".")
for(i in 1:k){
i = i+1
inclust = which(csc$cluster[good] ==i)
Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
}
# dev.off()
What is going on with the crazy large clusters? Let’s find them.
# given a set of 2d points, compute the sum of squares from the center
sd2 = function(x){
x = x[complete.cases(x),]
return( sqrt(sum(apply(scale(x, scale = F), 2, var))))
}
# given a set of 2d points, compute the maximal distance from the center
max2 = function(x){
x = x[complete.cases(x),]
return( max(apply(scale(x, scale = F), 1, function(z) return(sqrt(sum(z^2))))))
}
sizeSD = rep(NA, k)
sizeMAX = sizeSD
num = sizeSD
for(i in 1:k){
inclust = which(csc$cluster[good] ==i)
sizeSD[i] = sd2(comploc[inclust,])
sizeMAX[i] = max2(comploc[inclust,])
num[i]=length(inclust)
}
plot(log(sizeSD,10), log(sizeMAX,10))
plot(num, log(sizeMAX))
Define the big clusters to be the ones with log(sizeSD,10) > .5.
bigClust = which(log(sizeMAX,10)>.5)
# draw a polygon around the big clusters
map('world', add = F)
for(i in bigClust){
inclust = which(csc$cluster[good] ==i)
Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "red", thick = 2)
}
map('state', add = F)
for(i in bigClust){
inclust = which(csc$cluster[good] ==i)
Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
}
The really large cluster on the map is also is the cluster with the largest number of nodes.
map('state', add = F)
i = which.max(num)
inclust = which(csc$cluster[good] ==i)
Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
points(rnorm(comploc[inclust,1],comploc[inclust,1],.03),rnorm(comploc[inclust,2],comploc[inclust,2],.03))
This cluster contains 165 zip codes; roughly 1% of the zip codes. Note: not all zip codes that fall within the plotted region belong to the cluster!
Perhaps we simply need to increase the number of clusters!
k = 350
csc = CASC(A,Aknn, k)
loc = zipcode[rownames(A), c("longitude", "latitude"), with =F]
loc = as.matrix(loc)
# pdf(file = "newRegions2.pdf", height = 13, width = 20)
plot(rnorm(loc[,1],loc[,1],.1),rnorm(loc[,2],loc[,2],.1), col=as.factor(csc$clust),
xlim = c(-125, -65), ylim = c(24, 50), pch =19)
## Warning in rnorm(loc[, 1], loc[, 1], 0.1): NAs produced
## Warning in rnorm(loc[, 2], loc[, 2], 0.1): NAs produced
map('state', add = T)
# draw a polygon around the convex hull of the first 10 clusters.
good = complete.cases(loc)
comploc = loc[good,]
for(i in 1:k){
i = i+1
inclust = which(csc$cluster[good] ==i)
Plot_ConvexHull(xcoord = comploc[inclust,1], ycoord = comploc[inclust,2],lcolor = "black", thick = 2)
}
Still big clusters! One possibility is to “tune” the CASC weighting of the knn graph and the referral network. The next lecture tries a vastly different algorithm.