To illustrate the data, focus on Madison.
source("http://pages.stat.wisc.edu/~karlrohe/netsci/code/loadData.R")
##
Read 4.5% of 14080052 rows
Read 10.1% of 14080052 rows
Read 16.5% of 14080052 rows
Read 23.1% of 14080052 rows
Read 36.4% of 14080052 rows
Read 51.8% of 14080052 rows
Read 62.6% of 14080052 rows
Read 77.5% of 14080052 rows
Read 92.3% of 14080052 rows
Read 14080052 rows and 5 (of 5) columns from 0.620 GB file in 00:00:13
##
Read 0.0% of 2117987 rows
Read 9.0% of 2117987 rows
Read 11.3% of 2117987 rows
Read 13.7% of 2117987 rows
Read 16.5% of 2117987 rows
Read 19.4% of 2117987 rows
Read 23.1% of 2117987 rows
Read 27.9% of 2117987 rows
Read 33.5% of 2117987 rows
Read 38.2% of 2117987 rows
Read 45.3% of 2117987 rows
Read 54.3% of 2117987 rows
Read 66.1% of 2117987 rows
Read 77.4% of 2117987 rows
Read 81.7% of 2117987 rows
Read 93.0% of 2117987 rows
Read 2117987 rows and 43 (of 43) columns from 0.606 GB file in 00:00:34
## Loading required package: sp
wi = DT[State == "WI"]
tmp = Et[unique(wi$NPI)]
Ewi = tmp[complete.cases(tmp)] #lots of NA's. Have not inspected why.
el=as.matrix(Ewi)[,1:2] #igraph needs the edgelist to be in matrix format
g=graph.edgelist(el,directed = F) # this creates a graph.
ids = V(g)$name
cities = wi[ids, mult = "first"]$City
g = set.vertex.attribute(g, name = "city", index=V(g),value = cities)
wig = g
madgraph = induced.subgraph(graph = g,vids = which(V(g)$city == "MADISON"))
core = graph.coreness(madgraph)
madgraph = induced.subgraph(graph = madgraph,vids = core>1)
plot(madgraph, vertex.label = NA)
Let’s color the nodes in this figure by some other interesting attributes in DT.
colnames(DT)
## [1] "NPI"
## [2] "PAC ID"
## [3] "Professional Enrollment ID"
## [4] "Last Name"
## [5] "First Name"
## [6] "Middle Name"
## [7] "Suffix"
## [8] "Gender"
## [9] "Credential"
## [10] "Medical school name"
## [11] "Graduation year"
## [12] "Primary specialty"
## [13] "Secondary specialty 1"
## [14] "Secondary specialty 2"
## [15] "Secondary specialty 3"
## [16] "Secondary specialty 4"
## [17] "All secondary specialties"
## [18] "Organization legal name"
## [19] "Organization DBA name"
## [20] "Group Practice PAC ID"
## [21] "Number of Group Practice members"
## [22] "Line 1 Street Address"
## [23] "Line 2 Street Address"
## [24] "Marker of address line 2 suppression"
## [25] "City"
## [26] "State"
## [27] "Zip Code"
## [28] "Claims based hospital affiliation CCN 1"
## [29] "Claims based hospital affiliation LBN 1"
## [30] "Claims based hospital affiliation CCN 2"
## [31] "Claims based hospital affiliation LBN 2"
## [32] "Claims based hospital affiliation CCN 3"
## [33] "Claims based hospital affiliation LBN 3"
## [34] "Claims based hospital affiliation CCN 4"
## [35] "Claims based hospital affiliation LBN 4"
## [36] "Claims based hospital affiliation CCN 5"
## [37] "Claims based hospital affiliation LBN 5"
## [38] "Professional accepts Medicare Assignment"
## [39] "Participating in eRx"
## [40] "Participating in PQRS"
## [41] "Participating in EHR"
## [42] "Received PQRS Maintenance of Certification Program Incentive"
## [43] "Participated in Million Hearts"
features = colnames(DT)[c(8:12, 18,19, 21, 28)]
features
## [1] "Gender"
## [2] "Credential"
## [3] "Medical school name"
## [4] "Graduation year"
## [5] "Primary specialty"
## [6] "Organization legal name"
## [7] "Organization DBA name"
## [8] "Number of Group Practice members"
## [9] "Claims based hospital affiliation CCN 1"
ids = V(madgraph)$name
tmp = wi[ids, mult = "last"]
atbs = tmp[,features, with = F] # Thank you google for helping to find "with"
mean(complete.cases(atbs))
## [1] 1
atbs = as.matrix(atbs)
for(i in 1:ncol(atbs)){
madgraph = set.vertex.attribute(madgraph, name = colnames(atbs)[i], index=V(madgraph),value = atbs[,i])
}
summary(madgraph)
## IGRAPH UN-- 474 3706 --
## attr: name (v/c), city (v/c), Gender (v/c), Credential (v/c),
## Medical school name (v/c), Graduation year (v/c), Primary
## specialty (v/c), Organization legal name (v/c), Organization DBA
## name (v/c), Number of Group Practice members (v/c), Claims based
## hospital affiliation CCN 1 (v/c)
Now, let’s plot it with several different colorings. The thing that takes the longest in plotting is computing the node locations (extensive field of algorithmic study!).
locs = layout.fruchterman.reingold(madgraph)
madStuff = cbind(locs, atbs)
save(madStuff, file = "~/dataFiles/physicianReferral/madStuff.RData")