The Madison referral network

To illustrate the data, focus on Madison.

source("http://pages.stat.wisc.edu/~karlrohe/netsci/code/loadData.R")
## 
## Attaching package: 'igraph'
## 
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## 
## The following object is masked from 'package:base':
## 
##     union
## 
Read 0.0% of 14080052 rows
Read 6.5% of 14080052 rows
Read 14.7% of 14080052 rows
Read 17.4% of 14080052 rows
Read 24.3% of 14080052 rows
Read 33.2% of 14080052 rows
Read 42.3% of 14080052 rows
Read 51.3% of 14080052 rows
Read 56.5% of 14080052 rows
Read 65.3% of 14080052 rows
Read 73.8% of 14080052 rows
Read 82.4% of 14080052 rows
Read 91.0% of 14080052 rows
Read 99.5% of 14080052 rows
Read 14080052 rows and 5 (of 5) columns from 0.620 GB file in 00:00:19
## 
Read 0.0% of 2117987 rows
Read 5.7% of 2117987 rows
Read 11.8% of 2117987 rows
Read 17.5% of 2117987 rows
Read 20.8% of 2117987 rows
Read 26.9% of 2117987 rows
Read 34.0% of 2117987 rows
Read 35.9% of 2117987 rows
Read 42.0% of 2117987 rows
Read 48.6% of 2117987 rows
Read 54.8% of 2117987 rows
Read 60.9% of 2117987 rows
Read 62.3% of 2117987 rows
Read 68.5% of 2117987 rows
Read 74.6% of 2117987 rows
Read 80.7% of 2117987 rows
Read 85.9% of 2117987 rows
Read 91.6% of 2117987 rows
Read 97.3% of 2117987 rows
Read 2117987 rows and 43 (of 43) columns from 0.606 GB file in 00:00:37
mad = DT[City == "MADISON"]
tmp = Et[unique(mad$NPI)]  # so cool! and fast!
Emad = tmp[complete.cases(tmp)]  #lots of NA's.  Have not inspected why.
el=as.matrix(Emad)[,1:2] #igraph needs the edgelist to be in matrix format
g=graph.edgelist(el,directed = F) # this creates a graph.
# original graph is directed.  For now, ignore direction.
g= simplify(g)  # removes any self loops and multiple edges
core = graph.coreness(g)  # talk about core.
hist(core)

sum(core>10)
## [1] 588
g1 = induced.subgraph(graph = g,vids = V(g)[core>10])  # talk about induced subgraphs.
plot(g1)

What is going on with 1700865094? This is a playground! Do a google search!

Could fix this problem by thresholding on in/out degree…

## OLD WAY
#g=graph.edgelist(el,directed = T) 
#g = induced.subgraph(graph = g,vids = V(g)[degree(g, mode = "in") > 1])  #OMG so annoying. 
#g = as.undirected(g)
#g= simplify(g)

## ANOTHER WAY
# g = induced.subgraph(graph = g, vids = unique(mad$NPI))  # this should work! but it doesn't!

unique(mad$NPI)[1]
## [1] "1003019670"
V(g)[1]
## + 1/5530 vertex, named:
## [1] 1003019670
madgraph = induced.subgraph(graph = g, vids = which(V(g)$name %in% unique(mad$NPI)))
core = graph.coreness(madgraph)
sum(core > 1)
## [1] 853
madgraph = induced.subgraph(graph = madgraph,vids = (core > 1))
plot(madgraph)

Ugh. Those labels are horrible. How do you fix that? Google it!

r plot(madgraph, vertex.label= NA) # Thank you google!

r clust = clusters(madgraph) # this finds connected components. names(clust)

## [1] "membership" "csize" "no"

r clust$csize

## [1] 640 3 34 3 3 77 9 3 18 5 4 6 4 6 14 9 3 ## [18] 3 5 4

r tmp = induced.subgraph(graph = madgraph,vids = (clust$mem ==9)) plot(tmp)

r sort(degree(tmp))

## 1972786887 1427251826 1225395288 1205820057 1669760286 1205147782 ## 7 8 8 9 9 10 ## 1770801698 1588968960 1033180104 1730360629 1972648954 1174727564 ## 10 11 12 12 12 13 ## 1609154681 1346443439 1891907515 1518279843 1801045174 1922394907 ## 13 14 14 14 16 16

r summary(tmp)

## IGRAPH UN-- 18 104 -- ## + attr: name (v/c)

r DT[V(tmp)$name]$State # Oh shit!

## [1] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL" ## [15] "AL" "AL" "AL" "AL" "AL" "AL" "AL" "AL"

r DT[V(tmp)$name]$City # whhhhhaaaat?

## [1] "MADISON" "HUNTSVILLE" "MADISON" "MADISON" "MADISON" ## [6] "MADISON" "MADISON" "MADISON" "MADISON" "MADISON" ## [11] "MADISON" "JASPER" "DECATUR" "DECATUR" "MADISON" ## [16] "MADISON" "MADISON" "MADISON" "MADISON" "MADISON" ## [21] "MADISON" "MADISON"

r length(V(tmp)$name)

## [1] 18

r length(DT[V(tmp)$name]$City) # oh.

## [1] 22

Perhaps there is a better way to do all of this. igraph allows you to assign vertex (and edge) attributes.

wi = DT[State == "WI"]
tmp = Et[unique(wi$NPI)]  
Ewi = tmp[complete.cases(tmp)]  #lots of NA's.  Have not inspected why.
el=as.matrix(Ewi)[,1:2] #igraph needs the edgelist to be in matrix format
g=graph.edgelist(el,directed = F) # this creates a graph.
ids = V(g)$name
cities = wi[ids, mult = "first"]$City
-sort(-table(cities))[1:10]
## cities
##  MILWAUKEE    MADISON EAU CLAIRE  LA CROSSE MARSHFIELD  GREEN BAY 
##       2394       1314        596        589        588        507 
##   WAUKESHA   COLUMBUS BROOKFIELD   APPLETON 
##        309        279        269        260
mean(is.na(cities))
## [1] 0.3713623
g = set.vertex.attribute(g, name = "city", index=V(g),value =  cities)
wig = g
madgraph = induced.subgraph(graph = g,vids = which(V(g)$city == "MADISON"))
summary(madgraph)
## IGRAPH UN-- 1314 3848 -- 
## + attr: name (v/c), city (v/c)
core = graph.coreness(madgraph)
sum(core>1)
## [1] 474
madgraph = induced.subgraph(graph = madgraph,vids = core>1)
plot(madgraph, vertex.label = NA)

Let’s color the nodes in this figure by some other interesting attributes in DT.

colnames(DT)
##  [1] "NPI"                                                         
##  [2] "PAC ID"                                                      
##  [3] "Professional Enrollment ID"                                  
##  [4] "Last Name"                                                   
##  [5] "First Name"                                                  
##  [6] "Middle Name"                                                 
##  [7] "Suffix"                                                      
##  [8] "Gender"                                                      
##  [9] "Credential"                                                  
## [10] "Medical school name"                                         
## [11] "Graduation year"                                             
## [12] "Primary specialty"                                           
## [13] "Secondary specialty 1"                                       
## [14] "Secondary specialty 2"                                       
## [15] "Secondary specialty 3"                                       
## [16] "Secondary specialty 4"                                       
## [17] "All secondary specialties"                                   
## [18] "Organization legal name"                                     
## [19] "Organization DBA name"                                       
## [20] "Group Practice PAC ID"                                       
## [21] "Number of Group Practice members"                            
## [22] "Line 1 Street Address"                                       
## [23] "Line 2 Street Address"                                       
## [24] "Marker of address line 2 suppression"                        
## [25] "City"                                                        
## [26] "State"                                                       
## [27] "Zip Code"                                                    
## [28] "Claims based hospital affiliation CCN 1"                     
## [29] "Claims based hospital affiliation LBN 1"                     
## [30] "Claims based hospital affiliation CCN 2"                     
## [31] "Claims based hospital affiliation LBN 2"                     
## [32] "Claims based hospital affiliation CCN 3"                     
## [33] "Claims based hospital affiliation LBN 3"                     
## [34] "Claims based hospital affiliation CCN 4"                     
## [35] "Claims based hospital affiliation LBN 4"                     
## [36] "Claims based hospital affiliation CCN 5"                     
## [37] "Claims based hospital affiliation LBN 5"                     
## [38] "Professional accepts Medicare Assignment"                    
## [39] "Participating in eRx"                                        
## [40] "Participating in PQRS"                                       
## [41] "Participating in EHR"                                        
## [42] "Received PQRS Maintenance of Certification Program Incentive"
## [43] "Participated in Million Hearts"
features = colnames(DT)[c(8:12, 18,19, 21, 28)]
features
## [1] "Gender"                                 
## [2] "Credential"                             
## [3] "Medical school name"                    
## [4] "Graduation year"                        
## [5] "Primary specialty"                      
## [6] "Organization legal name"                
## [7] "Organization DBA name"                  
## [8] "Number of Group Practice members"       
## [9] "Claims based hospital affiliation CCN 1"
ids = V(madgraph)$name
tmp = wi[ids, mult = "last"]
atbs = tmp[,features, with = F]  # Thank you google for helping to find "with"
mean(complete.cases(atbs))
## [1] 1
atbs = as.matrix(atbs)
for(i in 1:ncol(atbs)){
madgraph = set.vertex.attribute(madgraph, name = colnames(atbs)[i], index=V(madgraph),value =  atbs[,i])
}
summary(madgraph)
## IGRAPH UN-- 474 3706 -- 
## + attr: name (v/c), city (v/c), Gender (v/c), Credential (v/c),
## | Medical school name (v/c), Graduation year (v/c), Primary
## | specialty (v/c), Organization legal name (v/c), Organization DBA
## | name (v/c), Number of Group Practice members (v/c), Claims based
## | hospital affiliation CCN 1 (v/c)

Now, let’s plot it with several different colorings. The thing that takes the longest in plotting is computing the node locations (extensive field of algorithmic study!).

locs = layout.fruchterman.reingold(madgraph)
summary(madgraph)
## IGRAPH UN-- 474 3706 -- 
## + attr: name (v/c), city (v/c), Gender (v/c), Credential (v/c),
## | Medical school name (v/c), Graduation year (v/c), Primary
## | specialty (v/c), Organization legal name (v/c), Organization DBA
## | name (v/c), Number of Group Practice members (v/c), Claims based
## | hospital affiliation CCN 1 (v/c)
plot(madgraph, vertex.label = NA, vertex.color = as.factor(V(madgraph)$"Gender"), layout = locs)

plot(madgraph, vertex.label = NA, vertex.color = as.factor(V(madgraph)$"Credential"), layout = locs)

plot(madgraph, vertex.label = NA, vertex.color = as.factor(V(madgraph)$"Medical school name"), layout = locs)

year = as.numeric(V(madgraph)$"Graduation year")
plot(madgraph, vertex.label = NA, vertex.color = grey((year - min(year))/(max(year) - min(year))) , layout = locs)

plot(madgraph, vertex.label = NA, vertex.color = as.factor(V(madgraph)$"Primary specialty"), layout = locs)

plot(madgraph, vertex.label = NA, vertex.color = as.factor(V(madgraph)$"Organization legal name"), layout = locs)

plot(madgraph, vertex.label = NA, vertex.color = as.factor(V(madgraph)$"Organization DBA name"), layout = locs)

plot(madgraph, vertex.label = NA, vertex.color = as.factor(V(madgraph)$"Claims based hospital affiliation CCN 1"), layout = locs)

What are they?

table(atbs[,7])
## 
##                                                
##                                             86 
##                ACCESS COMMUNITY HEALTH CENTERS 
##                                              1 
##                       ADVANCED PAIN MANAGEMENT 
##                                              3 
##                                    DEAN CLINIC 
##                                             50 
##                      DEAN REGIONAL EYE NETWORK 
##                                              3 
##                    FOND DU LAC REGIONAL CLINIC 
##                                              1 
##           HESS MEMORIAL HOSPITAL ER PHYSICIANS 
##                                              1 
##                            SURGICAL ASSOCIATES 
##                                              2 
## TURVILLE BAY MRI AND RADIATION ONCOLOGY CENTER 
##                                              3 
##     UNIVERSITY OF WISCONSIN MEDICAL FOUNDATION 
##                                             14 
##                          UW MEDICAL FOUNDATION 
##                                            310

Next, let’s plot the data using zip codes. Zip codes are another data playground!