Zip codes!

Zip codes are another data playground. So many data sets are indexed (or include) zip code. For example, the American Community Survey is easily accessible in R with the library acs. Locations of zip codes are available in the R library zipcode.

set.seed(1)
source("http://pages.stat.wisc.edu/~karlrohe/netsci/code/loadData.R")

## 
Read 0.0% of 14080052 rows
Read 8.1% of 14080052 rows
Read 16.1% of 14080052 rows
Read 22.1% of 14080052 rows
Read 30.8% of 14080052 rows
Read 38.9% of 14080052 rows
Read 46.2% of 14080052 rows
Read 54.7% of 14080052 rows
Read 63.1% of 14080052 rows
Read 70.0% of 14080052 rows
Read 77.3% of 14080052 rows
Read 84.8% of 14080052 rows
Read 92.6% of 14080052 rows
Read 99.4% of 14080052 rows
Read 14080052 rows and 5 (of 5) columns from 0.620 GB file in 00:00:17
## 
Read 0.0% of 2117987 rows
Read 5.2% of 2117987 rows
Read 9.0% of 2117987 rows
Read 15.1% of 2117987 rows
Read 17.0% of 2117987 rows
Read 21.7% of 2117987 rows
Read 28.3% of 2117987 rows
Read 30.7% of 2117987 rows
Read 36.8% of 2117987 rows
Read 43.0% of 2117987 rows
Read 48.6% of 2117987 rows
Read 53.4% of 2117987 rows
Read 59.5% of 2117987 rows
Read 65.6% of 2117987 rows
Read 71.8% of 2117987 rows
Read 77.9% of 2117987 rows
Read 84.0% of 2117987 rows
Read 90.7% of 2117987 rows
Read 97.3% of 2117987 rows
Read 2117987 rows and 43 (of 43) columns from 0.606 GB file in 00:00:38

wi = DT[State == "WI"]

library(data.table)  # so fast!
library(igraph)  # all the basic graph operations.
library(zipcode)
zip = wi$"Zip Code"
zip = substr(zip, start = 1, stop = 5)

data(zipcode)  # this contains the locations of zip codes
zipcode = as.data.table(zipcode); setkey(zipcode, zip)  # thanks data.table for making things so fast!  
loc =  zipcode[zip, c("latitude", "longitude"), with = F]
loc = loc[complete.cases(loc)]
loc = as.matrix(loc)
plot(loc)

plot(loc[,2], loc[,1])

We can make it prettier by finding an R library to help.

library(maps); library(ggplot2)
plot(loc[,2], loc[,1])
map('state', region = c('wisconsin'), add = T)  # adds an outline

# or, you might like this version:
# install.packages("ggmap")
library(ggmap)
map <- get_map(location = 'Madison, Wisconsin', zoom=10)

## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=Madison,+Wisconsin&zoom=10&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Madison,%20Wisconsin&sensor=false

ggmap(map)  +   geom_point(aes(x = loc[,2], y = loc[,1], alpha = .5))

## Warning: Removed 44445 rows containing missing values (geom_point).

Let’s compute the geographic distance of some of the edges. There are at least three different ways to do this. The third way is with a “pipe” operator %>%. While I will not use %>% here, this is the basic idea: for each edge, get the NPI %>% “Zip Code” (using DT) %>% longitude, latitude (in dataset zipcode) %>% dist in distGeo. The pipe operator is nice because the code becomes much more readable. Below, you will see that everything must be “backwards”. First, take a simple random sample of rows of DT. We will look at the edges coming out of these nodes.

samp =   DT$NPI[sample(dim(DT)[1], 10000)]  # take a random sample of NPI's. 
DTsamp = DT[samp,mult ="first"]
dim(DTsamp)

## [1] 10000    43

DTsamp = DTsamp[complete.cases(DTsamp$"Zip Code")]
dim(DTsamp)

## [1] 10000    43

setkey(DTsamp, NPI)
tmp = Et[DTsamp$NPI]
Esamp = tmp[complete.cases(tmp)]  #lots of NA's.  Have not inspected why.
Esamp=as.matrix(Esamp)[,1:2] #igraph needs the edgelist to be in matrix format

Ok, first, the slow way using apply.

library(geosphere)

## Loading required package: sp

edgeDistance  = function(x){
  from = x[1]; to = x[2]
  p1 = zipcode[substr(DTsamp[from]$"Zip Code", start = 1, stop =5), c("longitude", "latitude"), with = F] 
  p2 = zipcode[substr(DT[to, mult = "first"]$"Zip Code", start = 1, stop =5), c("longitude","latitude"), with = F] 
  if(mean(complete.cases(p2))==1 ) return(distGeo(p1,p2)/1000)
  return(NA)
}
# becuase this way is so slow, let's just do 100 edges.
edtmp = apply(Esamp[1:100,], 1, edgeDistance)

# so ugly and so fast!
ed = distGeo(
  zipcode[
    substr(
      DT[Esamp[,1], mult = "first"]$"Zip Code" ,start = 1, stop = 5
    )
    , c("longitude", "latitude"), with = F] 
  , zipcode[substr(DT[Esamp[,2], mult = "first"]$"Zip Code" ,start = 1, stop = 5), c("longitude", "latitude"), with = F] 
)/1000
mean(ed ==0, na.rm = T)

## [1] 0.3721837

hist(log(ed+1,10))

How do the distribution of referral distances vary between providers? Let’s study it using some characteristics of the provider’s zip code. Do providers in rural areas make referrals over longer distances? You could use acs data for this. For right now, I found some data on population density and unemployment rate with a google search. If this was for a publication, I would definitely want to find this data at a more citable place (e.g. ACS and BLS).

edgeZip = substr(DT[Esamp[,1], mult = "first"]$"Zip Code",1,5)
names(which.max(table(Esamp[,1])))

## [1] "1548482151"

popd = fread("~/dataFiles/physicianReferral/popDensity.csv",sep = ",")
unemp = fread("~/dataFiles/physicianReferral/Unemployment.csv",sep = ",")
setkey(popd, Zip/ZCTA)
setkey(unemp, Zip)
str(unemp)  # careful!  zeros removed.

## Classes 'data.table' and 'data.frame':   33120 obs. of  3 variables:
##  $ Zip        : int  601 602 603 606 610 612 616 617 622 623 ...
##  $ Unemp. Rate: chr  "21%" "23%" "20%" "8%" ...
##  $ # in sample: int  14252 33123 43365 4960 22983 55763 8516 18860 4776 35497 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "Zip"

#unemployment has a %. remove it.
unemp$`Unemp. Rate`[nchar(unemp$`Unemp. Rate`)==2] = substr(unemp$`Unemp. Rate`[nchar(unemp$`Unemp. Rate`)==2],1,1)
unemp$`Unemp. Rate`[nchar(unemp$`Unemp. Rate`)==3] = substr(unemp$`Unemp. Rate`[nchar(unemp$`Unemp. Rate`)==3],1,2)
unemp$`Unemp. Rate`[nchar(unemp$`Unemp. Rate`)==4] = substr(unemp$`Unemp. Rate`[nchar(unemp$`Unemp. Rate`)==4],1,3)
unemp$`Unemp. Rate` = as.numeric(unemp$`Unemp. Rate`)
edgeUnemp = unemp$`Unemp. Rate`[as.numeric(edgeZip)]

edgeUnemp[edgeUnemp ==100] = NA
plot( edgeUnemp, log(ed+1,10))
abline( lm(log(ed+1,10)~edgeUnemp))

summary( lm(log(ed+1,10)~edgeUnemp))

## 
## Call:
## lm(formula = log(ed + 1, 10) ~ edgeUnemp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93967 -0.83660  0.03581  0.58832  3.07635 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.8261154  0.0060864 135.731  < 2e-16 ***
## edgeUnemp   0.0017471  0.0005995   2.914  0.00357 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8015 on 41103 degrees of freedom
##   (88015 observations deleted due to missingness)
## Multiple R-squared:  0.0002066,  Adjusted R-squared:  0.0001823 
## F-statistic: 8.493 on 1 and 41103 DF,  p-value: 0.003567

edgeDen = popd[as.numeric(edgeZip)]$`Density Per Sq Mile`
plot( log(edgeDen+1,10), log(ed+1,10))
abline( lm(log(ed+1,10)~log(edgeDen+1,10)))

summary( lm(log(ed+1,10)~log(edgeDen+1,10)))

## 
## Call:
## lm(formula = log(ed + 1, 10) ~ log(edgeDen + 1, 10))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93858 -0.83295  0.03499  0.58947  3.15597 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.938575   0.008278  113.38   <2e-16 ***
## log(edgeDen + 1, 10) -0.053103   0.003890  -13.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7991 on 42223 degrees of freedom
##   (86895 observations deleted due to missingness)
## Multiple R-squared:  0.004395,   Adjusted R-squared:  0.004372 
## F-statistic: 186.4 on 1 and 42223 DF,  p-value: < 2.2e-16

summary( lm(log(ed+1,10)~log(edgeDen+1,10) + log(edgeUnemp+1,10)))

## 
## Call:
## lm(formula = log(ed + 1, 10) ~ log(edgeDen + 1, 10) + log(edgeUnemp + 
##     1, 10))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9562 -0.8306  0.0420  0.5901  3.1673 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.952309   0.011765  80.947   <2e-16 ***
## log(edgeDen + 1, 10)   -0.061479   0.003979 -15.451   <2e-16 ***
## log(edgeUnemp + 1, 10)  0.003622   0.011100   0.326    0.744    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7993 on 41102 degrees of freedom
##   (88015 observations deleted due to missingness)
## Multiple R-squared:  0.005825,   Adjusted R-squared:  0.005777 
## F-statistic: 120.4 on 2 and 41102 DF,  p-value: < 2.2e-16

There are so many problems with this analysis! Let’s count the ways.