srand(1234321) # set the random seed julia> X = [ones(100) rand(100,2)]; X' 3x100 Float64 Array: 1.0 1.0 1.0 1.0 1.0 … 1.0 1.0 1.0 1.0 0.0944218 0.936611 0.258327 0.930924 0.555283 0.799287 0.261148 0.255576 0.817669 0.942218 0.042852 0.658443 0.933942 0.493509 0.00561552 0.502888 0.0869479 0.0272969 julia> beta = rand(Normal(),3); beta' 1x3 Float64 Array: -0.197287 -0.86977 -1.55077 julia> y = X*beta + rand(Normal(0.,0.4), size(X,1)); y' 1x100 Float64 Array: -1.89246 -1.06572 -1.92631 -2.00036 -1.19763 … -1.27156 -0.976077 -0.913521 -0.167361 julia> betahat = X\y; betahat' 1x3 Float64 Array: -0.225888 -0.913171 -1.4425 ``` ## Use of factorizations ```{.julia} julia> using NumericExtensions julia> ch = cholpfact!(X'X) CholeskyPivoted{Float64}(3x3 Float64 Array: 10.0 5.26508 4.88059 0.0 3.03731 0.289294 0.0 0.0 2.91872 ,'U',[1,2,3],3,-1.0,0) julia> beta = ch\X'y; beta' 1x3 Float64 Array: -0.225888 -0.913171 -1.4425 julia> df = length(y) - rank(ch) julia> fitted = X*beta; ssqr = sqdiffsum(y, fitted)/df 0.15681475390926472 julia> vcov = ssqr * inv(ch) 3x3 Float64 Array: 0.00981028 -0.00818202 -0.00806099 -0.00818202 0.0171654 -0.00175329 -0.00806099 -0.00175329 0.0184078 julia> stderr = sqrt(diag(vcov)); println(stderr) [0.09904684680425663,0.131016871473335,0.1356755193594016] ``` # Julia packages ## Using packages created by others * Although only recently available, the _Julia_ package system is growing rapidly. Check the list of available packages on the documentation site. * Packages of interest to statisticians include - DataFrames --- facilities similar to _R_ data frames and formula language - Distributions --- described above - GLM --- fitting linear and generalized linear models - MixedModels --- mixed-effects models - currently LMMs but GLMMs are coming - NLopt --- high-quality optimizers for constrained and unconstrained problems - NumericExtensions - highly optimized matrix reductions etc. - Winston --- 2D plotting - various MCMC packages # An example - overdetermined ridge regression ## Overdetermined ridge regression * It is not uncommon now to work with data sets that have fewer observations (rows) than variables (columns). * For example, GWAS (genome-wide association studies) data may determine status at millions of SNP sites for tens of thousands of individuals. * Paulino Perez and Gustavo de la Campos performed a centered, scaled ridge regression using _R_ on such data where at each SNP position a binary response was measured. Here $p > n$ and a $p$-dimensional coefficient vector is estimated by regularizing the system to be solved. * On modern computers the system could be solved in memory when $p$ is in the thousands or tens of thousands but not for $p$ in the hundreds of thousands. A _back-fitting_ algorithm was used. ## _R_ code ```{.r} backfit <- function(Xstd, yc, h2 = 0.5, nIter = 20L) { n <- nrow(Xstd); p <- ncol(Xstd) stopifnot(length(yc) == n) lambda <- p * (1 - h2)/h2 SSx <- colSums(X^2) # the diagonal elements of crossprod(X) bHat <- rep(0,p) # initial values bj=zero e <- y # initial model residuals for(i in 1:nIter){ # loop for iterations of the algorithm for(j in 1:p){ # loop over predictors yStar <- e+X[,j] * bHat[j] # forming offsets bHat[j] <- sum(X[,j] * yStar)/(SSx[j]+lambda) # eq. [4] e <- yStar-X[,j] * bHat[j] # updates residuals } } bHat } ``` ## Direct _Julia_ translation ```{.julia} function backfit(Xstd::Matrix, yc::Vector, h2=0.5, nIter=20) n,p = size(Xstd) lambda = p*(1-h2)/h2 SSx = sum(Xstd .^ 2, 1) # all n-1 after standardizing bHat = zeros(p) e = copy(yc) for i in 1:nIter, j in 1:p yStar = e + Xstd[:,j]*bHat[j] # offsets bHat[j] = dot(Xstd[:,j],yStar)/(SSx[j] + lambda) e = yStar - Xstd[:,j]*bHat[j] end bHat end ``` ## _Julia_ version with in-place updates ```{.julia} using Base.LinAlg.BLAS.axpy! function backfit2(Xstd::Matrix, yc::Vector, h2=0.5, nIter=20) n,p = size(Xstd) lambda = p*(1 - h2)/h2 SSx = sum(Xstd .^ 2, 1) # all n-1 after standardizing bHat = zeros(p) e = copy(yc) for i in 1:nIter, j in 1:p axpy!(bHat[j], Xstd[:,j], e) bHat[j] = dot(Xstd[:,j], e)/(SSx[j] + lambda) axpy!(-bHat[j], Xstd[:,j], e) end bHat end ``` ## _Julia_ version using NumericExtensions ```{.julia} using NumericExtensions function backfit3(Xstd::Matrix, yc::Vector, h2=0.5, nIter=20) n,p = size(Xstd) length(yc) == n || error("Dimension mismatch") SSx = sum(Abs2(), Xstd, 1) # all n-1 after standardizing bHat = zeros(p); lambda = p*(1 - h2)/h2 e = copy(yc) for i in 1:nIter, j in 1:p Xj = unsafe_view(Xstd, :, j:j) fma!(e, Xj, bHat[j]) # fused multiply and add bHat[j] = mapreduce(Multiply(),Add(),Xj,e)/(SSx[j]+lambda) fma!(e, Xj, -bHat[j]) end bHat end ``` ## Timing results * On a moderate-sized data set (600 by 1280) `backfit2` is twice as fast as `backfit` in _Julia_ and about 3 times as fast as the _R_ version. ```{.r} > system.time(backfit(X,y)) user system elapsed 1.604 0.012 1.623 ``` ```{.julia} julia> @time backfit(Xstd, yc); elapsed time: 1.093174015 seconds julia> @time backfit2(Xstd, yc); elapsed time: 0.460209712 seconds ``` * Profiling the execution of _backfit2_ showed that most of the time was spent in indexing operations, leading to the `backfit3` version. ```{.julia} julia> @time backfit3(Xstd, yc); elapsed time: 0.103352683 seconds ``` ## Timing results (cont'd) * A direct calculation using the 1280 by 1280 system matrix with an inflated diagonal was nearly as fast as `backfit3`. ```{.julia} function directfit(Xstd::Matrix, yc::Vector, h2=0.5) n,p = size(Xstd); lambda = p*(1-h2)/h2; XtX = Xstd'Xstd SSx = sum(Abs2(), Xstd, 1) # all n-1 after standardizing for i in 1:size(XtX,1); XtX[i,i] += SSx[i] + lambda; end cholfact!(XtX)\(Xstd'yc) end ``` ```{.julia} julia> @time directfit(Xstd, yc); elapsed time: 0.132523019 seconds ``` ## Timing results (cont'd) * On a 1000 by 200000 example `backfit3` took 25 seconds, `backfit2` 125 seconds and `backfit` nearly 300 seconds in _Julia_. ```{.julia} julia> size(Xstd) (1000,200000) julia> @time backfit(Xstd,yy); elapsed time: 296.172161458 seconds julia> @time backfit2(Xstd, yy); elapsed time: 125.506957544 seconds julia> @time backfit3(Xstd, yy); elapsed time: 25.27092502 seconds ``` * The direct solution with a 200000 by 200000 system matrix is not on. * If the matrix `X` is a binary matrix, it is more effective to store the matrix as a `BitArray` and evaluate the two possible values in each column separately, customizing the multiplication operation for this data type. ## Workflow in _Julia_ as described by Tim Holy * The workflow "quickly write the simple version first" (which is really pleasant thanks to Julia's design and the nice library that everyone has been contributing to) -> "run it" -> "ugh, too slow" -> "profile it" -> "fix a few problems in places where it actually matters" -> "ah, that's much nicer!" is quite satisfying. * If I had to write everything in C from the ground up it would be far more tedious. And doing a little hand-optimization gives us geeks something to make us feel like we're earning our keep. * Key points as summarized by Stefan Karpinski 1. You can write the easy version that works, and 2. You can usually make it fast with a bit more work # Calling compiled code ## Calling compiled functions through _ccall_ * _R_ provides functions `.C`, `.Fortran` and `.Call` to call functions (subroutines) in compiled code. * The interface can be tricky, whole chapters of the manual "Writing R Extensions" are devoted to this topic. * The _Rcpp_ package provides a cleaner interface to _C++_ code but using it is still no picnic. * In _Julia_ it is uncommon to need to write code in _C/C++_ for performance. * The _ccall_ facility allows for calling functions in existing _C_ libraries directly from _Julia_, usually without writing interface code. ## Example of _ccall_ * In the _Rmath_ library the _pbinom_ function for evaluating the _cdf_ of the _binomial_ distribution has signature ```{.c} double pbinom(double x, double n, double p, int lower_tail, int log_p) ``` * We can call this directly from _Julia_ as ```{.julia} function cdf(d::Binomial, q::Number) ccall((:pbinom,:libRmath), Cdouble, (Cdouble, Cdouble, Cdouble, Cint, Cint), q, d.size, d.prob, 1, 0) end ``` * _Cdouble_ is a typealias for `Float64`{.julia}, _Cint_ for `Int32`{.julia} making it easier to translate the signature. Pointers and, to some extent, C structs can be passed as well. # Julia for "Big Data" ## Julia for "wide data" * Data sets like that from a GWAS study is sometimes called "wide data" because the number of "variables" (SNP locations in this case) can vastly excede the number of observations (test subjects). * We showed a simulated example with 1000 subjects and 200,000 SNP locations but much larger studies are common now. Millions of SNP locations and perhaps 10's of thousands of subjects. * In these cases we must store the data compactly. The combinations of the `.bed`, `.bim` and `.fam` files generated by [plink](http://pngu.mgh.harvard.edu/~purcell/plink/) are common. * The data are a (very) large matrix of values that can take on 1 of 4 different values (including the missing value). They are stored as 4 values per byte. ## GenData2 data type ```{.julia} type GenData2 gendat::Matrix{Uint8} nsubj::Int counts::Matrix{Int} end ``` ## GenData2 constructor ```{.julia} function GenData2(bnm::ASCIIString) # bnm = basename nsnp = countlines(string(bnm,".bim")) nsubj = countlines(string(bnm,".fam")) bednm = string(bnm,".bed"); s = open(bednm) read(s,Uint8) == 0x6c && read(s,Uint8) == 0x1b || error("wrong magic number in file $bednm") read(s,Uint8) == 1 || error(".bed file, $bednm, is not in correct orientation") m = div(nsubj+3,4) # of bytes per col., nsubj rounded up to next multiple of 4 bb = mmap_array(Uint8, (m,nsnp), s); close(s) GenData2(bb,nsubj,bedfreq(bb,nsubj)') end ``` ## Counter for column frequencies ```{.julia} function bedfreq(b::Matrix{Uint8},ns::Integer) m,n = size(b) div(ns+3,4) == m || error("ns = $ns should be in [$(4m-3),$(4m)] for size(b) = $(size(b))") cc = zeros(Int, 4, n) bpt = convert(Ptr{Uint8},b) for j in 0:(n-1) pj = bpt + j*m for i in 0:(ns-1) cc[1+(unsafe_load(pj,i>>2+1)>>(2i&0x06))&0x03, 1+j] += 1 end end cc end ``` ## Performance the current version * On the "consensus" data set from the "HapMap 3" samples (296 subjects, 1,440,616 SNPs) the time to generate the frequency counts was about 5 seconds on a single process ```{.julia} julia> @time gd = GenData2("/var/tmp/hapmap3/consensus"); elapsed time: 5.662568297 seconds julia> gd.counts' 4x1440616 Int64 Array: 3 0 15 0 245 230 203 147 77 … 4 33 19 29 80 51 20 115 16 3 0 0 0 19 28 3 17 0 1 0 13 1 2 1 5 11 66 123 1 459 433 410 438 290 0 0 0 0 0 0 0 0 1154 1115 1046 1183 480 502 543 596 800 1180 1150 1165 1142 1103 1131 1163 1064 ``` * Using distributed arrays is an obvious next step. ## Julia on "long" data * I hear of people needing to fit models like logistic regression on very large data sets but getting the data are usually proprietary. Simulating such data ```{.julia} julia> using GLM julia> n = 2_500_000; srand(1234321) julia> df2 = DataFrame(x1 = rand(Normal(), n), x2 = rand(Exponential(), n), ss = pool(rand(DiscreteUniform(50), n))); julia> beta = unshift!(rand(Normal(),52), 0.5); julia> eta = ModelMatrix(ModelFrame(Formula(:(~ (x1 + x2 + ss))), df2)).m * beta; julia> mu = linkinv!(LogitLink, similar(eta), eta); julia> df2["y"] = float64(rand(n) .< mu); df2["eta"] = eta; julia> df2["mu"] = mu; ``` ## Julia on "long" data (cont'd) ```{.julia} julia> head(df2) x1 x2 ss y eta mu [1,] -0.160767 0.586174 6 1.0 1.92011 0.872151 [2,] -1.48275 0.365982 46 0.0 -0.206405 0.448581 [3,] -0.384419 1.13903 37 1.0 1.22092 0.772226 [4,] -1.3541 1.02183 10 1.0 0.534944 0.630635 [5,] 1.03842 1.1349 34 1.0 3.33319 0.96555 [6,] -1.16268 1.503 6 0.0 2.21061 0.901198 ``` ## Julia on "long" data (cont'd) ```{.julia} julia> @time gm6 = glm(:(y ~ x1 + x2 + ss), df2, Bernoulli(), LogitLink()) elapsed time: 19.711060758 seconds Formula: y ~ :(+(x1,x2,ss)) Coefficients: Estimate Std.Error z value Pr(>|z|) [1,] 0.410529 0.0122097 33.6232 7.69311e-248 [2,] 0.550263 0.00214243 256.84 0.0 [3,] 1.01559 0.00370327 274.241 0.0 [4,] 0.591439 0.0206309 28.6676 9.66624e-181 ... ``` * I had plans for making the numerical work faster but then I profiled the execution and discovered that the time was being spent creating the model matrix. ## _Julia_ summary ### Good points - Speed of a compiled language in a dynamic, REPL-based language. (They said it couldn't be done.) - Core developers are extremely knowledgable (and young). - Syntax "not unlike" _R_ and/or _Matlab_. Many functions have the same names. - generic functions, multiple dispatch and parallelization built into the base language. ### Pitfalls - Need to learn yet another language. Syntax and function names are similar to _R_ but not the same. - Language, package system and packages are still in their infancy (but development is at an amazing pace).