We will be working with a subset of the genotype data from the Human Genome Diversity Panel (HGDP) and HapMap (https://www.sanger.ac.uk/resources/downloads/human/hapmap3.html).

The file “YRI_CEU_ASW_MEX_NAM.bed” is a binary file in PLINK format (as well as corresponding “.bim” and “.fam” files). The file contains genotype data at autosomal SNPs for Native American samples from the HGDP and four population samples from HapMap:

NAM: Native American;

YRI: Yoruba in Ibadan, Nigeria;

CEU: Utah residents with ancestry from Northern and Western Europe;

MXL: Mexican Americans in Los Angeles, California;

ASW: African Americans from the south-western United States.

1. File preparation

Install the Bioconductor R packages gdsfmt and SNPRelate

source(“http://bioconductor.org/biocLite.R”)

biocLite(“gdsfmt”)

biocLite(“SNPRelate”)

Load the pakcages gdsfmt and SNPRelate into your R session

if(!require("gdsfmt")){
  install.packages("gdsfmt")
  stopifnot(require("gdsfmt"))
}
# Loading required package: gdsfmt
if(!require("SNPRelate")){
  install.packages("SNPRelate")
  stopifnot(require("SNPRelate"))
}
# Loading required package: SNPRelate
# SNPRelate -- supported by Streaming SIMD Extensions 2 (SSE2)

Investigate the contents of the PLINK files .fam and .bim files

Count number of individuals in the study

FAM<-read.table(file="YRI_CEU_ASW_MEX_NAM.fam",sep=" ", header=FALSE,na="NA")
head(FAM)
#     V1        V2 V3 V4 V5 V6
# 1 1432 HGDP00702  0  0  2 -9
# 2 1433 HGDP00703  0  0  1 -9
# 3 1434 HGDP00704  0  0  2 -9
# 4 1436 HGDP00706  0  0  2 -9
# 5 1438 HGDP00708  0  0  2 -9
# 6 1440 HGDP00710  0  0  1 -9
dim(FAM)
# [1] 604   6
unique(FAM$V1)
#   [1] 1432  1433  1434  1436  1438  1440  1551  1555  1556  1561  1563 
#  [12] 1564  1567  1570  1571  1572  1573  1574  1575  1576  1577  1578 
#  [23] 1579  1580  1581  1582  1585  1586  1587  1588  1589  1590  1592 
#  [34] 1593  1594  1684  1704  1707  1708  1710  1711  1714  1718  1720 
#  [45] 1721  1722  1723  1726  1727  1740  1744  1746  1747  1750  1753 
#  [56] 1754  1756  1758  1759  1760  1761  1762  1763  2427  2431  2424 
#  [67] 2469  2368  2425  2430  2470  2484  2436  2426  2487  2475  2480 
#  [78] 2418  2471  2474  2490  2433  2495  2477  2492  2485  2494  2446 
#  [89] 2491  2489  2483  2466  2476  2488  2357  2367  2472  2467  2371 
# [100] 2481  2437  2496  2369  2493  2505  1328  1377  1349  1330  1444 
# [111] 1344  1463  1418  13291 1424  1346  13292 1354  13281 1451  1421 
# [122] 1353  1358  1458  1456  1423  1345  1355  1350  1334  1347  1340 
# [133] 1408  1447  1459  1362  1341  1420  1416  1454  1375  M012  M016 
# [144] M001  M002  M006  M007  M011  M015  M017  M023  M027  M033  M035 
# [155] M004  M005  M010  M026  M031  M034  M036  M039  M008  M032  M037 
# [166] M009  M014  M024  M028  2382  2394  2395  2414  2405  Y001  Y014 
# [177] Y039  Y075  Y041  Y007  Y073  Y010  Y092  Y006  Y033  Y038  Y030 
# [188] Y061  Y111  Y120  Y036  Y003  Y079  Y116  Y100  Y027  Y044  Y002 
# [199] Y052  Y110  Y057  Y035  Y019  Y025  Y005  Y045  Y117  Y004  Y043 
# [210] Y072  Y023  Y058  Y024  Y074  Y112  Y048  Y017  Y013  Y050  Y056 
# [221] Y042  Y047  Y018  Y071  Y028  Y009  Y077  Y060  Y016  Y012  Y040 
# [232] Y101  Y051  Y105  Y020  Y022  Y118  Y021  Y054  Y113  Y108  Y059 
# [243] Y119  Y062  Y109  Y114  Y032  Y080  Y078  Y102  Y103  Y055  Y049 
# [254] Y064  Y034  Y029  Y076 
# 257 Levels: 1328 13281 13291 13292 1330 1334 1340 1341 1344 1345 ... Y120

Map Information on number of SNPs

map<-read.table(file="YRI_CEU_ASW_MEX_NAM.bim",sep="\t", header=FALSE,na="NA")
head(map)
#   V1        V2 V3      V4 V5 V6
# 1  1 rs9442372  0 1008567  1  2
# 2  1 rs2887286  0 1145994  1  2
# 3  1 rs3813199  0 1148140  1  2
# 4  1 rs6685064  0 1201155  1  2
# 5  1 rs9439462  0 1452629  1  2
# 6  1 rs3820011  0 1878053  1  2
dim(map)
# [1] 150872      6

File with Information on Populations of Sample Individuals

POPINFO=read.table(file="Population_Sample_Info.txt",header=TRUE)

Obtain the number of sample individuals from each population

table(POPINFO$Population)
# 
# ASW CEU MXL NAM YRI 
#  87 165  86  63 203

Check to make sure the order of individuals is the same as the PLINK file

sum(POPINFO$IID!=FAM$V2)
# [1] 0

Now use functions in the gdsfmt and SNPRelate packages to perform a PCA. A tutorial on gdsfmt and SNPRelate can be found at: http://corearray.sourceforge.net/tutorials/SNPRelate/

Create a GDS file for R from the PLINK files

bedfile<-"YRI_CEU_ASW_MEX_NAM.bed" 
famfile<-"YRI_CEU_ASW_MEX_NAM.fam" 
bimfile<-"YRI_CEU_ASW_MEX_NAM.bim" 

Convert PLINK files into .gds format for R

snpgdsBED2GDS(bedfile, famfile, bimfile, "IPCgeno.gds")
# Start snpgdsBED2GDS ...
#   BED file: "YRI_CEU_ASW_MEX_NAM.bed" in the SNP-major mode (Sample X SNP)
#   FAM file: "YRI_CEU_ASW_MEX_NAM.fam", DONE.
#   BIM file: "YRI_CEU_ASW_MEX_NAM.bim", DONE.
# Fri Jul 12 22:36:07 2019  store sample id, snp id, position, and chromosome.
#   start writing: 604 samples, 150872 SNPs ...
#   Fri Jul 12 22:36:07 2019    0%
#   Fri Jul 12 22:36:08 2019    100%
# Fri Jul 12 22:36:08 2019  Done.
# Optimize the access efficiency ...
# Clean up the fragments of GDS file:
#     open the file 'IPCgeno.gds' (22.8M)
#     # of fragments: 39
#     save to 'IPCgeno.gds.tmp'
#     rename 'IPCgeno.gds.tmp' (22.8M, reduced: 252B)
#     # of fragments: 18

Open the .gds file

genofile <- snpgdsOpen("IPCgeno.gds")
head(genofile)
# $filename
# [1] "/Users/miaoyan/Desktop/teach/ESEB/html/IPCgeno.gds"
# 
# $id
# [1] 0
# 
# $root
# +    [  ] *
# |--+ sample.id   { Str8 604 ZIP_ra(27.7%), 1.3K }
# |--+ snp.id   { Str8 150872 ZIP_ra(37.6%), 561.7K }
# |--+ snp.position   { Int32 150872 ZIP_ra(90.9%), 535.8K }
# |--+ snp.chromosome   { UInt8 150872 ZIP_ra(0.16%), 241B } *
# |--+ snp.allele   { Str8 150872 ZIP_ra(0.13%), 765B }
# |--+ genotype   { Bit2 604x150872, 21.7M } *
# \--+ sample.annot   [ data.frame ] *
#    |--+ sex   { Str8 604 ZIP_ra(14.2%), 178B }
#    \--+ phenotype   { Int32 604 ZIP_ra(1.61%), 46B }
# 
# $readonly
# [1] TRUE

head(read.gdsn(index.gdsn(genofile, "sample.id")))
# [1] "HGDP00702" "HGDP00703" "HGDP00704" "HGDP00706" "HGDP00708" "HGDP00710"
head(read.gdsn(index.gdsn(genofile, "snp.id")))
# [1] "rs9442372" "rs2887286" "rs3813199" "rs6685064" "rs9439462" "rs3820011"

Extract genotype for a specific SNP

g <- snpgdsGetGeno(genofile, snp.id="rs3813199")
# Genotype matrix: 604 samples X 1 SNPs

hist(g)

1. Perform a PCA for the sample using all of the SNPs. Make a scatterplot of the first two principal components (PCs) with each point colored according to population membership.

Now perform a PCA using a function from the SNPRelate package

pca <- snpgdsPCA(genofile)
# Principal Component Analysis (PCA) on genotypes:
# Excluding 0 SNP on non-autosomes
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 150,872 SNPs
#     using 1 (CPU) core
# PCA:    the sum of all selected genotypes (0,1,2) = 47894459
# CPU capabilities: Double-Precision SSE2
# Fri Jul 12 22:36:10 2019    (internal increment: 588)
# 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed in 11s
# Fri Jul 12 22:36:22 2019    Begin (eigenvalues and eigenvectors)
# Fri Jul 12 22:36:22 2019    Done.

Incorporate the population membership in the PCA plot

Get sample id

sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
population=POPINFO$Population

tab <- data.frame(sample.id = pca$sample.id,
                  pop = factor(population)[match(pca$sample.id, sample.id)],
                  EV1 = pca$eigenvect[,1],    # the first eigenvector
                  EV2 = pca$eigenvect[,2],    # the second eigenvector
                  stringsAsFactors = FALSE)
head(tab)
#   sample.id pop         EV1         EV2
# 1 HGDP00702 NAM -0.04914341 -0.09600764
# 2 HGDP00703 NAM -0.04288822 -0.07801091
# 3 HGDP00704 NAM -0.04920633 -0.09414968
# 4 HGDP00706 NAM -0.04910582 -0.09618059
# 5 HGDP00708 NAM -0.04917482 -0.09728481
# 6 HGDP00710 NAM -0.04953930 -0.09574544



plot(tab$EV2, tab$EV1, col=as.integer(tab$pop), xlab="eigenvector 2", ylab="eigenvector 1", main="PCA using all SNPs")
legend("topleft", legend=levels(tab$pop), pch="o", col=1:(nlevels(tab$pop)))

Now make scatterplots of the top 4 PCs with proportional variance explained included

pc.percent <- pca$varprop*100
head(round(pc.percent, 2))
# [1] 11.30  3.99  0.55  0.44  0.40  0.39
lbls <- paste("PC", 1:4, "\n", format(pc.percent[1:4], digits=2), "%", sep="")
pairs(pca$eigenvect[,1:4], col=tab$pop, labels=lbls)

2. Now redo question 1 above using a subset of SNPs that are selected based on having linkage disequilibrium between SNPs that is no greater than 0.2.

Do we need 150K SNPs for population structure infererence in this sample? Identify a subset of SNPs based on LD threshold of 0.2

snpset <- snpgdsLDpruning(genofile, ld.threshold=0.2)
# SNP pruning based on LD:
# Excluding 0 SNP on non-autosomes
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 150,872 SNPs
#     using 1 (CPU) core
#     sliding window: 500,000 basepairs, Inf SNPs
#     |LD| threshold: 0.2
#     method: composite
# Chromosome 1: 28.40%, 3,286/11,572
# Chromosome 2: 26.38%, 3,368/12,766
# Chromosome 3: 27.43%, 2,887/10,524
# Chromosome 4: 28.35%, 2,502/8,825
# Chromosome 5: 27.03%, 2,613/9,668
# Chromosome 6: 26.41%, 2,592/9,814
# Chromosome 7: 27.90%, 2,259/8,097
# Chromosome 8: 25.29%, 2,211/8,743
# Chromosome 9: 26.56%, 1,987/7,482
# Chromosome 10: 27.21%, 2,236/8,217
# Chromosome 11: 26.81%, 2,004/7,474
# Chromosome 12: 28.24%, 2,106/7,457
# Chromosome 13: 27.91%, 1,618/5,798
# Chromosome 14: 27.45%, 1,421/5,176
# Chromosome 15: 26.59%, 1,307/4,916
# Chromosome 16: 27.32%, 1,347/4,930
# Chromosome 17: 31.93%, 1,227/3,843
# Chromosome 18: 28.44%, 1,349/4,743
# Chromosome 19: 35.63%, 795/2,231
# Chromosome 20: 29.06%, 1,209/4,161
# Chromosome 21: 29.80%, 658/2,208
# Chromosome 22: 29.91%, 666/2,227
# 41,648 markers are selected in total.

snpset.id <- unlist(snpset)

Now perform a PCA with the subset of SNPs

pca2 <- snpgdsPCA(genofile, snp.id=snpset.id)
# Principal Component Analysis (PCA) on genotypes:
# Excluding 109,224 SNPs (non-autosomes or non-selection)
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 41,648 SNPs
#     using 1 (CPU) core
# PCA:    the sum of all selected genotypes (0,1,2) = 11028314
# CPU capabilities: Double-Precision SSE2
# Fri Jul 12 22:36:25 2019    (internal increment: 588)
# 
[..................................................]  0%, ETC: ---    
[==================================================] 100%, completed in 3s
# Fri Jul 12 22:36:28 2019    Begin (eigenvalues and eigenvectors)
# Fri Jul 12 22:36:29 2019    Done.

tab2 <- data.frame(sample.id = pca2$sample.id,
                   pop = factor(population)[match(pca2$sample.id, sample.id)],
                   EV1 = pca2$eigenvect[,1],    # the first eigenvector
                   EV2 = pca2$eigenvect[,2],    # the second eigenvector
                   stringsAsFactors = FALSE)


plot(tab2$EV2, tab2$EV1, col=as.integer(tab$pop), xlab="eigenvector 2", ylab="eigenvector 1", main="PCA using a subset of SNPs")
legend("bottomright", legend=levels(tab$pop), pch="o", col=1:(nlevels(tab$pop)))

3. Predict proportional Native American and European Ancestry for the HapMap MXL from the PCA using one of the principal components.

Estimate proportional Native American and European Ancestry for the MXL from the PCA.

Assume that MXL have negligible African Ancestry.

avgCEU2=mean(pca2$eigenvect[population=="CEU",2])
avgNAM2=mean(pca2$eigenvect[population=="NAM",2])

MXLadmix=(pca2$eigenvect[population=="MXL",2]-avgNAM2)/(avgCEU2-avgNAM2)

4. Make a barplot of the proportional ancestry estimates from question 3.

Now make a plot of MXL estimated ancestery from the PCA

tab2=cbind(MXLadmix,1-MXLadmix)
myorder=order(MXLadmix)
temp=t(as.matrix(tab2[myorder,]))


barplot(temp, col=c("blue","green"),xlab="Individual ", ylab="Ancestry", border=NA,axisnames=FALSE,main="Ancestry of MXL",ylim=c(0,1))
legend("bottomright", c("European","Native American"), lwd=4, col=c("blue","green"), bg="white",cex=0.85)

References:

Timothy Thornton and Michael Wu. Summer Institute in Statistical Genetics (SISG) 2017.

D. Jiang and M. Wang. Recent Developments in Statistical Methods for GWAS and High-throughput Sequencing Studies of Complex Traits. Biostatistics and Epidemiology. Vol. 2 (1), 132-159, 2018.

Plink website: http://zzz.bwh.harvard.edu/plink/

Plink software: https://www.cog-genomics.org/plink2