We will be working with a subset of the genotype data from the Human Genome Diversity Panel (HGDP) and HapMap (https://www.sanger.ac.uk/resources/downloads/human/hapmap3.html).
The file “YRI_CEU_ASW_MEX_NAM.bed” is a binary file in PLINK format (as well as corresponding “.bim” and “.fam” files). The file contains genotype data at autosomal SNPs for Native American samples from the HGDP and four population samples from HapMap:
NAM: Native American;
YRI: Yoruba in Ibadan, Nigeria;
CEU: Utah residents with ancestry from Northern and Western Europe;
MXL: Mexican Americans in Los Angeles, California;
ASW: African Americans from the south-western United States.
Install the Bioconductor R packages gdsfmt and SNPRelate
source(“http://bioconductor.org/biocLite.R”)
biocLite(“gdsfmt”)
biocLite(“SNPRelate”)
Load the pakcages gdsfmt and SNPRelate into your R session
if(!require("gdsfmt")){
install.packages("gdsfmt")
stopifnot(require("gdsfmt"))
}
# Loading required package: gdsfmt
if(!require("SNPRelate")){
install.packages("SNPRelate")
stopifnot(require("SNPRelate"))
}
# Loading required package: SNPRelate
# SNPRelate -- supported by Streaming SIMD Extensions 2 (SSE2)
Investigate the contents of the PLINK files .fam and .bim files
Count number of individuals in the study
FAM<-read.table(file="YRI_CEU_ASW_MEX_NAM.fam",sep=" ", header=FALSE,na="NA")
head(FAM)
# V1 V2 V3 V4 V5 V6
# 1 1432 HGDP00702 0 0 2 -9
# 2 1433 HGDP00703 0 0 1 -9
# 3 1434 HGDP00704 0 0 2 -9
# 4 1436 HGDP00706 0 0 2 -9
# 5 1438 HGDP00708 0 0 2 -9
# 6 1440 HGDP00710 0 0 1 -9
dim(FAM)
# [1] 604 6
unique(FAM$V1)
# [1] 1432 1433 1434 1436 1438 1440 1551 1555 1556 1561 1563
# [12] 1564 1567 1570 1571 1572 1573 1574 1575 1576 1577 1578
# [23] 1579 1580 1581 1582 1585 1586 1587 1588 1589 1590 1592
# [34] 1593 1594 1684 1704 1707 1708 1710 1711 1714 1718 1720
# [45] 1721 1722 1723 1726 1727 1740 1744 1746 1747 1750 1753
# [56] 1754 1756 1758 1759 1760 1761 1762 1763 2427 2431 2424
# [67] 2469 2368 2425 2430 2470 2484 2436 2426 2487 2475 2480
# [78] 2418 2471 2474 2490 2433 2495 2477 2492 2485 2494 2446
# [89] 2491 2489 2483 2466 2476 2488 2357 2367 2472 2467 2371
# [100] 2481 2437 2496 2369 2493 2505 1328 1377 1349 1330 1444
# [111] 1344 1463 1418 13291 1424 1346 13292 1354 13281 1451 1421
# [122] 1353 1358 1458 1456 1423 1345 1355 1350 1334 1347 1340
# [133] 1408 1447 1459 1362 1341 1420 1416 1454 1375 M012 M016
# [144] M001 M002 M006 M007 M011 M015 M017 M023 M027 M033 M035
# [155] M004 M005 M010 M026 M031 M034 M036 M039 M008 M032 M037
# [166] M009 M014 M024 M028 2382 2394 2395 2414 2405 Y001 Y014
# [177] Y039 Y075 Y041 Y007 Y073 Y010 Y092 Y006 Y033 Y038 Y030
# [188] Y061 Y111 Y120 Y036 Y003 Y079 Y116 Y100 Y027 Y044 Y002
# [199] Y052 Y110 Y057 Y035 Y019 Y025 Y005 Y045 Y117 Y004 Y043
# [210] Y072 Y023 Y058 Y024 Y074 Y112 Y048 Y017 Y013 Y050 Y056
# [221] Y042 Y047 Y018 Y071 Y028 Y009 Y077 Y060 Y016 Y012 Y040
# [232] Y101 Y051 Y105 Y020 Y022 Y118 Y021 Y054 Y113 Y108 Y059
# [243] Y119 Y062 Y109 Y114 Y032 Y080 Y078 Y102 Y103 Y055 Y049
# [254] Y064 Y034 Y029 Y076
# 257 Levels: 1328 13281 13291 13292 1330 1334 1340 1341 1344 1345 ... Y120
Map Information on number of SNPs
map<-read.table(file="YRI_CEU_ASW_MEX_NAM.bim",sep="\t", header=FALSE,na="NA")
head(map)
# V1 V2 V3 V4 V5 V6
# 1 1 rs9442372 0 1008567 1 2
# 2 1 rs2887286 0 1145994 1 2
# 3 1 rs3813199 0 1148140 1 2
# 4 1 rs6685064 0 1201155 1 2
# 5 1 rs9439462 0 1452629 1 2
# 6 1 rs3820011 0 1878053 1 2
dim(map)
# [1] 150872 6
File with Information on Populations of Sample Individuals
POPINFO=read.table(file="Population_Sample_Info.txt",header=TRUE)
Obtain the number of sample individuals from each population
table(POPINFO$Population)
#
# ASW CEU MXL NAM YRI
# 87 165 86 63 203
Check to make sure the order of individuals is the same as the PLINK file
sum(POPINFO$IID!=FAM$V2)
# [1] 0
Now use functions in the gdsfmt and SNPRelate packages to perform a PCA. A tutorial on gdsfmt and SNPRelate can be found at: http://corearray.sourceforge.net/tutorials/SNPRelate/
Create a GDS file for R from the PLINK files
bedfile<-"YRI_CEU_ASW_MEX_NAM.bed"
famfile<-"YRI_CEU_ASW_MEX_NAM.fam"
bimfile<-"YRI_CEU_ASW_MEX_NAM.bim"
Convert PLINK files into .gds format for R
snpgdsBED2GDS(bedfile, famfile, bimfile, "IPCgeno.gds")
# Start snpgdsBED2GDS ...
# BED file: "YRI_CEU_ASW_MEX_NAM.bed" in the SNP-major mode (Sample X SNP)
# FAM file: "YRI_CEU_ASW_MEX_NAM.fam", DONE.
# BIM file: "YRI_CEU_ASW_MEX_NAM.bim", DONE.
# Fri Jul 12 22:36:07 2019 store sample id, snp id, position, and chromosome.
# start writing: 604 samples, 150872 SNPs ...
# Fri Jul 12 22:36:07 2019 0%
# Fri Jul 12 22:36:08 2019 100%
# Fri Jul 12 22:36:08 2019 Done.
# Optimize the access efficiency ...
# Clean up the fragments of GDS file:
# open the file 'IPCgeno.gds' (22.8M)
# # of fragments: 39
# save to 'IPCgeno.gds.tmp'
# rename 'IPCgeno.gds.tmp' (22.8M, reduced: 252B)
# # of fragments: 18
Open the .gds file
genofile <- snpgdsOpen("IPCgeno.gds")
head(genofile)
# $filename
# [1] "/Users/miaoyan/Desktop/teach/ESEB/html/IPCgeno.gds"
#
# $id
# [1] 0
#
# $root
# + [ ] *
# |--+ sample.id { Str8 604 ZIP_ra(27.7%), 1.3K }
# |--+ snp.id { Str8 150872 ZIP_ra(37.6%), 561.7K }
# |--+ snp.position { Int32 150872 ZIP_ra(90.9%), 535.8K }
# |--+ snp.chromosome { UInt8 150872 ZIP_ra(0.16%), 241B } *
# |--+ snp.allele { Str8 150872 ZIP_ra(0.13%), 765B }
# |--+ genotype { Bit2 604x150872, 21.7M } *
# \--+ sample.annot [ data.frame ] *
# |--+ sex { Str8 604 ZIP_ra(14.2%), 178B }
# \--+ phenotype { Int32 604 ZIP_ra(1.61%), 46B }
#
# $readonly
# [1] TRUE
head(read.gdsn(index.gdsn(genofile, "sample.id")))
# [1] "HGDP00702" "HGDP00703" "HGDP00704" "HGDP00706" "HGDP00708" "HGDP00710"
head(read.gdsn(index.gdsn(genofile, "snp.id")))
# [1] "rs9442372" "rs2887286" "rs3813199" "rs6685064" "rs9439462" "rs3820011"
Extract genotype for a specific SNP
g <- snpgdsGetGeno(genofile, snp.id="rs3813199")
# Genotype matrix: 604 samples X 1 SNPs
hist(g)
Now perform a PCA using a function from the SNPRelate package
pca <- snpgdsPCA(genofile)
# Principal Component Analysis (PCA) on genotypes:
# Excluding 0 SNP on non-autosomes
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 150,872 SNPs
# using 1 (CPU) core
# PCA: the sum of all selected genotypes (0,1,2) = 47894459
# CPU capabilities: Double-Precision SSE2
# Fri Jul 12 22:36:10 2019 (internal increment: 588)
#
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed in 11s
# Fri Jul 12 22:36:22 2019 Begin (eigenvalues and eigenvectors)
# Fri Jul 12 22:36:22 2019 Done.
Incorporate the population membership in the PCA plot
Get sample id
sample.id <- read.gdsn(index.gdsn(genofile, "sample.id"))
population=POPINFO$Population
tab <- data.frame(sample.id = pca$sample.id,
pop = factor(population)[match(pca$sample.id, sample.id)],
EV1 = pca$eigenvect[,1], # the first eigenvector
EV2 = pca$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE)
head(tab)
# sample.id pop EV1 EV2
# 1 HGDP00702 NAM -0.04914341 -0.09600764
# 2 HGDP00703 NAM -0.04288822 -0.07801091
# 3 HGDP00704 NAM -0.04920633 -0.09414968
# 4 HGDP00706 NAM -0.04910582 -0.09618059
# 5 HGDP00708 NAM -0.04917482 -0.09728481
# 6 HGDP00710 NAM -0.04953930 -0.09574544
plot(tab$EV2, tab$EV1, col=as.integer(tab$pop), xlab="eigenvector 2", ylab="eigenvector 1", main="PCA using all SNPs")
legend("topleft", legend=levels(tab$pop), pch="o", col=1:(nlevels(tab$pop)))
Now make scatterplots of the top 4 PCs with proportional variance explained included
pc.percent <- pca$varprop*100
head(round(pc.percent, 2))
# [1] 11.30 3.99 0.55 0.44 0.40 0.39
lbls <- paste("PC", 1:4, "\n", format(pc.percent[1:4], digits=2), "%", sep="")
pairs(pca$eigenvect[,1:4], col=tab$pop, labels=lbls)
Do we need 150K SNPs for population structure infererence in this sample? Identify a subset of SNPs based on LD threshold of 0.2
snpset <- snpgdsLDpruning(genofile, ld.threshold=0.2)
# SNP pruning based on LD:
# Excluding 0 SNP on non-autosomes
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 150,872 SNPs
# using 1 (CPU) core
# sliding window: 500,000 basepairs, Inf SNPs
# |LD| threshold: 0.2
# method: composite
# Chromosome 1: 28.40%, 3,286/11,572
# Chromosome 2: 26.38%, 3,368/12,766
# Chromosome 3: 27.43%, 2,887/10,524
# Chromosome 4: 28.35%, 2,502/8,825
# Chromosome 5: 27.03%, 2,613/9,668
# Chromosome 6: 26.41%, 2,592/9,814
# Chromosome 7: 27.90%, 2,259/8,097
# Chromosome 8: 25.29%, 2,211/8,743
# Chromosome 9: 26.56%, 1,987/7,482
# Chromosome 10: 27.21%, 2,236/8,217
# Chromosome 11: 26.81%, 2,004/7,474
# Chromosome 12: 28.24%, 2,106/7,457
# Chromosome 13: 27.91%, 1,618/5,798
# Chromosome 14: 27.45%, 1,421/5,176
# Chromosome 15: 26.59%, 1,307/4,916
# Chromosome 16: 27.32%, 1,347/4,930
# Chromosome 17: 31.93%, 1,227/3,843
# Chromosome 18: 28.44%, 1,349/4,743
# Chromosome 19: 35.63%, 795/2,231
# Chromosome 20: 29.06%, 1,209/4,161
# Chromosome 21: 29.80%, 658/2,208
# Chromosome 22: 29.91%, 666/2,227
# 41,648 markers are selected in total.
snpset.id <- unlist(snpset)
Now perform a PCA with the subset of SNPs
pca2 <- snpgdsPCA(genofile, snp.id=snpset.id)
# Principal Component Analysis (PCA) on genotypes:
# Excluding 109,224 SNPs (non-autosomes or non-selection)
# Excluding 0 SNP (monomorphic: TRUE, MAF: NaN, missing rate: NaN)
# Working space: 604 samples, 41,648 SNPs
# using 1 (CPU) core
# PCA: the sum of all selected genotypes (0,1,2) = 11028314
# CPU capabilities: Double-Precision SSE2
# Fri Jul 12 22:36:25 2019 (internal increment: 588)
#
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed in 3s
# Fri Jul 12 22:36:28 2019 Begin (eigenvalues and eigenvectors)
# Fri Jul 12 22:36:29 2019 Done.
tab2 <- data.frame(sample.id = pca2$sample.id,
pop = factor(population)[match(pca2$sample.id, sample.id)],
EV1 = pca2$eigenvect[,1], # the first eigenvector
EV2 = pca2$eigenvect[,2], # the second eigenvector
stringsAsFactors = FALSE)
plot(tab2$EV2, tab2$EV1, col=as.integer(tab$pop), xlab="eigenvector 2", ylab="eigenvector 1", main="PCA using a subset of SNPs")
legend("bottomright", legend=levels(tab$pop), pch="o", col=1:(nlevels(tab$pop)))
Estimate proportional Native American and European Ancestry for the MXL from the PCA.
Assume that MXL have negligible African Ancestry.
avgCEU2=mean(pca2$eigenvect[population=="CEU",2])
avgNAM2=mean(pca2$eigenvect[population=="NAM",2])
MXLadmix=(pca2$eigenvect[population=="MXL",2]-avgNAM2)/(avgCEU2-avgNAM2)
Now make a plot of MXL estimated ancestery from the PCA
tab2=cbind(MXLadmix,1-MXLadmix)
myorder=order(MXLadmix)
temp=t(as.matrix(tab2[myorder,]))
barplot(temp, col=c("blue","green"),xlab="Individual ", ylab="Ancestry", border=NA,axisnames=FALSE,main="Ancestry of MXL",ylim=c(0,1))
legend("bottomright", c("European","Native American"), lwd=4, col=c("blue","green"), bg="white",cex=0.85)
Timothy Thornton and Michael Wu. Summer Institute in Statistical Genetics (SISG) 2017.
D. Jiang and M. Wang. Recent Developments in Statistical Methods for GWAS and High-throughput Sequencing Studies of Complex Traits. Biostatistics and Epidemiology. Vol. 2 (1), 132-159, 2018.
Plink website: http://zzz.bwh.harvard.edu/plink/
Plink software: https://www.cog-genomics.org/plink2