Table 1: The key functions in the SeqArray package.
The default setting for the analysis functions in the SeqArray package is serial implementation, but users can setup a cluster computing environment manually via seqParallelSetup()
and distribute the calculations to multiple cores even 100 cluster nodes.
library(SeqArray)
## Loading required package: gdsfmt
# 1000 Genomes, Phase 1, chromosome 22
(gds.fn <- seqExampleFileName("KG_Phase1"))
## [1] "/tmp/RtmpkGyHWJ/Rinst6c674a987c26/SeqArray/extdata/1KG_phase1_release_v3_chr22.gds"
# open a GDS file
genofile <- seqOpen(gds.fn)
seqSummary(genofile)
## File: /tmp/RtmpkGyHWJ/Rinst6c674a987c26/SeqArray/extdata/1KG_phase1_release_v3_chr22.gds
## Reference: GRCh37
## Ploidy: 2
## Number of samples: 1092
## Number of variants: 19773
## Chromosomes:
## 22 <NA>
## 19773 0
## Number of alleles per site:
## 2
## 19773
## Annotation, INFO variable(s):
## Annotation, FORMAT variable(s):
## Annotation, sample variable(s):
## Sample
## Family.ID
## Population
## Population.Description
## Gender
## Relationship
## Unexpected.Parent.Child
## Non.Paternity
## Siblings
## Grandparents
## Avuncular
## Half.Siblings
## Unknown.Second.Order
## Third.Order
## Other.Comments
# use 2 cores for the demonstration
seqParallelSetup(2)
## Enable the computing cluster with 2 forked R processes.
# numbers of alleles per site
table(seqNumAllele(genofile))
##
## 2
## 19773
# reference allele frequencies
summary(seqAlleleFreq(genofile, ref.allele=0))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.9725 0.9963 0.9286 0.9991 1.0000
# close the cluster environment
seqParallelSetup(FALSE)
## Stop the computing cluster.
The SNPRelate package is developed to accelerate two key computations in genome-wide association studies: principal component analysis (PCA) and relatedness analysis using identity-by-descent (IBD) measures. The kernels of SNPRelate are written in C/C++ and have been highly optimized for multi-core symmetric multiprocessing computer architectures. The genotypes in SeqArray format are converted to categorical dosages of reference alleles (0,1,2,NA), which are the data format used in the SNPRelate pacakge.
library(SNPRelate)
## SNPRelate -- supported by Streaming SIMD Extensions 2 (SSE2)
It is suggested to use a pruned set of SNPs which are in approximate linkage equilibrium with each other to avoid the strong influence of SNP clusters in principal component analysis and relatedness analysis.
set.seed(1000)
# may try different LD thresholds for sensitivity analysis
snpset <- snpgdsLDpruning(genofile, ld.threshold=0.2)
## SNP pruning based on LD:
## Excluding 0 SNP on non-autosomes
## Excluding 122 SNPs (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
## Working space: 1092 samples, 19651 SNPs
## Using 1 (CPU) core
## Sliding window: 500000 basepairs, Inf SNPs
## |LD| threshold: 0.2
## Chromosome 22: 37.66%, 7447/19773
## 7447 SNPs are selected in total.
names(snpset)
## [1] "chr22"
head(snpset$chr22) # snp.id
## [1] 1 3 4 6 7 8
# get all selected snp id
snpset.id <- unlist(snpset)
# Run PCA
pca <- snpgdsPCA(genofile, snp.id=snpset.id, num.thread=2)
## Principal Component Analysis (PCA) on SNP genotypes:
## Excluding 12326 SNPs on non-autosomes
## Excluding 0 SNP (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
## Working space: 1092 samples, 7447 SNPs
## Using 2 (CPU) cores
## PCA: the sum of all working genotypes (0, 1 and 2) = 15598846
## PCA: Sun Nov 22 19:42:21 2015 0%
## PCA: Sun Nov 22 19:42:23 2015 100%
## PCA: Sun Nov 22 19:42:23 2015 Begin (eigenvalues and eigenvectors)
## PCA: Sun Nov 22 19:42:23 2015 End (eigenvalues and eigenvectors)
# variance proportion (%)
pc.percent <- pca$varprop*100
head(round(pc.percent, 2))
## [1] 1.22 0.84 0.29 0.28 0.27 0.26
Population information are available:
pop.code <- factor(seqGetData(genofile, "sample.annotation/Population"))
head(pop.code)
## [1] GBR GBR GBR GBR GBR GBR
## Levels: ASW CEU CHB CHS CLM FIN GBR IBS JPT LWK MXL PUR TSI YRI
popgroup <- list(
EastAsia = c("CHB", "JPT", "CHS", "CDX", "KHV", "CHD"),
European = c("CEU", "TSI", "GBR", "FIN", "IBS"),
African = c("ASW", "ACB", "YRI", "LWK", "GWD", "MSL", "ESN"),
SouthAmerica = c("MXL", "PUR", "CLM", "PEL"),
India = c("GIH", "PJL", "BEB", "STU", "ITU"))
colors <- sapply(levels(pop.code), function(x) {
for (i in 1:length(popgroup)) {
if (x %in% popgroup[[i]])
return(names(popgroup)[i])
}
NA
})
colors <- as.factor(colors)
legend.text <- sapply(levels(colors), function(x) paste(levels(pop.code)[colors==x], collapse=","))
legend.text
## African EastAsia European
## "ASW,LWK,YRI" "CHB,CHS,JPT" "CEU,FIN,GBR,IBS,TSI"
## SouthAmerica
## "CLM,MXL,PUR"
# make a data.frame
tab <- data.frame(sample.id = pca$sample.id,
EV1 = pca$eigenvect[,1], # the first eigenvector
EV2 = pca$eigenvect[,2], # the second eigenvector
Population = pop.code,
stringsAsFactors = FALSE)
head(tab)
## sample.id EV1 EV2 Population
## 1 HG00096 -0.01514785 0.03691694 GBR
## 2 HG00097 -0.01236666 0.02596987 GBR
## 3 HG00099 -0.01042858 0.03828143 GBR
## 4 HG00100 -0.01514046 0.02695951 GBR
## 5 HG00101 -0.01188685 0.02858924 GBR
## 6 HG00102 -0.01509719 0.03023233 GBR
# draw
plot(tab$EV2, tab$EV1, pch=20, cex=0.75, main="1KG Phase 1, chromosome 22",
xlab="eigenvector 2", ylab="eigenvector 1", col=colors[tab$Population])
legend("topleft", legend=legend.text, col=1:length(legend.text), pch=19, cex=0.75)
For \(n\) study individuals, snpgdsIBS()
can be used to create a \(n \times n\) matrix of genome-wide average IBS pairwise identities. To perform cluster analysis on the \(n \times n\) matrix of genome-wide IBS pairwise distances, and determine the groups by a permutation score:
set.seed(1000)
ibs.hc <- snpgdsHCluster(snpgdsIBS(genofile, num.thread=2))
## Identity-By-State (IBS) analysis on SNP genotypes:
## Excluding 0 SNP on non-autosomes
## Excluding 122 SNPs (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
## Working space: 1092 samples, 19651 SNPs
## Using 2 (CPU) cores
## IBS: the sum of all working genotypes (0, 1 and 2) = 39869601
## IBS: Sun Nov 22 19:42:25 2015 0%
## IBS: Sun Nov 22 19:42:28 2015 100%
Here is the population information we have known:
# Determine groups of individuals by population information
rv <- snpgdsCutTree(ibs.hc, samp.group=as.factor(colors[pop.code]))
## Create 4 groups.
plot(rv$dendrogram, leaflab="none", main="1KG Phase 1, chromosome 22",
edgePar=list(col=rgb(0.5,0.5,0.5,0.75), t.col="black"))
legend("bottomleft", legend=legend.text, col=1:length(legend.text), pch=19, cex=0.75, ncol=4)
# close the GDS file
seqClose(genofile)
An R/Bioconductor package SeqVarTools is available on Bioconductor, which defines S4 classes and methods for other common operations and analyses on SeqArray datasets.
sessionInfo()
## R version 3.2.2 (2015-08-14)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.3 LTS
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] SNPRelate_1.4.0 SeqArray_1.10.6 gdsfmt_1.6.2
##
## loaded via a namespace (and not attached):
## [1] AnnotationDbi_1.32.1 knitr_1.11
## [3] XVector_0.10.0 magrittr_1.5
## [5] GenomicAlignments_1.6.1 GenomicRanges_1.22.1
## [7] BiocGenerics_0.16.1 zlibbioc_1.16.0
## [9] IRanges_2.4.4 BiocParallel_1.4.0
## [11] BSgenome_1.38.0 stringr_1.0.0
## [13] GenomeInfoDb_1.6.1 tools_3.2.2
## [15] SummarizedExperiment_1.0.1 parallel_3.2.2
## [17] Biobase_2.30.0 DBI_0.3.1
## [19] lambda.r_1.1.7 futile.logger_1.4.1
## [21] htmltools_0.2.6 yaml_2.1.13
## [23] digest_0.6.8 rtracklayer_1.30.1
## [25] formatR_1.2.1 futile.options_1.0.0
## [27] S4Vectors_0.8.3 bitops_1.0-6
## [29] biomaRt_2.26.1 RCurl_1.95-4.7
## [31] RSQLite_1.0.0 evaluate_0.8
## [33] rmarkdown_0.8.1 stringi_1.0-1
## [35] GenomicFeatures_1.22.5 Biostrings_2.38.2
## [37] Rsamtools_1.22.0 XML_3.98-1.3
## [39] stats4_3.2.2 VariantAnnotation_1.16.3