hsrecombi can approximate the genetic-map positions of genetic markers from SNP genotypes in half-sib families. It is referred to Hampel et al. (2018, Fron Gen) and Qanbari & Wittenburg (2020, Genet Sel Evol) for more information on the methodology. The workflow relies on paternal half-sib families but it can be adapted to maternal half-sib families with minor modifications.
In Part I, genotypic data from progeny and their common parents are simulated with the R package AlphaSimR and reshaped into the PLINK format. If genotypic data are already available in the required PLINK format, the workflow starts directly at Part II.
Required input files for Part II:
map_chr<chr>.map
hsphase_input_chr<chr>.raw
Note: A make pipeline for the main analysis (Part II) is available at GitHub.
library(hsrecombi)
library(AlphaSimR)
#> Loading required package: R6
#> Warning: package 'R6' was built under R version 4.0.5
library(doParallel)
#> Warning: package 'doParallel' was built under R version 4.0.5
#> Loading required package: foreach
#> Warning: package 'foreach' was built under R version 4.0.5
#> Loading required package: iterators
#> Warning: package 'iterators' was built under R version 4.0.5
#> Loading required package: parallel
library(ggplot2)
library(rlist)
#> Warning: package 'rlist' was built under R version 4.0.5
Sys.time()
#> [1] "2023-06-07 09:59:55 CEST"
# number of chromosomes
<- 2
nchr # Number of simulated SNPs (e.g., p = 1500 resembles an average bovine chromosome)
<- 150
p # Number of simulated progeny in each half-sib family
<- 1000
n # Directory for (simulated) data
<- "data"
path.dat dir.create(path.dat, showWarnings = FALSE)
# Directory for output
<- "results"
path.res dir.create(path.res, showWarnings = FALSE)
# Number of computing clusters allocated
<- 2 nclust
For instance, executing the following workflow with \(p=1500\) and \(n\geq 1000\) requires about 120 Mb storage space and 10 min computing time (2.6 GHz processor). With \(p=150\), it takes about 7 Mb and 20 sec.
<- runMacs2(nInd = 1000, nChr = nchr, segSites = p)
founderPop <- SimParam$new(founderPop)
SP $setSexes("yes_sys")
SP# Enable tracing location of recombination events
$setTrackRec(TRUE)
SP
<- newPop(founderPop)
pop <- 10
N <- N * n
ntotal <- selectCross(pop = pop, nFemale = 500, nMale = N, use = "rand", nCrosses = ntotal)
my_pop
<- list()
probRec for (chr in 1:nchr) {
<- matrix(0, ncol = p, nrow = ntotal)
co.pat for (i in 1:ntotal) {
if (nrow(SP$recHist[[1000 + i]][[chr]][[2]]) > 1) {
# 1. line contains 1 1 by default
<- SP$recHist[[1000 + i]][[chr]][[2]][-1, 2]
loci <- 1
co.pat[i, loci]
}
}<- colMeans(co.pat)
probRec[[chr]]
}
save(list = c("SP", "founderPop", "pop", "my_pop", "ntotal", "probRec"), file = "data/pop.Rdata")
Two chromosomes, each with \(p\) SNPs and 1 Morgan length, are simulated. The founder population consists of 1,000 animals with equal gender distribution. The follow-up generation consists of \(N\) paternal-half sib families with equal number of progeny \(n\). The study design with the equal number of progeny per family was preferred here for simplicity but this is not required in the subsequent steps.
<- my_pop@father
PAT <- paste(rep(unique(PAT), each = 2), c(1, 2), sep = "_")
rown <- pullSegSiteHaplo(pop)[rown, ]
H.pat <- pullSegSiteGeno(my_pop)
X # Physical position of markers in Mbp
<- lapply(founderPop@genMap, function(z) z * 100) map.snp
Note that founderPop@genMap
includes genetic positions
in Morgan units but physical positions are required in the next
steps.
<- data.frame(Chr = rep(1:length(map.snp), unlist(lapply(map.snp, length))), Name = paste0("SNP",
map 1:length(unlist(map.snp))), locus_Mb = unlist(map.snp), locus_bp = unlist(map.snp) * 1e+06)
colnames(X) <- map$Name
<- "FAM001"
FID <- my_pop@id
IID <- my_pop@mother
MAT <- 2
SEX <- -9
PHENOTYPE
for (chr in 1:nchr) {
write.table(map[map$Chr == chr, ], file.path(path.dat, paste0("map_chr", chr, ".map")), col.names = F,
row.names = F, quote = F)
write.table(cbind(FID, IID, PAT, MAT, SEX, PHENOTYPE, X[, map$Chr == chr]), file.path(path.dat, paste0("hsphase_input_chr",
".raw")), col.names = T, row.names = F, quote = F)
chr, }
<- makeCluster(nclust)
cl registerDoParallel(cl)
<- foreach(chr = 1:nchr, .packages = "hsrecombi") %dopar% {
out
# 1: Physical map
<- read.table(file.path(path.dat, paste0("map_chr", chr, ".map")), col.names = c("Chr", "SNP",
map "locus_Mb", "locus_bp"))
# 2: Genotype matrix
<- data.table::fread(file.path(path.dat, paste0("hsphase_input_chr", chr, ".raw")))
genomatrix <- as.matrix(genomatrix[, -c(1:6)])
X is.na(X)] <- 9 # required for hsphase
X[
# 3: Assign daughters to sire IDs
<- genomatrix$PAT
daughterSire
# 4: Estimate sire haplotypes and format data
<- makehappm(unique(daughterSire), daughterSire, X)
hap save("hap", file = file.path(path.res, paste0("hsphase_output_chr", chr, ".Rdata")))
# Check order and dimension
<- sapply(1:nrow(map), function(z) {
io grepl(x = colnames(X)[z], pattern = map$SNP[z])
})if (sum(io) != nrow(map))
stop("ERROR in dimension")
# 5: Estimate recombination rates
<- hsrecombi(hap, X)
res <- editraw(res, map)
final save(list = c("final", "map"), file = file.path(path.res, paste0("Results_chr", chr, ".Rdata")))
ifelse(nrow(final) > 0, "OK", "no result")
}print(which(unlist(out) == "OK"))
#> [1] 1 2
In step 4, function makehap
would be sufficient but
makehappm
allows comparison with a deterministic approach
shown later. Note that sire haplotypes can also be obtained by some
other (external) software. Then sire haplotypes and sire-to-daughter
information need to be reshaped with makehaplist
in step
4.
# 6a: Filter SNPs with unusually large recombination rate to neighbouring (30) SNPs
<- foreach(chr = 1:nchr, .packages = "hsrecombi") %dopar% {
excl load(file.path(path.res, paste0("Results_chr", chr, ".Rdata")))
checkCandidates(final, map)
}
# 6b: Heatmap plot of recombination rates for visual verification, e.g.:
<- 2
chr load(file.path(path.res, paste0("Results_chr", chr, ".Rdata")))
<- excl[[chr]][1]
cand <- match(cand, map$SNP) + (-100:100)
win <- win[(win >= 1) & (win <= nrow(map))]
win
<- final[(final$SNP1 %in% map$SNP[win]) & (final$SNP2 %in% map$SNP[win]), ]
target $SNP1 <- match(target$SNP1, map$SNP)
target$SNP2 <- match(target$SNP2, map$SNP)
target
ggplot(data = target, aes(SNP2, SNP1, fill = theta)) + geom_tile() + xlab("Locus 2") + ylab("Locus 1") +
coord_equal() + scale_y_continuous(trans = "reverse") + theme(panel.background = element_blank(),
panel.grid.major = element_line(colour = "grey", linewidth = 0.1), panel.grid.minor = element_line(colour = "grey")) +
theme(text = element_text(size = 18)) + scale_fill_gradientn(colours = c("yellow", "red"), limits = c(0,
1 + 1e-10), na.value = "white")
# -> nothing conspicious
1]] <- excl[[2]] <- NA excl[[
Orange colour in the SNP neighbourhood implies an increased recombination rate. In case of a misplaced marker or a cluster of misplaced markers, the heatmap is characterised by an orange band.
<- foreach(chr = 1:nchr, .packages = "hsrecombi") %dopar% {
pos load(file.path(path.res, paste0("Results_chr", chr, ".Rdata")))
geneticPosition(final, map, exclude = excl[[chr]])
}
Genetic position is reported as NA if the corresponding SNP has not been considered in the analysis (e.g., when no sire was heterozygous at the corresponding SNP).
stopCluster(cl)
<- data.frame(Chr = rep(1:length(pos), times = unlist(list.select(pos, length(pos.Mb)))), Mbp = unlist(list.select(pos,
data cM = unlist(list.select(pos, pos.cM)))
pos.Mb)), ggplot(data, aes(x = Mbp, y = cM)) + geom_point(na.rm = T) + facet_grid(Chr ~ .)
The genetic mapping function that fits best to the estimated data can be retrieved from a system of mapping functions (Rao et al. 1977, Human Hered); putatively misplaced markers need to be excluded. Given a parameter \(p\), recombination rate \(\theta \in (0,0.5)\) is mapped to Morgan units by \[ f(\theta; p) = \frac{1}{6}(p(2p - 1)(1 - 4p) log(1 - 2\theta) + 16p(p - 1)(2p - 1) \text{atan}(2\theta) + 2p(1 - p)(8p + 2)\text{atanh}(2\theta) + 6(1 - p)(1 - 2p)(1 - 4p)\theta) \]
for (chr in 1:nchr) {
load(file.path(path.res, paste0("Results_chr", chr, ".Rdata")))
$theta[(final$SNP1 %in% excl[[chr]]) | (final$SNP2 %in% excl[[chr]])] <- NA
final$dist_M <- (pos[[chr]]$pos.cM[final$SNP2] - pos[[chr]]$pos.cM[final$SNP1])/100
final<- bestmapfun(theta = final$theta, dist_M = final$dist_M)
out plot(final$dist_M, final$theta, xlab = "genetic distance (Morgan)", ylab = "recombination rate",
col = 8)
<- seq(0, 0.5, 0.01)
x <- rao(out$mixing, x)
y points(y, x, type = "l", lwd = 2)
legend("bottomright", legend = paste0("map function (p=", round(out$mixing, 3), ")"), lwd = 2, lty = 1,
bty = "n")
}#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
#> Maximum recombination rate is set to 0.5
The mixing parameter \(p\) is estimated
via quadratic optimisation in bestmapfun
. A value of \(p=0\) would match to Morgan’s mapping
function, \(p=0.25\) to Carter, \(p=0.5\) to Kosambi and \(p=1\) resembles Haldane’s mapping
function.
<- 2 chr
load(file.path(path.dat, "pop.Rdata"))
<- cumsum(probRec[[chr]]) * 100 sim.cM
The position of a recombination event is traced in
SP$recHist
which has a nested list structure (individual,
chromosome, female/male haplotype). The recombination rate between
adjacent SNPs is determined as the proportion of recombinant progeny and
the positions are derived as cumulative sum over recombination
rates.
load(file.path(path.res, paste0("hsphase_output_chr", chr, ".Rdata")))
<- hap$gen hsphase.cM
The results are compared to the deterministic approach of Ferdosi et al. (2014)
provided in the R package hsphase.
Functions of that package are called in makehap
and
makehappm
.
par(mar = c(5.1, 4.1, 4.1, 9.1), xpd = TRUE)
plot(pos[[chr]]$pos.Mb, pos[[chr]]$pos.cM, xlab = "physical position (Mbp)", ylab = "genetic position (cM)",
ylim = range(c(sim.cM, hsphase.cM, pos[[chr]]$pos.cM), na.rm = T), pch = 20)
points(pos[[chr]]$pos.Mb, sim.cM, pch = 20, col = 4)
points(pos[[chr]]$pos.Mb, hsphase.cM, pch = 20, col = 8)
legend("topleft", inset = c(1.01, 0), legend = c("simulated", "likelihood-based", "deterministic"), pch = 20,
col = c(4, 1, 8), bty = "n")
The likelihood-based approach requires a sufficiently large sample size if SNP density is high. A verification of the bias of genetic-map positions can be found in the Supplemental Material of Qanbari & Wittenburg (2020, Genet Sel Evol).
unlink(path.dat, recursive = TRUE)
unlink(path.res, recursive = TRUE)
Sys.time()
#> [1] "2023-06-07 10:00:11 CEST"