Added ability to sample from intra-het distance distribution

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4836 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
fromer 2010-12-14 18:09:03 +00:00
parent afd4655674
commit 4dbdf7a13d
1 changed files with 22 additions and 13 deletions

View File

@ -1,21 +1,10 @@
# For consecutive diploid het sites x and y, P(distance(x,y) = k)
# = P(site y is the first het site downstream of x at distance = k | het site x exists at its location).
# That is, het site x already "exists", and we want to know what the probability that the NEXT het site (y) is k bases away.
#
# pOneSiteIsHom = p(top chromosome is ref AND bottom chromosome is ref) + p(top chromosome is var AND bottom chromosome is var)
# = (1-theta)^2 + theta^2
#
# pOneSiteIsHet = p(top chromosome is ref AND bottom chromosome is var) + p(top chromosome is var AND bottom chromosome is ref)
# = (1-theta)*theta + theta*(1-theta) = 2*theta*(1-theta)
#
pHetPairAtDistance <- function(k, theta) {
pOneSiteIsHet = 2 * theta * (1 - theta)
dexp(k, pOneSiteIsHet)
}
# Since the geometric/exponential distribution is "memory-free", can simply multiply the (independent) probabilities for the distances:
pHetPairsAtDistances <- function(dists, theta) {
prod(pHetPairAtDistance(dists, theta))
pOneSiteIsHet <- function(theta) {
2 * theta * (1 - theta)
}
# p = 2 * theta * (1 - theta)
@ -32,6 +21,26 @@ meanIntraHetDistanceToTheta <- function(d) {
(-1 + (1 - 2/d)^0.5) / -2
}
# For consecutive diploid het sites x and y, P(distance(x,y) = k)
# = P(site y is the first het site downstream of x at distance = k | het site x exists at its location).
# That is, het site x already "exists", and we want to know what the probability that the NEXT het site (y) is k bases away.
pHetPairAtDistance <- function(k, theta) {
pOneSiteIsHetTheta = pOneSiteIsHet(theta)
dexp(k, pOneSiteIsHet)
}
# Since the geometric/exponential distribution is "memory-free", can simply multiply the (independent) probabilities for the distances:
pHetPairsAtDistances <- function(dists, theta) {
prod(pHetPairAtDistance(dists, theta))
}
# Sample numDists distances from the intra-het distance distribution.
# [since the geometric/exponential distribution is "memory-free", can simply **independently** sample from the distribution]:
sampleIntraHetDistances <- function(numDists, theta) {
pOneSiteIsHetTheta = pOneSiteIsHet(theta)
ceiling(rexp(numDists, pOneSiteIsHetTheta)) # round up to get whole-number distances starting from 1
}
# For consecutive diploid het sites x and y, P(distance(x,y) <= k)
pHetPairLteDistance <- function(k, theta) {
# Although the real minimum distance starts with 1 (geometric distribution), the exponential distribution approximation starts with 0: