Added ability to sample from intra-het distance distribution
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4836 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
afd4655674
commit
4dbdf7a13d
|
|
@ -1,21 +1,10 @@
|
||||||
# For consecutive diploid het sites x and y, P(distance(x,y) = k)
|
|
||||||
# = P(site y is the first het site downstream of x at distance = k | het site x exists at its location).
|
|
||||||
# That is, het site x already "exists", and we want to know what the probability that the NEXT het site (y) is k bases away.
|
|
||||||
#
|
|
||||||
# pOneSiteIsHom = p(top chromosome is ref AND bottom chromosome is ref) + p(top chromosome is var AND bottom chromosome is var)
|
# pOneSiteIsHom = p(top chromosome is ref AND bottom chromosome is ref) + p(top chromosome is var AND bottom chromosome is var)
|
||||||
# = (1-theta)^2 + theta^2
|
# = (1-theta)^2 + theta^2
|
||||||
#
|
#
|
||||||
# pOneSiteIsHet = p(top chromosome is ref AND bottom chromosome is var) + p(top chromosome is var AND bottom chromosome is ref)
|
# pOneSiteIsHet = p(top chromosome is ref AND bottom chromosome is var) + p(top chromosome is var AND bottom chromosome is ref)
|
||||||
# = (1-theta)*theta + theta*(1-theta) = 2*theta*(1-theta)
|
# = (1-theta)*theta + theta*(1-theta) = 2*theta*(1-theta)
|
||||||
#
|
pOneSiteIsHet <- function(theta) {
|
||||||
pHetPairAtDistance <- function(k, theta) {
|
2 * theta * (1 - theta)
|
||||||
pOneSiteIsHet = 2 * theta * (1 - theta)
|
|
||||||
dexp(k, pOneSiteIsHet)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Since the geometric/exponential distribution is "memory-free", can simply multiply the (independent) probabilities for the distances:
|
|
||||||
pHetPairsAtDistances <- function(dists, theta) {
|
|
||||||
prod(pHetPairAtDistance(dists, theta))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# p = 2 * theta * (1 - theta)
|
# p = 2 * theta * (1 - theta)
|
||||||
|
|
@ -32,6 +21,26 @@ meanIntraHetDistanceToTheta <- function(d) {
|
||||||
(-1 + (1 - 2/d)^0.5) / -2
|
(-1 + (1 - 2/d)^0.5) / -2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# For consecutive diploid het sites x and y, P(distance(x,y) = k)
|
||||||
|
# = P(site y is the first het site downstream of x at distance = k | het site x exists at its location).
|
||||||
|
# That is, het site x already "exists", and we want to know what the probability that the NEXT het site (y) is k bases away.
|
||||||
|
pHetPairAtDistance <- function(k, theta) {
|
||||||
|
pOneSiteIsHetTheta = pOneSiteIsHet(theta)
|
||||||
|
dexp(k, pOneSiteIsHet)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Since the geometric/exponential distribution is "memory-free", can simply multiply the (independent) probabilities for the distances:
|
||||||
|
pHetPairsAtDistances <- function(dists, theta) {
|
||||||
|
prod(pHetPairAtDistance(dists, theta))
|
||||||
|
}
|
||||||
|
|
||||||
|
# Sample numDists distances from the intra-het distance distribution.
|
||||||
|
# [since the geometric/exponential distribution is "memory-free", can simply **independently** sample from the distribution]:
|
||||||
|
sampleIntraHetDistances <- function(numDists, theta) {
|
||||||
|
pOneSiteIsHetTheta = pOneSiteIsHet(theta)
|
||||||
|
ceiling(rexp(numDists, pOneSiteIsHetTheta)) # round up to get whole-number distances starting from 1
|
||||||
|
}
|
||||||
|
|
||||||
# For consecutive diploid het sites x and y, P(distance(x,y) <= k)
|
# For consecutive diploid het sites x and y, P(distance(x,y) <= k)
|
||||||
pHetPairLteDistance <- function(k, theta) {
|
pHetPairLteDistance <- function(k, theta) {
|
||||||
# Although the real minimum distance starts with 1 (geometric distribution), the exponential distribution approximation starts with 0:
|
# Although the real minimum distance starts with 1 (geometric distribution), the exponential distribution approximation starts with 0:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue