Added ability to sample from intra-het distance distribution

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4836 348d0f76-0448-11de-a6fe-93d51630548a
2010-12-14 18:09:03 +00:00 · 2010-12-14 18:09:03 +00:00 · 4dbdf7a13d
parent afd4655674
commit 4dbdf7a13d
1 changed files with 22 additions and 13 deletions
--- a/R/phasing/RBP_theoretical.R
+++ b/R/phasing/RBP_theoretical.R
@ -1,21 +1,10 @@
-# For consecutive diploid het sites x and y, P(distance(x,y) = k)
-# = P(site y is the first het site downstream of x at distance = k | het site x exists at its location).
-# That is, het site x already "exists", and we want to know what the probability that the NEXT het site (y) is k bases away.
-#
 # pOneSiteIsHom = p(top chromosome is ref AND bottom chromosome is ref) + p(top chromosome is var AND bottom chromosome is var)
 # = (1-theta)^2 + theta^2
 #
 # pOneSiteIsHet = p(top chromosome is ref AND bottom chromosome is var) + p(top chromosome is var AND bottom chromosome is ref)
 # = (1-theta)*theta + theta*(1-theta) = 2*theta*(1-theta)
-#
-pHetPairAtDistance <- function(k, theta) {
-	pOneSiteIsHet = 2 * theta * (1 - theta)
-	dexp(k, pOneSiteIsHet)
-}
-
-# Since the geometric/exponential distribution is "memory-free", can simply multiply the (independent) probabilities for the distances:
-pHetPairsAtDistances <- function(dists, theta) {
-	prod(pHetPairAtDistance(dists, theta))
+pOneSiteIsHet <- function(theta) {
+	2 * theta * (1 - theta)
 }

 # p = 2 * theta * (1 - theta)
@ -32,6 +21,26 @@ meanIntraHetDistanceToTheta <- function(d) {
 	(-1 + (1 - 2/d)^0.5) / -2
 }

+# For consecutive diploid het sites x and y, P(distance(x,y) = k)
+# = P(site y is the first het site downstream of x at distance = k | het site x exists at its location).
+# That is, het site x already "exists", and we want to know what the probability that the NEXT het site (y) is k bases away.
+pHetPairAtDistance <- function(k, theta) {
+	pOneSiteIsHetTheta = pOneSiteIsHet(theta)
+	dexp(k, pOneSiteIsHet)
+}
+
+# Since the geometric/exponential distribution is "memory-free", can simply multiply the (independent) probabilities for the distances:
+pHetPairsAtDistances <- function(dists, theta) {
+	prod(pHetPairAtDistance(dists, theta))
+}
+
+# Sample numDists distances from the intra-het distance distribution.
+# [since the geometric/exponential distribution is "memory-free", can simply **independently** sample from the distribution]:
+sampleIntraHetDistances <- function(numDists, theta) {
+	pOneSiteIsHetTheta = pOneSiteIsHet(theta)
+	ceiling(rexp(numDists, pOneSiteIsHetTheta))  # round up to get whole-number distances starting from 1
+}
+
 # For consecutive diploid het sites x and y, P(distance(x,y) <= k)
 pHetPairLteDistance <- function(k, theta) {
 	# Although the real minimum distance starts with 1 (geometric distribution), the exponential distribution approximation starts with 0: