Private mutation simulator and analysis routines for EOMI paper

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4960 348d0f76-0448-11de-a6fe-93d51630548a
2011-01-07 21:23:29 +00:00 · 2011-01-07 21:23:29 +00:00 · 3362f0c280
parent 60f45a7c49
commit 3362f0c280
2 changed files with 169 additions and 0 deletions
--- a/R/privateMutations.R
+++ b/R/privateMutations.R
@ -0,0 +1,28 @@
+MAX_AC = 10000
+normHist <- function(d, m) {
+	x = hist(d$true.ac, breaks=1:20000, plot=F)$counts[1:MAX_AC]
+	x / sum(x)
+}
+	
+f <- function(d, acs) {
+	cols = rainbow(length(acs), alpha=0.75)
+	y = normHist(subset(afs, small.ac == acs[1]))
+	x = 1:length(y) / max(d$true.an)
+	plot(x, y, type="l", col=cols[1], xlab="True MAF in full population", ylab="Frequency", lwd=3, log="x")
+	for (i in 2:length(acs)) {
+		points(x, normHist(subset(afs, small.ac == acs[i])), type="l", col=cols[i], lwd=3)
+	}
+	
+	legend("topright", legend=lapply(acs, function(x) paste("AC =", x)), fill=cols, title="Sub-population")
+}
+
+expected <- function(ps, N, eps) {
+	co = 2 * N / ( 1 - eps )
+	v = co * ((1 - ps)/(1-eps))^(2 * N - 1)
+	v / sum(v)
+}
+
+f(afs, c(1,2,3,5,10,50))
+x = 1:MAX_AC / 200000
+points(x, expected(x,1000,1e-8),type="l",lty=3,lwd=3)
+
--- a/python/privateMutationRates.py
+++ b/python/privateMutationRates.py
@ -0,0 +1,141 @@
+import sys
+from optparse import OptionParser
+from itertools import *
+import random
+
+# a simple script that does:
+# 1 -- generates a master set of variants following the neutral expectation from a single big population
+# 2 -- randomly generates M individuals with variants and genotypes sampled as expected from the big population of variants
+# 3 -- writes out the genotypes of these individuals, and their allele frequency
+def main():
+    global OPTIONS
+    usage = "usage: %prog [options] outputFile"
+    parser = OptionParser(usage=usage)
+    
+    parser.add_option("-N", "", dest="bigPopSize",
+                        type='int', default=1000,
+                        help="")         
+
+    parser.add_option("-M", "", dest="smallPopSize",
+                        type='int', default=100,
+                        help="")         
+
+    parser.add_option("-K", "", dest="nHetsPerSample",
+                        type='int', default=1000,
+                        help="")         
+
+    parser.add_option("", "--maxMAF", dest="maxMAF",
+                        type='float', default=None,
+                        help="")         
+         
+    (OPTIONS, args) = parser.parse_args()
+    if len(args) != 1:
+        parser.error("Takes no arguments")
+
+    random.seed(10000)
+    genotypes = simulateSeqExpt(OPTIONS.bigPopSize, OPTIONS.smallPopSize, OPTIONS.nHetsPerSample)
+    printGenotypes(genotypes, open(args[0] + ".genotypes", 'w'))
+    printAFS(genotypes, open(args[0] + ".afs", 'w'))
+
+class Variant:
+    def __init__(self, id, trueAC, trueAN):
+        self.id = "%d.%d" % ( trueAC, id )
+        self.trueAC = trueAC
+        self.trueAN = trueAN
+
+        q = self.af()
+        p = 1 - q
+        self.hw = [p * p, 2 * p * q, q * q]
+        
+    def __str__(self):
+        return "[V %s ac=%d an=%d af=%.2f]" % (self.id, self.trueAC, self.trueAN, self.af())
+    __repr__ = __str__
+        
+    def af(self):
+        return self.trueAC / (1.0*self.trueAN)
+        
+    def hwe(self): # returns phomref, phet, phomvar
+        return self.hw
+
+def simulateSeqExpt(bigPopSize, smallPopSize, nHetsPerSample):
+    """Master runner function"""
+    trueAFS = makeAFS(bigPopSize, nHetsPerSample)
+    
+    variants = AFStoVariants(trueAFS, bigPopSize)
+    
+    # returns a list of variants per sample
+    genotypes = genotypeSamples(variants, smallPopSize)
+    
+    return genotypes
+    
+def makeAFS(nSamples, nHetsPerSample):
+    """Generates allele frequency spectrum counts for nsamples and nHetsPerSample from neutral expectation"""
+    nTotalVariants = nHetsPerSample * sum([1 / (1.0*i) for i in range(1, nSamples * 2 + 1)])
+    AFSCounts = [int(round(nHetsPerSample / (1.0*i))) for i in range(1, nSamples * 2 + 1)]
+    print AFSCounts
+    print nTotalVariants
+    print sum(AFSCounts)
+    return AFSCounts
+
+def AFStoVariants(trueAFS, bigPopSize):
+    """Converts an allele frequency spectrum to specific named Variant objects"""
+    variants = []
+
+    nChromosomes = 2 * bigPopSize 
+    for ac in range(len(trueAFS)):
+        af = (1.0*ac) / nChromosomes
+        if OPTIONS.maxMAF == None or af <= OPTIONS.maxMAF: 
+            for j in range(trueAFS[ac]):
+                v = Variant(j, ac+1, nChromosomes)
+                #print ac, j, v
+                variants.append(v)
+        else:
+            print 'Skipping AC', ac, ' / ', nChromosomes, 'beyond max MAF', OPTIONS.maxMAF
+
+    return variants
+
+# returns a list of variants per sample
+def genotypeSamples(variants, nSamples):
+    """Given a list of variants, generates nSamples genotypes"""
+    return [genotypeSample(samplei, variants) for samplei in range(nSamples)]
+
+def genotypeSample(id, variants):
+    """Generate a single set of genotypes for a single using the list of variants"""
+    print 'Genotyping sample', id
+    genotypes = []
+    for v in variants:
+        pHomRef, pHet, pHomVar = v.hwe()
+        r = random.random()
+        if r > pHomRef:             # are we not reference?
+            if r > pHomRef + pHet:  # are we hom var?
+                count = 2
+            else:
+                count = 1
+            #print (r, v.af(), pHomRef, pHet, pHomVar, count)
+            genotypes.append([v, count])
+
+    return genotypes    
+
+def printGenotypes(sampleGenotypes, out):
+    print >> out, "\t".join(["sample", "id", "ac", "an", "g"])
+    for sample, i in izip(sampleGenotypes, count(len(sampleGenotypes))):
+        for v, g in sample:
+            print >> out, "\t".join(map(str, [i-1, v.id, v.trueAC, v.trueAN, g]))
+
+def printAFS(sampleGenotypes, out):
+    print >> out, "\t".join(["id", "true.ac", "true.an", "true.af", "small.ac", "small.an", "small.af"])
+    counts = dict()
+
+    smallAN = len(sampleGenotypes) * 2
+    for sample in sampleGenotypes:
+        for v, g in sample:
+            if v not in counts: counts[v] = 0
+            counts[v] = counts[v] + g
+            
+
+    for v, smallAC in counts.iteritems():
+        print >> out, "\t".join(map(str, [v.id, v.trueAC, v.trueAN, v.af(), smallAC, smallAN, smallAC / (1.0*smallAN)]))
+
+
+if __name__ == "__main__":
+    main()