diff --git a/R/privateMutations.R b/R/privateMutations.R new file mode 100644 index 000000000..a23e28247 --- /dev/null +++ b/R/privateMutations.R @@ -0,0 +1,28 @@ +MAX_AC = 10000 +normHist <- function(d, m) { + x = hist(d$true.ac, breaks=1:20000, plot=F)$counts[1:MAX_AC] + x / sum(x) +} + +f <- function(d, acs) { + cols = rainbow(length(acs), alpha=0.75) + y = normHist(subset(afs, small.ac == acs[1])) + x = 1:length(y) / max(d$true.an) + plot(x, y, type="l", col=cols[1], xlab="True MAF in full population", ylab="Frequency", lwd=3, log="x") + for (i in 2:length(acs)) { + points(x, normHist(subset(afs, small.ac == acs[i])), type="l", col=cols[i], lwd=3) + } + + legend("topright", legend=lapply(acs, function(x) paste("AC =", x)), fill=cols, title="Sub-population") +} + +expected <- function(ps, N, eps) { + co = 2 * N / ( 1 - eps ) + v = co * ((1 - ps)/(1-eps))^(2 * N - 1) + v / sum(v) +} + +f(afs, c(1,2,3,5,10,50)) +x = 1:MAX_AC / 200000 +points(x, expected(x,1000,1e-8),type="l",lty=3,lwd=3) + diff --git a/python/privateMutationRates.py b/python/privateMutationRates.py new file mode 100755 index 000000000..6bdaf5401 --- /dev/null +++ b/python/privateMutationRates.py @@ -0,0 +1,141 @@ +import sys +from optparse import OptionParser +from itertools import * +import random + +# a simple script that does: +# 1 -- generates a master set of variants following the neutral expectation from a single big population +# 2 -- randomly generates M individuals with variants and genotypes sampled as expected from the big population of variants +# 3 -- writes out the genotypes of these individuals, and their allele frequency +def main(): + global OPTIONS + usage = "usage: %prog [options] outputFile" + parser = OptionParser(usage=usage) + + parser.add_option("-N", "", dest="bigPopSize", + type='int', default=1000, + help="") + + parser.add_option("-M", "", dest="smallPopSize", + type='int', default=100, + help="") + + parser.add_option("-K", "", dest="nHetsPerSample", + type='int', default=1000, + help="") + + parser.add_option("", "--maxMAF", dest="maxMAF", + type='float', default=None, + help="") + + (OPTIONS, args) = parser.parse_args() + if len(args) != 1: + parser.error("Takes no arguments") + + random.seed(10000) + genotypes = simulateSeqExpt(OPTIONS.bigPopSize, OPTIONS.smallPopSize, OPTIONS.nHetsPerSample) + printGenotypes(genotypes, open(args[0] + ".genotypes", 'w')) + printAFS(genotypes, open(args[0] + ".afs", 'w')) + +class Variant: + def __init__(self, id, trueAC, trueAN): + self.id = "%d.%d" % ( trueAC, id ) + self.trueAC = trueAC + self.trueAN = trueAN + + q = self.af() + p = 1 - q + self.hw = [p * p, 2 * p * q, q * q] + + def __str__(self): + return "[V %s ac=%d an=%d af=%.2f]" % (self.id, self.trueAC, self.trueAN, self.af()) + __repr__ = __str__ + + def af(self): + return self.trueAC / (1.0*self.trueAN) + + def hwe(self): # returns phomref, phet, phomvar + return self.hw + +def simulateSeqExpt(bigPopSize, smallPopSize, nHetsPerSample): + """Master runner function""" + trueAFS = makeAFS(bigPopSize, nHetsPerSample) + + variants = AFStoVariants(trueAFS, bigPopSize) + + # returns a list of variants per sample + genotypes = genotypeSamples(variants, smallPopSize) + + return genotypes + +def makeAFS(nSamples, nHetsPerSample): + """Generates allele frequency spectrum counts for nsamples and nHetsPerSample from neutral expectation""" + nTotalVariants = nHetsPerSample * sum([1 / (1.0*i) for i in range(1, nSamples * 2 + 1)]) + AFSCounts = [int(round(nHetsPerSample / (1.0*i))) for i in range(1, nSamples * 2 + 1)] + print AFSCounts + print nTotalVariants + print sum(AFSCounts) + return AFSCounts + +def AFStoVariants(trueAFS, bigPopSize): + """Converts an allele frequency spectrum to specific named Variant objects""" + variants = [] + + nChromosomes = 2 * bigPopSize + for ac in range(len(trueAFS)): + af = (1.0*ac) / nChromosomes + if OPTIONS.maxMAF == None or af <= OPTIONS.maxMAF: + for j in range(trueAFS[ac]): + v = Variant(j, ac+1, nChromosomes) + #print ac, j, v + variants.append(v) + else: + print 'Skipping AC', ac, ' / ', nChromosomes, 'beyond max MAF', OPTIONS.maxMAF + + return variants + +# returns a list of variants per sample +def genotypeSamples(variants, nSamples): + """Given a list of variants, generates nSamples genotypes""" + return [genotypeSample(samplei, variants) for samplei in range(nSamples)] + +def genotypeSample(id, variants): + """Generate a single set of genotypes for a single using the list of variants""" + print 'Genotyping sample', id + genotypes = [] + for v in variants: + pHomRef, pHet, pHomVar = v.hwe() + r = random.random() + if r > pHomRef: # are we not reference? + if r > pHomRef + pHet: # are we hom var? + count = 2 + else: + count = 1 + #print (r, v.af(), pHomRef, pHet, pHomVar, count) + genotypes.append([v, count]) + + return genotypes + +def printGenotypes(sampleGenotypes, out): + print >> out, "\t".join(["sample", "id", "ac", "an", "g"]) + for sample, i in izip(sampleGenotypes, count(len(sampleGenotypes))): + for v, g in sample: + print >> out, "\t".join(map(str, [i-1, v.id, v.trueAC, v.trueAN, g])) + +def printAFS(sampleGenotypes, out): + print >> out, "\t".join(["id", "true.ac", "true.an", "true.af", "small.ac", "small.an", "small.af"]) + counts = dict() + + smallAN = len(sampleGenotypes) * 2 + for sample in sampleGenotypes: + for v, g in sample: + if v not in counts: counts[v] = 0 + counts[v] = counts[v] + g + + + for v, smallAC in counts.iteritems(): + print >> out, "\t".join(map(str, [v.id, v.trueAC, v.trueAN, v.af(), smallAC, smallAN, smallAC / (1.0*smallAN)])) + + +if __name__ == "__main__": + main()