gatk-3.8/python/picard_utils.py

import farm_commands
import os.path
import sys
from optparse import OptionParser
import string
import re
import glob
import unittest
import itertools

#lanes = ["30JW3AAXX.6", "30KRNAAXX.1", "30KRNAAXX.6", "30PYMAAXX.5"]
#idsList = ['NA12843', 'NA19065', 'NA19064', 'NA18637']

lanes = ["30JW3AAXX.6", "30PYMAAXX.5", "30PNUAAXX.8", "30PPJAAXX.5"]
idsList = ['NA12843', 'NA18637', "NA19058", "NA12842"]
ids = dict(zip(lanes, idsList))
gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar"
ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
analysis = "CombineDuplicates"

MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar'
CALL_GENOTYPES_BIN = '/seq/software/picard/current/bin/CallGenotypes.jar'

def unique(l):
    return list(set(l))

def genotypes2heterozygosity(genotypes, nIndividuals = -1):
    def isHET(genotype):
        return genotype[0] <> genotype[1]

    if nIndividuals == -1:
        n = len(genotypes)
    else:
        n = nIndividuals

    hets = filter( isHET, genotypes )
    nhets = len(hets)
    # print genotypes, ' => hets', hets
    return [nhets / (1.0*n), nhets, n]

def genotypes2allelefrequencies(ref, genotypes, nIndividuals = -1):
    if nIndividuals == -1:
        n = len(genotypes)
    else:
        n = nIndividuals

    alleles = ''.join(genotypes)
    nChroms = 2 * n
    nCalledChroms = 2 * len(genotypes)
    nMissingChroms = nChroms - nCalledChroms
    nRefChroms = alleles.count(ref) + nMissingChroms
    nAltChroms = nChroms - nRefChroms
    p = float(nRefChroms) / nChroms
    q = float(nAltChroms) / nChroms

    assert p + q == 1

    return [p, q, n]

class PicardSNP:
    def __init__( self, loc, ref, polymorphism, heterozygosity, allelefrequencies, genotypes):
        self.loc = loc
        self.ref = ref
        self.polymorphism = polymorphism
        self.heterozygosity = heterozygosity
        self.nIndividuals = allelefrequencies[2]
        self.allelefrequencies = allelefrequencies
        self.genotypes = genotypes

    def refGenotype(self):
        return self.ref + self.ref

    def hetGenotype(self):
        return self.ref + self.alt()

    def homVarGenotype(self):
        return self.alt() + self.alt()

    def alt(self):
        return self.polymorphism[1]

    def het(self):
        return self.heterozygosity[0]

    def p(self):
        return self.allelefrequencies[0]

    def q(self):
        return self.allelefrequencies[1]

    def countGenotype(self, genotype):
        r = len(filter( lambda x: x == genotype, self.genotypes ))
        #print 'countGenotype', genotype, self.genotypes, r
        return r

    def nRefGenotypes(self):
        return self.nIndividuals - self.nHetGenotypes() - self.nHomVarGenotypes()

    def nHetGenotypes(self):
        return self.countGenotype(self.hetGenotype())

    def nHomVarGenotypes(self):
        return self.countGenotype(self.homVarGenotype())


    def __str__(self):
        return '%s %s %s %s %s %s' % ( self.loc, self.ref, str(self.polymorphism), str(self.het()), str(self.allelefrequencies), str(self.genotypes))

def aggregatedGeliCalls2SNP( geliCallsAtSite, nIndividuals ):
    #print 'geliCallsAtSite', geliCallsAtSite
    loc = geliCallsAtSite[0]
    #print loc
    refBases = map( lambda call: call[2], geliCallsAtSite[1] )
    refBase = refBases[0]
    #print 'refBases', refBases

    genotypes = map( lambda call: ''.join(sorted(call[5])), geliCallsAtSite[1] )
    allBases = unique(''.join(genotypes) + refBase)
    #print 'All bases => ', allBases, genotypes

    if len(allBases) > 2:
        print '*** WARNING, tri-state allele [ref=%s, all bases observed = %s] discovered at %s, ignoring the call' % ( refBase, ''.join(allBases), loc )
        return None

    #print 'genotypes', genotypes
    polymorphism = unique(list(refBase + genotypes[0]))
    if polymorphism[0] <> refBase: polymorphism.reverse()

    #print 'polymorphism', polymorphism
    genotype  = list(geliCallsAtSite[1][0][5])

    return PicardSNP(loc, refBase, polymorphism, genotypes2heterozygosity(genotypes, nIndividuals), genotypes2allelefrequencies(refBase, genotypes, nIndividuals), genotypes)

    #return '%s   %s %s 0.002747 -411.622578 -420.661738 0.000000 9.039160 364.000000 %d 1 0' % (loc, genotype[0], genotype[1], len(geliCallsAtSite))

def call2loc(call):
    return call[0] + ':' + call[1]

def aggregateGeliCalls( sortedGeliCalls ):
    #return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
    return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]

def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN ):
    if type(inputFiles) <> list:
        inputFiles = list(inputFiles)
    return 'java -Xmx4096m -jar ' + mergeBin + ' MSD=true AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
    #return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))

def getPicardPath(lane, picardRoot = '/seq/picard/'):
    flowcell, laneNo = lane.split('.')
    filePat = os.path.join(picardRoot, flowcell, '*', laneNo, '*')
    dirs = glob.glob(filePat)
    print dirs
    if len(dirs) > 1:
        system.exit("Bad lane -- too many directories matching pattern " + filePat)
    return dirs[0]

def getReferenceGenotypeFileFromConcordanceFile(concordFile):
    # REFERENCE_GENOTYPES=/seq/references/reference_genotypes/hapmap/Homo_sapiens_assembly18/NA19058.geli
    p = re.compile('REFERENCE_GENOTYPES=([/.\w]+)')
    for line in open(concordFile):
        match = p.search(line)
        print 'Match is', line, match
        if match <> None:
            return match.group(1)
    return None

def callGenotypesCmd( inputBam, outputFilename, callGenotypesBin = CALL_GENOTYPES_BIN, options = ''):
    return "java -jar %s INPUT=%s OUTPUT=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, options)

def concord(options, geli, output, genotypeFile):
    return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) )

def readPicardConcordance(file):
    p = re.compile('HOMOZYGOUS_REFERENCE|HETEROZYGOUS|HOMOZYGOUS_NON_REFERENCE')
# CATEGORY    OBSERVATIONS        AGREE   DISAGREE        PCT_CONCORDANCE
# HOMOZYGOUS_REFERENCE    853     853     0       1
# HETEROZYGOUS    416     413     3  0.992788
# HOMOZYGOUS_NON_REFERENCE        235     231     4       0.982979
    types = [str, int, int, int, float]
    def parse1(line):
        return [f(x) for f, x in zip(types, line.split())]
    data = [parse1(line) for line in open(file) if p.match(line) <> None]
    return data

def splitPath(geli):
    root, filename = os.path.split(geli)
    s = filename.split('.')
    flowcellDotlane = '.'.join(s[0:2])
    ext = '.'.join(s[2:])
    return [root, flowcellDotlane, ext]

def read_dbsnp(dbsnp_matches):
    next = False
    for line in open(dbsnp_matches):
        s = line.split()
        if next:
            return s
        if len(s) > 0 and s[0] == "TOTAL_SNPS":
            next = True
    return []

# ------------------------------------------------------------------------------------------
#
# Unit testing!
#
# ------------------------------------------------------------------------------------------
class TestPicardUnils(unittest.TestCase):
    def setUp(self):
        import cStringIO
        dataString = """chr1    1105366 T       52      99      CT      10.559975       10.559975               -117.68 -93.107178      -116.616493     -45.536842      -88.591728      -92.043671      -20.964022      -116.014435     -44.473339      -31.523996
chr1    1105411 G       22      99      AG      12.484722       12.484722               -23.995817      -27.909206      -10.875731      -27.909206      -46.579994      -29.546518      -46.579994      -23.360453      -29.546518      -46.579994
chr1    1105411 G       29      99      AG      12.033216       12.033216               -30.641142      -34.376297      -14.483982      -35.457623      -53.197636      -32.525024      -53.498665      -26.517199      -33.606354      -54.579994
chr1    1105857 G       6       99      AG      7.442399        2.096584                -7.55462        -9.3608 -5.458036       -9.3608 -20.279993      -16.37723       -20.279993      -12.900434      -16.37723       -20.279993
chr1    1105857 G       7       99      AG      10.889011       1.795554                -7.406977       -9.514187       -5.611423       -9.514187       -23.879993      -19.97723       -23.879993      -16.500435      -19.97723       -23.879993
chr1    1106094 T       20      99      CT      6.747106        6.747106                -56.979992      -43.143734      -56.806652      -23.699236      -41.036522      -42.97039       -9.862975       -56.505623      -23.525892      -16.610081
chr1    1110294 G       42      99      AG      21.076984       21.076984               -44.285015      -49.702267      -17.869579      -49.831242      -80.276649      -48.442669      -80.404335      -38.946564      -48.571644      -80.405624
chr1    1111204 C       26      99      CT      11.364679       11.364679               -55.479992      -31.040928      -55.479992      -36.424099      -23.349712      -31.040928      -11.985033      -55.479992      -36.424099      -32.811741
chr1    1111204 C       29      99      TT      34.740601       3.890135                -52.282055      -48.646442      -52.704597      -17.954794      -44.565525      -48.464859      -13.715057      -52.100475      -17.773212      -9.824923
chr1    1111204 C       31      99      CT      18.784479       18.784479               -71.079994      -39.870823      -71.079994      -44.303268      -31.878578      -39.870823      -13.094099      -71.079994      -44.303268      -39.48679
"""
        dataFile = cStringIO.StringIO(dataString)
        self.nIndividuals = 10

        self.genotypesSets = aggregateGeliCalls(map( string.split, dataFile.readlines() ) )
        self.genotypes = map(lambda x: aggregatedGeliCalls2SNP(x, self.nIndividuals), self.genotypesSets )
        self.locs = ["chr1:1105366", "chr1:1105411", "chr1:1105857", "chr1:1106094", "chr1:1110294", "chr1:1111204"]
        self.nhets = [1, 2, 2, 1, 1, 2]
        self.altAlleles = [1, 2, 2, 1, 1, 4]

        self.aaf = map( lambda x: (1.0*x) / (2 * self.nIndividuals), self.altAlleles )
        self.hets = map( lambda x: (1.0*x) / self.nIndividuals, self.nhets )

    def testGenotypesSize(self):
        self.assertEqual(len(self.genotypesSets), 6)

    def testGenotypes2Het(self):
        print 'testGenotypes2Het...'
        self.assertEqual(genotypes2heterozygosity(['AT']), [1, 1, 1])
        self.assertEqual(genotypes2heterozygosity(['AA']), [0, 0, 1])
        self.assertEqual(genotypes2heterozygosity(['TT']), [0, 0, 1])
        self.assertEqual(genotypes2heterozygosity(['AT', 'AT']), [1, 2, 2])
        self.assertEqual(genotypes2heterozygosity(['AA', 'AA']), [0, 0, 2])
        self.assertEqual(genotypes2heterozygosity(['AT', 'AA']), [0.5, 1, 2])
        self.assertEqual(genotypes2heterozygosity(['AT', 'TT']), [0.5, 1, 2])
        self.assertEqual(genotypes2heterozygosity(['AT', 'TT', 'AA']), [1.0/3, 1, 3])
        self.assertEqual(genotypes2heterozygosity(['AT', 'AT', 'AA']), [2.0/3, 2, 3])

        self.assertEqual(genotypes2heterozygosity(['AT', 'AT'], 10), [2.0/10, 2, 10])
        self.assertEqual(genotypes2heterozygosity(['AT', 'AA'], 10), [1.0/10, 1, 10])

    def testAlleleFreqs(self):
        print 'testAlleleFreqs...'
        self.assertEqual(genotypes2allelefrequencies('A', ['AT']), [0.5, 0.5, 1])
        self.assertEqual(genotypes2allelefrequencies('T', ['AT']), [0.5, 0.5, 1])
        self.assertEqual(genotypes2allelefrequencies('A', ['AA']), [1.0, 0.0, 1])
        self.assertEqual(genotypes2allelefrequencies('A', ['TT']), [0.0, 1.0, 1])

        self.assertEqual(genotypes2allelefrequencies('A', ['TT'], 2), [0.5, 0.5, 2])
        self.assertEqual(genotypes2allelefrequencies('A', ['AA'], 2), [1.0, 0.0, 2])
        self.assertEqual(genotypes2allelefrequencies('A', ['AT'], 2), [3.0/4, 1.0/4, 2])
        self.assertEqual(genotypes2allelefrequencies('A', ['AT'], 3), [5.0/6, 1.0/6, 3])
        self.assertEqual(genotypes2allelefrequencies('T', ['AT'], 3), [5.0/6, 1.0/6, 3])

        self.assertEqual(genotypes2allelefrequencies('A', ['AT', 'AT'], 3), [4.0/6, 2.0/6, 3])
        self.assertEqual(genotypes2allelefrequencies('A', ['AT', 'TT'], 3), [3.0/6, 3.0/6, 3])
        self.assertEqual(genotypes2allelefrequencies('A', ['AT', 'TT', 'AA']), [3.0/6, 3.0/6, 3])

    def testGenotypeSetLocs(self):
        for set, loc in zip(self.genotypesSets, self.locs):
            #print loc, set
            self.assertEqual(set[0], loc)

    def testGenotypeLocs(self):
        for genotype, loc in zip(self.genotypes, self.locs):
            self.assertEqual(genotype.loc, loc)

    def testGenotypeHets(self):
        print 'testGenotypeHets:'
        for genotype, het in zip(self.genotypes, self.hets):
            print ' => ', genotype, het
            self.assertEqual(genotype.het(), het)

    def testGenotypeAlleleFreqs(self):
        print 'testGenotypeAlleleFreqs:'
        for genotype, af in zip(self.genotypes, self.aaf):
            print ' => ', genotype, af
            self.assertEqual(genotype.allelefrequencies, [1 - af, af, self.nIndividuals])

    def testSplit(self):
        self.assertEqual(splitPath('/seq/picard/30GA9AAXX/C1-152_2008-10-23_2009-04-05/1/Solexa-8267/30GA9AAXX.1.observed_genotypes.geli'), ['/seq/picard/30GA9AAXX/C1-152_2008-10-23_2009-04-05/1/Solexa-8267', '30GA9AAXX.1', 'observed_genotypes.geli'])
        self.assertEqual(splitPath('/seq/picard/30GA9AAXX/C1-152_2008-10-23_2009-04-05/2/Solexa-8268/30GA9AAXX.2.observed_genotypes.geli'), ['/seq/picard/30GA9AAXX/C1-152_2008-10-23_2009-04-05/2/Solexa-8268', '30GA9AAXX.2', 'observed_genotypes.geli'])

if __name__ == '__main__':
    unittest.main()