Much improved python analysis routines, as well as easier / more correct merging utility. Better R scripts, which now close recalibration data by the confidence of the quality score itself

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1081 348d0f76-0448-11de-a6fe-93d51630548a
2009-06-24 01:12:35 +00:00 · 2009-06-24 01:12:35 +00:00 · caf5aef0f8
parent ef546868bf
commit caf5aef0f8
4 changed files with 63 additions and 45 deletions
--- a/python/MergeBAMBatch.py
+++ b/python/MergeBAMBatch.py
@ -63,7 +63,7 @@ if __name__ == "__main__":
                jobid = None
                if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAM()):
-                    output = spec.getMergedBase() + '.stdout'
+                    output = spec.getMergedBase()
                    cmd = spec.mergeCmd(OPTIONS.mergeBin, useSamtools = OPTIONS.useSamtools)
                    #print cmd
                    jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)
--- a/python/MergeBAMsUtils.py
+++ b/python/MergeBAMsUtils.py
@ -59,7 +59,7 @@ class MergeFilesSpec:
    def filename(self):
        if self.merged_filename_base <> None:
-            return self.merged_filename_base + '.' + self.group()
+            return self.merged_filename_base + '.'.join([self.group()])
        else:
            return self.group()
--- a/python/analyzeRecalQuals.py
+++ b/python/analyzeRecalQuals.py
@ -10,14 +10,17 @@ from itertools import *
 import math
 import operator
 MAX_QUAL_SCORE = 50
 def phredQScore( nMismatches, nBases ):
    #print 'phredQScore', nMismatches, nBases
    if nMismatches == 0:
-        return 40
+        return MAX_QUAL_SCORE
    elif nBases == 0:
        return 0
    else:
-        return -10 * math.log10(float(nMismatches) / nBases)
+        return min(-10 * math.log10(float(nMismatches) / nBases), MAX_QUAL_SCORE)
        return r
 def phredScore2ErrorProp(qual):
@ -30,16 +33,16 @@ def tryByInt(s):
    except:
        return s        
-expectedHeader = 'rg,dn,Qrep,pos,NBases,MMismatches,Qemp'.split(',')
+expectedHeader = 'rg,pos,Qrep,dn,nBases,nMismatches,Qemp'.split(',')
-defaultValues = '0,**,0,0,0,0,0'.split(',')
+defaultValues = '0,0,0,**,0,0,0'.split(',')
 class RecalData(dict):
    def __init__(self):
        self.parse(expectedHeader, defaultValues)
    def parse(self, header, data):
-        # rg,dn,Qrep,pos,NBases,MMismatches,Qemp
+        # rg,pos,Qrep,dn,NBases,MMismatches,Qemp
-        types = [str, str, int, tryByInt, int, int, int]
+        types = [str, tryByInt, int, str, int, int, int]
        for head, expected, datum, type in zip(header, expectedHeader, data, types):
            if head <> expected:
                raise ("Unexpected header in rawData %s %s %s" % (head, expected, datum))
@ -59,32 +62,41 @@ class RecalData(dict):
    def readGroup(self): return self.rg
    def dinuc(self): return self.dn
    def qReported(self): return self.Qrep
    def qEmpirical(self): return self.Qemp
    def cycle(self): return self.pos
-    def nBases(self): return self.NBases
+    def getNBases(self): return self.nBases
-    def nMismatches(self): return self.MMismatches
+    def getNMismatches(self): return self.nMismatches
-    def nExpectedMismatches(self): return self.nBases() * phredScore2ErrorProp(self.qReported())
+    def nExpectedMismatches(self): return self.getNBases() * phredScore2ErrorProp(self.qReported())
    def qEmpirical(self): 
        #if OPTIONS.raw:
        return self.Qemp
        #else:
        #    r = phredQScore(self.getNMismatches() + 1, self.getNBases() + 1) 
        #    #print 'Using yates corrected Q scores', self.getNMismatches(), self.getNBases(), self.getNMismatches() + 1, self.getNBases() + 1, self.Qemp, r, r - self.Qemp
        #    return r
    def combine(self, moreData):
        # grab useful info
        sumErrors = self.nExpectedMismatches()
        for datum in moreData:
-            self.NBases += datum.nBases()
+            self.nBases += datum.getNBases()
-            self.MMismatches += datum.nMismatches()
+            self.nMismatches += datum.getNMismatches()
            sumErrors += datum.nExpectedMismatches()
        self.updateQemp()
-        self.Qrep = phredQScore(sumErrors, self.nBases())
+        self.Qrep = phredQScore(sumErrors, self.getNBases())
        #print 'self.Qrep is now', self.Qrep
        return self
    def updateQemp(self):
-        newQemp = phredQScore( self.nMismatches(), self.nBases() )
+        newQemp = phredQScore( self.getNMismatches(), self.getNBases() )
        #print 'Updating qEmp', self.Qemp, newQemp
        self.Qemp = newQemp
        return newQemp
    def __str__(self):
-        return "[rg=%s cycle=%s dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d]" % ( self.readGroup(), str(self.cycle()), self.dinuc(), self.qReported(), self.qEmpirical(), self.nBases(), self.nMismatches())
+        return "[rg=%s cycle=%s dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d]" % ( self.readGroup(), str(self.cycle()), self.dinuc(), self.qReported(), self.qEmpirical(), self.getNBases(), self.getNMismatches())
    def __repr__(self):
        return self.__str__()
@ -157,7 +169,7 @@ def countQsOfMinQuality(thres, data):
    return qDeclared, qDeclaredTrue
 def medianQreported(jaffe, allBases):
-    i, ignore = medianByCounts(map( RecalData.nBases, jaffe ))
+    i, ignore = medianByCounts(map( RecalData.getNBases, jaffe ))
    return jaffe[i].qReported()
 def medianByCounts(counts):
@ -170,7 +182,7 @@ def medianByCounts(counts):
            return i, counts[i]
 def modeQreported(jaffe, allBases):
-    ordered = sorted(jaffe, key=RecalData.nBases, reverse=True )
+    ordered = sorted(jaffe, key=RecalData.getNBases, reverse=True )
    #print ordered
    return ordered[0].qReported()
@ -202,21 +214,21 @@ def rmse(reportedList, empiricalList, counts):
        diff = reported - empirical
        inc = count * diff * diff
        sum += inc
        #print reported, empirical, sum, inc, count, diff
    #print sum, math.sqrt(sum)
    return math.sqrt(sum)
 def stdevQReported(jaffe, allBases):
    mean = averageQreported(jaffe, allBases)
-    return lsamplestdev(map( RecalData.qReported, jaffe ), map( RecalData.nBases, jaffe ), mean)
+    return lsamplestdev(map( RecalData.qReported, jaffe ), map( RecalData.getNBases, jaffe ), mean)
 def coeffOfVariationQreported(jaffe, allBases):
    mean = averageQreported(jaffe, allBases)
    stdev = stdevQReported(jaffe, allBases)
    return stdev / mean
 #    o("variance_Qreported                           %2.2f" % varianceQreported(jaffe))
 def rmseJaffe(jaffe):
-    return rmse( map( RecalData.qReported, jaffe ), map( RecalData.qEmpirical, jaffe ), map( RecalData.nBases, jaffe ) )
+    return rmse( map( RecalData.qReported, jaffe ), map( RecalData.qEmpirical, jaffe ), map( RecalData.getNBases, jaffe ) )
 def basicQualScoreStats(readGroup, data, output ):
    def o(s):
@ -227,8 +239,8 @@ def basicQualScoreStats(readGroup, data, output ):
    o("read_group                                   %s" % rg)
    #o("number_of_cycles %d" % 0)
    #o("maximum_reported_quality_score %d" % 0)
-    o("number_of_bases                              %d" % allBases.nBases())
+    o("number_of_bases                              %d" % allBases.getNBases())
-    o("number_of_mismatching_bases                  %d" % allBases.nMismatches())
+    o("number_of_mismatching_bases                  %d" % allBases.getNMismatches())
    o("lane_wide_Qreported                          %2.2f" % allBases.qReported())
    o("lane_wide_Qempirical                         %2.2f" % allBases.qEmpirical())
    o("lane_wide_Qempirical_minus_Qreported         %2.2f" % (allBases.qEmpirical()-allBases.qReported()))
@ -241,21 +253,22 @@ def basicQualScoreStats(readGroup, data, output ):
    o("stdev_Qreported                              %2.2f" % stdevQReported(jaffe, allBases))
    o("coeff_of_variation_Qreported                 %2.2f" % coeffOfVariationQreported(jaffe, allBases))
-    o("RMSE(qReported,qEmpirical)                   %2.2f" % rmseJaffe(jaffe))
+    o("RMSE_qReported_qEmpirical                    %2.2f" % rmseJaffe(jaffe))
    for thres in [20, 25, 30]:
        qDeclared, qDeclaredTrue = countQsOfMinQuality(thres, jaffe)
-        o("number_of_q%d_bases                          %d" % (thres, qDeclared.nBases()))
+        o("number_of_q%d+_bases                         %d" % (thres, qDeclared.getNBases()))
-        o("percent_of_q%d_bases                         %2.2f" % (thres, 100 * qDeclared.nBases() / float(allBases.nBases())))
+        o("percent_of_q%d+_bases                        %2.2f" % (thres, 100 * qDeclared.getNBases() / float(allBases.getNBases())))
-        o("number_of_q%d_bases_with_qemp_above_q%d      %d" % (thres, thres, qDeclaredTrue.nBases()))
+        o("number_of_q%d+_bases_with_qemp_above_q%d     %d" % (thres, thres, qDeclaredTrue.getNBases()))
-        o("percent_of_q%d_bases_with_qemp_above_q%d     %2.2f" % (thres, thres, 100 * qDeclaredTrue.nBases() / float(allBases.nBases())))
+        o("percent_of_q%d+_bases_with_qemp_above_q%d    %2.2f" % (thres, thres, 100 * qDeclaredTrue.getNBases() / float(allBases.getNBases())))
 def qDiffByCycle(readGroup, allData, output):
    #print '#### qDiffByCycle ####'
    print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases'
    print >> output, 'Cycle    Qreported   Qempirical     Qempirical_Qreported     nMismatches     nBases'
    for cycle, datum in groupRecalData(allData, key=RecalData.cycle):
        datum.set(['rg', 'dn', 'pos'], [readGroup, '**', cycle])
        diff = datum.qEmpirical() - datum.qReported()
-        print >> output, "%s   %2.2f  %2.2f   %2.2f     %12d    %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases())
+        print >> output, "%s   %2.2f  %2.2f   %2.2f     %12d    %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.getNMismatches(), datum.getNBases())
 def qDiffByDinuc(readGroup, allData, output):
    print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases'
@ -263,7 +276,7 @@ def qDiffByDinuc(readGroup, allData, output):
    for dinuc, datum in groupRecalData(allData, key=RecalData.dinuc):
        datum.set(['rg', 'dn', 'pos'], [readGroup, dinuc, '*'])
        diff = datum.qEmpirical() - datum.qReported()
-        print >> output, "%s   %2.2f  %2.2f   %2.2f     %12d    %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases())
+        print >> output, "%s   %2.2f  %2.2f   %2.2f     %12d    %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.getNMismatches(), datum.getNBases())
 def qReportedVsqEmpiricalStream(readGroup, data):
    for key, datum in groupRecalData(data, key=RecalData.qReported):
@ -273,7 +286,9 @@ def qReportedVsqEmpiricalStream(readGroup, data):
 def qReportedVsqEmpirical(readGroup, allData, output):
    print >> output, 'Qreported    Qempirical   nMismatches     nBases'
    for key, datum in qReportedVsqEmpiricalStream(readGroup, allData):
-        print >> output, "%2.2f  %2.2f   %12d    %12d" % (datum.qReported(), datum.qEmpirical(), datum.nMismatches(), datum.nBases())
+        #if datum.qReported() > 35:
        #    print datum
        print >> output, "%2.2f  %2.2f   %12d    %12d" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases())
 def analyzeRawData(rawDataFile):
    for readGroup, data in rawDataByReadGroup(rawDataFile):
@ -303,6 +318,7 @@ def analyzeFiles(files):
    #print 'analyzeFiles', files
    Rscript = config.getOption('R', 'Rscript', 'input_file')
    for file in files:
        print 'Analyzing file', file 
        plotter = getPlotterForFile(file)
        if plotter <> None:
            cmd = ' '.join([Rscript, plotter, file])
@ -328,6 +344,9 @@ def main():
    parser.add_option("", "--dry", dest="dry",
                        action='store_true', default=False,
                        help="If provided, nothing actually gets run, just a dry run")
    #parser.add_option("-r", "--raw", dest="raw",
    #                    action='store_true', default=False,
    #                    help="If provided, analyze data w.r.t. the raw empirical qulaity scores # mmismatches / # bases, as opposed to the Yates correction of +1 to each")
    parser.add_option("-g", "--readGroup", dest="selectedReadGroups",
                        action="append", type="string", default=[],
                        help="If provided, only the provided read groups will be analyzed")                        
--- a/python/easyRecalQuals.py
+++ b/python/easyRecalQuals.py
@ -12,9 +12,9 @@ if __name__ == "__main__":
    parser.add_option("-A", "--args", dest="args",
                        type="string", default="",
                        help="arguments to GATK")
-    parser.add_option("-C", "--CovariateArgs", dest="CovariateArgs",
+    parser.add_option("-m", "--mode", dest="RecalMode",
                        type="string", default="",
-                        help="arguments to GATK")
+                        help="Mode argument to provide to table recalibrator")
    parser.add_option("-q", "--farm", dest="farmQueue",
                        type="string", default=None,
                        help="Farm queue to send processing jobs to")
@ -45,37 +45,36 @@ if __name__ == "__main__":
    covariateRoot = os.path.join(OPTIONS.scratchDir, rootname)
    covariateInitial = covariateRoot + '.init'
-    initDataFile = covariateInitial + '.raw_data.csv'
+    initDataFile = covariateInitial + '.recal_data.csv'
    covariateRecal = covariateRoot +  '.recal'
-    recalDataFile = covariateRecal + '.raw_data.csv'
+    recalDataFile = covariateRecal + '.recal_data.csv'
    if not os.path.exists(OPTIONS.scratchDir):
        os.mkdir(OPTIONS.scratchDir)
-    def covariateCmd(bam, outputDir, ignoreAdds):
+    def covariateCmd(bam, outputDir):
        add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir)
        if not ignoreAdds:
            add += " " + OPTIONS.CovariateArgs
        return config.gatkCmd('CountCovariates') + add 
    def recalibrateCmd(inputBAM, dataFile, outputBAM):
-        return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s" % (inputBAM, dataFile, outputBAM)
+        return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode)
-    def runCovariateCmd(inputBAM, dataFile, dir, jobid, ignoreAdds = False):
+    def runCovariateCmd(inputBAM, dataFile, dir, jobid):
        if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile):
-            cmd = covariateCmd(inputBAM, dir, ignoreAdds)
+            cmd = covariateCmd(inputBAM, dir)
            return farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
    #
    # Actually do some work here
    #
    jobid = None
-    jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid, False)
+    if OPTIONS.ignoreExistingFiles or not os.path.exists(initDataFile): 
        jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid)
    if OPTIONS.ignoreExistingFiles or not os.path.exists(outputBAM):
        cmd = recalibrateCmd(inputBAM, initDataFile, outputBAM)
        jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
        jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
-    jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid, True)
+    jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid)