From caf5aef0f81fc4fd33a7af98ebca3c24ea3eac15 Mon Sep 17 00:00:00 2001 From: depristo Date: Wed, 24 Jun 2009 01:12:35 +0000 Subject: [PATCH] Much improved python analysis routines, as well as easier / more correct merging utility. Better R scripts, which now close recalibration data by the confidence of the quality score itself git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1081 348d0f76-0448-11de-a6fe-93d51630548a --- python/MergeBAMBatch.py | 2 +- python/MergeBAMsUtils.py | 2 +- python/analyzeRecalQuals.py | 81 +++++++++++++++++++++++-------------- python/easyRecalQuals.py | 23 +++++------ 4 files changed, 63 insertions(+), 45 deletions(-) diff --git a/python/MergeBAMBatch.py b/python/MergeBAMBatch.py index 19a49cd54..0bcc3cf64 100755 --- a/python/MergeBAMBatch.py +++ b/python/MergeBAMBatch.py @@ -63,7 +63,7 @@ if __name__ == "__main__": jobid = None if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAM()): - output = spec.getMergedBase() + '.stdout' + output = spec.getMergedBase() cmd = spec.mergeCmd(OPTIONS.mergeBin, useSamtools = OPTIONS.useSamtools) #print cmd jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry) diff --git a/python/MergeBAMsUtils.py b/python/MergeBAMsUtils.py index 6aa5066d7..d9004568b 100755 --- a/python/MergeBAMsUtils.py +++ b/python/MergeBAMsUtils.py @@ -59,7 +59,7 @@ class MergeFilesSpec: def filename(self): if self.merged_filename_base <> None: - return self.merged_filename_base + '.' + self.group() + return self.merged_filename_base + '.'.join([self.group()]) else: return self.group() diff --git a/python/analyzeRecalQuals.py b/python/analyzeRecalQuals.py index 0a2f312d4..f75d8c414 100755 --- a/python/analyzeRecalQuals.py +++ b/python/analyzeRecalQuals.py @@ -10,14 +10,17 @@ from itertools import * import math import operator +MAX_QUAL_SCORE = 50 + def phredQScore( nMismatches, nBases ): #print 'phredQScore', nMismatches, nBases if nMismatches == 0: - return 40 + return MAX_QUAL_SCORE elif nBases == 0: return 0 else: - return -10 * math.log10(float(nMismatches) / nBases) + return min(-10 * math.log10(float(nMismatches) / nBases), MAX_QUAL_SCORE) + return r def phredScore2ErrorProp(qual): @@ -30,16 +33,16 @@ def tryByInt(s): except: return s -expectedHeader = 'rg,dn,Qrep,pos,NBases,MMismatches,Qemp'.split(',') -defaultValues = '0,**,0,0,0,0,0'.split(',') +expectedHeader = 'rg,pos,Qrep,dn,nBases,nMismatches,Qemp'.split(',') +defaultValues = '0,0,0,**,0,0,0'.split(',') class RecalData(dict): def __init__(self): self.parse(expectedHeader, defaultValues) def parse(self, header, data): - # rg,dn,Qrep,pos,NBases,MMismatches,Qemp - types = [str, str, int, tryByInt, int, int, int] + # rg,pos,Qrep,dn,NBases,MMismatches,Qemp + types = [str, tryByInt, int, str, int, int, int] for head, expected, datum, type in zip(header, expectedHeader, data, types): if head <> expected: raise ("Unexpected header in rawData %s %s %s" % (head, expected, datum)) @@ -59,32 +62,41 @@ class RecalData(dict): def readGroup(self): return self.rg def dinuc(self): return self.dn def qReported(self): return self.Qrep - def qEmpirical(self): return self.Qemp def cycle(self): return self.pos - def nBases(self): return self.NBases - def nMismatches(self): return self.MMismatches - def nExpectedMismatches(self): return self.nBases() * phredScore2ErrorProp(self.qReported()) + def getNBases(self): return self.nBases + def getNMismatches(self): return self.nMismatches + def nExpectedMismatches(self): return self.getNBases() * phredScore2ErrorProp(self.qReported()) + + + def qEmpirical(self): + #if OPTIONS.raw: + return self.Qemp + #else: + # r = phredQScore(self.getNMismatches() + 1, self.getNBases() + 1) + # #print 'Using yates corrected Q scores', self.getNMismatches(), self.getNBases(), self.getNMismatches() + 1, self.getNBases() + 1, self.Qemp, r, r - self.Qemp + # return r + def combine(self, moreData): # grab useful info sumErrors = self.nExpectedMismatches() for datum in moreData: - self.NBases += datum.nBases() - self.MMismatches += datum.nMismatches() + self.nBases += datum.getNBases() + self.nMismatches += datum.getNMismatches() sumErrors += datum.nExpectedMismatches() self.updateQemp() - self.Qrep = phredQScore(sumErrors, self.nBases()) + self.Qrep = phredQScore(sumErrors, self.getNBases()) #print 'self.Qrep is now', self.Qrep return self def updateQemp(self): - newQemp = phredQScore( self.nMismatches(), self.nBases() ) + newQemp = phredQScore( self.getNMismatches(), self.getNBases() ) #print 'Updating qEmp', self.Qemp, newQemp self.Qemp = newQemp return newQemp def __str__(self): - return "[rg=%s cycle=%s dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d]" % ( self.readGroup(), str(self.cycle()), self.dinuc(), self.qReported(), self.qEmpirical(), self.nBases(), self.nMismatches()) + return "[rg=%s cycle=%s dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d]" % ( self.readGroup(), str(self.cycle()), self.dinuc(), self.qReported(), self.qEmpirical(), self.getNBases(), self.getNMismatches()) def __repr__(self): return self.__str__() @@ -157,7 +169,7 @@ def countQsOfMinQuality(thres, data): return qDeclared, qDeclaredTrue def medianQreported(jaffe, allBases): - i, ignore = medianByCounts(map( RecalData.nBases, jaffe )) + i, ignore = medianByCounts(map( RecalData.getNBases, jaffe )) return jaffe[i].qReported() def medianByCounts(counts): @@ -170,7 +182,7 @@ def medianByCounts(counts): return i, counts[i] def modeQreported(jaffe, allBases): - ordered = sorted(jaffe, key=RecalData.nBases, reverse=True ) + ordered = sorted(jaffe, key=RecalData.getNBases, reverse=True ) #print ordered return ordered[0].qReported() @@ -202,21 +214,21 @@ def rmse(reportedList, empiricalList, counts): diff = reported - empirical inc = count * diff * diff sum += inc + #print reported, empirical, sum, inc, count, diff + #print sum, math.sqrt(sum) return math.sqrt(sum) def stdevQReported(jaffe, allBases): mean = averageQreported(jaffe, allBases) - return lsamplestdev(map( RecalData.qReported, jaffe ), map( RecalData.nBases, jaffe ), mean) + return lsamplestdev(map( RecalData.qReported, jaffe ), map( RecalData.getNBases, jaffe ), mean) def coeffOfVariationQreported(jaffe, allBases): mean = averageQreported(jaffe, allBases) stdev = stdevQReported(jaffe, allBases) return stdev / mean -# o("variance_Qreported %2.2f" % varianceQreported(jaffe)) - def rmseJaffe(jaffe): - return rmse( map( RecalData.qReported, jaffe ), map( RecalData.qEmpirical, jaffe ), map( RecalData.nBases, jaffe ) ) + return rmse( map( RecalData.qReported, jaffe ), map( RecalData.qEmpirical, jaffe ), map( RecalData.getNBases, jaffe ) ) def basicQualScoreStats(readGroup, data, output ): def o(s): @@ -227,8 +239,8 @@ def basicQualScoreStats(readGroup, data, output ): o("read_group %s" % rg) #o("number_of_cycles %d" % 0) #o("maximum_reported_quality_score %d" % 0) - o("number_of_bases %d" % allBases.nBases()) - o("number_of_mismatching_bases %d" % allBases.nMismatches()) + o("number_of_bases %d" % allBases.getNBases()) + o("number_of_mismatching_bases %d" % allBases.getNMismatches()) o("lane_wide_Qreported %2.2f" % allBases.qReported()) o("lane_wide_Qempirical %2.2f" % allBases.qEmpirical()) o("lane_wide_Qempirical_minus_Qreported %2.2f" % (allBases.qEmpirical()-allBases.qReported())) @@ -241,21 +253,22 @@ def basicQualScoreStats(readGroup, data, output ): o("stdev_Qreported %2.2f" % stdevQReported(jaffe, allBases)) o("coeff_of_variation_Qreported %2.2f" % coeffOfVariationQreported(jaffe, allBases)) - o("RMSE(qReported,qEmpirical) %2.2f" % rmseJaffe(jaffe)) + o("RMSE_qReported_qEmpirical %2.2f" % rmseJaffe(jaffe)) for thres in [20, 25, 30]: qDeclared, qDeclaredTrue = countQsOfMinQuality(thres, jaffe) - o("number_of_q%d_bases %d" % (thres, qDeclared.nBases())) - o("percent_of_q%d_bases %2.2f" % (thres, 100 * qDeclared.nBases() / float(allBases.nBases()))) - o("number_of_q%d_bases_with_qemp_above_q%d %d" % (thres, thres, qDeclaredTrue.nBases())) - o("percent_of_q%d_bases_with_qemp_above_q%d %2.2f" % (thres, thres, 100 * qDeclaredTrue.nBases() / float(allBases.nBases()))) + o("number_of_q%d+_bases %d" % (thres, qDeclared.getNBases())) + o("percent_of_q%d+_bases %2.2f" % (thres, 100 * qDeclared.getNBases() / float(allBases.getNBases()))) + o("number_of_q%d+_bases_with_qemp_above_q%d %d" % (thres, thres, qDeclaredTrue.getNBases())) + o("percent_of_q%d+_bases_with_qemp_above_q%d %2.2f" % (thres, thres, 100 * qDeclaredTrue.getNBases() / float(allBases.getNBases()))) def qDiffByCycle(readGroup, allData, output): + #print '#### qDiffByCycle ####' print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases' print >> output, 'Cycle Qreported Qempirical Qempirical_Qreported nMismatches nBases' for cycle, datum in groupRecalData(allData, key=RecalData.cycle): datum.set(['rg', 'dn', 'pos'], [readGroup, '**', cycle]) diff = datum.qEmpirical() - datum.qReported() - print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases()) + print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.getNMismatches(), datum.getNBases()) def qDiffByDinuc(readGroup, allData, output): print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases' @@ -263,7 +276,7 @@ def qDiffByDinuc(readGroup, allData, output): for dinuc, datum in groupRecalData(allData, key=RecalData.dinuc): datum.set(['rg', 'dn', 'pos'], [readGroup, dinuc, '*']) diff = datum.qEmpirical() - datum.qReported() - print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases()) + print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.getNMismatches(), datum.getNBases()) def qReportedVsqEmpiricalStream(readGroup, data): for key, datum in groupRecalData(data, key=RecalData.qReported): @@ -273,7 +286,9 @@ def qReportedVsqEmpiricalStream(readGroup, data): def qReportedVsqEmpirical(readGroup, allData, output): print >> output, 'Qreported Qempirical nMismatches nBases' for key, datum in qReportedVsqEmpiricalStream(readGroup, allData): - print >> output, "%2.2f %2.2f %12d %12d" % (datum.qReported(), datum.qEmpirical(), datum.nMismatches(), datum.nBases()) + #if datum.qReported() > 35: + # print datum + print >> output, "%2.2f %2.2f %12d %12d" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases()) def analyzeRawData(rawDataFile): for readGroup, data in rawDataByReadGroup(rawDataFile): @@ -303,6 +318,7 @@ def analyzeFiles(files): #print 'analyzeFiles', files Rscript = config.getOption('R', 'Rscript', 'input_file') for file in files: + print 'Analyzing file', file plotter = getPlotterForFile(file) if plotter <> None: cmd = ' '.join([Rscript, plotter, file]) @@ -328,6 +344,9 @@ def main(): parser.add_option("", "--dry", dest="dry", action='store_true', default=False, help="If provided, nothing actually gets run, just a dry run") + #parser.add_option("-r", "--raw", dest="raw", + # action='store_true', default=False, + # help="If provided, analyze data w.r.t. the raw empirical qulaity scores # mmismatches / # bases, as opposed to the Yates correction of +1 to each") parser.add_option("-g", "--readGroup", dest="selectedReadGroups", action="append", type="string", default=[], help="If provided, only the provided read groups will be analyzed") diff --git a/python/easyRecalQuals.py b/python/easyRecalQuals.py index cba63d1a1..62f7c27c3 100755 --- a/python/easyRecalQuals.py +++ b/python/easyRecalQuals.py @@ -12,9 +12,9 @@ if __name__ == "__main__": parser.add_option("-A", "--args", dest="args", type="string", default="", help="arguments to GATK") - parser.add_option("-C", "--CovariateArgs", dest="CovariateArgs", + parser.add_option("-m", "--mode", dest="RecalMode", type="string", default="", - help="arguments to GATK") + help="Mode argument to provide to table recalibrator") parser.add_option("-q", "--farm", dest="farmQueue", type="string", default=None, help="Farm queue to send processing jobs to") @@ -45,37 +45,36 @@ if __name__ == "__main__": covariateRoot = os.path.join(OPTIONS.scratchDir, rootname) covariateInitial = covariateRoot + '.init' - initDataFile = covariateInitial + '.raw_data.csv' + initDataFile = covariateInitial + '.recal_data.csv' covariateRecal = covariateRoot + '.recal' - recalDataFile = covariateRecal + '.raw_data.csv' + recalDataFile = covariateRecal + '.recal_data.csv' if not os.path.exists(OPTIONS.scratchDir): os.mkdir(OPTIONS.scratchDir) - def covariateCmd(bam, outputDir, ignoreAdds): + def covariateCmd(bam, outputDir): add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir) - if not ignoreAdds: - add += " " + OPTIONS.CovariateArgs return config.gatkCmd('CountCovariates') + add def recalibrateCmd(inputBAM, dataFile, outputBAM): - return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s" % (inputBAM, dataFile, outputBAM) + return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode) - def runCovariateCmd(inputBAM, dataFile, dir, jobid, ignoreAdds = False): + def runCovariateCmd(inputBAM, dataFile, dir, jobid): if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile): - cmd = covariateCmd(inputBAM, dir, ignoreAdds) + cmd = covariateCmd(inputBAM, dir) return farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) # # Actually do some work here # jobid = None - jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid, False) + if OPTIONS.ignoreExistingFiles or not os.path.exists(initDataFile): + jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid) if OPTIONS.ignoreExistingFiles or not os.path.exists(outputBAM): cmd = recalibrateCmd(inputBAM, initDataFile, outputBAM) jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) - jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid, True) + jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid) \ No newline at end of file