From c9e6cb72e173e0f340d41857b272c00c41eab816 Mon Sep 17 00:00:00 2001 From: depristo Date: Sat, 20 Jun 2009 19:50:37 +0000 Subject: [PATCH] Major improvements to python analysis code -- now computes a host of statistics about quality scores from the recal_data.csv file emitted by countcovariates. Includes average Q scores, medians, modes, stdev, coefficients of variation, RSME, and % bases > q10, q20, q30. Can finally quantify and compare the improvement of quality score recalibration. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1064 348d0f76-0448-11de-a6fe-93d51630548a --- python/analyzeRecalQuals.py | 189 +++++++++++++++++++++++++++++------- python/easyRecalQuals.py | 66 ++++--------- python/gatkConfigParser.py | 10 +- 3 files changed, 179 insertions(+), 86 deletions(-) diff --git a/python/analyzeRecalQuals.py b/python/analyzeRecalQuals.py index 18cd34415..0a2f312d4 100755 --- a/python/analyzeRecalQuals.py +++ b/python/analyzeRecalQuals.py @@ -8,6 +8,7 @@ from gatkConfigParser import * import re from itertools import * import math +import operator def phredQScore( nMismatches, nBases ): #print 'phredQScore', nMismatches, nBases @@ -23,6 +24,12 @@ def phredScore2ErrorProp(qual): #print 'phredScore2ErrorProp', qual return math.pow(10.0, float(qual) / -10.0) +def tryByInt(s): + try: + return int(s) + except: + return s + expectedHeader = 'rg,dn,Qrep,pos,NBases,MMismatches,Qemp'.split(',') defaultValues = '0,**,0,0,0,0,0'.split(',') class RecalData(dict): @@ -32,7 +39,7 @@ class RecalData(dict): def parse(self, header, data): # rg,dn,Qrep,pos,NBases,MMismatches,Qemp - types = [str, str, int, int, int, int, int] + types = [str, str, int, tryByInt, int, int, int] for head, expected, datum, type in zip(header, expectedHeader, data, types): if head <> expected: raise ("Unexpected header in rawData %s %s %s" % (head, expected, datum)) @@ -77,7 +84,9 @@ class RecalData(dict): return newQemp def __str__(self): - return "rg=%s cycle=%d dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d" % ( self.readGroup(), self.cycle(), self.dinuc(), self.qReported(), self.qEmpirical(), self.nBases(), self.nMismatches()) + return "[rg=%s cycle=%s dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d]" % ( self.readGroup(), str(self.cycle()), self.dinuc(), self.qReported(), self.qEmpirical(), self.nBases(), self.nMismatches()) + def __repr__(self): + return self.__str__() # def __init__(dinuc, Qrep, pos, nbases, nmismatches, qemp ): # self.dinuc = dinuc @@ -117,6 +126,7 @@ def analyzeReadGroup(readGroup, data, outputRoot): files = [] if OPTIONS.toStdout: + basicQualScoreStats(readGroup, data, sys.stdout ) qReportedVsqEmpirical(readGroup, data, sys.stdout ) qDiffByCycle(readGroup, data, sys.stdout) qDiffByDinuc(readGroup, data, sys.stdout) @@ -126,6 +136,8 @@ def analyzeReadGroup(readGroup, data, outputRoot): files.append(file) return file + with open(outputFile(".basic_info.dat"), 'w') as output: + basicQualScoreStats(readGroup, data, output ) with open(outputFile(".empirical_v_reported_quality.dat"), 'w') as output: qReportedVsqEmpirical(readGroup, data, output ) with open(outputFile(".quality_difference_v_cycle.dat"), 'w') as output: @@ -136,40 +148,114 @@ def analyzeReadGroup(readGroup, data, outputRoot): print 'Files', files return analyzeFiles(files) +def countQsOfMinQuality(thres, data): + """Returns RecalData lists for each of the following: + All quality score bins with qRep > thres, and all quality scores with qRep and qRemp > thres""" + qDeclared = RecalData().combine(filter(lambda x: x.qReported() > thres, data)) + qDeclaredTrue = RecalData().combine(filter(lambda x: x.qReported() > thres and x.qEmpirical() > thres, data)) + #print qDeclared + return qDeclared, qDeclaredTrue + +def medianQreported(jaffe, allBases): + i, ignore = medianByCounts(map( RecalData.nBases, jaffe )) + return jaffe[i].qReported() + +def medianByCounts(counts): + nTotal = lsum(counts) + sum = 0.0 + for i in range(len(counts)): + sum += counts[i] + if sum / nTotal > 0.5: + # The current datum contains the median + return i, counts[i] + +def modeQreported(jaffe, allBases): + ordered = sorted(jaffe, key=RecalData.nBases, reverse=True ) + #print ordered + return ordered[0].qReported() + +def averageQreported(jaffe, allBases): + # the average reported quality score is already calculated and stored as qRep! + return allBases.qReported() + +def lsum(inlist): + return reduce(operator.__add__, inlist, 0) + +def lsamplestdev (inlist, counts, mean): + """ + Returns the variance of the values in the passed list using + N for the denominator (i.e., DESCRIBES the sample variance only). + + Usage: lsamplevar(inlist)""" + n = lsum(counts) + sum = 0.0 + for item, count in zip(inlist, counts): + diff = item - mean + inc = count * diff * diff + #print item, count, mean, diff, diff*diff, inc + sum += inc + return math.sqrt(sum / float(n-1)) + +def rmse(reportedList, empiricalList, counts): + sum = 0.0 + for reported, empirical, count in zip(reportedList, empiricalList, counts): + diff = reported - empirical + inc = count * diff * diff + sum += inc + return math.sqrt(sum) + +def stdevQReported(jaffe, allBases): + mean = averageQreported(jaffe, allBases) + return lsamplestdev(map( RecalData.qReported, jaffe ), map( RecalData.nBases, jaffe ), mean) + +def coeffOfVariationQreported(jaffe, allBases): + mean = averageQreported(jaffe, allBases) + stdev = stdevQReported(jaffe, allBases) + return stdev / mean + +# o("variance_Qreported %2.2f" % varianceQreported(jaffe)) + +def rmseJaffe(jaffe): + return rmse( map( RecalData.qReported, jaffe ), map( RecalData.qEmpirical, jaffe ), map( RecalData.nBases, jaffe ) ) + +def basicQualScoreStats(readGroup, data, output ): + def o(s): + print >> output, s + # aggregate all the data into a single datum + rg, allBases = groupRecalData(data, key=RecalData.readGroup)[0] + #o(allBases) + o("read_group %s" % rg) + #o("number_of_cycles %d" % 0) + #o("maximum_reported_quality_score %d" % 0) + o("number_of_bases %d" % allBases.nBases()) + o("number_of_mismatching_bases %d" % allBases.nMismatches()) + o("lane_wide_Qreported %2.2f" % allBases.qReported()) + o("lane_wide_Qempirical %2.2f" % allBases.qEmpirical()) + o("lane_wide_Qempirical_minus_Qreported %2.2f" % (allBases.qEmpirical()-allBases.qReported())) + + jaffe = [datum for key, datum in qReportedVsqEmpiricalStream(readGroup, data)] + + o("median_Qreported %2.2f" % medianQreported(jaffe, allBases)) + o("mode_Qreported %2.2f" % modeQreported(jaffe, allBases)) + o("average_Qreported %2.2f" % averageQreported(jaffe, allBases)) + o("stdev_Qreported %2.2f" % stdevQReported(jaffe, allBases)) + o("coeff_of_variation_Qreported %2.2f" % coeffOfVariationQreported(jaffe, allBases)) + + o("RMSE(qReported,qEmpirical) %2.2f" % rmseJaffe(jaffe)) + for thres in [20, 25, 30]: + qDeclared, qDeclaredTrue = countQsOfMinQuality(thres, jaffe) + o("number_of_q%d_bases %d" % (thres, qDeclared.nBases())) + o("percent_of_q%d_bases %2.2f" % (thres, 100 * qDeclared.nBases() / float(allBases.nBases()))) + o("number_of_q%d_bases_with_qemp_above_q%d %d" % (thres, thres, qDeclaredTrue.nBases())) + o("percent_of_q%d_bases_with_qemp_above_q%d %2.2f" % (thres, thres, 100 * qDeclaredTrue.nBases() / float(allBases.nBases()))) + def qDiffByCycle(readGroup, allData, output): print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases' print >> output, 'Cycle Qreported Qempirical Qempirical_Qreported nMismatches nBases' for cycle, datum in groupRecalData(allData, key=RecalData.cycle): datum.set(['rg', 'dn', 'pos'], [readGroup, '**', cycle]) diff = datum.qEmpirical() - datum.qReported() - print >> output, "%d %2.2f %2.2f %2.2f %12d %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases()) - -# -# public void qualityDiffVsDinucleotide(PrintStream file, final String readGroup) { -# ArrayList ByCycle = new ArrayList(); -# ArrayList ByCycleReportedQ = new ArrayList(); -# file.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n"); -# RecalData All = new RecalData(0,0,readGroup,""); -# MeanReportedQuality AllReported = new MeanReportedQuality(); -# for (int c=0; c < RecalData.NDINUCS; c++) { -# ByCycle.add(new RecalData(-1, -1,readGroup,RecalData.dinucIndex2bases(c))); -# ByCycleReportedQ.add(new MeanReportedQuality()); -# } -# -# for ( RecalData datum: getRecalData(readGroup) ) { -# int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false); -# ByCycle.get(dinucIndex).inc(datum.N, datum.B); -# ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N); -# All.inc(datum.N, datum.B); -# AllReported.inc(datum.qual, datum.N); -# } -# -# for (int c=0; c < RecalData.NDINUCS; c++) { -# double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N); -# double reportedQual = ByCycleReportedQ.get(c).result(); -# file.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); -# } -# } + print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases()) def qDiffByDinuc(readGroup, allData, output): print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases' @@ -179,11 +265,15 @@ def qDiffByDinuc(readGroup, allData, output): diff = datum.qEmpirical() - datum.qReported() print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases()) +def qReportedVsqEmpiricalStream(readGroup, data): + for key, datum in groupRecalData(data, key=RecalData.qReported): + datum.set(['rg', 'dn', 'Qrep', 'pos'], [readGroup, '**', key, '*']) + yield key, datum + def qReportedVsqEmpirical(readGroup, allData, output): print >> output, 'Qreported Qempirical nMismatches nBases' - for key, datum in groupRecalData(allData, key=RecalData.qReported): - datum.set(['rg', 'dn', 'Qrep', 'pos'], [readGroup, '**', key, '*']) - print >> output, "%2d %2.2f %12d %12d" % (datum.qReported(), datum.qEmpirical(), datum.nMismatches(), datum.nBases()) + for key, datum in qReportedVsqEmpiricalStream(readGroup, allData): + print >> output, "%2.2f %2.2f %12d %12d" % (datum.qReported(), datum.qEmpirical(), datum.nMismatches(), datum.nBases()) def analyzeRawData(rawDataFile): for readGroup, data in rawDataByReadGroup(rawDataFile): @@ -218,7 +308,8 @@ def analyzeFiles(files): cmd = ' '.join([Rscript, plotter, file]) farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry) -if __name__ == "__main__": +def main(): + global config, OPTIONS usage = """usage: %prog -c config.cfg files*""" parser = OptionParser(usage=usage) @@ -250,4 +341,32 @@ if __name__ == "__main__": config = gatkConfigParser(OPTIONS.configs) - analyzeFiles(args) \ No newline at end of file + analyzeFiles(args) + +import unittest +class TestanalzyeRecalQuals(unittest.TestCase): + def setUp(self): + self.numbers = [0, 1, 2, 2, 3, 4, 4, 4, 5, 5, 5, 6, 6] + self.numbersItems = [0, 1, 2, 3, 4, 5, 6] + self.numbersCounts = [1, 1, 2, 1, 3, 3, 2] + self.numbers_sum = 47 + self.numbers_mean = 3.615385 + self.numbers_mode = 4 + self.numbers_median = 4 + self.numbers_stdev = 1.894662 + self.numbers_var = 3.589744 + self.numbers_cov = self.numbers_stdev / self.numbers_mean + + def testSum(self): + self.assertEquals(self.numbers_sum, lsum(self.numbers)) + self.assertEquals(0, lsum(self.numbers[0:0])) + self.assertEquals(1, lsum(self.numbers[0:2])) + self.assertEquals(3, lsum(self.numbers[0:3])) + + def teststdev(self): + self.assertAlmostEqual(self.numbers_stdev, lsamplestdev(self.numbersItems, self.numbersCounts, self.numbers_mean), 4) + +if __name__ == '__main__': + main() + #unittest.main() + \ No newline at end of file diff --git a/python/easyRecalQuals.py b/python/easyRecalQuals.py index 8cf7c49bd..cba63d1a1 100755 --- a/python/easyRecalQuals.py +++ b/python/easyRecalQuals.py @@ -1,23 +1,12 @@ -# -# -# -# java -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -R ~/work/humanref/Homo_sapiens_assembly18.fasta --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -l INFO -T CountCovariates -I NA07048.bam --OUTPUT_FILEROOT test/NA07048_test --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1 -L chr1:1-10,000,000 -collapsePos -collapseDinuc && cat test/NA07048_test.raw_data.csv -# -# java -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -R ~/work/humanref/Homo_sapiens_assembly18.fasta -T TableRecalibration -I NA07048.bam -params test/NA07048_test.raw_data.csv -outputBAM NA07048.test.bam -l INFO -compress 1 -L chr1:1-10,000,000 -# samtools index NA07048.test.bam -# -# java -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -R ~/work/humanref/Homo_sapiens_assembly18.fasta --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -l INFO -T CountCovariates -I NA07048.test.bam --OUTPUT_FILEROOT test/NA07048_test.recal --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1 -L chr1:1-10,000,000 -collapsePos -collapseDinuc && cat test/NA07048_test.recal.raw_data.csv -# import farm_commands import os.path import sys from optparse import OptionParser -import picard_utils from gatkConfigParser import * import glob if __name__ == "__main__": - usage = """usage: %prog config.cfg* input.bam output.bam""" + usage = """usage: %prog [-c config.cfg]* input.bam output.bam""" parser = OptionParser(usage=usage) parser.add_option("-A", "--args", dest="args", @@ -29,6 +18,9 @@ if __name__ == "__main__": parser.add_option("-q", "--farm", dest="farmQueue", type="string", default=None, help="Farm queue to send processing jobs to") + parser.add_option("-c", "--config", dest="configs", + action="append", type="string", default=[], + help="Configuration file") parser.add_option("-d", "--dir", dest="scratchDir", type="string", default="test", help="Output directory") @@ -43,13 +35,12 @@ if __name__ == "__main__": help="Ignores already written files, if present") (OPTIONS, args) = parser.parse_args() - #if len(args) != 3: - # parser.error("incorrect number of arguments") + if len(args) != 2: + parser.error("incorrect number of arguments") - configFiles = args[0:len(args)-2] - config = gatkConfigParser(configFiles) - inputBAM = args[len(args)-2] - outputBAM = args[len(args)-1] + config = gatkConfigParser(OPTIONS.configs) + inputBAM = args[0] + outputBAM = args[1] rootname = os.path.split(os.path.splitext(outputBAM)[0])[1] covariateRoot = os.path.join(OPTIONS.scratchDir, rootname) @@ -75,33 +66,16 @@ if __name__ == "__main__": cmd = covariateCmd(inputBAM, dir, ignoreAdds) return farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) - if OPTIONS.plot: - def plotCmd(cmd): - farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry) - - Rscript = config.getOption('R', 'Rscript', 'input_file') - plotEmpStated = config.getOption('R', 'PlotQEmpStated', 'input_file') - plotByCycleDinuc = config.getOption('R', 'PlotQByCycleDinuc', 'input_file') + # + # Actually do some work here + # + jobid = None + jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid, False) + + if OPTIONS.ignoreExistingFiles or not os.path.exists(outputBAM): + cmd = recalibrateCmd(inputBAM, initDataFile, outputBAM) + jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) + jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) - empQualFiles = map( lambda x: glob.glob(''.join([x,'.RG_*.empirical_v_reported_quality.csv'])), [covariateInitial, covariateRecal]) - empQualFiles = empQualFiles[0] + empQualFiles[1] - readGroupRoots = map(lambda x: x.replace(".empirical_v_reported_quality.csv", ""), empQualFiles) - print readGroupRoots - - for file in empQualFiles: - plotCmd(' '.join([Rscript, plotEmpStated, file])) - - for root in readGroupRoots: - plotCmd(' '.join([Rscript, plotByCycleDinuc, root])) - - else: - jobid = None - jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid, False) - - if OPTIONS.ignoreExistingFiles or not os.path.exists(outputBAM): - cmd = recalibrateCmd(inputBAM, initDataFile, outputBAM) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) - jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) - - jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid, True) + jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid, True) \ No newline at end of file diff --git a/python/gatkConfigParser.py b/python/gatkConfigParser.py index 27892f560..8e6367356 100755 --- a/python/gatkConfigParser.py +++ b/python/gatkConfigParser.py @@ -25,7 +25,7 @@ class gatkConfigParser(ConfigParser.SafeConfigParser): ConfigParser.SafeConfigParser.__init__(self) files = filter(None, configFiles) print 'Reading configuration file(s):', files - print self.read(files) + self.read(files) self.validateRequiredOptions() def validateRequiredOptions(self): @@ -34,7 +34,7 @@ class gatkConfigParser(ConfigParser.SafeConfigParser): def validateOption(self, section, name, type = str): v = self.getOption(section, name, type) - print ' => Validated option', name, v + #print ' => Validated option', name, v def getGATKOption(self, name, type = str): return self.getOption(self.GATK, name, type) @@ -85,9 +85,9 @@ class TestMergeBAMsUtils(unittest.TestCase): def testValidate(self): self.config.validateRequiredOptions() - def testCmd(self): - s = "java -ea -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -L 1:1-10,000,000 -R /home/radon01/depristo/work/humanref/Homo_sapiens_assembly18.fasta -T CountCovariates --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1" - self.assertEquals(self.config.gatkCmd('CountCovariates'), s) + #def testCmd(self): + # s = "java -ea -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -L 1:1-10,000,000 -R /home/radon01/depristo/work/humanref/Homo_sapiens_assembly18.fasta -T CountCovariates --MIN_MAPPING_QUALITY 1" + # self.assertEquals(self.config.gatkCmd('CountCovariates'), s) if __name__ == '__main__': unittest.main() \ No newline at end of file