From f5b00c20d0c3ab7f7ce9c82b5a4ac3697da8c753 Mon Sep 17 00:00:00 2001 From: depristo Date: Tue, 7 Jul 2009 14:15:39 +0000 Subject: [PATCH] Updated python files git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1182 348d0f76-0448-11de-a6fe-93d51630548a --- R/plot_q_emp_stated_hst.R | 14 +++- doc/ReadQualityRecalibrator/README | 113 ----------------------------- python/Gelis2PopSNPs.py | 15 ++-- python/analyzeRecalQuals.py | 28 ++++--- python/easyRecalQuals.py | 18 ++--- python/farm_commands.py | 2 +- python/gatkConfigParser.py | 9 ++- python/picard_utils.py | 2 +- 8 files changed, 59 insertions(+), 142 deletions(-) delete mode 100644 doc/ReadQualityRecalibrator/README diff --git a/R/plot_q_emp_stated_hst.R b/R/plot_q_emp_stated_hst.R index 468b619d0..f020f944d 100755 --- a/R/plot_q_emp_stated_hst.R +++ b/R/plot_q_emp_stated_hst.R @@ -15,7 +15,7 @@ pdf(outfile, height=7, width=7) d.good <- t[t$nMismatches >= 1000,] d.100 <- t[t$nMismatches < 100,] d.1000 <- t[t$nMismatches < 1000 & t$nMismatches >= 100,] -plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,45), ylim=c(0,45), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores") +plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,63), ylim=c(0,63), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores") points(d.100$Qreported, d.100$Qempirical, type="p", col="lightblue", pch=16) points(d.1000$Qreported, d.1000$Qempirical, type="p", col="cornflowerblue", pch=16) abline(0,1, lty=2) @@ -26,6 +26,16 @@ dev.off() outfile = paste(input, ".quality_emp_hist.pdf", sep="") pdf(outfile, height=7, width=7) hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0) -plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,45), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n") +plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,63), main="Empirical quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n") +axis(2,axTicks(2), format(axTicks(2), scientific=F)) +dev.off() + +# +# Plot Q reported histogram +# +outfile = paste(input, ".quality_rep_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(t$Qreported, t$nBases), t.nBases != 0) +plot(hst$t.Qreported, hst$t.nBases, type="h", lwd=3, xlim=c(0,63), main="Reported quality score histogram", xlab="Qreported quality score", ylab="Count", yaxt="n") axis(2,axTicks(2), format(axTicks(2), scientific=F)) dev.off() diff --git a/doc/ReadQualityRecalibrator/README b/doc/ReadQualityRecalibrator/README deleted file mode 100644 index cbc1605d6..000000000 --- a/doc/ReadQualityRecalibrator/README +++ /dev/null @@ -1,113 +0,0 @@ -Read Quality Recalibrator -------------------------- -The tools in this package recalibrate quality scores of -Illumina reads in an aligned BAM file. After recalibration, -the quality scores in the QUAL field in each Illumina read -in the output BAM are accurate in that the reported quality -score is equal to its actual probability of mismatching. -This is process is accomplished by analyzing the covariation -between machine reported quality scores and - -1) the position within the read, and -2) the preceding nucleotide (sequencing chemistry effect). - -The aligned reads have their dbSNP sites masked out, and the -mismatched bases are used as a metric for the true error rate -of the system. The error rate at different dinucleotides and -positions in the read is then fed into a logistic regression -system which outputs a correction factor for each of those -combinations which are then use to output a recalibrated BAM -file. - -Software Dependencies ---------------------- -The recalibrator currently depends on the following -applications. Please install these before proceeding. - -- Java Runtime Environment 1.6.0_12 or later -- Python 2.4.2 or later -- R 2.6.0 or later -- samtools 0.1.4 or later. - -Please update the file paths at the top of -RecalQual.py to point to the local installations of the -software listed above. - -Running -------- -Before running the tool, please update the file paths -listed at the top of RecalQual.py to point to the -local installations of the tools listed above. - -The recalibrator has two modes: recalibration and -evaluation, which can be run separately or jointly. -By default, the recalibrator will recalibrate only. - -To calibrate a given bam file, run the following -command: - -python RecalQual.py - -After recalibration, performing evaluation will walk -through the source BAM file again, remeasuring the -differences between empirical vs. reported quality. -The results of evaluation are comma-delimited text -files (.csv) and graphs (.png) which can be found -in the output directory (see below) for the given run. -A successful recalibration should produce a plot of -empirical vs. observed qualities that shows little -difference between the two values. - -To both recalibrate and evaluate, execute: - -python RecalQual.py --recalibrate --evaluate - -To (only) evaluate a given bam file after calibrating: - -python RecalQual.py --evaluate - -Platforms ---------- -By default, the recalibrator processes only read groups -originating from Illumina sequencers. To enable calibration -for other platforms, edit the 'platforms' array at the -top of RecalQual.py. Platforms specified here should -case-insensitive match the "PL" attribute of the read -group in the BAM file. - -Output ------- -The recalibration process keeps many incremental -files around for future analysis. By default, all -of these files will be grouped into the directory -'output./'. By default, this directory -will be created within the working directory. This -location can be changed by editing the output_root -variable at the top of RecalQual.py. - -It is safe to delete this supplemental output -directory at any time. - -Known Issues ------------- -- The recalibrator places severe memory demands on - files with large numbers of read groups. -- If running in 'evaluation' mode (see the 'Running' - section above), X11 is required to generate the - graphs. If running on a machine via ssh, be certain - to enable X tunnelling. - -Troubleshooting ---------------- -- The memory requirements of the recalibrator will - vary based on the type of JVM running the application - and the number of read groups in the input bam file. - If the application reports 'java.lang.OutOfMemoryError: - Java heap space', increase the max heap size provided - to the JVM by adding ' -Xmx????m' to the jvm_args - variable in RecalQual.py, where '????' is the maximum - available memory on the processing computer. - -Support -------- -For support, please email gsadevelopers@broad.mit.edu. diff --git a/python/Gelis2PopSNPs.py b/python/Gelis2PopSNPs.py index 89246ddad..b4da5b201 100755 --- a/python/Gelis2PopSNPs.py +++ b/python/Gelis2PopSNPs.py @@ -13,7 +13,10 @@ ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" def geli2dbsnpFile(geli): root, flowcellDotlane, ext = picard_utils.splitPath(geli) - return os.path.join(root, flowcellDotlane) + '.dbsnp_matches' + if ext == ".calls": + return None + else: + return os.path.join(root, flowcellDotlane) + '.dbsnp_matches' def SingleSampleGenotyperCmd(bam, geli, use2bp): naid = os.path.split(bam)[1].split(".")[0] @@ -49,7 +52,7 @@ def gelis2gelisText( gelis ): if os.path.splitext(maybeGeli)[1] == ".calls" : return maybeGeli else: - return os.path.split(geli)[1] + '.calls' + return os.path.split(maybeGeli)[1] + '.calls' return map( geli2geliText, gelis) @@ -84,7 +87,7 @@ def main(): (OPTIONS, args) = parser.parse_args() if len(args) != 2: parser.error("incorrect number of arguments: " + str(args)) - lines = [line.split() for line in open(args[0])] + lines = [line.strip().split() for line in open(args[0])] nIndividuals = int(args[1]) outputFile = open(OPTIONS.output, 'w') @@ -108,7 +111,7 @@ def main(): for geli, jobid in zip(gelis, jobids): dbsnpFile = geli2dbsnpFile(geli) - if not os.path.exists(dbsnpFile): + if dbsnpFile == None and not os.path.exists(dbsnpFile): dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod) if jobid == 0: jobid = None farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid) @@ -151,14 +154,14 @@ def main(): sortedCalls = [line.split() for line in open(sortedCallFile)] aggregratedCalls = picard_utils.aggregateGeliCalls(sortedCalls) - print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom individuals' + print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom' for snp in map( lambda x: picard_utils.aggregatedGeliCalls2SNP(x, nIndividuals), aggregratedCalls ): if snp == None: continue # ignore bad calls #print snp #sharedCalls = list(sharedCallsGroup) #genotype = list(sharedCalls[0][5]) - print >> outputFile, '%s %s %s %.6f -420.0 -420.0 0.000000 100.0 %d %d %d %d NA' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes()) + print >> outputFile, '%s %s %s %.6f -0.0 -0.0 0.000000 100.0 %d %d %d %d' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes()) outputFile.close() diff --git a/python/analyzeRecalQuals.py b/python/analyzeRecalQuals.py index d466ba0fa..3c3be4738 100755 --- a/python/analyzeRecalQuals.py +++ b/python/analyzeRecalQuals.py @@ -10,7 +10,7 @@ from itertools import * import math import operator -MAX_QUAL_SCORE = 50 +MAX_QUAL_SCORE = 40 def phredQScore( nMismatches, nBases ): """Calculates a phred-scaled score for nMismatches in nBases""" @@ -293,19 +293,25 @@ def qReportedVsqEmpiricalStream(readGroup, data): yield key, datum def qReportedVsqEmpirical(readGroup, allData, output): - print >> output, 'Qreported Qempirical nMismatches nBases' + print >> output, 'Qreported Qempirical nMismatches nBases PercentBases' + rg, allBases = groupRecalData(allData, key=RecalData.readGroup)[0] for key, datum in qReportedVsqEmpiricalStream(readGroup, allData): #if datum.qReported() > 35: # print datum - print >> output, "%2.2f %2.2f %12d %12d" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases()) + print >> output, "%2.2f %2.2f %12d %12d %.2f" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases(), 100.0*datum.getNBases() / float(allBases.getNBases())) def analyzeRawData(rawDataFile): + nReadGroups = 0 for readGroup, data in rawDataByReadGroup(rawDataFile): if OPTIONS.selectedReadGroups == [] or readGroup in OPTIONS.selectedReadGroups: - root, sourceFilename = os.path.split(rawDataFile) - if ( OPTIONS.outputDir ): root = OPTIONS.outputDir - outputRoot = os.path.join(root, "%s.%s.%s" % ( sourceFilename, readGroup, 'analysis' )) - analyzeReadGroup(readGroup, data, outputRoot) + nReadGroups += 1 + if nReadGroups > OPTIONS.maxReadGroups and OPTIONS.maxReadGroups <> -1: + break + else: + root, sourceFilename = os.path.split(rawDataFile) + if ( OPTIONS.outputDir ): root = OPTIONS.outputDir + outputRoot = os.path.join(root, "%s.%s.%s" % ( sourceFilename, readGroup, 'analysis' )) + analyzeReadGroup(readGroup, data, outputRoot) plottersByFile = { "raw_data.csv$" : analyzeRawData, @@ -344,6 +350,9 @@ def main(): parser.add_option("-d", "--dir", dest="outputDir", type="string", default=None, help="If provided, analysis output files will be written to this directory") + parser.add_option("-m", "--maxReadGroups", dest="maxReadGroups", + type="int", default=-1, + help="Maximum number of read groups to process. The default of -1 indicates that all read groups will be processed") parser.add_option("-c", "--config", dest="configs", action="append", type="string", default=[], help="Configuration file") @@ -371,7 +380,8 @@ def main(): parser.error("Requires at least one configuration file be provided") config = gatkConfigParser(OPTIONS.configs) - + + if OPTIONS.selectedReadGroups <> []: print 'Analyzing only the following read groups', OPTIONS.selectedReadGroups analyzeFiles(args) import unittest @@ -400,4 +410,4 @@ class TestanalzyeRecalQuals(unittest.TestCase): if __name__ == '__main__': main() #unittest.main() - \ No newline at end of file + diff --git a/python/easyRecalQuals.py b/python/easyRecalQuals.py index 62f7c27c3..0343744b7 100755 --- a/python/easyRecalQuals.py +++ b/python/easyRecalQuals.py @@ -13,7 +13,7 @@ if __name__ == "__main__": type="string", default="", help="arguments to GATK") parser.add_option("-m", "--mode", dest="RecalMode", - type="string", default="", + type="string", default="SEQUENTIAL", help="Mode argument to provide to table recalibrator") parser.add_option("-q", "--farm", dest="farmQueue", type="string", default=None, @@ -22,14 +22,14 @@ if __name__ == "__main__": action="append", type="string", default=[], help="Configuration file") parser.add_option("-d", "--dir", dest="scratchDir", - type="string", default="test", + type="string", default="./", help="Output directory") + parser.add_option("-w", "--wait", dest="initialWaitID", + type="string", default=None, + help="If providedm the first job dispatched to LSF will use this id as it ended() prerequisite") parser.add_option("", "--dry", dest="dry", action='store_true', default=False, help="If provided, nothing actually gets run, just a dry run") - parser.add_option("", "--plot", dest="plot", - action='store_true', default=False, - help="If provided, will call R to generate convenient plots about the Q scores of the pre and post calibrated files") parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles", action='store_true', default=False, help="Ignores already written files, if present") @@ -54,10 +54,10 @@ if __name__ == "__main__": def covariateCmd(bam, outputDir): add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir) - return config.gatkCmd('CountCovariates') + add + return config.gatkCmd('CountCovariates', log=outputDir, stdLogName=True) + add def recalibrateCmd(inputBAM, dataFile, outputBAM): - return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode) + return config.gatkCmd('TableRecalibration', log=outputBAM, stdLogName=True) + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode) def runCovariateCmd(inputBAM, dataFile, dir, jobid): if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile): @@ -67,7 +67,7 @@ if __name__ == "__main__": # # Actually do some work here # - jobid = None + jobid = OPTIONS.initialWaitID if OPTIONS.ignoreExistingFiles or not os.path.exists(initDataFile): jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid) @@ -77,4 +77,4 @@ if __name__ == "__main__": jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid) - \ No newline at end of file + diff --git a/python/farm_commands.py b/python/farm_commands.py index 883f7776b..07e60d5a3 100644 --- a/python/farm_commands.py +++ b/python/farm_commands.py @@ -22,7 +22,7 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman cmd_str += " -o " + farm_stdout if waitID <> None: - cmd_str += " -w \"done(%s)\"" % (str(waitID)) + cmd_str += " -w \"ended(%s)\"" % (str(waitID)) cmd_str += " \""+cmd_str_from_user + "\"" diff --git a/python/gatkConfigParser.py b/python/gatkConfigParser.py index 8e6367356..5cd3796e1 100755 --- a/python/gatkConfigParser.py +++ b/python/gatkConfigParser.py @@ -71,9 +71,16 @@ class gatkConfigParser(ConfigParser.SafeConfigParser): def gatk_args(self): return self.getOption(self.GATK, 'args') def reference(self): return self.getOption(self.GATK, 'reference') - def gatkCmd(self, mode): + def gatkCmd(self, mode, log = None, stdLogName=False): cmd = ' '.join([self.java(), self.jvm_args(), '-jar', self.jar(), self.gatk_args(), '-R', self.reference()]) cmd += ' ' + ' '.join(['-T', mode, self.getGATKModeOption('args', mode)]) + if log <> None: + if stdLogName: + #head, ext = os.path.splitext(log) + logName = log + "." + mode + ".log" + else: + logName = log + cmd += ' ' + ' '.join(['-log', logName]) return cmd import unittest diff --git a/python/picard_utils.py b/python/picard_utils.py index 56fadd162..e42d7a8b5 100755 --- a/python/picard_utils.py +++ b/python/picard_utils.py @@ -164,7 +164,7 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, MSDStr = '' if MSD: MSDStr = 'MSD=true' - return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + compression_level + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) + return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + str(compression_level) + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) #return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) def getPicardPath(lane, picardRoot = '/seq/picard/'):