diff --git a/R/plot_q_emp_stated_hst.R b/R/plot_q_emp_stated_hst.R new file mode 100755 index 000000000..ec6f1e0af --- /dev/null +++ b/R/plot_q_emp_stated_hst.R @@ -0,0 +1,26 @@ +#!/broad/tools/apps/R-2.6.0/bin/Rscript + +args <- commandArgs(TRUE) + +input = args[1] + +t=read.table(input, header=T) +#t=read.csv(input) +#par(mfrow=c(2,1), cex=1.2) + +#outfile = paste(input, ".quality_emp_v_stated.png", sep="") +#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446) +outfile = paste(input, ".quality_emp_v_stated.pdf", sep="") +pdf(outfile, height=7, width=7) +plot(t$Qreported, t$Qempirical, type="p", col="blue", xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores") +abline(0,1) +dev.off() + +#outfile = paste(input, ".quality_emp_hist.png", sep="") +#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446) +outfile = paste(input, ".quality_emp_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0) +plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,40), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n") +axis(2,axTicks(2), format(axTicks(2), scientific=F)) +dev.off() diff --git a/R/plot_qual_diff_v_cycle.R b/R/plot_qual_diff_v_cycle.R new file mode 100755 index 000000000..862569c22 --- /dev/null +++ b/R/plot_qual_diff_v_cycle.R @@ -0,0 +1,16 @@ +#!/broad/tools/apps/R-2.6.0/bin/Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +input = args[1] + +#X11(width=7, height=14) +#outfile = paste(input, ".qual_diff_v_cycle.png", sep="") +#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680) +outfile = paste(input, ".qual_diff_v_cycle.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +c <- read.table(input, header=T) +plot(c$Cycle, c$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Cycle", ylim=c(-10, 10)) + diff --git a/R/plot_qual_diff_v_dinuc.R b/R/plot_qual_diff_v_dinuc.R new file mode 100755 index 000000000..03b70d494 --- /dev/null +++ b/R/plot_qual_diff_v_dinuc.R @@ -0,0 +1,16 @@ +#!/broad/tools/apps/R-2.6.0/bin/Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +input = args[1] + +#outfile = paste(input, ".qual_diff_v_dinuc.png", sep="") +#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680) +outfile = paste(input, ".qual_diff_v_dinuc.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +#in_dinuc = paste(input, ".quality_difference_v_dinucleotide.csv", sep="") +#d <- read.csv(input) +d <- read.table(input, header=T) +plot(d$Dinuc, d$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Dinucleotide", ylim=c(-10,10)) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index fc4a6d0ab..da4675b3a 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -22,20 +22,11 @@ public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false) public int buggyMaxReadLen = 100000; - //@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", doc="max quality score", required=false) - //public int MAX_QUAL_SCORE = 63; - @Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Filename root for the outputted logistic regression training files") public String OUTPUT_FILEROOT = "output"; - //@Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, doc="Create training data files for logistic regression") - //public boolean CREATE_TRAINING_DATA = true; - - //@Argument(fullName="DOWNSAMPLE_FRACTION", shortName="sample", required=false, doc="Fraction of bases to randomly sample") - //public float DOWNSAMPLE_FRACTION=1.0f; - @Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score") - public int MIN_MAPPING_QUALITY = 0; + public int MIN_MAPPING_QUALITY = 1; @Argument(fullName="READ_GROUP", shortName="rg", required=false, doc="Only use reads with this read group (@RG)") public String READ_GROUP = "none"; @@ -47,9 +38,6 @@ public class CovariateCounterWalker extends LocusWalker { public List platforms = Collections.singletonList("*"); //public List platforms = Collections.singletonList("ILLUMINA"); - //@Argument(fullName="rawData", shortName="raw", required=false, doc="If true, raw mismatch observations will be output to a file") - //public boolean outputRawData = true; - @Argument(fullName="writeDetailedAnalysisFiles", required=false, doc="If true, we'll write detailed analysis files out at the end of covariate calcuilations") public boolean writeDetailedAnalysisFiles = false; @@ -59,10 +47,7 @@ public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="") public boolean collapseDinuc = false; - //ArrayList flattenData = new ArrayList(); - //HashMap data = new HashMap(); HashMap data = new HashMap(); - //RecalData[][][] data; long counted_sites = 0; // number of sites used to count covariates long counted_bases = 0; // number of bases used to count covariates @@ -161,21 +146,6 @@ public class CovariateCounterWalker extends LocusWalker { } public void onTraversalDone(Integer result) { - /* - PrintStream covars_out; - try { - covars_out = new PrintStream(OUTPUT_FILEROOT+".covars.out"); - covars_out.println(RecalData.headerString()); - for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { - for ( RecalData datum : getRecalData(readGroup.getReadGroupId()) ) { - covars_out.println(datum); - } - } - } catch (FileNotFoundException e) { - System.err.println("FileNotFoundException: " + e.getMessage()); - } - */ - printInfo(out); writeTrainingData(); writeAnalysisData(); @@ -194,7 +164,7 @@ public class CovariateCounterWalker extends LocusWalker { private void writeTrainingData() { out.printf("Writing raw recalibration data%n"); - writeRawTable(); + writeRecalTable(); out.printf("...done%n"); out.printf("Writing logistic recalibration data%n"); @@ -228,10 +198,10 @@ public class CovariateCounterWalker extends LocusWalker { } } - private void writeRawTable() { + private void writeRecalTable() { PrintStream table_out = null; try { - table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv"); + table_out = new PrintStream( OUTPUT_FILEROOT+".recal_data.csv"); printInfo(table_out); table_out.println("rg,dn,Qrep,pos,NBases,MMismatches,Qemp"); for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 9f8b6179b..b45d6d0fa 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -229,7 +229,7 @@ class CombinatorialRecalMapping implements RecalMapping { byte[][] dataTable = index == -1 ? null : cache.get(index); if ( dataTable == null && prevBase != 'N' && base != 'N' ) - throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, prevBase, base)); + throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, (char)prevBase, (char)base)); byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual; diff --git a/python/MergeBAMBatch.py b/python/MergeBAMBatch.py index 8ef1f4fbb..19a49cd54 100755 --- a/python/MergeBAMBatch.py +++ b/python/MergeBAMBatch.py @@ -69,5 +69,5 @@ if __name__ == "__main__": jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry) if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAMIndex()): - jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, '', waitID = jobid, just_print_commands = OPTIONS.dry) + jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, None, waitID = jobid, just_print_commands = OPTIONS.dry) diff --git a/python/analyzeRecalQuals.py b/python/analyzeRecalQuals.py new file mode 100755 index 000000000..18cd34415 --- /dev/null +++ b/python/analyzeRecalQuals.py @@ -0,0 +1,253 @@ +from __future__ import with_statement +import farm_commands +import os.path +import sys +from optparse import OptionParser +import picard_utils +from gatkConfigParser import * +import re +from itertools import * +import math + +def phredQScore( nMismatches, nBases ): + #print 'phredQScore', nMismatches, nBases + if nMismatches == 0: + return 40 + elif nBases == 0: + return 0 + else: + return -10 * math.log10(float(nMismatches) / nBases) + + +def phredScore2ErrorProp(qual): + #print 'phredScore2ErrorProp', qual + return math.pow(10.0, float(qual) / -10.0) + +expectedHeader = 'rg,dn,Qrep,pos,NBases,MMismatches,Qemp'.split(',') +defaultValues = '0,**,0,0,0,0,0'.split(',') +class RecalData(dict): + + def __init__(self): + self.parse(expectedHeader, defaultValues) + + def parse(self, header, data): + # rg,dn,Qrep,pos,NBases,MMismatches,Qemp + types = [str, str, int, int, int, int, int] + for head, expected, datum, type in zip(header, expectedHeader, data, types): + if head <> expected: + raise ("Unexpected header in rawData %s %s %s" % (head, expected, datum)) + #print 'Binding => ', head, type(datum) + self[head] = type(datum) + #print self + return self + + def set(self, header, values): + for head, val in zip(header, values): + self[head] = val + + def __getattr__(self, name): + return self[name] + + # rg,dn,Qrep,pos,NBases,MMismatches,Qemp + def readGroup(self): return self.rg + def dinuc(self): return self.dn + def qReported(self): return self.Qrep + def qEmpirical(self): return self.Qemp + def cycle(self): return self.pos + def nBases(self): return self.NBases + def nMismatches(self): return self.MMismatches + def nExpectedMismatches(self): return self.nBases() * phredScore2ErrorProp(self.qReported()) + + def combine(self, moreData): + # grab useful info + sumErrors = self.nExpectedMismatches() + for datum in moreData: + self.NBases += datum.nBases() + self.MMismatches += datum.nMismatches() + sumErrors += datum.nExpectedMismatches() + self.updateQemp() + self.Qrep = phredQScore(sumErrors, self.nBases()) + #print 'self.Qrep is now', self.Qrep + return self + + def updateQemp(self): + newQemp = phredQScore( self.nMismatches(), self.nBases() ) + #print 'Updating qEmp', self.Qemp, newQemp + self.Qemp = newQemp + return newQemp + + def __str__(self): + return "rg=%s cycle=%d dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d" % ( self.readGroup(), self.cycle(), self.dinuc(), self.qReported(), self.qEmpirical(), self.nBases(), self.nMismatches()) + +# def __init__(dinuc, Qrep, pos, nbases, nmismatches, qemp ): +# self.dinuc = dinuc +# self.Qrep = Qrep + +def rawDataStream(file): + """Yields successive lists containing the CSVs in the data file; excludes headers""" + header = None + for line in open(file): + if line.find("#") <> -1: continue + else: + data = line.strip().split(',') + if line.find("rg,") <> -1: + header = data + else: + yield RecalData().parse(header, data) + +def rawDataByReadGroup(rawDataFile): + """Yields a stream of the data in rawDataFile, grouped by readGroup""" + for readGroup, generator in groupby(rawDataStream(rawDataFile), key=RecalData.readGroup): + yield (readGroup, list(generator)) + +def combineRecalData(separateData): + return RecalData().combine(separateData) + +def groupRecalData(allData, key=None): + s = sorted(allData, key=key) + values = [ [key, combineRecalData(vals)] for key, vals in groupby(s, key=key) ] + return sorted( values, key=lambda x: x[0]) + +# +# let's actually analyze the data! +# +def analyzeReadGroup(readGroup, data, outputRoot): + print 'Read group => ', readGroup + print 'Number of elements => ', len(data) + + files = [] + if OPTIONS.toStdout: + qReportedVsqEmpirical(readGroup, data, sys.stdout ) + qDiffByCycle(readGroup, data, sys.stdout) + qDiffByDinuc(readGroup, data, sys.stdout) + else: + def outputFile(tail): + file = outputRoot + tail + files.append(file) + return file + + with open(outputFile(".empirical_v_reported_quality.dat"), 'w') as output: + qReportedVsqEmpirical(readGroup, data, output ) + with open(outputFile(".quality_difference_v_cycle.dat"), 'w') as output: + qDiffByCycle(readGroup, data, output) + with open(outputFile(".quality_difference_v_dinucleotide.dat"), 'w') as output: + qDiffByDinuc(readGroup, data, output) + + print 'Files', files + return analyzeFiles(files) + +def qDiffByCycle(readGroup, allData, output): + print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases' + print >> output, 'Cycle Qreported Qempirical Qempirical_Qreported nMismatches nBases' + for cycle, datum in groupRecalData(allData, key=RecalData.cycle): + datum.set(['rg', 'dn', 'pos'], [readGroup, '**', cycle]) + diff = datum.qEmpirical() - datum.qReported() + print >> output, "%d %2.2f %2.2f %2.2f %12d %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases()) + +# +# public void qualityDiffVsDinucleotide(PrintStream file, final String readGroup) { +# ArrayList ByCycle = new ArrayList(); +# ArrayList ByCycleReportedQ = new ArrayList(); +# file.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n"); +# RecalData All = new RecalData(0,0,readGroup,""); +# MeanReportedQuality AllReported = new MeanReportedQuality(); +# for (int c=0; c < RecalData.NDINUCS; c++) { +# ByCycle.add(new RecalData(-1, -1,readGroup,RecalData.dinucIndex2bases(c))); +# ByCycleReportedQ.add(new MeanReportedQuality()); +# } +# +# for ( RecalData datum: getRecalData(readGroup) ) { +# int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false); +# ByCycle.get(dinucIndex).inc(datum.N, datum.B); +# ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N); +# All.inc(datum.N, datum.B); +# AllReported.inc(datum.qual, datum.N); +# } +# +# for (int c=0; c < RecalData.NDINUCS; c++) { +# double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N); +# double reportedQual = ByCycleReportedQ.get(c).result(); +# file.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N); +# } +# } + +def qDiffByDinuc(readGroup, allData, output): + print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases' + print >> output, 'Dinuc Qreported Qempirical Qempirical_Qreported nMismatches nBases' + for dinuc, datum in groupRecalData(allData, key=RecalData.dinuc): + datum.set(['rg', 'dn', 'pos'], [readGroup, dinuc, '*']) + diff = datum.qEmpirical() - datum.qReported() + print >> output, "%s %2.2f %2.2f %2.2f %12d %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases()) + +def qReportedVsqEmpirical(readGroup, allData, output): + print >> output, 'Qreported Qempirical nMismatches nBases' + for key, datum in groupRecalData(allData, key=RecalData.qReported): + datum.set(['rg', 'dn', 'Qrep', 'pos'], [readGroup, '**', key, '*']) + print >> output, "%2d %2.2f %12d %12d" % (datum.qReported(), datum.qEmpirical(), datum.nMismatches(), datum.nBases()) + +def analyzeRawData(rawDataFile): + for readGroup, data in rawDataByReadGroup(rawDataFile): + if OPTIONS.selectedReadGroups == [] or readGroup in OPTIONS.selectedReadGroups: + root, sourceFilename = os.path.split(rawDataFile) + if ( OPTIONS.outputDir ): root = OPTIONS.outputDir + outputRoot = os.path.join(root, "%s.%s.%s" % ( sourceFilename, readGroup, 'analysis' )) + analyzeReadGroup(readGroup, data, outputRoot) + +plottersByFile = { + "raw_data.csv$" : analyzeRawData, + "recal_data.csv$" : analyzeRawData, + "empirical_v_reported_quality" : 'PlotQEmpStated', + "quality_difference_v_dinucleotide" : 'PlotQDiffByDinuc', + "quality_difference_v_cycle" : 'PlotQDiffByCycle' } + +def getPlotterForFile(file): + for pat, analysis in plottersByFile.iteritems(): + if re.search(pat, file): + if type(analysis) == str: + return config.getOption('R', analysis, 'input_file') + else: + analysis(file) + return None + +def analyzeFiles(files): + #print 'analyzeFiles', files + Rscript = config.getOption('R', 'Rscript', 'input_file') + for file in files: + plotter = getPlotterForFile(file) + if plotter <> None: + cmd = ' '.join([Rscript, plotter, file]) + farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry) + +if __name__ == "__main__": + usage = """usage: %prog -c config.cfg files*""" + + parser = OptionParser(usage=usage) + parser.add_option("-q", "--farm", dest="farmQueue", + type="string", default=None, + help="Farm queue to send processing jobs to") + parser.add_option("-d", "--dir", dest="outputDir", + type="string", default=None, + help="If provided, analysis output files will be written to this directory") + parser.add_option("-c", "--config", dest="configs", + action="append", type="string", default=[], + help="Configuration file") + parser.add_option("-s", "--stdout", dest="toStdout", + action='store_true', default=False, + help="If provided, writes output to standard output, not to files") + parser.add_option("", "--dry", dest="dry", + action='store_true', default=False, + help="If provided, nothing actually gets run, just a dry run") + parser.add_option("-g", "--readGroup", dest="selectedReadGroups", + action="append", type="string", default=[], + help="If provided, only the provided read groups will be analyzed") + + (OPTIONS, args) = parser.parse_args() + #if len(args) != 3: + # parser.error("incorrect number of arguments") + + if len(OPTIONS.configs) == 0: + parser.error("Requires at least one configuration file be provided") + + config = gatkConfigParser(OPTIONS.configs) + + analyzeFiles(args) \ No newline at end of file diff --git a/python/easyRecalQuals.py b/python/easyRecalQuals.py new file mode 100755 index 000000000..8cf7c49bd --- /dev/null +++ b/python/easyRecalQuals.py @@ -0,0 +1,107 @@ +# +# +# +# java -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -R ~/work/humanref/Homo_sapiens_assembly18.fasta --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -l INFO -T CountCovariates -I NA07048.bam --OUTPUT_FILEROOT test/NA07048_test --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1 -L chr1:1-10,000,000 -collapsePos -collapseDinuc && cat test/NA07048_test.raw_data.csv +# +# java -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -R ~/work/humanref/Homo_sapiens_assembly18.fasta -T TableRecalibration -I NA07048.bam -params test/NA07048_test.raw_data.csv -outputBAM NA07048.test.bam -l INFO -compress 1 -L chr1:1-10,000,000 +# samtools index NA07048.test.bam +# +# java -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -R ~/work/humanref/Homo_sapiens_assembly18.fasta --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -l INFO -T CountCovariates -I NA07048.test.bam --OUTPUT_FILEROOT test/NA07048_test.recal --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1 -L chr1:1-10,000,000 -collapsePos -collapseDinuc && cat test/NA07048_test.recal.raw_data.csv +# +import farm_commands +import os.path +import sys +from optparse import OptionParser +import picard_utils +from gatkConfigParser import * +import glob + +if __name__ == "__main__": + usage = """usage: %prog config.cfg* input.bam output.bam""" + + parser = OptionParser(usage=usage) + parser.add_option("-A", "--args", dest="args", + type="string", default="", + help="arguments to GATK") + parser.add_option("-C", "--CovariateArgs", dest="CovariateArgs", + type="string", default="", + help="arguments to GATK") + parser.add_option("-q", "--farm", dest="farmQueue", + type="string", default=None, + help="Farm queue to send processing jobs to") + parser.add_option("-d", "--dir", dest="scratchDir", + type="string", default="test", + help="Output directory") + parser.add_option("", "--dry", dest="dry", + action='store_true', default=False, + help="If provided, nothing actually gets run, just a dry run") + parser.add_option("", "--plot", dest="plot", + action='store_true', default=False, + help="If provided, will call R to generate convenient plots about the Q scores of the pre and post calibrated files") + parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles", + action='store_true', default=False, + help="Ignores already written files, if present") + + (OPTIONS, args) = parser.parse_args() + #if len(args) != 3: + # parser.error("incorrect number of arguments") + + configFiles = args[0:len(args)-2] + config = gatkConfigParser(configFiles) + inputBAM = args[len(args)-2] + outputBAM = args[len(args)-1] + rootname = os.path.split(os.path.splitext(outputBAM)[0])[1] + + covariateRoot = os.path.join(OPTIONS.scratchDir, rootname) + covariateInitial = covariateRoot + '.init' + initDataFile = covariateInitial + '.raw_data.csv' + covariateRecal = covariateRoot + '.recal' + recalDataFile = covariateRecal + '.raw_data.csv' + + if not os.path.exists(OPTIONS.scratchDir): + os.mkdir(OPTIONS.scratchDir) + + def covariateCmd(bam, outputDir, ignoreAdds): + add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir) + if not ignoreAdds: + add += " " + OPTIONS.CovariateArgs + return config.gatkCmd('CountCovariates') + add + + def recalibrateCmd(inputBAM, dataFile, outputBAM): + return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s" % (inputBAM, dataFile, outputBAM) + + def runCovariateCmd(inputBAM, dataFile, dir, jobid, ignoreAdds = False): + if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile): + cmd = covariateCmd(inputBAM, dir, ignoreAdds) + return farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) + + if OPTIONS.plot: + def plotCmd(cmd): + farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry) + + Rscript = config.getOption('R', 'Rscript', 'input_file') + plotEmpStated = config.getOption('R', 'PlotQEmpStated', 'input_file') + plotByCycleDinuc = config.getOption('R', 'PlotQByCycleDinuc', 'input_file') + + empQualFiles = map( lambda x: glob.glob(''.join([x,'.RG_*.empirical_v_reported_quality.csv'])), [covariateInitial, covariateRecal]) + empQualFiles = empQualFiles[0] + empQualFiles[1] + readGroupRoots = map(lambda x: x.replace(".empirical_v_reported_quality.csv", ""), empQualFiles) + print readGroupRoots + + for file in empQualFiles: + plotCmd(' '.join([Rscript, plotEmpStated, file])) + + for root in readGroupRoots: + plotCmd(' '.join([Rscript, plotByCycleDinuc, root])) + + else: + jobid = None + jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid, False) + + if OPTIONS.ignoreExistingFiles or not os.path.exists(outputBAM): + cmd = recalibrateCmd(inputBAM, initDataFile, outputBAM) + jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) + jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid) + + jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid, True) + \ No newline at end of file diff --git a/python/recal_qual.py b/python/recal_qual.py deleted file mode 100755 index d46640fb8..000000000 --- a/python/recal_qual.py +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/bin/env python - -# Hg18 ref and rods -reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" -dbsnp = "/humgen/gsa-scr1/GATK_Data/dbsnp.assembled_and_random_contigs_only.rod.out" - -## 1KG ref and rods -reference = "/broad/1KG/reference/human_b36_both.fasta" -dbsnp = "/humgen/gsa-scr1/GATK_Data/dbsnp.1kg.rod.out" - -# Flags - many of these are not necessary now but were used for debugging -training_range_str = "" #" -L chr1" -gatk_flags = "" -downsample_fraction = "" #" --DOWNSAMPLE_FRACTION 0.01" -read_group = "" #" --READ_GROUP SRR005814" -min_mapping_quality = " --MIN_MAPPING_QUALITY 1" -recal_limit = 0 - -javaexe = "java -ea" - -justPrintCommands = False - -import os, sys -sys.path.append(".") -if len(sys.argv) > 1: - #from recal_args import * - #print "Using args imported from recal_args.py" -#else: - #from default_broad_recal_args import * - #print "Using default Broad recalibration arguments" - file = sys.argv[1] - fileroot = file - if file.endswith(".bam"): - fileroot = fileroot.replace(".bam", "") - fileroot = os.path.basename(fileroot) - print "fileroot: "+fileroot - #training_range_str = " -L chr1:1-20000000" - -plot = False -recal_only = False -for arg in sys.argv: - if arg == "-plot": - plot = True - if arg == "-recal_only": - recal_only = True - -def cmd(cmdstr): - print ">>> Running: "+cmdstr - if (not justPrintCommands): - ret_val = str(os.system(cmdstr)) - print "<<< Returned: "+str(ret_val) - - if int(ret_val) != 0: - sys.exit("Last command failed: "+cmdstr) - -# Assure that BAM index is available -def assert_bam_index(bam_filename): - if not os.path.exists(bam_filename+".bai"): - cmd("samtools index "+bam_filename) - -def ensure_dir(dir): - if not os.path.exists(dir): - os.makedirs(dir) - -# Process arguments -if recal_limit == 0: - recal_limit_str = "" -else: - recal_limit_str = "_"+str(recal_limit) - -# This was the central directory where I put the files, I've changed it to "." -# for simplicity now even though it probably needs to be read group aware -#covar_path = "/humgen/gsa-scr1/andrewk/recalibration_work" -covar_path = "." -central_fileroot= covar_path+"/"+fileroot - -bam = central_fileroot+".bam" -if not os.path.lexists(bam): - os.symlink(file, bam) - -correction_str = ".corrected"+recal_limit_str -corrected_fileroot = central_fileroot+correction_str -corrected_bam = corrected_fileroot+".bam" - -assert_bam_index(bam) -cmd(javaexe+" -jar ~andrewk/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T CountCovariates -R "+reference+" --DBSNP "+dbsnp+" -mqs 40 -I "+bam+" --OUTPUT_FILEROOT "+central_fileroot+" -l INFO --CREATE_TRAINING_DATA"+training_range_str+downsample_fraction+gatk_flags+read_group+min_mapping_quality) - -# Make plots -if plot: - cmd("~andrewk/covariates/plot_q_emp_stated_hst.R "+central_fileroot+".empirical_v_reported_quality.csv") - cmd("~andrewk/covariates/plot_qual_diff_v_cycle_dinuc.R "+central_fileroot) - -cmd("python ~andrewk/dev/Sting/trunk/python/LogRegression.py "+central_fileroot) - -if recal_only: # Stop if we are only making recalibration tables - sys.exit() - -cmd(javaexe+" -jar ~/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T LogisticRecalibration -I "+bam+" -R "+reference+" --logisticparamsfile "+central_fileroot+".recalibration_table --outputbamfile "+corrected_bam+" -l DEBUG -M "+str(recal_limit)+gatk_flags) -cmd("samtools index "+corrected_bam) - -cmd(javaexe+" -jar ~andrewk/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T CountCovariates -R "+reference+" --DBSNP "+dbsnp+" -mqs 40 -I "+corrected_bam+" --OUTPUT_FILEROOT "+corrected_fileroot+" -l INFO"+gatk_flags+read_group+min_mapping_quality) - -# Make plots -if plot: - cmd("~andrewk/covariates/plot_q_emp_stated_hst.R "+corrected_fileroot+".empirical_v_reported_quality.csv") - cmd("~andrewk/covariates/plot_qual_diff_v_cycle_dinuc.R "+corrected_fileroot) - -# Merge corrected and uncorrected files -output_dir = "comparison_png" -ensure_dir(output_dir) -corrected_pngs = [f for f in os.listdir(".") if f.endswith(".png") and "corrected" in f] -print "\n".join(corrected_pngs) - -for corrected_png in corrected_pngs: - raw_png = corrected_png.replace(correction_str, "") - both_png = corrected_png.replace(correction_str, "raw_and_corrected") - cmd(" ".join(["montage", raw_png, corrected_png, "-geometry +0+0", output_dir+"/"+both_png])) - - - diff --git a/testdata/defaultGATKConfig.cfg b/testdata/defaultGATKConfig.cfg index a45525fa2..9f87170ed 100755 --- a/testdata/defaultGATKConfig.cfg +++ b/testdata/defaultGATKConfig.cfg @@ -10,14 +10,17 @@ gatkData: /humgen/gsa-scr1/GATK_Data dbsnp: %(gatkData)s/dbsnp_129_hg18.rod tmp: /tmp/ args: -l INFO +Sting: ~/dev/GenomeAnalysisTK/trunk #args: -l INFO -L chr1:1-1,000,000 [CountCovariates] -args: --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1 -D %(dbsnp)s +args: --MIN_MAPPING_QUALITY 1 -D %(dbsnp)s [TableRecalibration] args: -compress 1 [R] Rscript: /broad/tools/apps/R-2.6.0/bin/Rscript -PlotQEmpStated: ~andrewk/recalQual_resources/plot_q_emp_stated_hst.R +PlotQEmpStated: %(Sting)s/R/plot_q_emp_stated_hst.R +PlotQDiffByCycle: %(Sting)s/R/plot_qual_diff_v_cycle.R +PlotQDiffByDinuc: %(Sting)s/R/plot_qual_diff_v_dinuc.R