Apprach v2. Added python analysis script, so java no longer must be used to analyses quality score data. About to refactor out lots of unneeded code

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1063 348d0f76-0448-11de-a6fe-93d51630548a
2009-06-20 16:00:23 +00:00 · 2009-06-20 16:00:23 +00:00 · 9e26550b0d
parent dde52e33eb
commit 9e26550b0d
10 changed files with 429 additions and 158 deletions
--- a/R/plot_q_emp_stated_hst.R
+++ b/R/plot_q_emp_stated_hst.R
@ -0,0 +1,26 @@
+#!/broad/tools/apps/R-2.6.0/bin/Rscript
+
+args <- commandArgs(TRUE)
+
+input = args[1]
+
+t=read.table(input, header=T)
+#t=read.csv(input)
+#par(mfrow=c(2,1), cex=1.2)
+
+#outfile = paste(input, ".quality_emp_v_stated.png", sep="")
+#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
+outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
+pdf(outfile, height=7, width=7)
+plot(t$Qreported, t$Qempirical, type="p", col="blue", xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
+abline(0,1)
+dev.off()
+
+#outfile = paste(input, ".quality_emp_hist.png", sep="")
+#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
+outfile = paste(input, ".quality_emp_hist.pdf", sep="")
+pdf(outfile, height=7, width=7)
+hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0)
+plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,40), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
+axis(2,axTicks(2), format(axTicks(2), scientific=F))
+dev.off()
--- a/R/plot_qual_diff_v_cycle.R
+++ b/R/plot_qual_diff_v_cycle.R
@ -0,0 +1,16 @@
+#!/broad/tools/apps/R-2.6.0/bin/Rscript
+
+args <- commandArgs(TRUE)
+verbose = TRUE
+
+input = args[1]
+
+#X11(width=7, height=14)
+#outfile = paste(input, ".qual_diff_v_cycle.png", sep="")
+#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680)
+outfile = paste(input, ".qual_diff_v_cycle.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+c <- read.table(input, header=T)
+plot(c$Cycle, c$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Cycle", ylim=c(-10, 10))
+
--- a/R/plot_qual_diff_v_dinuc.R
+++ b/R/plot_qual_diff_v_dinuc.R
@ -0,0 +1,16 @@
+#!/broad/tools/apps/R-2.6.0/bin/Rscript
+
+args <- commandArgs(TRUE)
+verbose = TRUE
+
+input = args[1]
+
+#outfile = paste(input, ".qual_diff_v_dinuc.png", sep="")
+#png(outfile, height=7, width=7, units="in", res=72) #height=1000, width=680)
+outfile = paste(input, ".qual_diff_v_dinuc.pdf", sep="")
+pdf(outfile, height=7, width=7)
+par(cex=1.1)
+#in_dinuc = paste(input, ".quality_difference_v_dinucleotide.csv", sep="")
+#d <- read.csv(input)
+d <- read.table(input, header=T)
+plot(d$Dinuc, d$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Dinucleotide", ylim=c(-10,10))
--- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java
@ -22,20 +22,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
    @Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false)
    public int buggyMaxReadLen = 100000;

-    //@Argument(fullName="MAX_QUAL_SCORE", shortName="mqs", doc="max quality score", required=false)
-    //public int MAX_QUAL_SCORE = 63;
-
    @Argument(fullName="OUTPUT_FILEROOT", shortName="outroot", required=false, doc="Filename root for the outputted logistic regression training files")
    public String OUTPUT_FILEROOT = "output";

-    //@Argument(fullName="CREATE_TRAINING_DATA", shortName="trainingdata", required=false, doc="Create training data files for logistic regression")
-    //public boolean CREATE_TRAINING_DATA = true;
-
-    //@Argument(fullName="DOWNSAMPLE_FRACTION", shortName="sample", required=false, doc="Fraction of bases to randomly sample")
-    //public float DOWNSAMPLE_FRACTION=1.0f;
-
    @Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score")
-    public int MIN_MAPPING_QUALITY = 0;
+    public int MIN_MAPPING_QUALITY = 1;

    @Argument(fullName="READ_GROUP", shortName="rg", required=false, doc="Only use reads with this read group (@RG)")
    public String READ_GROUP = "none";
@ -47,9 +38,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
    public List<String> platforms = Collections.singletonList("*");
    //public List<String> platforms = Collections.singletonList("ILLUMINA");

-    //@Argument(fullName="rawData", shortName="raw", required=false, doc="If true, raw mismatch observations will be output to a file")
-    //public boolean outputRawData = true;
-
    @Argument(fullName="writeDetailedAnalysisFiles", required=false, doc="If true, we'll write detailed analysis files out at the end of covariate calcuilations")
    public boolean writeDetailedAnalysisFiles = false;

@ -59,10 +47,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
    @Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
    public boolean collapseDinuc = false;

-    //ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
-    //HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
    HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
-    //RecalData[][][] data;

    long counted_sites = 0; // number of sites used to count covariates
    long counted_bases = 0; // number of bases used to count covariates
@ -161,21 +146,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
    }

    public void onTraversalDone(Integer result) {
-        /*
-        PrintStream covars_out;
-        try {
-            covars_out = new PrintStream(OUTPUT_FILEROOT+".covars.out");
-            covars_out.println(RecalData.headerString());
-            for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
-                for ( RecalData datum : getRecalData(readGroup.getReadGroupId()) ) {
-                    covars_out.println(datum);
-                }
-            }
-        } catch (FileNotFoundException e) {
-            System.err.println("FileNotFoundException: " + e.getMessage());
-        }
-        */
-
        printInfo(out);
        writeTrainingData();
        writeAnalysisData();
@ -194,7 +164,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {

    private void writeTrainingData() {
        out.printf("Writing raw recalibration data%n");
-        writeRawTable();
+        writeRecalTable();
        out.printf("...done%n");

        out.printf("Writing logistic recalibration data%n");
@ -228,10 +198,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
        }
    }

-     private void writeRawTable() {
+     private void writeRecalTable() {
         PrintStream table_out = null;
         try {
-             table_out = new PrintStream( OUTPUT_FILEROOT+".raw_data.csv");
+             table_out = new PrintStream( OUTPUT_FILEROOT+".recal_data.csv");
             printInfo(table_out);
             table_out.println("rg,dn,Qrep,pos,NBases,MMismatches,Qemp");
             for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
--- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
@ -229,7 +229,7 @@ class CombinatorialRecalMapping implements RecalMapping {
        byte[][] dataTable = index == -1 ? null : cache.get(index);

        if ( dataTable == null && prevBase != 'N' && base != 'N' )
-            throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, prevBase, base));
+            throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, (char)prevBase, (char)base));

        byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;

--- a/python/MergeBAMBatch.py
+++ b/python/MergeBAMBatch.py
@ -69,5 +69,5 @@ if __name__ == "__main__":
                    jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, output, just_print_commands = OPTIONS.dry)
    
                if OPTIONS.ignoreExistingFiles or not os.path.exists(spec.getMergedBAMIndex()):
-                    jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, '', waitID = jobid, just_print_commands = OPTIONS.dry)
+                    jobid = farm_commands.cmd(spec.getIndexCmd(), OPTIONS.farmQueue, None, waitID = jobid, just_print_commands = OPTIONS.dry)

--- a/python/analyzeRecalQuals.py
+++ b/python/analyzeRecalQuals.py
@ -0,0 +1,253 @@
+from __future__ import with_statement
+import farm_commands
+import os.path
+import sys
+from optparse import OptionParser
+import picard_utils
+from gatkConfigParser import *
+import re
+from itertools import *
+import math
+
+def phredQScore( nMismatches, nBases ):
+    #print 'phredQScore', nMismatches, nBases
+    if nMismatches == 0:
+        return 40
+    elif nBases == 0:
+        return 0
+    else:
+        return -10 * math.log10(float(nMismatches) / nBases)
+        
+
+def phredScore2ErrorProp(qual):
+    #print 'phredScore2ErrorProp', qual
+    return math.pow(10.0, float(qual) / -10.0)
+
+expectedHeader = 'rg,dn,Qrep,pos,NBases,MMismatches,Qemp'.split(',')
+defaultValues = '0,**,0,0,0,0,0'.split(',')
+class RecalData(dict):
+
+    def __init__(self):
+        self.parse(expectedHeader, defaultValues)
+
+    def parse(self, header, data):
+        # rg,dn,Qrep,pos,NBases,MMismatches,Qemp
+        types = [str, str, int, int, int, int, int]
+        for head, expected, datum, type in zip(header, expectedHeader, data, types):
+            if head <> expected:
+                raise ("Unexpected header in rawData %s %s %s" % (head, expected, datum))
+            #print 'Binding => ', head, type(datum)
+            self[head] = type(datum)
+        #print self
+        return self
+    
+    def set(self, header, values):
+        for head, val in zip(header, values):
+            self[head] = val
+            
+    def __getattr__(self, name):
+        return self[name]
+    
+    # rg,dn,Qrep,pos,NBases,MMismatches,Qemp
+    def readGroup(self): return self.rg
+    def dinuc(self): return self.dn
+    def qReported(self): return self.Qrep
+    def qEmpirical(self): return self.Qemp
+    def cycle(self): return self.pos
+    def nBases(self): return self.NBases
+    def nMismatches(self): return self.MMismatches
+    def nExpectedMismatches(self): return self.nBases() * phredScore2ErrorProp(self.qReported())
+
+    def combine(self, moreData):
+        # grab useful info
+        sumErrors = self.nExpectedMismatches()
+        for datum in moreData:
+            self.NBases += datum.nBases()
+            self.MMismatches += datum.nMismatches()
+            sumErrors += datum.nExpectedMismatches()
+        self.updateQemp()
+        self.Qrep = phredQScore(sumErrors, self.nBases())
+        #print 'self.Qrep is now', self.Qrep
+        return self
+        
+    def updateQemp(self):
+        newQemp = phredQScore( self.nMismatches(), self.nBases() )
+        #print 'Updating qEmp', self.Qemp, newQemp
+        self.Qemp = newQemp
+        return newQemp
+
+    def __str__(self):
+        return "rg=%s cycle=%d dinuc=%s qrep=%.1f qemp=%.1f nbases=%d nmismatchs=%d" % ( self.readGroup(), self.cycle(), self.dinuc(), self.qReported(), self.qEmpirical(), self.nBases(), self.nMismatches())
+    
+#    def __init__(dinuc, Qrep, pos, nbases, nmismatches, qemp ):
+#        self.dinuc = dinuc
+#        self.Qrep = Qrep
+
+def rawDataStream(file):
+    """Yields successive lists containing the CSVs in the data file; excludes headers"""
+    header = None
+    for line in open(file):
+        if line.find("#") <> -1: continue
+        else:
+            data = line.strip().split(',')
+            if line.find("rg,") <> -1:
+                header = data
+            else:
+                yield RecalData().parse(header, data)
+
+def rawDataByReadGroup(rawDataFile):
+    """Yields a stream of the data in rawDataFile, grouped by readGroup"""
+    for readGroup, generator in groupby(rawDataStream(rawDataFile), key=RecalData.readGroup):
+        yield (readGroup, list(generator))
+
+def combineRecalData(separateData):
+    return RecalData().combine(separateData)
+
+def groupRecalData(allData, key=None):
+    s = sorted(allData, key=key)
+    values = [ [key, combineRecalData(vals)] for key, vals in groupby(s, key=key) ]
+    return sorted( values, key=lambda x: x[0])
+
+#
+# let's actually analyze the data!
+#
+def analyzeReadGroup(readGroup, data, outputRoot):
+    print 'Read group => ', readGroup
+    print 'Number of elements => ', len(data)
+
+    files = []
+    if OPTIONS.toStdout:
+        qReportedVsqEmpirical(readGroup, data, sys.stdout )
+        qDiffByCycle(readGroup, data, sys.stdout)
+        qDiffByDinuc(readGroup, data, sys.stdout)
+    else:
+        def outputFile(tail):
+            file = outputRoot + tail
+            files.append(file)
+            return file
+            
+        with open(outputFile(".empirical_v_reported_quality.dat"), 'w') as output:
+            qReportedVsqEmpirical(readGroup, data, output )
+        with open(outputFile(".quality_difference_v_cycle.dat"), 'w') as output:
+            qDiffByCycle(readGroup, data, output)
+        with open(outputFile(".quality_difference_v_dinucleotide.dat"), 'w') as output:
+            qDiffByDinuc(readGroup, data, output)
+            
+    print 'Files', files
+    return analyzeFiles(files)
+
+def qDiffByCycle(readGroup, allData, output):
+    print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases'
+    print >> output, 'Cycle    Qreported   Qempirical     Qempirical_Qreported     nMismatches     nBases'
+    for cycle, datum in groupRecalData(allData, key=RecalData.cycle):
+        datum.set(['rg', 'dn', 'pos'], [readGroup, '**', cycle])
+        diff = datum.qEmpirical() - datum.qReported()
+        print >> output, "%d   %2.2f  %2.2f   %2.2f     %12d    %12d" % (datum.cycle(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases())
+
+# 
+#     public void qualityDiffVsDinucleotide(PrintStream file, final String readGroup) {
+#         ArrayList<RecalData> ByCycle = new ArrayList<RecalData>();
+#         ArrayList<MeanReportedQuality> ByCycleReportedQ = new ArrayList<MeanReportedQuality>();
+#         file.printf("dinuc,Qemp-obs,Qemp,Qobs,B,N%n");
+#         RecalData All = new RecalData(0,0,readGroup,"");
+#         MeanReportedQuality AllReported = new MeanReportedQuality();
+#         for (int c=0; c < RecalData.NDINUCS; c++) {
+#             ByCycle.add(new RecalData(-1, -1,readGroup,RecalData.dinucIndex2bases(c)));
+#             ByCycleReportedQ.add(new MeanReportedQuality());
+#         }
+# 
+#         for ( RecalData datum: getRecalData(readGroup) ) {
+#             int dinucIndex = RecalData.dinucIndex(datum.dinuc); //bases2dinucIndex(datum.dinuc.charAt(0), datum.dinuc.charAt(1), false);
+#             ByCycle.get(dinucIndex).inc(datum.N, datum.B);
+#             ByCycleReportedQ.get(dinucIndex).inc(datum.qual, datum.N);
+#             All.inc(datum.N, datum.B);
+#             AllReported.inc(datum.qual, datum.N);
+#         }
+# 
+#         for (int c=0; c < RecalData.NDINUCS; c++) {
+#             double empiricalQual = -10 * Math.log10((double)ByCycle.get(c).B / ByCycle.get(c).N);
+#             double reportedQual = ByCycleReportedQ.get(c).result();
+#             file.printf("%s, %f, %f, %f, %d, %d%n", ByCycle.get(c).dinuc, empiricalQual-reportedQual, empiricalQual, reportedQual, ByCycle.get(c).B, ByCycle.get(c).N);
+#         }
+#     }
+
+def qDiffByDinuc(readGroup, allData, output):
+    print >> output, '# Note Qreported is a float here due to combining Qreported across quality bins -- Qreported is the expected Q across all Q bins, weighted by nBases'
+    print >> output, 'Dinuc    Qreported   Qempirical     Qempirical_Qreported     nMismatches     nBases'
+    for dinuc, datum in groupRecalData(allData, key=RecalData.dinuc):
+        datum.set(['rg', 'dn', 'pos'], [readGroup, dinuc, '*'])
+        diff = datum.qEmpirical() - datum.qReported()
+        print >> output, "%s   %2.2f  %2.2f   %2.2f     %12d    %12d" % (datum.dinuc(), datum.qReported(), datum.qEmpirical(), diff, datum.nMismatches(), datum.nBases())
+
+def qReportedVsqEmpirical(readGroup, allData, output):
+    print >> output, 'Qreported    Qempirical   nMismatches     nBases'
+    for key, datum in groupRecalData(allData, key=RecalData.qReported):
+        datum.set(['rg', 'dn', 'Qrep', 'pos'], [readGroup, '**', key, '*'])
+        print >> output, "%2d  %2.2f   %12d    %12d" % (datum.qReported(), datum.qEmpirical(), datum.nMismatches(), datum.nBases())
+
+def analyzeRawData(rawDataFile):
+    for readGroup, data in rawDataByReadGroup(rawDataFile):
+        if OPTIONS.selectedReadGroups == [] or readGroup in OPTIONS.selectedReadGroups:
+            root, sourceFilename = os.path.split(rawDataFile)
+            if ( OPTIONS.outputDir ): root = OPTIONS.outputDir
+            outputRoot = os.path.join(root, "%s.%s.%s" % ( sourceFilename, readGroup, 'analysis' ))
+            analyzeReadGroup(readGroup, data, outputRoot)
+
+plottersByFile = {
+    "raw_data.csv$" : analyzeRawData,
+    "recal_data.csv$" : analyzeRawData,
+    "empirical_v_reported_quality" : 'PlotQEmpStated',
+    "quality_difference_v_dinucleotide" : 'PlotQDiffByDinuc',
+    "quality_difference_v_cycle" : 'PlotQDiffByCycle' }
+
+def getPlotterForFile(file):
+    for pat, analysis in plottersByFile.iteritems():
+        if re.search(pat, file):
+            if type(analysis) == str:
+                return config.getOption('R', analysis, 'input_file')
+            else:
+                analysis(file)
+    return None
+
+def analyzeFiles(files):
+    #print 'analyzeFiles', files
+    Rscript = config.getOption('R', 'Rscript', 'input_file')
+    for file in files:
+        plotter = getPlotterForFile(file)
+        if plotter <> None:
+            cmd = ' '.join([Rscript, plotter, file])
+            farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry)
+
+if __name__ == "__main__":
+    usage = """usage: %prog -c config.cfg files*"""
+
+    parser = OptionParser(usage=usage)
+    parser.add_option("-q", "--farm", dest="farmQueue",
+                        type="string", default=None,
+                        help="Farm queue to send processing jobs to")
+    parser.add_option("-d", "--dir", dest="outputDir",
+                        type="string", default=None,
+                        help="If provided, analysis output files will be written to this directory")
+    parser.add_option("-c", "--config", dest="configs",
+                        action="append", type="string", default=[],
+                        help="Configuration file")                        
+    parser.add_option("-s", "--stdout", dest="toStdout",
+                        action='store_true', default=False,
+                        help="If provided, writes output to standard output, not to files")
+    parser.add_option("", "--dry", dest="dry",
+                        action='store_true', default=False,
+                        help="If provided, nothing actually gets run, just a dry run")
+    parser.add_option("-g", "--readGroup", dest="selectedReadGroups",
+                        action="append", type="string", default=[],
+                        help="If provided, only the provided read groups will be analyzed")                        
+                        
+    (OPTIONS, args) = parser.parse_args()
+    #if len(args) != 3:
+    #    parser.error("incorrect number of arguments")
+
+    if len(OPTIONS.configs) == 0:
+        parser.error("Requires at least one configuration file be provided")
+
+    config = gatkConfigParser(OPTIONS.configs)
+  
+    analyzeFiles(args)
--- a/python/easyRecalQuals.py
+++ b/python/easyRecalQuals.py
@ -0,0 +1,107 @@
+#
+#
+# 
+# java -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -R ~/work/humanref/Homo_sapiens_assembly18.fasta --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -l INFO -T CountCovariates -I NA07048.bam --OUTPUT_FILEROOT test/NA07048_test --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1 -L chr1:1-10,000,000 -collapsePos -collapseDinuc && cat test/NA07048_test.raw_data.csv
+# 
+# java -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -R ~/work/humanref/Homo_sapiens_assembly18.fasta -T TableRecalibration -I NA07048.bam -params test/NA07048_test.raw_data.csv -outputBAM NA07048.test.bam -l INFO -compress 1 -L chr1:1-10,000,000
+# samtools index NA07048.test.bam
+# 
+# java -Xmx2048m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -R ~/work/humanref/Homo_sapiens_assembly18.fasta --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -l INFO -T CountCovariates -I NA07048.test.bam --OUTPUT_FILEROOT test/NA07048_test.recal --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1 -L chr1:1-10,000,000 -collapsePos -collapseDinuc && cat test/NA07048_test.recal.raw_data.csv
+# 
+import farm_commands
+import os.path
+import sys
+from optparse import OptionParser
+import picard_utils
+from gatkConfigParser import *
+import glob
+
+if __name__ == "__main__":
+    usage = """usage: %prog config.cfg* input.bam output.bam"""
+
+    parser = OptionParser(usage=usage)
+    parser.add_option("-A", "--args", dest="args",
+                        type="string", default="",
+                        help="arguments to GATK")
+    parser.add_option("-C", "--CovariateArgs", dest="CovariateArgs",
+                        type="string", default="",
+                        help="arguments to GATK")
+    parser.add_option("-q", "--farm", dest="farmQueue",
+                        type="string", default=None,
+                        help="Farm queue to send processing jobs to")
+    parser.add_option("-d", "--dir", dest="scratchDir",
+                        type="string", default="test",
+                        help="Output directory")
+    parser.add_option("", "--dry", dest="dry",
+                        action='store_true', default=False,
+                        help="If provided, nothing actually gets run, just a dry run")
+    parser.add_option("", "--plot", dest="plot",
+                        action='store_true', default=False,
+                        help="If provided, will call R to generate convenient plots about the Q scores of the pre and post calibrated files")
+    parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
+                        action='store_true', default=False,
+                        help="Ignores already written files, if present")
+
+    (OPTIONS, args) = parser.parse_args()
+    #if len(args) != 3:
+    #    parser.error("incorrect number of arguments")
+
+    configFiles = args[0:len(args)-2]
+    config = gatkConfigParser(configFiles)
+    inputBAM = args[len(args)-2]
+    outputBAM = args[len(args)-1]
+    rootname = os.path.split(os.path.splitext(outputBAM)[0])[1]
+
+    covariateRoot = os.path.join(OPTIONS.scratchDir, rootname)
+    covariateInitial = covariateRoot + '.init'
+    initDataFile = covariateInitial + '.raw_data.csv'
+    covariateRecal = covariateRoot +  '.recal'
+    recalDataFile = covariateRecal + '.raw_data.csv'
+
+    if not os.path.exists(OPTIONS.scratchDir):
+        os.mkdir(OPTIONS.scratchDir)
+    
+    def covariateCmd(bam, outputDir, ignoreAdds):
+        add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir)
+        if not ignoreAdds:
+            add += " " + OPTIONS.CovariateArgs
+        return config.gatkCmd('CountCovariates') + add 
+
+    def recalibrateCmd(inputBAM, dataFile, outputBAM):
+        return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s" % (inputBAM, dataFile, outputBAM)
+
+    def runCovariateCmd(inputBAM, dataFile, dir, jobid, ignoreAdds = False):
+        if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile):
+            cmd = covariateCmd(inputBAM, dir, ignoreAdds)
+            return farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
+
+    if OPTIONS.plot:
+        def plotCmd(cmd):
+            farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry)
+            
+        Rscript = config.getOption('R', 'Rscript', 'input_file')
+        plotEmpStated = config.getOption('R', 'PlotQEmpStated', 'input_file')
+        plotByCycleDinuc = config.getOption('R', 'PlotQByCycleDinuc', 'input_file')
+        
+        empQualFiles = map( lambda x: glob.glob(''.join([x,'.RG_*.empirical_v_reported_quality.csv'])), [covariateInitial, covariateRecal])
+        empQualFiles = empQualFiles[0] + empQualFiles[1]
+        readGroupRoots = map(lambda x: x.replace(".empirical_v_reported_quality.csv", ""), empQualFiles)
+        print readGroupRoots
+        
+        for file in empQualFiles:
+            plotCmd(' '.join([Rscript, plotEmpStated, file]))
+            
+        for root in readGroupRoots:
+            plotCmd(' '.join([Rscript, plotByCycleDinuc, root]))
+            
+    else:
+        jobid = None
+        jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid, False)
+        
+        if OPTIONS.ignoreExistingFiles or not os.path.exists(outputBAM):
+            cmd = recalibrateCmd(inputBAM, initDataFile, outputBAM)
+            jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
+            jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
+            
+        jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid, True)
+        
--- a/python/recal_qual.py
+++ b/python/recal_qual.py
@ -1,120 +0,0 @@
-#!/usr/bin/env python
-
-# Hg18 ref and rods
-reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
-dbsnp = "/humgen/gsa-scr1/GATK_Data/dbsnp.assembled_and_random_contigs_only.rod.out"
-
-## 1KG ref and rods
-reference = "/broad/1KG/reference/human_b36_both.fasta"
-dbsnp = "/humgen/gsa-scr1/GATK_Data/dbsnp.1kg.rod.out"
-
-# Flags - many of these are not necessary now but were used for debugging
-training_range_str = "" #" -L chr1"
-gatk_flags = ""
-downsample_fraction = "" #" --DOWNSAMPLE_FRACTION 0.01"
-read_group = "" #" --READ_GROUP SRR005814"
-min_mapping_quality = " --MIN_MAPPING_QUALITY 1"
-recal_limit = 0
-
-javaexe = "java -ea"
-
-justPrintCommands = False
-
-import os, sys
-sys.path.append(".")
-if len(sys.argv) > 1:
-    #from recal_args import *
-    #print "Using args imported from recal_args.py"
-#else:
-    #from default_broad_recal_args import *
-    #print "Using default Broad recalibration arguments"
-    file = sys.argv[1]
-    fileroot = file
-    if file.endswith(".bam"):
-        fileroot = fileroot.replace(".bam", "")
-    fileroot = os.path.basename(fileroot)
-    print "fileroot: "+fileroot
-    #training_range_str = " -L chr1:1-20000000"
-
-plot = False
-recal_only = False
-for arg in sys.argv:
-    if arg == "-plot":
-        plot = True
-    if arg == "-recal_only":
-        recal_only = True
-
-def cmd(cmdstr):
-    print ">>> Running: "+cmdstr
-    if (not justPrintCommands):
-        ret_val = str(os.system(cmdstr))
-        print "<<< Returned: "+str(ret_val)
-
-        if int(ret_val) != 0:
-            sys.exit("Last command failed: "+cmdstr)
-
-# Assure that BAM index is available
-def assert_bam_index(bam_filename):
-    if not os.path.exists(bam_filename+".bai"):
-        cmd("samtools index "+bam_filename)
-
-def ensure_dir(dir):
-    if not os.path.exists(dir):
-        os.makedirs(dir)
-
-# Process arguments
-if recal_limit == 0:
-    recal_limit_str = ""
-else:
-    recal_limit_str = "_"+str(recal_limit)
-
-# This was the central directory where I put the files, I've changed it to "." 
-# for simplicity now even though it probably needs to be read group aware
-#covar_path = "/humgen/gsa-scr1/andrewk/recalibration_work"
-covar_path = "."
-central_fileroot= covar_path+"/"+fileroot
-
-bam = central_fileroot+".bam"
-if not os.path.lexists(bam):
-    os.symlink(file, bam)
-
-correction_str = ".corrected"+recal_limit_str
-corrected_fileroot = central_fileroot+correction_str
-corrected_bam = corrected_fileroot+".bam"
-
-assert_bam_index(bam)
-cmd(javaexe+" -jar ~andrewk/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T CountCovariates  -R "+reference+" --DBSNP "+dbsnp+" -mqs 40 -I "+bam+" --OUTPUT_FILEROOT "+central_fileroot+" -l INFO --CREATE_TRAINING_DATA"+training_range_str+downsample_fraction+gatk_flags+read_group+min_mapping_quality)
-
-# Make plots
-if plot:
-    cmd("~andrewk/covariates/plot_q_emp_stated_hst.R "+central_fileroot+".empirical_v_reported_quality.csv")
-    cmd("~andrewk/covariates/plot_qual_diff_v_cycle_dinuc.R "+central_fileroot)
-    
-cmd("python ~andrewk/dev/Sting/trunk/python/LogRegression.py "+central_fileroot)
-
-if recal_only: # Stop if we are only making recalibration tables
-    sys.exit()
-
-cmd(javaexe+" -jar ~/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T LogisticRecalibration -I "+bam+" -R "+reference+" --logisticparamsfile "+central_fileroot+".recalibration_table --outputbamfile "+corrected_bam+" -l DEBUG -M "+str(recal_limit)+gatk_flags)
-cmd("samtools index "+corrected_bam)    
-
-cmd(javaexe+" -jar ~andrewk/dev/Sting/trunk/dist/GenomeAnalysisTK.jar -T CountCovariates  -R "+reference+" --DBSNP "+dbsnp+" -mqs 40 -I "+corrected_bam+" --OUTPUT_FILEROOT "+corrected_fileroot+" -l INFO"+gatk_flags+read_group+min_mapping_quality)
-
-# Make plots
-if plot:
-    cmd("~andrewk/covariates/plot_q_emp_stated_hst.R "+corrected_fileroot+".empirical_v_reported_quality.csv")
-    cmd("~andrewk/covariates/plot_qual_diff_v_cycle_dinuc.R "+corrected_fileroot)
-
-# Merge corrected and uncorrected files
-output_dir = "comparison_png"
-ensure_dir(output_dir)
-corrected_pngs = [f for f in os.listdir(".") if f.endswith(".png") and "corrected" in f]
-print "\n".join(corrected_pngs)
-
-for corrected_png in corrected_pngs:
-    raw_png = corrected_png.replace(correction_str, "")
-    both_png = corrected_png.replace(correction_str, "raw_and_corrected")
-    cmd(" ".join(["montage", raw_png, corrected_png, "-geometry +0+0", output_dir+"/"+both_png]))
-
-
-
--- a/testdata/defaultGATKConfig.cfg
+++ b/testdata/defaultGATKConfig.cfg
@ -10,14 +10,17 @@ gatkData: /humgen/gsa-scr1/GATK_Data
 dbsnp: %(gatkData)s/dbsnp_129_hg18.rod
 tmp: /tmp/
 args: -l INFO
+Sting: ~/dev/GenomeAnalysisTK/trunk
 #args: -l INFO -L chr1:1-1,000,000

 [CountCovariates]
-args: --CREATE_TRAINING_DATA --MIN_MAPPING_QUALITY 1 -D %(dbsnp)s
+args: --MIN_MAPPING_QUALITY 1 -D %(dbsnp)s

 [TableRecalibration]
 args: -compress 1

 [R]
 Rscript: /broad/tools/apps/R-2.6.0/bin/Rscript 
-PlotQEmpStated: ~andrewk/recalQual_resources/plot_q_emp_stated_hst.R
+PlotQEmpStated: %(Sting)s/R/plot_q_emp_stated_hst.R
+PlotQDiffByCycle: %(Sting)s/R/plot_qual_diff_v_cycle.R
+PlotQDiffByDinuc: %(Sting)s/R/plot_qual_diff_v_dinuc.R