From f5b00c20d0c3ab7f7ce9c82b5a4ac3697da8c753 Mon Sep 17 00:00:00 2001
From: depristo <depristo@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Tue, 7 Jul 2009 14:15:39 +0000
Subject: [PATCH] Updated python files

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1182 348d0f76-0448-11de-a6fe-93d51630548a
---
 R/plot_q_emp_stated_hst.R          |  14 +++-
 doc/ReadQualityRecalibrator/README | 113 -----------------------------
 python/Gelis2PopSNPs.py            |  15 ++--
 python/analyzeRecalQuals.py        |  28 ++++---
 python/easyRecalQuals.py           |  18 ++---
 python/farm_commands.py            |   2 +-
 python/gatkConfigParser.py         |   9 ++-
 python/picard_utils.py             |   2 +-
 8 files changed, 59 insertions(+), 142 deletions(-)
 delete mode 100644 doc/ReadQualityRecalibrator/README

diff --git a/R/plot_q_emp_stated_hst.R b/R/plot_q_emp_stated_hst.R
index 468b619d0..f020f944d 100755
--- a/R/plot_q_emp_stated_hst.R
+++ b/R/plot_q_emp_stated_hst.R
@@ -15,7 +15,7 @@ pdf(outfile, height=7, width=7)
 d.good <- t[t$nMismatches >= 1000,]
 d.100 <- t[t$nMismatches < 100,]
 d.1000 <- t[t$nMismatches < 1000 & t$nMismatches >= 100,]
-plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,45), ylim=c(0,45), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
+plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,63), ylim=c(0,63), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
 points(d.100$Qreported, d.100$Qempirical, type="p", col="lightblue", pch=16)
 points(d.1000$Qreported, d.1000$Qempirical, type="p", col="cornflowerblue", pch=16)
 abline(0,1, lty=2)
@@ -26,6 +26,16 @@ dev.off()
 outfile = paste(input, ".quality_emp_hist.pdf", sep="")
 pdf(outfile, height=7, width=7)
 hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0)
-plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,45), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
+plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,63), main="Empirical quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
+axis(2,axTicks(2), format(axTicks(2), scientific=F))
+dev.off()
+
+#
+# Plot Q reported histogram
+#
+outfile = paste(input, ".quality_rep_hist.pdf", sep="")
+pdf(outfile, height=7, width=7)
+hst=subset(data.frame(t$Qreported, t$nBases), t.nBases != 0)
+plot(hst$t.Qreported, hst$t.nBases, type="h", lwd=3, xlim=c(0,63), main="Reported quality score histogram", xlab="Qreported quality score", ylab="Count", yaxt="n")
 axis(2,axTicks(2), format(axTicks(2), scientific=F))
 dev.off()
diff --git a/doc/ReadQualityRecalibrator/README b/doc/ReadQualityRecalibrator/README
deleted file mode 100644
index cbc1605d6..000000000
--- a/doc/ReadQualityRecalibrator/README
+++ /dev/null
@@ -1,113 +0,0 @@
-Read Quality Recalibrator
--------------------------
-The tools in this package recalibrate quality scores of 
-Illumina reads in an aligned BAM file. After recalibration, 
-the quality scores in the QUAL field in each Illumina read 
-in the output BAM are accurate in that the reported quality 
-score is equal to its actual probability of mismatching.  
-This is process is accomplished by analyzing the covariation 
-between machine reported quality scores and 
-
-1) the position within the read, and 
-2) the preceding nucleotide (sequencing chemistry effect).  
-
-The aligned reads have their dbSNP sites masked out, and the 
-mismatched bases are used as a metric for the true error rate 
-of the system.  The error rate at different dinucleotides and 
-positions in the read is then fed into a logistic regression 
-system which outputs a correction factor for each of those 
-combinations which are then use to output a recalibrated BAM 
-file. 
-
-Software Dependencies
----------------------
-The recalibrator currently depends on the following
-applications.  Please install these before proceeding.
-
-- Java Runtime Environment 1.6.0_12 or later
-- Python 2.4.2 or later
-- R 2.6.0 or later
-- samtools 0.1.4 or later.
-
-Please update the file paths at the top of 
-RecalQual.py to point to the local installations of the
-software listed above.
-
-Running
--------
-Before running the tool, please update the file paths
-listed at the top of RecalQual.py to point to the
-local installations of the tools listed above.
-
-The recalibrator has two modes: recalibration and
-evaluation, which can be run separately or jointly.
-By default, the recalibrator will recalibrate only.
-
-To calibrate a given bam file, run the following 
-command:
-
-python RecalQual.py <source bam> <recalibrated bam>
-
-After recalibration, performing evaluation will walk 
-through the source BAM file again, remeasuring the 
-differences between empirical vs. reported quality.  
-The results of evaluation are comma-delimited text 
-files (.csv) and graphs (.png) which can be found 
-in the output directory (see below) for the given run.
-A successful recalibration should produce a plot of 
-empirical vs. observed qualities that shows little
-difference between the two values.
-
-To both recalibrate and evaluate, execute:
-
-python RecalQual.py --recalibrate --evaluate <source bam> <recalibrated bam>
-
-To (only) evaluate a given bam file after calibrating:
-
-python RecalQual.py --evaluate <source bam> <recalibrated bam>
-
-Platforms
----------
-By default, the recalibrator processes only read groups
-originating from Illumina sequencers.  To enable calibration
-for other platforms, edit the 'platforms' array at the
-top of RecalQual.py.  Platforms specified here should
-case-insensitive match the "PL" attribute of the read
-group in the BAM file.
-
-Output
-------
-The recalibration process keeps many incremental
-files around for future analysis.  By default, all 
-of these files will be grouped into the directory 
-'output.<source bam>/'.  By default, this directory
-will be created within the working directory.  This
-location can be changed by editing the output_root
-variable at the top of RecalQual.py.
-
-It is safe to delete this supplemental output 
-directory at any time.
-
-Known Issues
-------------
-- The recalibrator places severe memory demands on
-  files with large numbers of read groups.
-- If running in 'evaluation' mode (see the 'Running'
-  section above), X11 is required to generate the 
-  graphs.  If running on a machine via ssh, be certain
-  to enable X tunnelling.
-
-Troubleshooting
----------------
-- The memory requirements of the recalibrator will 
-  vary based on the type of JVM running the application 
-  and the number of read groups in the input bam file.  
-  If the application reports 'java.lang.OutOfMemoryError:
-  Java heap space', increase the max heap size provided
-  to the JVM by adding ' -Xmx????m' to the jvm_args
-  variable in RecalQual.py, where '????' is the maximum
-  available memory on the processing computer.
-
-Support
--------
-For support, please email gsadevelopers@broad.mit.edu.
diff --git a/python/Gelis2PopSNPs.py b/python/Gelis2PopSNPs.py
index 89246ddad..b4da5b201 100755
--- a/python/Gelis2PopSNPs.py
+++ b/python/Gelis2PopSNPs.py
@@ -13,7 +13,10 @@ ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
    
 def geli2dbsnpFile(geli):
     root, flowcellDotlane, ext = picard_utils.splitPath(geli)
-    return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
+    if ext == ".calls":
+        return None
+    else:
+        return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
 
 def SingleSampleGenotyperCmd(bam, geli, use2bp):
     naid = os.path.split(bam)[1].split(".")[0]
@@ -49,7 +52,7 @@ def gelis2gelisText( gelis ):
         if os.path.splitext(maybeGeli)[1] == ".calls" :
             return maybeGeli
         else:
-            return os.path.split(geli)[1] + '.calls'
+            return os.path.split(maybeGeli)[1] + '.calls'
             
     return map( geli2geliText, gelis)
 
@@ -84,7 +87,7 @@ def main():
     (OPTIONS, args) = parser.parse_args()
     if len(args) != 2:
         parser.error("incorrect number of arguments: " + str(args))
-    lines = [line.split() for line in open(args[0])]
+    lines = [line.strip().split() for line in open(args[0])]
     nIndividuals = int(args[1])
     
     outputFile = open(OPTIONS.output, 'w')
@@ -108,7 +111,7 @@ def main():
 
     for geli, jobid in zip(gelis, jobids):
         dbsnpFile = geli2dbsnpFile(geli)
-        if not os.path.exists(dbsnpFile):
+        if dbsnpFile == None and not os.path.exists(dbsnpFile):
             dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod)
             if jobid == 0: jobid = None
             farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
@@ -151,14 +154,14 @@ def main():
     sortedCalls = [line.split() for line in open(sortedCallFile)]
     aggregratedCalls = picard_utils.aggregateGeliCalls(sortedCalls)
     
-    print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom individuals'
+    print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom'
 
     for snp in map( lambda x: picard_utils.aggregatedGeliCalls2SNP(x, nIndividuals), aggregratedCalls ):
         if snp == None: continue    # ignore bad calls
         #print snp
         #sharedCalls = list(sharedCallsGroup)
         #genotype = list(sharedCalls[0][5])
-        print >> outputFile, '%s   %s %s %.6f -420.0 -420.0 0.000000 100.0 %d %d %d %d NA' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
+        print >> outputFile, '%s   %s %s %.6f -0.0 -0.0 0.000000 100.0 %d %d %d %d' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
     outputFile.close()
     
                 
diff --git a/python/analyzeRecalQuals.py b/python/analyzeRecalQuals.py
index d466ba0fa..3c3be4738 100755
--- a/python/analyzeRecalQuals.py
+++ b/python/analyzeRecalQuals.py
@@ -10,7 +10,7 @@ from itertools import *
 import math
 import operator
 
-MAX_QUAL_SCORE = 50
+MAX_QUAL_SCORE = 40
 
 def phredQScore( nMismatches, nBases ):
     """Calculates a phred-scaled score for nMismatches in nBases"""
@@ -293,19 +293,25 @@ def qReportedVsqEmpiricalStream(readGroup, data):
         yield key, datum
 
 def qReportedVsqEmpirical(readGroup, allData, output):
-    print >> output, 'Qreported    Qempirical   nMismatches     nBases'
+    print >> output, 'Qreported    Qempirical   nMismatches     nBases      PercentBases'
+    rg, allBases = groupRecalData(allData, key=RecalData.readGroup)[0]
     for key, datum in qReportedVsqEmpiricalStream(readGroup, allData):
         #if datum.qReported() > 35:
         #    print datum
-        print >> output, "%2.2f  %2.2f   %12d    %12d" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases())
+        print >> output, "%2.2f  %2.2f   %12d    %12d       %.2f" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases(), 100.0*datum.getNBases() / float(allBases.getNBases()))
 
 def analyzeRawData(rawDataFile):
+    nReadGroups = 0
     for readGroup, data in rawDataByReadGroup(rawDataFile):
         if OPTIONS.selectedReadGroups == [] or readGroup in OPTIONS.selectedReadGroups:
-            root, sourceFilename = os.path.split(rawDataFile)
-            if ( OPTIONS.outputDir ): root = OPTIONS.outputDir
-            outputRoot = os.path.join(root, "%s.%s.%s" % ( sourceFilename, readGroup, 'analysis' ))
-            analyzeReadGroup(readGroup, data, outputRoot)
+            nReadGroups += 1
+            if nReadGroups > OPTIONS.maxReadGroups and OPTIONS.maxReadGroups <> -1:
+                break
+            else:
+                root, sourceFilename = os.path.split(rawDataFile)
+                if ( OPTIONS.outputDir ): root = OPTIONS.outputDir
+                outputRoot = os.path.join(root, "%s.%s.%s" % ( sourceFilename, readGroup, 'analysis' ))
+                analyzeReadGroup(readGroup, data, outputRoot)
 
 plottersByFile = {
     "raw_data.csv$" : analyzeRawData,
@@ -344,6 +350,9 @@ def main():
     parser.add_option("-d", "--dir", dest="outputDir",
                         type="string", default=None,
                         help="If provided, analysis output files will be written to this directory")
+    parser.add_option("-m", "--maxReadGroups", dest="maxReadGroups",
+                        type="int", default=-1,
+                        help="Maximum number of read groups to process.  The default of -1 indicates that all read groups will be processed")
     parser.add_option("-c", "--config", dest="configs",
                         action="append", type="string", default=[],
                         help="Configuration file")                        
@@ -371,7 +380,8 @@ def main():
         parser.error("Requires at least one configuration file be provided")
 
     config = gatkConfigParser(OPTIONS.configs)
-  
+
+    if OPTIONS.selectedReadGroups <> []: print 'Analyzing only the following read groups', OPTIONS.selectedReadGroups
     analyzeFiles(args)
 
 import unittest
@@ -400,4 +410,4 @@ class TestanalzyeRecalQuals(unittest.TestCase):
 if __name__ == '__main__':
     main()    
     #unittest.main()    
-    
\ No newline at end of file
+    
diff --git a/python/easyRecalQuals.py b/python/easyRecalQuals.py
index 62f7c27c3..0343744b7 100755
--- a/python/easyRecalQuals.py
+++ b/python/easyRecalQuals.py
@@ -13,7 +13,7 @@ if __name__ == "__main__":
                         type="string", default="",
                         help="arguments to GATK")
     parser.add_option("-m", "--mode", dest="RecalMode",
-                        type="string", default="",
+                        type="string", default="SEQUENTIAL",
                         help="Mode argument to provide to table recalibrator")
     parser.add_option("-q", "--farm", dest="farmQueue",
                         type="string", default=None,
@@ -22,14 +22,14 @@ if __name__ == "__main__":
                         action="append", type="string", default=[],
                         help="Configuration file")                        
     parser.add_option("-d", "--dir", dest="scratchDir",
-                        type="string", default="test",
+                        type="string", default="./",
                         help="Output directory")
+    parser.add_option("-w", "--wait", dest="initialWaitID",
+                        type="string", default=None,
+                        help="If providedm the first job dispatched to LSF will use this id as it ended() prerequisite")
     parser.add_option("", "--dry", dest="dry",
                         action='store_true', default=False,
                         help="If provided, nothing actually gets run, just a dry run")
-    parser.add_option("", "--plot", dest="plot",
-                        action='store_true', default=False,
-                        help="If provided, will call R to generate convenient plots about the Q scores of the pre and post calibrated files")
     parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
                         action='store_true', default=False,
                         help="Ignores already written files, if present")
@@ -54,10 +54,10 @@ if __name__ == "__main__":
     
     def covariateCmd(bam, outputDir):
         add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir)
-        return config.gatkCmd('CountCovariates') + add 
+        return config.gatkCmd('CountCovariates', log=outputDir, stdLogName=True) + add 
 
     def recalibrateCmd(inputBAM, dataFile, outputBAM):
-        return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode)
+        return config.gatkCmd('TableRecalibration', log=outputBAM, stdLogName=True) + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode)
 
     def runCovariateCmd(inputBAM, dataFile, dir, jobid):
         if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile):
@@ -67,7 +67,7 @@ if __name__ == "__main__":
     #
     # Actually do some work here
     #
-    jobid = None
+    jobid = OPTIONS.initialWaitID
     if OPTIONS.ignoreExistingFiles or not os.path.exists(initDataFile): 
         jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid)
     
@@ -77,4 +77,4 @@ if __name__ == "__main__":
         jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
         
     jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid)
-        
\ No newline at end of file
+        
diff --git a/python/farm_commands.py b/python/farm_commands.py
index 883f7776b..07e60d5a3 100644
--- a/python/farm_commands.py
+++ b/python/farm_commands.py
@@ -22,7 +22,7 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman
             cmd_str += " -o " + farm_stdout
 
         if waitID <> None:
-            cmd_str += " -w \"done(%s)\"" % (str(waitID))
+            cmd_str += " -w \"ended(%s)\"" % (str(waitID))
 
         cmd_str += " \""+cmd_str_from_user + "\""
         
diff --git a/python/gatkConfigParser.py b/python/gatkConfigParser.py
index 8e6367356..5cd3796e1 100755
--- a/python/gatkConfigParser.py
+++ b/python/gatkConfigParser.py
@@ -71,9 +71,16 @@ class gatkConfigParser(ConfigParser.SafeConfigParser):
     def gatk_args(self): return self.getOption(self.GATK, 'args')
     def reference(self): return self.getOption(self.GATK, 'reference')
     
-    def gatkCmd(self, mode):
+    def gatkCmd(self, mode, log = None, stdLogName=False):
         cmd = ' '.join([self.java(), self.jvm_args(), '-jar', self.jar(), self.gatk_args(), '-R', self.reference()])
         cmd += ' ' + ' '.join(['-T', mode, self.getGATKModeOption('args', mode)])
+        if log <> None:
+            if stdLogName:
+                #head, ext = os.path.splitext(log)
+                logName = log + "." + mode + ".log"
+            else:
+                logName = log
+            cmd += ' ' + ' '.join(['-log', logName])
         return cmd
         
 import unittest
diff --git a/python/picard_utils.py b/python/picard_utils.py
index 56fadd162..e42d7a8b5 100755
--- a/python/picard_utils.py
+++ b/python/picard_utils.py
@@ -164,7 +164,7 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True,
         MSDStr = ''
         if MSD: MSDStr = 'MSD=true'
     
-        return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + compression_level + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
+        return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + str(compression_level) + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
         #return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
 
 def getPicardPath(lane, picardRoot = '/seq/picard/'):