Updated python files

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1182 348d0f76-0448-11de-a6fe-93d51630548a
2009-07-07 14:15:39 +00:00 · 2009-07-07 14:15:39 +00:00 · f5b00c20d0
parent f6a273a537
commit f5b00c20d0
8 changed files with 59 additions and 142 deletions
--- a/R/plot_q_emp_stated_hst.R
+++ b/R/plot_q_emp_stated_hst.R
@ -15,7 +15,7 @@ pdf(outfile, height=7, width=7)
 d.good <- t[t$nMismatches >= 1000,]
 d.100 <- t[t$nMismatches < 100,]
 d.1000 <- t[t$nMismatches < 1000 & t$nMismatches >= 100,]
-plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,45), ylim=c(0,45), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
+plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,63), ylim=c(0,63), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
 points(d.100$Qreported, d.100$Qempirical, type="p", col="lightblue", pch=16)
 points(d.1000$Qreported, d.1000$Qempirical, type="p", col="cornflowerblue", pch=16)
 abline(0,1, lty=2)
@ -26,6 +26,16 @@ dev.off()
 outfile = paste(input, ".quality_emp_hist.pdf", sep="")
 pdf(outfile, height=7, width=7)
 hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0)
-plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,45), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
+plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,63), main="Empirical quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
+axis(2,axTicks(2), format(axTicks(2), scientific=F))
+dev.off()
+
+#
+# Plot Q reported histogram
+#
+outfile = paste(input, ".quality_rep_hist.pdf", sep="")
+pdf(outfile, height=7, width=7)
+hst=subset(data.frame(t$Qreported, t$nBases), t.nBases != 0)
+plot(hst$t.Qreported, hst$t.nBases, type="h", lwd=3, xlim=c(0,63), main="Reported quality score histogram", xlab="Qreported quality score", ylab="Count", yaxt="n")
 axis(2,axTicks(2), format(axTicks(2), scientific=F))
 dev.off()
--- a/doc/ReadQualityRecalibrator/README
+++ b/doc/ReadQualityRecalibrator/README
@ -1,113 +0,0 @@
-Read Quality Recalibrator
-------------------------
-The tools in this package recalibrate quality scores of 
-Illumina reads in an aligned BAM file. After recalibration, 
-the quality scores in the QUAL field in each Illumina read 
-in the output BAM are accurate in that the reported quality 
-score is equal to its actual probability of mismatching.  
-This is process is accomplished by analyzing the covariation 
-between machine reported quality scores and 
-
-1) the position within the read, and 
-2) the preceding nucleotide (sequencing chemistry effect).  
-
-The aligned reads have their dbSNP sites masked out, and the 
-mismatched bases are used as a metric for the true error rate 
-of the system.  The error rate at different dinucleotides and 
-positions in the read is then fed into a logistic regression 
-system which outputs a correction factor for each of those 
-combinations which are then use to output a recalibrated BAM 
-file. 
-
-Software Dependencies
---------------------
-The recalibrator currently depends on the following
-applications.  Please install these before proceeding.
-
- Java Runtime Environment 1.6.0_12 or later
- Python 2.4.2 or later
- R 2.6.0 or later
- samtools 0.1.4 or later.
-
-Please update the file paths at the top of 
-RecalQual.py to point to the local installations of the
-software listed above.
-
-Running
-------
-Before running the tool, please update the file paths
-listed at the top of RecalQual.py to point to the
-local installations of the tools listed above.
-
-The recalibrator has two modes: recalibration and
-evaluation, which can be run separately or jointly.
-By default, the recalibrator will recalibrate only.
-
-To calibrate a given bam file, run the following 
-command:
-
-python RecalQual.py <source bam> <recalibrated bam>
-
-After recalibration, performing evaluation will walk 
-through the source BAM file again, remeasuring the 
-differences between empirical vs. reported quality.  
-The results of evaluation are comma-delimited text 
-files (.csv) and graphs (.png) which can be found 
-in the output directory (see below) for the given run.
-A successful recalibration should produce a plot of 
-empirical vs. observed qualities that shows little
-difference between the two values.
-
-To both recalibrate and evaluate, execute:
-
-python RecalQual.py --recalibrate --evaluate <source bam> <recalibrated bam>
-
-To (only) evaluate a given bam file after calibrating:
-
-python RecalQual.py --evaluate <source bam> <recalibrated bam>
-
-Platforms
---------
-By default, the recalibrator processes only read groups
-originating from Illumina sequencers.  To enable calibration
-for other platforms, edit the 'platforms' array at the
-top of RecalQual.py.  Platforms specified here should
-case-insensitive match the "PL" attribute of the read
-group in the BAM file.
-
-Output
------
-The recalibration process keeps many incremental
-files around for future analysis.  By default, all 
-of these files will be grouped into the directory 
-'output.<source bam>/'.  By default, this directory
-will be created within the working directory.  This
-location can be changed by editing the output_root
-variable at the top of RecalQual.py.
-
-It is safe to delete this supplemental output 
-directory at any time.
-
-Known Issues
------------
- The recalibrator places severe memory demands on
-  files with large numbers of read groups.
- If running in 'evaluation' mode (see the 'Running'
-  section above), X11 is required to generate the 
-  graphs.  If running on a machine via ssh, be certain
-  to enable X tunnelling.
-
-Troubleshooting
---------------
- The memory requirements of the recalibrator will 
-  vary based on the type of JVM running the application 
-  and the number of read groups in the input bam file.  
-  If the application reports 'java.lang.OutOfMemoryError:
-  Java heap space', increase the max heap size provided
-  to the JVM by adding ' -Xmx????m' to the jvm_args
-  variable in RecalQual.py, where '????' is the maximum
-  available memory on the processing computer.
-
-Support
-------
-For support, please email gsadevelopers@broad.mit.edu.
--- a/python/Gelis2PopSNPs.py
+++ b/python/Gelis2PopSNPs.py
@ -13,7 +13,10 @@ ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
   
 def geli2dbsnpFile(geli):
    root, flowcellDotlane, ext = picard_utils.splitPath(geli)
-    return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'
+    if ext == ".calls":
+        return None
+    else:
+        return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'

 def SingleSampleGenotyperCmd(bam, geli, use2bp):
    naid = os.path.split(bam)[1].split(".")[0]
@ -49,7 +52,7 @@ def gelis2gelisText( gelis ):
        if os.path.splitext(maybeGeli)[1] == ".calls" :
            return maybeGeli
        else:
-            return os.path.split(geli)[1] + '.calls'
+            return os.path.split(maybeGeli)[1] + '.calls'
            
    return map( geli2geliText, gelis)

@ -84,7 +87,7 @@ def main():
    (OPTIONS, args) = parser.parse_args()
    if len(args) != 2:
        parser.error("incorrect number of arguments: " + str(args))
-    lines = [line.split() for line in open(args[0])]
+    lines = [line.strip().split() for line in open(args[0])]
    nIndividuals = int(args[1])
    
    outputFile = open(OPTIONS.output, 'w')
@ -108,7 +111,7 @@ def main():

    for geli, jobid in zip(gelis, jobids):
        dbsnpFile = geli2dbsnpFile(geli)
-        if not os.path.exists(dbsnpFile):
+        if dbsnpFile == None and not os.path.exists(dbsnpFile):
            dbsnpCmd = picard_utils.CollectDbSnpMatchesCmd(geli, dbsnpFile, OPTIONS.lod)
            if jobid == 0: jobid = None
            farm_commands.cmd(dbsnpCmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry, waitID = jobid)
@ -151,14 +154,14 @@ def main():
    sortedCalls = [line.split() for line in open(sortedCallFile)]
    aggregratedCalls = picard_utils.aggregateGeliCalls(sortedCalls)
    
-    print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom individuals'
+    print >> outputFile, 'loc ref alt EM_alt_freq discovery_likelihood discovery_null discovery_prior discovery_lod EM_N n_ref n_het n_hom'

    for snp in map( lambda x: picard_utils.aggregatedGeliCalls2SNP(x, nIndividuals), aggregratedCalls ):
        if snp == None: continue    # ignore bad calls
        #print snp
        #sharedCalls = list(sharedCallsGroup)
        #genotype = list(sharedCalls[0][5])
-        print >> outputFile, '%s   %s %s %.6f -420.0 -420.0 0.000000 100.0 %d %d %d %d NA' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
+        print >> outputFile, '%s   %s %s %.6f -0.0 -0.0 0.000000 100.0 %d %d %d %d' % (snp.loc, snp.ref, snp.alt(), snp.q(), nIndividuals, snp.nRefGenotypes(), snp.nHetGenotypes(), snp.nHomVarGenotypes())
    outputFile.close()
    
                
--- a/python/analyzeRecalQuals.py
+++ b/python/analyzeRecalQuals.py
@ -10,7 +10,7 @@ from itertools import *
 import math
 import operator

-MAX_QUAL_SCORE = 50
+MAX_QUAL_SCORE = 40

 def phredQScore( nMismatches, nBases ):
    """Calculates a phred-scaled score for nMismatches in nBases"""
@ -293,19 +293,25 @@ def qReportedVsqEmpiricalStream(readGroup, data):
        yield key, datum

 def qReportedVsqEmpirical(readGroup, allData, output):
-    print >> output, 'Qreported    Qempirical   nMismatches     nBases'
+    print >> output, 'Qreported    Qempirical   nMismatches     nBases      PercentBases'
+    rg, allBases = groupRecalData(allData, key=RecalData.readGroup)[0]
    for key, datum in qReportedVsqEmpiricalStream(readGroup, allData):
        #if datum.qReported() > 35:
        #    print datum
-        print >> output, "%2.2f  %2.2f   %12d    %12d" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases())
+        print >> output, "%2.2f  %2.2f   %12d    %12d       %.2f" % (datum.qReported(), datum.qEmpirical(), datum.getNMismatches(), datum.getNBases(), 100.0*datum.getNBases() / float(allBases.getNBases()))

 def analyzeRawData(rawDataFile):
+    nReadGroups = 0
    for readGroup, data in rawDataByReadGroup(rawDataFile):
        if OPTIONS.selectedReadGroups == [] or readGroup in OPTIONS.selectedReadGroups:
-            root, sourceFilename = os.path.split(rawDataFile)
-            if ( OPTIONS.outputDir ): root = OPTIONS.outputDir
-            outputRoot = os.path.join(root, "%s.%s.%s" % ( sourceFilename, readGroup, 'analysis' ))
-            analyzeReadGroup(readGroup, data, outputRoot)
+            nReadGroups += 1
+            if nReadGroups > OPTIONS.maxReadGroups and OPTIONS.maxReadGroups <> -1:
+                break
+            else:
+                root, sourceFilename = os.path.split(rawDataFile)
+                if ( OPTIONS.outputDir ): root = OPTIONS.outputDir
+                outputRoot = os.path.join(root, "%s.%s.%s" % ( sourceFilename, readGroup, 'analysis' ))
+                analyzeReadGroup(readGroup, data, outputRoot)

 plottersByFile = {
    "raw_data.csv$" : analyzeRawData,
@ -344,6 +350,9 @@ def main():
    parser.add_option("-d", "--dir", dest="outputDir",
                        type="string", default=None,
                        help="If provided, analysis output files will be written to this directory")
+    parser.add_option("-m", "--maxReadGroups", dest="maxReadGroups",
+                        type="int", default=-1,
+                        help="Maximum number of read groups to process.  The default of -1 indicates that all read groups will be processed")
    parser.add_option("-c", "--config", dest="configs",
                        action="append", type="string", default=[],
                        help="Configuration file")                        
@ -371,7 +380,8 @@ def main():
        parser.error("Requires at least one configuration file be provided")

    config = gatkConfigParser(OPTIONS.configs)
-  
+
+    if OPTIONS.selectedReadGroups <> []: print 'Analyzing only the following read groups', OPTIONS.selectedReadGroups
    analyzeFiles(args)

 import unittest
@ -400,4 +410,4 @@ class TestanalzyeRecalQuals(unittest.TestCase):
 if __name__ == '__main__':
    main()    
    #unittest.main()    
-    
+    
--- a/python/easyRecalQuals.py
+++ b/python/easyRecalQuals.py
@ -13,7 +13,7 @@ if __name__ == "__main__":
                        type="string", default="",
                        help="arguments to GATK")
    parser.add_option("-m", "--mode", dest="RecalMode",
-                        type="string", default="",
+                        type="string", default="SEQUENTIAL",
                        help="Mode argument to provide to table recalibrator")
    parser.add_option("-q", "--farm", dest="farmQueue",
                        type="string", default=None,
@ -22,14 +22,14 @@ if __name__ == "__main__":
                        action="append", type="string", default=[],
                        help="Configuration file")                        
    parser.add_option("-d", "--dir", dest="scratchDir",
-                        type="string", default="test",
+                        type="string", default="./",
                        help="Output directory")
+    parser.add_option("-w", "--wait", dest="initialWaitID",
+                        type="string", default=None,
+                        help="If providedm the first job dispatched to LSF will use this id as it ended() prerequisite")
    parser.add_option("", "--dry", dest="dry",
                        action='store_true', default=False,
                        help="If provided, nothing actually gets run, just a dry run")
-    parser.add_option("", "--plot", dest="plot",
-                        action='store_true', default=False,
-                        help="If provided, will call R to generate convenient plots about the Q scores of the pre and post calibrated files")
    parser.add_option("-i", "--ignoreExistingFiles", dest="ignoreExistingFiles",
                        action='store_true', default=False,
                        help="Ignores already written files, if present")
@ -54,10 +54,10 @@ if __name__ == "__main__":
    
    def covariateCmd(bam, outputDir):
        add = " -I %s --OUTPUT_FILEROOT %s" % (bam, outputDir)
-        return config.gatkCmd('CountCovariates') + add 
+        return config.gatkCmd('CountCovariates', log=outputDir, stdLogName=True) + add 

    def recalibrateCmd(inputBAM, dataFile, outputBAM):
-        return config.gatkCmd('TableRecalibration') + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode)
+        return config.gatkCmd('TableRecalibration', log=outputBAM, stdLogName=True) + " -I %s -params %s -outputBAM %s -mode %s" % (inputBAM, dataFile, outputBAM, OPTIONS.RecalMode)

    def runCovariateCmd(inputBAM, dataFile, dir, jobid):
        if OPTIONS.ignoreExistingFiles or not os.path.exists(dataFile):
@ -67,7 +67,7 @@ if __name__ == "__main__":
    #
    # Actually do some work here
    #
-    jobid = None
+    jobid = OPTIONS.initialWaitID
    if OPTIONS.ignoreExistingFiles or not os.path.exists(initDataFile): 
        jobid = runCovariateCmd(inputBAM, initDataFile, covariateInitial, jobid)
    
@ -77,4 +77,4 @@ if __name__ == "__main__":
        jobid = farm_commands.cmd('samtools index ' + outputBAM, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry, waitID = jobid)
        
    jobid = runCovariateCmd(outputBAM, recalDataFile, covariateRecal, jobid)
-        
+        
--- a/python/farm_commands.py
+++ b/python/farm_commands.py
@ -22,7 +22,7 @@ def cmd(cmd_str_from_user, farm_queue=False, output_head=None, just_print_comman
            cmd_str += " -o " + farm_stdout

        if waitID <> None:
-            cmd_str += " -w \"done(%s)\"" % (str(waitID))
+            cmd_str += " -w \"ended(%s)\"" % (str(waitID))

        cmd_str += " \""+cmd_str_from_user + "\""
        
--- a/python/gatkConfigParser.py
+++ b/python/gatkConfigParser.py
@ -71,9 +71,16 @@ class gatkConfigParser(ConfigParser.SafeConfigParser):
    def gatk_args(self): return self.getOption(self.GATK, 'args')
    def reference(self): return self.getOption(self.GATK, 'reference')
    
-    def gatkCmd(self, mode):
+    def gatkCmd(self, mode, log = None, stdLogName=False):
        cmd = ' '.join([self.java(), self.jvm_args(), '-jar', self.jar(), self.gatk_args(), '-R', self.reference()])
        cmd += ' ' + ' '.join(['-T', mode, self.getGATKModeOption('args', mode)])
+        if log <> None:
+            if stdLogName:
+                #head, ext = os.path.splitext(log)
+                logName = log + "." + mode + ".log"
+            else:
+                logName = log
+            cmd += ' ' + ' '.join(['-log', logName])
        return cmd
        
 import unittest
--- a/python/picard_utils.py
+++ b/python/picard_utils.py
@ -164,7 +164,7 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True,
        MSDStr = ''
        if MSD: MSDStr = 'MSD=true'
    
-        return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + compression_level + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
+        return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + str(compression_level) + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
        #return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))

 def getPicardPath(lane, picardRoot = '/seq/picard/'):