Now supports resume and dry runningRecalQual.py

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@996 348d0f76-0448-11de-a6fe-93d51630548a
2009-06-11 23:31:59 +00:00 · 2009-06-11 23:31:59 +00:00 · 1fb241a8b8
parent 4eda040e0f
commit 1fb241a8b8
2 changed files with 96 additions and 38 deletions
--- a/python/Gelis2PopSNPs.py
+++ b/python/Gelis2PopSNPs.py
@ -15,13 +15,25 @@ def geli2dbsnpFile(geli):
    root, flowcellDotlane, ext = picard_utils.splitPath(geli)
    return os.path.join(root, flowcellDotlane) + '.dbsnp_matches'

-   
+def SingleSampleGenotyperCmd(bam, geli, use2bp):
+    naid = bam.split(".")[0]
+    metrics = geli + '.metrics'
+    gatkPath = '/humgen/gsa-scr1/kiran/repositories/Sting/trunk/dist/GenomeAnalysisTK.jar'
+    hapmapChip = '/home/radon01/andrewk/hapmap_1kg/gffs/' + naid + '.gff'
+    targetList = '/home/radon01/depristo/work/1kg_pilot_evaluation/data/thousand_genomes_alpha_redesign.targets.interval_list'
+    cmd = "java -ea -jar " + gatkPath + ' ' + ' '.join(['-T SingleSampleGenotyper', '-I', bam, '-L', targetList, '-R', ref, '-D', '/humgen/gsa-scr1/GATK_Data/dbsnp.rod.out', '--hapmap_chip', hapmapChip, '-calls', geli, '-met', metrics, '-geli -fb -l INFO'])
+    return cmd
+ 
 def bams2geli(bams):
    def call1(bam):
        geli = os.path.splitext(bam)[0] + '.geli'
        jobid = 0
-        if not os.path.exists(geli):
-            cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling())
+        if OPTIONS.useSSG:
+            if not os.path.exists(geli + '.calls'):
+                cmd = SingleSampleGenotyperCmd(bam, geli + '.calls', OPTIONS.useSSG2b)
+        else:
+            if not os.path.exists(geli):
+                cmd = picard_utils.callGenotypesCmd( bam, geli, options = picard_utils.hybridSelectionExtraArgsForCalling())
            jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, just_print_commands = OPTIONS.dry )
        return geli, jobid
    calls = map(call1, bams)
@ -44,6 +56,12 @@ def main():
    parser.add_option("", "--dry", dest="dry",
                        action='store_true', default=False,
                        help="If provided, nothing actually gets run, just a dry run")
+    parser.add_option("", "--ssg", dest="useSSG",
+                        action='store_true', default=False,
+                        help="If provided, we'll use the GATK SSG for genotyping")
+    parser.add_option("", "--ssg2b", dest="useSSG2b",
+                        action='store_true', default=False,
+                        help="If provided, we'll use 2bp enabled GATK SSG ")
    parser.add_option("-o", "--output", dest="output",
                        type="string", default='/dev/stdout',
                        help="x")
--- a/python/RecalQual.py
+++ b/python/RecalQual.py
@ -18,6 +18,7 @@ output_root = './'
 # Location of the resource files distributed with the recalibration tool.  
 # If editing, please end this variable with a trailing slash.
 resources='resources/'
+#resources='/broad/1KG/DCC_recal/ReadQualityRecalibrator/resources/'

 import getopt,glob,os,sys
 import LogisticRegressionByReadGroup
@ -39,12 +40,29 @@ dbsnp = resources + 'dbsnp.1kg.rod.out'

 # Where are the application files required to run the recalibration?
 gatk = resources + 'gatk/GenomeAnalysisTK.jar'
+#gatk = '/home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar'
+
 logistic_regression_script = resources + 'logistic_regression.R'
 empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'

 # Assemble the platform list into command-line arguments.
 platform_args = ' '.join(['-pl %s' % platform for platform in platforms])

+def output_file_needs_update(output_file, input_files = []):
+    if reprocessFiles:                          # we are always reprocessing files 
+        return True
+    if not os.path.exists(output_file):         # the file doesn't exist
+        return True
+    else: 
+        return False 
+
+def executeCommand(cmd):
+    if not dryRun:
+        return os.system(cmd)
+    else:
+        print '  => Would execute', cmd
+        return 0
+
 def exit(msg,errorcode):
    print msg
    sys.exit(errorcode)
@ -57,7 +75,7 @@ def graph_file(graph_script,graph_data):
    'Graph the given data using the given script.  Leave the data in the output directory.'
    check_input_file_available(graph_script,'%s R graphing script' % graph_script)
    check_input_file_available(graph_data,'%s graphing data' % graph_data)
-    result = os.system(' '.join((R_exe,graph_script,graph_data)))
+    result = executeCommand(' '.join((R_exe,graph_script,graph_data)))
    if result != 0:
        exit('Unable to graph data: %s' % graph_data,1)

@ -65,54 +83,70 @@ def recalibrate():
    'Recalibrate the given bam file'
    # generate the covariates
    print 'generating covariates'
-    generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
-    returncode = os.system(generate_covariates)
-    if returncode != 0:
-        exit('Unable to generate covariates',1)
-
-    # compute the logistic regression
-    print 'computing the logistic regression'
-    LogisticRegressionByReadGroup.compute_logistic_regression(output_dir + 'initial.covariate_counts.csv',output_dir + 'linear_regression_results.out',R_exe,logistic_regression_script)
-
-    # apply the logistic regression, writing the output data to calibrated_bam
-    print 'applying the correction to the reads'
-    apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams',output_dir+'linear_regression_results.out','-outputBAM',calibrated_bam))
-    returncode = os.system(apply_logistic_regression)
-    if returncode != 0:
-        exit('Unable to apply logistic regression',1)
+    linear_regression_results = output_dir + 'linear_regression_results.out'
+    if output_file_needs_update(linear_regression_results):
+        generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
+        returncode = executeCommand(generate_covariates)
+        if returncode != 0:
+            exit('Unable to generate covariates',1)
+    
+        # compute the logistic regression
+        if not dryRun:
+            print 'computing the logistic regression'
+            LogisticRegressionByReadGroup.compute_logistic_regression(output_dir + 'initial.covariate_counts.csv',linear_regression_results,R_exe,logistic_regression_script)
+    else:
+        print 'Logistic recalibration data files already generated', linear_regression_results
+  
+    if output_file_needs_update(calibrated_bam):
+        # apply the logistic regression, writing the output data to calibrated_bam
+        print 'applying the correction to the reads'
+        apply_logistic_regression = ' '.join((gatk_base_cmdline,'-T LogisticRecalibration','-I',bam,'-logisticParams',output_dir+'linear_regression_results.out','-outputBAM',calibrated_bam))
+        returncode = executeCommand(apply_logistic_regression)
+        if returncode != 0:
+            exit('Unable to apply logistic regression',1)
+    else:
+        print 'Recalibrated BAM file already generated', calibrated_bam

    # index the calibrated bam
-    print 'indexing the calibrated bam'
-    index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam))
-    returncode = os.system(index_calibrated_bamfile)
-    if returncode != 0:
-        exit('Unable to index calibrated bamfile',1)
+    if output_file_needs_update(calibrated_bam + '.bai'):
+        print 'indexing the calibrated bam'
+        index_calibrated_bamfile = ' '.join((samtools_exe,'index',calibrated_bam))
+        returncode = executeCommand(index_calibrated_bamfile)
+        if returncode != 0:
+            exit('Unable to index calibrated bamfile',1)
+    else:
+        print 'Recalibrated BAM index file already generated', calibrated_bam + '.bai'

    print 'Recalibration complete!  Calibrated bam is available here: ' + calibrated_bam    

 def evaluate():
    'Evaluate recalibration results.'
    print 'Evaluating recalibration results'
-    # regenerate the covariates
-    regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
-    print 'regenerating covariates'
-    returncode = os.system(regenerate_covariates)
-    if returncode != 0:
-        exit('Unable to regenerate covariates',1)

-    print 'graphing initial results'
-    for filename in glob.glob(output_dir+'initial.*.empirical_v_reported_quality.csv'):
-        graph_file(empirical_vs_reported_grapher,filename)
-    print 'graphing final results'
-    for filename in glob.glob(output_dir+'recalibrated.*.empirical_v_reported_quality.csv'):
-        graph_file(empirical_vs_reported_grapher,filename)
+    recalibrated_regression_results = output_dir + 'recalibrated' + 'linear_regression_results.out'
+    if output_file_needs_update(recalibrated_regression_results):
+       # regenerate the covariates
+       regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
+       print 'regenerating covariates'
+       returncode = executeCommand(regenerate_covariates)
+       if returncode != 0:
+           exit('Unable to regenerate covariates',1)
+   
+       print 'graphing initial results'
+       for filename in glob.glob(output_dir+'initial.*.empirical_v_reported_quality.csv'):
+           graph_file(empirical_vs_reported_grapher,filename)
+       print 'graphing final results'
+       for filename in glob.glob(output_dir+'recalibrated.*.empirical_v_reported_quality.csv'):
+           graph_file(empirical_vs_reported_grapher,filename)
+    else:
+        print 'Evaluated files already generated', recalibrated_regression_results

 def usage():
    exit('Usage: python RecalQual.py [--recalibrate] [--evaluate] <input bam file> <calibrated output bam file>',1)

 # Try to parse the given command-line arguments.
 try:
-    opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate'])
+    opts, args = getopt.gnu_getopt(sys.argv[1:],'',['recalibrate','evaluate', 'reprocess', 'dry'])
 except getopt.GetoptError, err:
    usage()

@ -123,12 +157,18 @@ if len(args) < 2:
 # Determine whether to evaluate / recalibrate.
 recalibrate_requested = False
 evaluate_requested = False
+reprocessFiles = False
+dryRun = False
 for opt,arg in opts:
    if opt == '--recalibrate':
        recalibrate_requested = True
    if opt == '--evaluate':
        evaluate_requested = True
-
+    if opt == '--reprocess':
+        reprocessFiles = True
+    if opt == '--dry':
+        dryRun = True
+        
 # Default to 'recalibrate' unless the user specified only evaluate.
 do_recalibration = not (evaluate_requested and not recalibrate_requested)
 # Only evaluate if the user specifically requested evaluation.