Allow script to be easily modified to support different platforms.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@955 348d0f76-0448-11de-a6fe-93d51630548a
2009-06-09 16:06:57 +00:00 · 2009-06-09 16:06:57 +00:00 · e77dfe9983
parent 7fa84ea157
commit e77dfe9983
3 changed files with 37 additions and 7 deletions
--- a/doc/ReadQualityRecalibrator/README
+++ b/doc/ReadQualityRecalibrator/README
@ -66,6 +66,15 @@ To (only) evaluate a given bam file after calibrating:

 python RecalQual.py --evaluate <source bam> <recalibrated bam>

+Platforms
+---------
+By default, the recalibrator processes only read groups
+originating from Illumina sequencers.  To enable calibration
+for other platforms, edit the 'platforms' array at the
+top of RecalQual.py.  Platforms specified here should
+case-insensitive match the "PL" attribute of the read
+group in the BAM file.
+
 Output
 ------
 The recalibration process keeps many incremental
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java
@ -15,6 +15,7 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.Random;
 import java.util.HashMap;
+import java.util.Collections;
 import java.io.PrintStream;
 import java.io.FileNotFoundException;

@ -44,6 +45,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
    @Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.")
    public int MAX_READ_GROUPS = 100;

+    @Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = Illumina)")
+    public List<String> platforms = Collections.singletonList("ILLUMINA");
+
    int NDINUCS = 16;
    ArrayList<RecalData> flattenData = new ArrayList<RecalData>();
    HashMap<String, RecalData[][][]> data = new HashMap<String, RecalData[][][]>();
@ -101,8 +105,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {

        for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
            if( readGroup.getAttribute("PL") == null )
-                Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are illumina",readGroup.getReadGroupId()));
-            if( !isIlluminaReadGroup(readGroup) )
+                Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are supported",readGroup.getReadGroupId()));
+            if( !isSupportedReadGroup(readGroup) )
                continue;
            data.put(readGroup.getReadGroupId(), new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS]);
            for ( int i = 0; i < MAX_READ_LENGTH+1; i++) {
@ -126,7 +130,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
            for (int i =0; i < reads.size(); i++ ) {
                SAMRecord read = reads.get(i);
                SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG"));
-                if ( isIlluminaReadGroup(readGroup) &&
+                if ( isSupportedReadGroup(readGroup) &&
                    !read.getReadNegativeStrandFlag() &&
                    (READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
                    (read.getMappingQuality() >= MIN_MAPPING_QUALITY)) {
@ -394,7 +398,18 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
        random_genrator = new Random(123454321); // keep same random seed while debugging
    }

-    private boolean isIlluminaReadGroup( SAMReadGroupRecord readGroup ) {
-        return (readGroup.getAttribute("PL") == null || "ILLUMINA".equalsIgnoreCase(readGroup.getAttribute("PL").toString()));
+    /**
+     * Check to see whether this read group should be processed.
+     * @param readGroup
+     * @return
+     */
+    private boolean isSupportedReadGroup( SAMReadGroupRecord readGroup ) {
+        for( String platform: platforms ) {
+            platform = platform.trim();
+            if( readGroup.getAttribute("PL") == null || readGroup.getAttribute("PL").toString().equalsIgnoreCase(platform) )
+                return true;
+        }
+
+        return false;
    }
 }
--- a/python/RecalQual.py
+++ b/python/RecalQual.py
@ -8,6 +8,9 @@ R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript"
 # Any special site-specific arguments to pass the JVM.
 jvm_args='-ea -Xmx4096m'

+# Which platforms should the calibration tool be run over?
+platforms=['illumina']
+
 # Where to put the output created as part of recalibration.  
 # If editing, please end this variable with a trailing slash.
 output_root = './'
@ -39,6 +42,9 @@ gatk = resources + 'gatk/GenomeAnalysisTK.jar'
 logistic_regression_script = resources + 'logistic_regression.R'
 empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R'

+# Assemble the platform list into command-line arguments.
+platform_args = ' '.join(['-pl %s' % platform for platform in platforms])
+
 def exit(msg,errorcode):
    print msg
    sys.exit(errorcode)
@ -59,7 +65,7 @@ def recalibrate():
    'Recalibrate the given bam file'
    # generate the covariates
    print 'generating covariates'
-    generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
+    generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
    returncode = os.system(generate_covariates)
    if returncode != 0:
        exit('Unable to generate covariates',1)
@ -88,7 +94,7 @@ def evaluate():
    'Evaluate recalibration results.'
    print 'Evaluating recalibration results'
    # regenerate the covariates
-    regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1'))
+    regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args))
    print 'regenerating covariates'
    returncode = os.system(regenerate_covariates)
    if returncode != 0: