From e77dfe9983d1bd64295a3e81dd97aec99b416002 Mon Sep 17 00:00:00 2001 From: hanna Date: Tue, 9 Jun 2009 16:06:57 +0000 Subject: [PATCH] Allow script to be easily modified to support different platforms. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@955 348d0f76-0448-11de-a6fe-93d51630548a --- doc/ReadQualityRecalibrator/README | 9 +++++++ .../gatk/walkers/CovariateCounterWalker.java | 25 +++++++++++++++---- python/RecalQual.py | 10 ++++++-- 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/doc/ReadQualityRecalibrator/README b/doc/ReadQualityRecalibrator/README index 3a1620e46..a786df1a5 100644 --- a/doc/ReadQualityRecalibrator/README +++ b/doc/ReadQualityRecalibrator/README @@ -66,6 +66,15 @@ To (only) evaluate a given bam file after calibrating: python RecalQual.py --evaluate +Platforms +--------- +By default, the recalibrator processes only read groups +originating from Illumina sequencers. To enable calibration +for other platforms, edit the 'platforms' array at the +top of RecalQual.py. Platforms specified here should +case-insensitive match the "PL" attribute of the read +group in the BAM file. + Output ------ The recalibration process keeps many incremental diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java index 47180658d..93d4b2afa 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CovariateCounterWalker.java @@ -15,6 +15,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Random; import java.util.HashMap; +import java.util.Collections; import java.io.PrintStream; import java.io.FileNotFoundException; @@ -44,6 +45,9 @@ public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.") public int MAX_READ_GROUPS = 100; + @Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = Illumina)") + public List platforms = Collections.singletonList("ILLUMINA"); + int NDINUCS = 16; ArrayList flattenData = new ArrayList(); HashMap data = new HashMap(); @@ -101,8 +105,8 @@ public class CovariateCounterWalker extends LocusWalker { for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { if( readGroup.getAttribute("PL") == null ) - Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are illumina",readGroup.getReadGroupId())); - if( !isIlluminaReadGroup(readGroup) ) + Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are supported",readGroup.getReadGroupId())); + if( !isSupportedReadGroup(readGroup) ) continue; data.put(readGroup.getReadGroupId(), new RecalData[MAX_READ_LENGTH+1][MAX_QUAL_SCORE+1][NDINUCS]); for ( int i = 0; i < MAX_READ_LENGTH+1; i++) { @@ -126,7 +130,7 @@ public class CovariateCounterWalker extends LocusWalker { for (int i =0; i < reads.size(); i++ ) { SAMRecord read = reads.get(i); SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG")); - if ( isIlluminaReadGroup(readGroup) && + if ( isSupportedReadGroup(readGroup) && !read.getReadNegativeStrandFlag() && (READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) && (read.getMappingQuality() >= MIN_MAPPING_QUALITY)) { @@ -394,7 +398,18 @@ public class CovariateCounterWalker extends LocusWalker { random_genrator = new Random(123454321); // keep same random seed while debugging } - private boolean isIlluminaReadGroup( SAMReadGroupRecord readGroup ) { - return (readGroup.getAttribute("PL") == null || "ILLUMINA".equalsIgnoreCase(readGroup.getAttribute("PL").toString())); + /** + * Check to see whether this read group should be processed. + * @param readGroup + * @return + */ + private boolean isSupportedReadGroup( SAMReadGroupRecord readGroup ) { + for( String platform: platforms ) { + platform = platform.trim(); + if( readGroup.getAttribute("PL") == null || readGroup.getAttribute("PL").toString().equalsIgnoreCase(platform) ) + return true; + } + + return false; } } diff --git a/python/RecalQual.py b/python/RecalQual.py index 987c5d7b2..85cfc4be8 100755 --- a/python/RecalQual.py +++ b/python/RecalQual.py @@ -8,6 +8,9 @@ R_exe="/broad/tools/apps/R-2.6.0/bin/Rscript" # Any special site-specific arguments to pass the JVM. jvm_args='-ea -Xmx4096m' +# Which platforms should the calibration tool be run over? +platforms=['illumina'] + # Where to put the output created as part of recalibration. # If editing, please end this variable with a trailing slash. output_root = './' @@ -39,6 +42,9 @@ gatk = resources + 'gatk/GenomeAnalysisTK.jar' logistic_regression_script = resources + 'logistic_regression.R' empirical_vs_reported_grapher = resources + 'plot_q_emp_stated_hst.R' +# Assemble the platform list into command-line arguments. +platform_args = ' '.join(['-pl %s' % platform for platform in platforms]) + def exit(msg,errorcode): print msg sys.exit(errorcode) @@ -59,7 +65,7 @@ def recalibrate(): 'Recalibrate the given bam file' # generate the covariates print 'generating covariates' - generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1')) + generate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'initial','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args)) returncode = os.system(generate_covariates) if returncode != 0: exit('Unable to generate covariates',1) @@ -88,7 +94,7 @@ def evaluate(): 'Evaluate recalibration results.' print 'Evaluating recalibration results' # regenerate the covariates - regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1')) + regenerate_covariates = ' '.join((gatk_base_cmdline,'-T CountCovariates','-I',calibrated_bam,'-mqs 40','--OUTPUT_FILEROOT',output_dir+'recalibrated','--CREATE_TRAINING_DATA','--MIN_MAPPING_QUALITY 1',platform_args)) print 'regenerating covariates' returncode = os.system(regenerate_covariates) if returncode != 0: