From d1298dda133686afe25e98a1276d210fb146f526 Mon Sep 17 00:00:00 2001 From: rpoplin Date: Fri, 27 Nov 2009 19:54:10 +0000 Subject: [PATCH] Encapsulated the sections of code that were shared by the two Recalibration walkers. This includes both the shared command line arguments and the section of code in the map methods which pull out data from the SAMRecord and stuff it into the ReadHashDatum. Command line arguments are now passed to the Covariates using a new initialize method that all Covariates must implement. Updated the dbsnp sanity check warning message to be less cryptic. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2170 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/recalibration/Covariate.java | 5 +- .../recalibration/CovariateCounterWalker.java | 130 +++++------------- .../walkers/recalibration/CycleCovariate.java | 45 ++++-- .../gatk/walkers/recalibration/Dinuc.java | 25 ++++ .../walkers/recalibration/DinucCovariate.java | 8 +- .../MappingQualityCovariate.java | 7 +- .../recalibration/MinimumNQSCovariate.java | 15 +- .../recalibration/PositionCovariate.java | 7 +- .../recalibration/PrimerRoundCovariate.java | 7 +- .../recalibration/QualityScoreCovariate.java | 7 +- .../recalibration/ReadGroupCovariate.java | 9 +- .../walkers/recalibration/ReadHashDatum.java | 93 +++++++++++++ .../RecalibrationArgumentCollection.java | 68 +++++++++ .../TableRecalibrationWalker.java | 110 ++++++--------- .../walkers/recalibration/TileCovariate.java | 8 ++ 15 files changed, 320 insertions(+), 224 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java index 331a3cea6..717bc7c64 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java @@ -36,7 +36,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; */ public interface Covariate { - public Comparable getValue(ReadHashDatum readDatum, int offset); // Used to pick out the covariate's value from attributes of the read - public Comparable getValue(String str); // Used to get the covariate's value from input csv file in TableRecalibrationWalker + public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers + public Comparable getValue( ReadHashDatum readDatum, int offset ); // Used to pick out the covariate's value from attributes of the read + public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker public int estimatedNumberOfBins(); // Used to estimate the amount space required for the full data HashMap } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index 6aa325a25..e6c19004b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -9,6 +9,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.Variation; @@ -69,6 +70,11 @@ import edu.mit.broad.picard.illumina.parser.IlluminaUtil; @Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta public class CovariateCounterWalker extends LocusWalker { + ///////////////////////////// + // Shared Arguments + ///////////////////////////// + @ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + ///////////////////////////// // Command Line Arguments ///////////////////////////// @@ -76,31 +82,14 @@ public class CovariateCounterWalker extends LocusWalker { private Boolean LIST_ONLY = false; @Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false) private String[] COVARIATES = null; - @Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) - private boolean USE_ORIGINAL_QUALS = false; - @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false) - private int WINDOW_SIZE = 5; - @Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file") - private String RECAL_FILE = "output.recal_data.csv"; @Argument(fullName="process_nth_locus", shortName="pN", required=false, doc="Only process every Nth covered locus we see.") private int PROCESS_EVERY_NTH_LOCUS = 1; - @Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.") - private String DEFAULT_READ_GROUP = ReadGroupCovariate.defaultReadGroup; - @Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") - private String DEFAULT_PLATFORM = "Illumina"; - @Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.") - private String FORCE_READ_GROUP = null; - @Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") - private String FORCE_PLATFORM = null; ///////////////////////////// // Debugging-only Arguments ///////////////////////////// @Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. FOR DEBUGGING PURPOSES ONLY.") private boolean NO_PRINT_HEADER = false; - @Argument(fullName="validate_old_recalibrator", shortName="validate", required=false, doc="!!!Depricated!!! Match the output of the old recalibrator exactly. FOR DEBUGGING PURPOSES ONLY.") - private boolean VALIDATE_OLD_RECALIBRATOR = false; - // BUGBUG: This validate option no longer does exactly what it says because now using read filter to filter out zero mapping quality reads. The old recalibrator counted those reads in the counted_sites variable but the new one doesn't. @Argument(fullName="sorted_output", shortName="sorted", required=false, doc="The outputted table recalibration file will be in sorted order at the cost of added overhead. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") private boolean SORTED_OUTPUT = false; @@ -115,13 +104,12 @@ public class CovariateCounterWalker extends LocusWalker { private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site - private final String versionString = "v2.0.6"; // Major version, minor version, and build number + private static final String versionString = "v2.0.7"; // Major version, minor version, and build number private Pair dbSNP_counts = new Pair(0L, 0L); // mismatch/base counts for dbSNP loci private Pair novel_counts = new Pair(0L, 0L); // mismatch/base counts for non-dbSNP loci private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) private int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen) private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation - private boolean warnUserNullReadGroup = false; // Has the walker warned the user about null read groups yet? //--------------------------------------------------------------------------------------------------------------- // @@ -136,8 +124,8 @@ public class CovariateCounterWalker extends LocusWalker { public void initialize() { logger.info( "CovariateCounterWalker version: " + versionString ); - if( FORCE_READ_GROUP != null ) { DEFAULT_READ_GROUP = FORCE_READ_GROUP; } - if( FORCE_PLATFORM != null ) { DEFAULT_PLATFORM = FORCE_PLATFORM; } + if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; } + if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; } // Get a list of all available covariates final List> classes = PackageUtils.getClassesImplementingInterface( Covariate.class ); @@ -169,11 +157,12 @@ public class CovariateCounterWalker extends LocusWalker { // BUGBUG: This is a mess because there are a lot of cases (validate, all, none, and supplied covList). Clean up needed. requestedCovariates = new ArrayList(); int estimatedCapacity = 1; // Capacity is multiplicitive so this starts at one - if( VALIDATE_OLD_RECALIBRATOR ) { + if( RAC.VALIDATE_OLD_RECALIBRATOR ) { requestedCovariates.add( new ReadGroupCovariate() ); requestedCovariates.add( new CycleCovariate() ); // Unfortunately order is different here in order to match the old recalibrator exactly requestedCovariates.add( new QualityScoreCovariate() ); requestedCovariates.add( new DinucCovariate() ); + estimatedCapacity = 60 * 100 * 40 * 16; } else if( COVARIATES != null ) { if(COVARIATES[0].equalsIgnoreCase( "ALL" )) { // The user wants ALL covariates to be used requestedCovariates.add( new ReadGroupCovariate() ); // First add the required covariates then add the rest by looping over all implementing classes that were found @@ -181,10 +170,8 @@ public class CovariateCounterWalker extends LocusWalker { for( Class covClass : classes ) { try { Covariate covariate = (Covariate)covClass.newInstance(); + estimatedCapacity *= covariate.estimatedNumberOfBins(); - // Some covariates need parameters (user supplied command line arguments) passed to them - if( covariate instanceof CycleCovariate && DEFAULT_PLATFORM != null ) { covariate = new CycleCovariate( DEFAULT_PLATFORM ); } - if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); } if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // These were already added so don't add them again requestedCovariates.add( covariate ); } @@ -211,9 +198,6 @@ public class CovariateCounterWalker extends LocusWalker { // Now that we've found a matching class, try to instantiate it Covariate covariate = (Covariate)covClass.newInstance(); estimatedCapacity *= covariate.estimatedNumberOfBins(); - // Some covariates need parameters (user supplied command line arguments) passed to them - if( covariate instanceof CycleCovariate && DEFAULT_PLATFORM != null ) { covariate = new CycleCovariate( DEFAULT_PLATFORM ); } - if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); } requestedCovariates.add( covariate ); } catch ( InstantiationException e ) { throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); @@ -232,14 +216,19 @@ public class CovariateCounterWalker extends LocusWalker { Utils.warnUser( "Using default set of covariates because none were specified. Using ReadGroupCovariate and QualityScoreCovariate only." ); requestedCovariates.add( new ReadGroupCovariate() ); requestedCovariates.add( new QualityScoreCovariate() ); - estimatedCapacity = 300 * 40; + estimatedCapacity = 60 * 40; } logger.info( "The covariates being used here: " ); - logger.info( requestedCovariates ); + for( Covariate cov : requestedCovariates ) { + logger.info( "\t" + cov.getClass().getSimpleName() ); + cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection + } - if(estimatedCapacity > 300 * 40 * 200 * 16 || estimatedCapacity < 0) // could be negative if overflowed - { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception + // Don't want to crash with out of heap space exception + if(estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0) { // Could be negative if overflowed + estimatedCapacity = 300 * 40 * 200; + } dataManager = new RecalDataManager( estimatedCapacity ); readDatumHashMap = new IdentityHashMap(); } @@ -283,17 +272,10 @@ public class CovariateCounterWalker extends LocusWalker { final List offsets = context.getOffsets(); SAMRecord read; int offset; - String readGroupId; - byte[] quals; - byte[] bases; byte refBase; byte prevBase; - String platform; byte[] colorSpaceQuals; ReadHashDatum readDatum; - boolean isNegStrand; - int mappingQuality; - int length; final int numReads = reads.size(); // For each read at this locus @@ -312,45 +294,7 @@ public class CovariateCounterWalker extends LocusWalker { } // This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again - // Lots of lines just pulling things out of the SAMRecord and stuffing into local variables, many edge cases to worry about - // These lines should be exactly the same in TableRecalibrationWalker - quals = read.getBaseQualities(); - // Check if we need to use the original quality scores instead - if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { - Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); - if ( obj instanceof String ) - quals = QualityUtils.fastqToPhred((String)obj); - else { - throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); - } - } - bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'. - isNegStrand = read.getReadNegativeStrandFlag(); - final SAMReadGroupRecord readGroup = read.getReadGroup(); - if( readGroup == null ) { - if( !warnUserNullReadGroup && FORCE_READ_GROUP == null ) { - Utils.warnUser("The input .bam file contains reads with no read group. " + - "Defaulting to read group ID = " + DEFAULT_READ_GROUP + " and platform = " + DEFAULT_PLATFORM + ". " + - "First observed at read with name = " + read.getReadName() ); - warnUserNullReadGroup = true; - } - // There is no readGroup so defaulting to these values - readGroupId = DEFAULT_READ_GROUP; - platform = DEFAULT_PLATFORM; - } else { - readGroupId = readGroup.getReadGroupId(); - platform = readGroup.getPlatform(); - } - mappingQuality = read.getMappingQuality(); - length = bases.length; - if( FORCE_READ_GROUP != null ) { // Collapse all the read groups into a single common String provided by the user - readGroupId = FORCE_READ_GROUP; - } - if( FORCE_PLATFORM != null ) { - platform = FORCE_PLATFORM; - } - Integer tile = IlluminaUtil.getTileFromReadName(read.getReadName()); - readDatum = new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length, tile ); + readDatum = ReadHashDatum.parseSAMRecord( read, RAC ); readDatumHashMap.put( read, readDatum ); sizeOfReadDatumHashMap++; } @@ -384,7 +328,7 @@ public class CovariateCounterWalker extends LocusWalker { updateDataFromRead( readDatum, offset, refBase ); } } else { - if( VALIDATE_OLD_RECALIBRATOR ) { + if( RAC.VALIDATE_OLD_RECALIBRATOR ) { countedBases++; // Replicating a small bug in the old recalibrator } } @@ -406,7 +350,7 @@ public class CovariateCounterWalker extends LocusWalker { } // Do a dbSNP sanity check every so often - if ( ++lociSinceLastDbsnpCheck == DBSNP_VALIDATION_CHECK_FREQUENCY ) { + if( ++lociSinceLastDbsnpCheck == DBSNP_VALIDATION_CHECK_FREQUENCY ) { lociSinceLastDbsnpCheck = 0; validateDbsnpMismatchRate(); } @@ -424,13 +368,13 @@ public class CovariateCounterWalker extends LocusWalker { private static void updateMismatchCounts(Pair counts, AlignmentContext context, char ref) { List reads = context.getReads(); List offsets = context.getOffsets(); - for (int iii = 0; iii < reads.size(); iii++ ) { + for(int iii = 0; iii < reads.size(); iii++ ) { char readChar = (char)(reads.get(iii).getReadBases()[offsets.get(iii)]); int readCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(readChar); int refCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref); - if ( readCharBaseIndex != -1 && refCharBaseIndex != -1 ) { - if ( readCharBaseIndex != refCharBaseIndex ) { + if( readCharBaseIndex != -1 && refCharBaseIndex != -1 ) { + if( readCharBaseIndex != refCharBaseIndex ) { counts.first++; } counts.second++; @@ -442,17 +386,17 @@ public class CovariateCounterWalker extends LocusWalker { * Validate the dbSNP mismatch rates. */ private void validateDbsnpMismatchRate() { - if ( novel_counts.second == 0 || dbSNP_counts.second == 0 ) { + if( novel_counts.second == 0 || dbSNP_counts.second == 0 ) { return; } double fractionMM_novel = (double)novel_counts.first / (double)novel_counts.second; double fractionMM_dbsnp = (double)dbSNP_counts.first / (double)dbSNP_counts.second; - //System.out.println(String.format("dbSNP rate = %.2f, novel rate = %.2f", fractionMM_dbsnp, fractionMM_novel)); - - if ( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) { - Utils.warnUser("The variation rate at supplied known variant sites seems suspiciously low. Please double-check that the correct ROD is being used."); - DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't output the message every megabase of a large file + + if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) { + Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) ); + DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file } } @@ -480,7 +424,7 @@ public class CovariateCounterWalker extends LocusWalker { RecalDatum datum = dataManager.data.get( key ); if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method - if( VALIDATE_OLD_RECALIBRATOR || SORTED_OUTPUT ) { + if( RAC.VALIDATE_OLD_RECALIBRATOR || SORTED_OUTPUT ) { dataManager.data.sortedPut( key, datum ); } else { dataManager.data.put( key, datum ); @@ -508,7 +452,7 @@ public class CovariateCounterWalker extends LocusWalker { */ public PrintStream reduceInit() { try { - return new PrintStream( RECAL_FILE ); + return new PrintStream( RAC.RECAL_FILE ); } catch ( FileNotFoundException e ) { throw new RuntimeException( "Couldn't open output file: ", e ); } @@ -543,7 +487,7 @@ public class CovariateCounterWalker extends LocusWalker { private void outputToCSV( final PrintStream recalTableStream ) { //BUGBUG: This method is a mess. It will be cleaned up when I get rid of the validation and no_header debug options. - if( VALIDATE_OLD_RECALIBRATOR ) { + if( RAC.VALIDATE_OLD_RECALIBRATOR ) { // Output the old header as well as output the data in sorted order recalTableStream.printf("# collapsed_pos false%n"); recalTableStream.printf("# collapsed_dinuc false%n"); @@ -574,7 +518,7 @@ public class CovariateCounterWalker extends LocusWalker { } } - if( SORTED_OUTPUT ) + if( SORTED_OUTPUT && requestedCovariates.size() == 4 ) { for( Pair,RecalDatum> entry : dataManager.data.entrySetSorted4() ) { //BUGBUG: entrySetSorted4 isn't correct here for( Comparable comp : entry.first ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index 3819ce354..d359bf41d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -46,28 +46,35 @@ public class CycleCovariate implements Covariate { private static boolean warnedUserBadPlatform = false; private static String defaultPlatform; - public CycleCovariate() { // Empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker - defaultPlatform = null; - } - - public CycleCovariate( final String _defaultPlatform ) { - if( _defaultPlatform.equalsIgnoreCase( "ILLUMINA" ) || _defaultPlatform.contains( "454" ) || _defaultPlatform.equalsIgnoreCase( "SOLID" ) ) { - defaultPlatform = _defaultPlatform; + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { + if( RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ILLUMINA" ) || RAC.DEFAULT_PLATFORM.contains( "454" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SOLID" ) ) { + defaultPlatform = RAC.DEFAULT_PLATFORM; } else { - throw new StingException( "The requested default platform (" + _defaultPlatform +") is not a recognized platform. Implemented options are illumina, 454, and solid"); + throw new StingException( "The requested default platform (" + RAC.DEFAULT_PLATFORM +") is not a recognized platform. Implemented options are illumina, 454, and solid"); } } // Used to pick out the covariate's value from attributes of the read public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) { + //----------------------------- + // ILLUMINA + //----------------------------- + if( readDatum.platform.equalsIgnoreCase( "ILLUMINA" ) ) { int cycle = offset; if( readDatum.isNegStrand ) { cycle = readDatum.bases.length - (offset + 1); } return cycle; - } else if( readDatum.platform.contains( "454" ) ) { // Some bams have "LS454" and others have just "454" + } + + //----------------------------- + // 454 + //----------------------------- + + else if( readDatum.platform.contains( "454" ) ) { // Some bams have "LS454" and others have just "454" int cycle = 0; //BUGBUG: should reverse directions on negative strand reads! byte prevBase = readDatum.bases[0]; @@ -78,11 +85,23 @@ public class CycleCovariate implements Covariate { } } return cycle; - } else if( readDatum.platform.equalsIgnoreCase( "SOLID" ) ) { + } + + //----------------------------- + // SOLID + //----------------------------- + + else if( readDatum.platform.equalsIgnoreCase( "SOLID" ) ) { // The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf //BUGBUG: should reverse directions on negative strand reads! return offset / 5; // integer division - } else { // Platform is unrecognized so revert to the default platform but warn the user first + } + + //----------------------------- + // UNRECOGNIZED PLATFORM + //----------------------------- + + else { // Platform is unrecognized so revert to the default platform but warn the user first if( !warnedUserBadPlatform ) { if( defaultPlatform != null) { // the user set a default platform Utils.warnUser( "Platform string (" + readDatum.platform + ") unrecognized in CycleCovariate. " + @@ -110,8 +129,4 @@ public class CycleCovariate implements Covariate { public final int estimatedNumberOfBins() { return 100; } - - public String toString() { - return "Machine Cycle"; - } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java index ace61c6ff..f2428e808 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java @@ -1,5 +1,30 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + /** * Created by IntelliJ IDEA. * User: rpoplin diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java index 16407ac57..636f5eb43 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java @@ -43,8 +43,8 @@ public class DinucCovariate implements Covariate { HashMap dinucHashMap; - public DinucCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker - + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { final byte[] BASES = { (byte)'A', (byte)'C', (byte)'G', (byte)'T' }; dinucHashMap = new HashMap(); for(byte byte1 : BASES) { @@ -83,8 +83,4 @@ public class DinucCovariate implements Covariate { public final int estimatedNumberOfBins() { return 16; } - - public String toString() { - return "Dinucleotide"; - } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java index 9552b7842..ae6fc552f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java @@ -35,7 +35,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; public class MappingQualityCovariate implements Covariate { - public MappingQualityCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { } // Used to pick out the covariate's value from attributes of the read @@ -53,8 +54,4 @@ public class MappingQualityCovariate implements Covariate { public final int estimatedNumberOfBins() { return 100; } - - public String toString() { - return "Mapping Quality Score"; - } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java index 505f6be8d..d0f9699f1 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java @@ -36,14 +36,11 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; public class MinimumNQSCovariate implements Covariate { - private int windowReach; // how far in each direction from the current base to look + private int windowReach; // How far in each direction from the current base to look - public MinimumNQSCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker - windowReach = 2; // window size = 5 was the best covariate according to Chris's analysis - } - - public MinimumNQSCovariate(final int windowSize) { - windowReach = windowSize / 2; // integer division + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { + windowReach = RAC.WINDOW_SIZE / 2; // integer division } // Used to pick out the covariate's value from attributes of the read @@ -70,8 +67,4 @@ public class MinimumNQSCovariate implements Covariate { public final int estimatedNumberOfBins() { return 40; } - - public String toString() { - return "Minimum Neighborhood Quality Score (window size = " + (windowReach * 2 + 1) + ")"; - } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java index 0742f2a8c..6f4d6f787 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java @@ -36,7 +36,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; public class PositionCovariate implements Covariate { - public PositionCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { } // Used to pick out the covariate's value from attributes of the read @@ -57,8 +58,4 @@ public class PositionCovariate implements Covariate { public final int estimatedNumberOfBins() { return 100; } - - public String toString() { - return "Position in Read"; - } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java index 0a668c810..5445deb18 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java @@ -37,7 +37,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; public class PrimerRoundCovariate implements Covariate { - public PrimerRoundCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { } // Used to pick out the covariate's value from attributes of the read @@ -59,8 +60,4 @@ public class PrimerRoundCovariate implements Covariate { public final int estimatedNumberOfBins() { return 5; } - - public String toString() { - return "Primer Round"; - } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java index c9123e4ab..ab083fefc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java @@ -35,7 +35,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; public class QualityScoreCovariate implements Covariate { - public QualityScoreCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { } // Used to pick out the covariate's value from attributes of the read @@ -53,8 +54,4 @@ public class QualityScoreCovariate implements Covariate { public final int estimatedNumberOfBins() { return 40; } - - public String toString() { - return "Reported Quality Score"; - } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java index 7da89ff81..400571864 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java @@ -37,7 +37,8 @@ public class ReadGroupCovariate implements Covariate{ public static final String defaultReadGroup = "DefaultReadGroup"; - public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { } // Used to pick out the covariate's value from attributes of the read @@ -52,11 +53,7 @@ public class ReadGroupCovariate implements Covariate{ // Used to estimate the amount space required for the full data HashMap public final int estimatedNumberOfBins() { - return 300; - } - - public String toString() { - return "Read Group"; + return 60; } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadHashDatum.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadHashDatum.java index 79af1ae7e..3af1339c1 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadHashDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadHashDatum.java @@ -1,10 +1,42 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import edu.mit.broad.picard.illumina.parser.IlluminaUtil; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + /** * Created by IntelliJ IDEA. * User: rpoplin * Date: Nov 14, 2009 */ + public class ReadHashDatum { public String readGroup; public String platform; @@ -14,6 +46,7 @@ public class ReadHashDatum { public int mappingQuality; public int length; public Integer tile; + private static boolean warnUserNullReadGroup = false; // Has the walker warned the user about null read groups yet? public ReadHashDatum(String _readGroup, String _platform, byte[] _quals, byte[] _bases, boolean _isNegStrand, int _mappingQuality, int _length, Integer _tile) { @@ -37,4 +70,64 @@ public class ReadHashDatum { this.length = that.length; this.tile = that.tile; } + + /** + * Pulls the important bits out of a SAMRecord and stuffs them into a leaner Object to be used by the Covariates and cached by the Recalibration walkers + * @param read The SAMRecord whose data we want to get at + * @param RAC The collection of arguments that are shared between CovariateCounterWalker and TableRecalibrationWalker + * @return A new ReadHashDatum holding all the important parts of the read + */ + public static ReadHashDatum parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) { + + // Lots of lines just pulling things out of the SAMRecord and stuffing into local variables, many edge cases to worry about + byte[] quals = read.getBaseQualities(); + + // Check if we need to use the original quality scores instead + if ( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { + Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); + if ( obj instanceof String ) + quals = QualityUtils.fastqToPhred((String)obj); + else { + throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); + } + } + + final byte[] bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'. + final boolean isNegStrand = read.getReadNegativeStrandFlag(); + final SAMReadGroupRecord readGroup = read.getReadGroup(); + String readGroupId; + String platform; + + // If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments + if( readGroup == null ) { + if( !warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null ) { + Utils.warnUser("The input .bam file contains reads with no read group. " + + "Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " + + "First observed at read with name = " + read.getReadName() ); + warnUserNullReadGroup = true; + } + // There is no readGroup so defaulting to these values + readGroupId = RAC.DEFAULT_READ_GROUP; + platform = RAC.DEFAULT_PLATFORM; + } else { + readGroupId = readGroup.getReadGroupId(); + platform = readGroup.getPlatform(); + } + + final int mappingQuality = read.getMappingQuality(); + final int length = bases.length; + + if( RAC.FORCE_READ_GROUP != null ) { // Collapse all the read groups into a single common String provided by the user + readGroupId = RAC.FORCE_READ_GROUP; + } + + if( RAC.FORCE_PLATFORM != null ) { + platform = RAC.FORCE_PLATFORM; + } + + Integer tile = IlluminaUtil.getTileFromReadName(read.getReadName()); + + return new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length, tile ); + + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java new file mode 100755 index 000000000..1a80f458e --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java @@ -0,0 +1,68 @@ +package org.broadinstitute.sting.gatk.walkers.recalibration; + +import org.broadinstitute.sting.utils.cmdLine.Argument; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 27, 2009 + * + * A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker. + * This set of arguments will also be passed to the constructor of every Covariate when it is instantiated. + */ + +public class RecalibrationArgumentCollection { + + ////////////////////////////////// + // Shared Command Line Arguments + ////////////////////////////////// + @Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file") + public String RECAL_FILE = "output.recal_data.csv"; + @Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) + public boolean USE_ORIGINAL_QUALS = false; + @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false) + public int WINDOW_SIZE = 5; + @Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.") + public String DEFAULT_READ_GROUP = ReadGroupCovariate.defaultReadGroup; + @Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") + public String DEFAULT_PLATFORM = "Illumina"; + @Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.") + public String FORCE_READ_GROUP = null; + @Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") + public String FORCE_PLATFORM = null; + + ////////////////////////////////// + // Debugging-only Arguments + ////////////////////////////////// + @Argument(fullName="validate_old_recalibrator", shortName="validate", required=false, doc="!!!Depricated!!! Match the output of the old recalibrator exactly. FOR DEBUGGING PURPOSES ONLY.") + public boolean VALIDATE_OLD_RECALIBRATOR = false; + // BUGBUG: This validate option no longer does exactly what it says because now using read filter to filter out zero mapping quality reads. The old recalibrator counted those reads in the counted_sites variable but the new one doesn't. + +} + + diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 6c8b39068..c0ab680f0 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection; import org.broadinstitute.sting.utils.*; import java.util.ArrayList; @@ -62,36 +63,25 @@ import edu.mit.broad.picard.illumina.parser.IlluminaUtil; // GATK to not actually spend time giving me the refBase array since I don't use it public class TableRecalibrationWalker extends ReadWalker { + ///////////////////////////// + // Shared Arguments + ///////////////////////////// + @ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + ///////////////////////////// // Command Line Arguments ///////////////////////////// - @Argument(fullName="recal_file", shortName="recalFile", doc="Input recalibration table file generated by CountCovariates", required=true) - private String RECAL_FILE = null; @Argument(fullName="output_bam", shortName="outputBam", doc="output BAM file", required=true) private String OUTPUT_BAM_FILE = null; @Argument(fullName="preserve_qscores_less_than", shortName="pQ", - doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false) + doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false) private int PRESERVE_QSCORES_LESS_THAN = 5; - @Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) - private boolean USE_ORIGINAL_QUALS = false; - @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false) - private int WINDOW_SIZE = 5; @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points") private int SMOOTHING = 1; - @Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.") - private String DEFAULT_READ_GROUP = ReadGroupCovariate.defaultReadGroup; - @Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") - private String DEFAULT_PLATFORM = "Illumina"; - @Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String.") - private String FORCE_READ_GROUP = null; - @Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") - private String FORCE_PLATFORM = null; ///////////////////////////// // Debugging-only Arguments ///////////////////////////// - @Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. FOR DEBUGGING PURPOSES ONLY.") - private boolean VALIDATE_OLD_RECALIBRATOR = false; @Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") private boolean NO_PG_TAG = false; @@ -102,11 +92,10 @@ public class TableRecalibrationWalker extends ReadWalker requestedCovariates; // List of covariates to be used in this calculation private ArrayList fullCovariateKey; // The list that will be used over and over again to hold the full set of covariate values private ArrayList collapsedTableKey; // The key that will be used over and over again to query the collapsed tables - private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); - private static Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); - private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*"); - private final String versionString = "v2.0.4"; // Major version, minor version, and build number - private boolean warnUserNullReadGroup = false; // Has the walker warned the user about null read groups yet? + private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); + private static final Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*"); + private static final String versionString = "v2.0.5"; // Major version, minor version, and build number private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam //--------------------------------------------------------------------------------------------------------------- @@ -123,8 +112,8 @@ public class TableRecalibrationWalker extends ReadWalker> classes = PackageUtils.getClassesImplementingInterface(Covariate.class); @@ -154,14 +143,14 @@ public class TableRecalibrationWalker extends ReadWalker covClass : classes ) { @@ -170,9 +159,6 @@ public class TableRecalibrationWalker extends ReadWalker 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception + // Don't want to crash with out of heap space exception + if(estimatedCapacity > 300 * 40 * 200 || estimatedCapacity < 0) { // Could be negative if overflowed + estimatedCapacity = 300 * 40 * 200; + } final boolean createCollapsedTables = true; + + // Initialize any covariate member variables using the shared argument collection + for( Covariate cov : requestedCovariates ) { + cov.initialize( RAC ); + } // Initialize the data hashMaps dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() ); @@ -210,14 +204,16 @@ public class TableRecalibrationWalker extends ReadWalker