From 0d3ea0401c0afb82487a1c2018750350fad790ca Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 7 Feb 2012 13:22:46 -0500 Subject: [PATCH] BQSR Parameter cleanup * get rid of 320C argument that nobody uses. * get rid of DEFAULT_READ_GROUP parameter and functionality (later to become an engine argument). --- .../recalibration/CountCovariatesWalker.java | 278 +++++++++--------- .../recalibration/RecalDataManager.java | 26 -- .../RecalibrationArgumentCollection.java | 22 +- .../TableRecalibrationWalker.java | 247 ++++++++-------- 4 files changed, 280 insertions(+), 293 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index 4e3d4048b..626460be6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -77,20 +77,20 @@ import java.util.Map; *

Output

*

* A recalibration table file in CSV format that is used by the TableRecalibration walker. - * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score. + * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score. * - * The first 20 lines of such a file is shown below. + * The first 20 lines of such a file is shown below. * * The file begins with a series of comment lines describing: * ** The number of counted loci * ** The number of counted bases * ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases - * - * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records. + * + * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records. * * * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change - * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of + * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of * reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate. - * + * *

  * # Counted Sites    19451059
  * # Counted Bases    56582018
@@ -129,13 +129,14 @@ import java.util.Map;
  *   -cov DinucCovariate \
  *   -recalFile my_reads.recal_data.csv
  * 
- * */ @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) -@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file -@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality -@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta +@By(DataSource.READS) // Only look at covered loci, not every loci of the reference file +@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) +// Filter out all reads with zero or unavailable mapping quality +@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES}) +// This walker requires both -I input.bam and -R reference.fasta @PartitionBy(PartitionType.LOCUS) public class CountCovariatesWalker extends LocusWalker implements TreeReducible { @@ -149,7 +150,8 @@ public class CountCovariatesWalker extends LocusWalker> knownSites = Collections.emptyList(); /** @@ -169,31 +171,31 @@ public class CountCovariatesWalker extends LocusWalker> covariateClasses = new PluginManager( Covariate.class ).getPlugins(); - final List> requiredClasses = new PluginManager( RequiredCovariate.class ).getPlugins(); - final List> standardClasses = new PluginManager( StandardCovariate.class ).getPlugins(); + final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); + final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins(); + final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins(); // Print and exit if that's what was requested - if ( LIST_ONLY ) { - logger.info( "Available covariates:" ); - for( Class covClass : covariateClasses ) { - logger.info( covClass.getSimpleName() ); + if (LIST_ONLY) { + logger.info("Available covariates:"); + for (Class covClass : covariateClasses) { + logger.info(covClass.getSimpleName()); } logger.info(""); - System.exit( 0 ); // Early exit here because user requested it + System.exit(0); // Early exit here because user requested it } // Warn the user if no dbSNP file or other variant mask was specified - if( knownSites.isEmpty() && !RUN_WITHOUT_DBSNP ) { + if (knownSites.isEmpty() && !RUN_WITHOUT_DBSNP) { throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."); } // Initialize the requested covariates by parsing the -cov argument // First add the required covariates - if( requiredClasses.size() == 2) { // readGroup and reported quality score - requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here - requestedCovariates.add( new QualityScoreCovariate() ); - } else { + if (requiredClasses.size() == 2) { // readGroup and reported quality score + requestedCovariates.add(new ReadGroupCovariate()); // Order is important here + requestedCovariates.add(new QualityScoreCovariate()); + } + else { throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order."); } // Next add the standard covariates if -standard was specified by the user - if( USE_STANDARD_COVARIATES ) { + if (USE_STANDARD_COVARIATES) { // We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order // A list of Classes can't be sorted, but a list of Class names can be final List standardClassNames = new ArrayList(); - for( Class covClass : standardClasses ) { - standardClassNames.add( covClass.getName() ); + for (Class covClass : standardClasses) { + standardClassNames.add(covClass.getName()); } Collections.sort(standardClassNames); // Sort the list of class names - for( String className : standardClassNames ) { - for( Class covClass : standardClasses ) { // Find the class that matches this class name - if( covClass.getName().equals( className ) ) { + for (String className : standardClassNames) { + for (Class covClass : standardClasses) { // Find the class that matches this class name + if (covClass.getName().equals(className)) { try { - final Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); + final Covariate covariate = (Covariate) covClass.newInstance(); + requestedCovariates.add(covariate); } catch (Exception e) { throw new DynamicClassResolutionException(covClass, e); } @@ -302,17 +307,17 @@ public class CountCovariatesWalker extends LocusWalker covClass : covariateClasses ) { - if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class + for (Class covClass : covariateClasses) { + if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class foundClass = true; - if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) { + if (!requiredClasses.contains(covClass) && (!USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) { try { // Now that we've found a matching class, try to instantiate it - final Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); + final Covariate covariate = (Covariate) covClass.newInstance(); + requestedCovariates.add(covariate); } catch (Exception e) { throw new DynamicClassResolutionException(covClass, e); } @@ -320,20 +325,19 @@ public class CountCovariatesWalker extends LocusWalker 0 ) { + if (gatkRead.getBaseQualities()[offset] > 0) { byte[] bases = gatkRead.getReadBases(); byte refBase = ref.getBase(); // Skip if this base is an 'N' or etc. - if( BaseUtils.isRegularBase( bases[offset] ) ) { + if (BaseUtils.isRegularBase(bases[offset])) { // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it - if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING || - !RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) { + if (!gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING || + !RecalDataManager.isInconsistentColorSpace(gatkRead, offset)) { // This base finally passed all the checks for a good base, so add it to the big data hashmap - updateDataFromRead( counter, gatkRead, offset, refBase ); + updateDataFromRead(counter, gatkRead, offset, refBase); - } else { // calculate SOLID reference insertion rate - if( refBase == bases[offset] ) { + } + else { // calculate SOLID reference insertion rate + if (refBase == bases[offset]) { counter.solidInsertedReferenceBases++; - } else { + } + else { counter.otherColorSpaceInconsistency++; } } @@ -405,7 +410,8 @@ public class CountCovariatesWalker extends LocusWalker= DBSNP_VALIDATION_CHECK_FREQUENCY ) { + if (++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY) { counter.lociSinceLastDbsnpCheck = 0; - if( counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L ) { - final double fractionMM_novel = (double)counter.novelCountsMM / (double)counter.novelCountsBases; - final double fractionMM_dbsnp = (double)counter.dbSNPCountsMM / (double)counter.dbSNPCountsBases; + if (counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L) { + final double fractionMM_novel = (double) counter.novelCountsMM / (double) counter.novelCountsBases; + final double fractionMM_dbsnp = (double) counter.dbSNPCountsMM / (double) counter.dbSNPCountsBases; - if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) { - Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + - String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) ); + if (fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel) { + Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel)); DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file } } @@ -518,47 +525,50 @@ public class CountCovariatesWalker extends LocusWalker keyList = new ArrayList(); - for( Object comp : data.keySet() ) { + for (Object comp : data.keySet()) { keyList.add((Comparable) comp); } Collections.sort(keyList); - for( Comparable comp : keyList ) { + for (Comparable comp : keyList) { key[curPos] = comp; final Object val = data.get(comp); - if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps + if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps // For each Covariate in the key - for( Object compToPrint : key ) { + for (Object compToPrint : key) { // Output the Covariate's value - recalTableStream.print( compToPrint + "," ); + recalTableStream.print(compToPrint + ","); } // Output the RecalDatum entry - recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() ); - } else { // Another layer in the nested hash map - printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val ); + recalTableStream.println(((RecalDatumOptimized) val).outputToCSV()); + } + else { // Another layer in the nested hash map + printMappingsSorted(recalTableStream, curPos + 1, key, (Map) val); } } } - private void printMappings( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) { - for( Object comp : data.keySet() ) { + private void printMappings(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) { + for (Object comp : data.keySet()) { key[curPos] = comp; final Object val = data.get(comp); - if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps + if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps // For each Covariate in the key - for( Object compToPrint : key ) { + for (Object compToPrint : key) { // Output the Covariate's value - recalTableStream.print( compToPrint + "," ); + recalTableStream.print(compToPrint + ","); } // Output the RecalDatum entry - recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() ); - } else { // Another layer in the nested hash map - printMappings( recalTableStream, curPos + 1, key, (Map) val ); + recalTableStream.println(((RecalDatumOptimized) val).outputToCSV()); + } + else { // Another layer in the nested hash map + printMappings(recalTableStream, curPos + 1, key, (Map) val); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 18b33c0e8..72c2b2829 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -256,32 +256,6 @@ public class RecalDataManager { public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup(); - // If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments - if (readGroup == null) { - if (RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) { - if (!warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null) { - Utils.warnUser("The input .bam file contains reads with no read group. " + - "Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " + - "First observed at read with name = " + read.getReadName()); - warnUserNullReadGroup = true; - } - // There is no readGroup so defaulting to these values - readGroup = new GATKSAMReadGroupRecord(RAC.DEFAULT_READ_GROUP); - readGroup.setPlatform(RAC.DEFAULT_PLATFORM); - ((GATKSAMRecord) read).setReadGroup(readGroup); - } - else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName()); - } - } - - if (RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP)) { // Collapse all the read groups into a single common String provided by the user - final String oldPlatform = readGroup.getPlatform(); - readGroup = new GATKSAMReadGroupRecord(RAC.FORCE_READ_GROUP); - readGroup.setPlatform(oldPlatform); - ((GATKSAMRecord) read).setReadGroup(readGroup); - } - if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { readGroup.setPlatform(RAC.FORCE_PLATFORM); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java index 7f3035f1e..9752b1dee 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java @@ -43,31 +43,15 @@ public class RecalibrationArgumentCollection { // Shared Command Line Arguments ////////////////////////////////// @Hidden - @Argument(fullName = "default_read_group", shortName = "dRG", required = false, doc = "If a read has no read group then default to the provided String.") - public String DEFAULT_READ_GROUP = null; - @Hidden @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String DEFAULT_PLATFORM = null; @Hidden - @Argument(fullName = "force_read_group", shortName = "fRG", required = false, doc = "If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.") - public String FORCE_READ_GROUP = null; - @Hidden @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; @Hidden @Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false) public int WINDOW_SIZE = 5; - /** - * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score. - */ - @Hidden - @Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false) - public int HOMOPOLYMER_NBACK = 7; - @Hidden - @Argument(fullName = "exception_if_no_tile", shortName = "throwTileException", doc = "If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required = false) - public boolean EXCEPTION_IF_NO_TILE = false; - /** * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the * reads which have had the reference inserted because of color space inconsistencies. @@ -89,4 +73,10 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false) public int CONTEXT_SIZE = 8; + /** + * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score. + */ + @Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false) + public int HOMOPOLYMER_NBACK = 7; + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index a8006d506..cd848cd9e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -86,12 +86,12 @@ import java.util.regex.Pattern; * -o my_reads.recal.bam \ * -recalFile my_reads.recal_data.csv * - * */ @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @WalkerName("TableRecalibration") -@Requires({ DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES }) // This walker requires -I input.bam, it also requires -R reference.fasta +@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES}) +// This walker requires -I input.bam, it also requires -R reference.fasta public class TableRecalibrationWalker extends ReadWalker { public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration"; @@ -99,7 +99,8 @@ public class TableRecalibrationWalker extends ReadWalker> classes = new PluginManager(Covariate.class).getPlugins(); @@ -205,31 +206,33 @@ public class TableRecalibrationWalker extends ReadWalker covClass : classes ) { - if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { + for (Class covClass : classes) { + if ((vals[iii] + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) { foundClass = true; try { - Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); + Covariate covariate = (Covariate) covClass.newInstance(); + requestedCovariates.add(covariate); } catch (Exception e) { throw new DynamicClassResolutionException(covClass, e); } @@ -237,107 +240,110 @@ public class TableRecalibrationWalker extends ReadWalker oldRecords = header.getProgramRecords(); - List newRecords = new ArrayList(oldRecords.size()+1); - for ( SAMProgramRecord record : oldRecords ) { - if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) ) + List newRecords = new ArrayList(oldRecords.size() + 1); + for (SAMProgramRecord record : oldRecords) { + if (!record.getId().startsWith(PROGRAM_RECORD_NAME)) newRecords.add(record); } newRecords.add(programRecord); header.setProgramRecords(newRecords); // Write out the new header - OUTPUT_BAM.writeHeader( header ); + OUTPUT_BAM.writeHeader(header); } } /** * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) + * * @param line A line of CSV data read from the recalibration table data file */ private void addCSVData(final File file, final String line) { final String[] vals = line.split(","); // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly - if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical + if (vals.length != requestedCovariates.size() + 3) { // +3 because of nObservations, nMismatch, and Qempirical throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + " --Perhaps the read group string contains a comma and isn't being parsed correctly."); } @@ -345,15 +351,15 @@ public class TableRecalibrationWalker extends ReadWalker