BQSR Parameter cleanup

* get rid of 320C argument that nobody uses.
   * get rid of DEFAULT_READ_GROUP parameter and functionality (later to become an engine argument).
This commit is contained in:
Mauricio Carneiro 2012-02-07 13:22:46 -05:00
parent 717cd4b912
commit 0d3ea0401c
4 changed files with 280 additions and 293 deletions

View File

@ -77,20 +77,20 @@ import java.util.Map;
* <h2>Output</h2> * <h2>Output</h2>
* <p> * <p>
* A recalibration table file in CSV format that is used by the TableRecalibration walker. * A recalibration table file in CSV format that is used by the TableRecalibration walker.
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score. * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
* *
* The first 20 lines of such a file is shown below. * The first 20 lines of such a file is shown below.
* * The file begins with a series of comment lines describing: * * The file begins with a series of comment lines describing:
* ** The number of counted loci * ** The number of counted loci
* ** The number of counted bases * ** The number of counted bases
* ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases * ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
* *
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records. * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
* *
* * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change * * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
* reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate. * reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
* *
* <pre> * <pre>
* # Counted Sites 19451059 * # Counted Sites 19451059
* # Counted Bases 56582018 * # Counted Bases 56582018
@ -129,13 +129,14 @@ import java.util.Map;
* -cov DinucCovariate \ * -cov DinucCovariate \
* -recalFile my_reads.recal_data.csv * -recalFile my_reads.recal_data.csv
* </pre> * </pre>
*
*/ */
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file @By(DataSource.READS) // Only look at covered loci, not every loci of the reference file
@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class})
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta // Filter out all reads with zero or unavailable mapping quality
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
// This walker requires both -I input.bam and -R reference.fasta
@PartitionBy(PartitionType.LOCUS) @PartitionBy(PartitionType.LOCUS)
public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> { public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> {
@ -149,7 +150,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
///////////////////////////// /////////////////////////////
// Shared Arguments // Shared Arguments
///////////////////////////// /////////////////////////////
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); @ArgumentCollection
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
///////////////////////////// /////////////////////////////
// Command Line Arguments // Command Line Arguments
@ -160,7 +162,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
* for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites. * for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument. * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
*/ */
@Input(fullName="knownSites", shortName = "knownSites", doc="A database of known polymorphic sites to skip over in the recalibration algorithm", required=false) @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
public List<RodBinding<Feature>> knownSites = Collections.emptyList(); public List<RodBinding<Feature>> knownSites = Collections.emptyList();
/** /**
@ -169,31 +171,31 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
* and the raw empirical quality score calculated by phred-scaling the mismatch rate. * and the raw empirical quality score calculated by phred-scaling the mismatch rate.
*/ */
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the output covariates table recalibration file") @Output(fullName = "recal_file", shortName = "recalFile", required = true, doc = "Filename for the output covariates table recalibration file")
@Gather(CountCovariatesGatherer.class) @Gather(CountCovariatesGatherer.class)
public PrintStream RECAL_FILE; public PrintStream RECAL_FILE;
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false) @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
private boolean LIST_ONLY = false; private boolean LIST_ONLY = false;
/** /**
* See the -list argument to view available covariates. * See the -list argument to view available covariates.
*/ */
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false) @Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false)
private String[] COVARIATES = null; private String[] COVARIATES = null;
@Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false) @Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false)
private boolean USE_STANDARD_COVARIATES = false; private boolean USE_STANDARD_COVARIATES = false;
///////////////////////////// /////////////////////////////
// Debugging-only Arguments // Debugging-only Arguments
///////////////////////////// /////////////////////////////
@Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.") @Argument(fullName = "dont_sort_output", shortName = "unsorted", required = false, doc = "If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
private boolean DONT_SORT_OUTPUT = false; private boolean DONT_SORT_OUTPUT = false;
/** /**
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option. * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
*/ */
@Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
private boolean RUN_WITHOUT_DBSNP = false; private boolean RUN_WITHOUT_DBSNP = false;
///////////////////////////// /////////////////////////////
@ -217,6 +219,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/** /**
* Adds the values of other to this, returning this * Adds the values of other to this, returning this
*
* @param other * @param other
* @return this object * @return this object
*/ */
@ -247,53 +250,55 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
*/ */
public void initialize() { public void initialize() {
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; } if (RAC.FORCE_PLATFORM != null) {
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; } RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
}
// Get a list of all available covariates // Get a list of all available covariates
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>( Covariate.class ).getPlugins(); final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>( RequiredCovariate.class ).getPlugins(); final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>(RequiredCovariate.class).getPlugins();
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>( StandardCovariate.class ).getPlugins(); final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>(StandardCovariate.class).getPlugins();
// Print and exit if that's what was requested // Print and exit if that's what was requested
if ( LIST_ONLY ) { if (LIST_ONLY) {
logger.info( "Available covariates:" ); logger.info("Available covariates:");
for( Class<?> covClass : covariateClasses ) { for (Class<?> covClass : covariateClasses) {
logger.info( covClass.getSimpleName() ); logger.info(covClass.getSimpleName());
} }
logger.info(""); logger.info("");
System.exit( 0 ); // Early exit here because user requested it System.exit(0); // Early exit here because user requested it
} }
// Warn the user if no dbSNP file or other variant mask was specified // Warn the user if no dbSNP file or other variant mask was specified
if( knownSites.isEmpty() && !RUN_WITHOUT_DBSNP ) { if (knownSites.isEmpty() && !RUN_WITHOUT_DBSNP) {
throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."); throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.");
} }
// Initialize the requested covariates by parsing the -cov argument // Initialize the requested covariates by parsing the -cov argument
// First add the required covariates // First add the required covariates
if( requiredClasses.size() == 2) { // readGroup and reported quality score if (requiredClasses.size() == 2) { // readGroup and reported quality score
requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here requestedCovariates.add(new ReadGroupCovariate()); // Order is important here
requestedCovariates.add( new QualityScoreCovariate() ); requestedCovariates.add(new QualityScoreCovariate());
} else { }
else {
throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order."); throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order.");
} }
// Next add the standard covariates if -standard was specified by the user // Next add the standard covariates if -standard was specified by the user
if( USE_STANDARD_COVARIATES ) { if (USE_STANDARD_COVARIATES) {
// We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order // We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order
// A list of Classes can't be sorted, but a list of Class names can be // A list of Classes can't be sorted, but a list of Class names can be
final List<String> standardClassNames = new ArrayList<String>(); final List<String> standardClassNames = new ArrayList<String>();
for( Class<?> covClass : standardClasses ) { for (Class<?> covClass : standardClasses) {
standardClassNames.add( covClass.getName() ); standardClassNames.add(covClass.getName());
} }
Collections.sort(standardClassNames); // Sort the list of class names Collections.sort(standardClassNames); // Sort the list of class names
for( String className : standardClassNames ) { for (String className : standardClassNames) {
for( Class<?> covClass : standardClasses ) { // Find the class that matches this class name for (Class<?> covClass : standardClasses) { // Find the class that matches this class name
if( covClass.getName().equals( className ) ) { if (covClass.getName().equals(className)) {
try { try {
final Covariate covariate = (Covariate)covClass.newInstance(); final Covariate covariate = (Covariate) covClass.newInstance();
requestedCovariates.add( covariate ); requestedCovariates.add(covariate);
} catch (Exception e) { } catch (Exception e) {
throw new DynamicClassResolutionException(covClass, e); throw new DynamicClassResolutionException(covClass, e);
} }
@ -302,17 +307,17 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
} }
} }
// Finally parse the -cov arguments that were provided, skipping over the ones already specified // Finally parse the -cov arguments that were provided, skipping over the ones already specified
if( COVARIATES != null ) { if (COVARIATES != null) {
for( String requestedCovariateString : COVARIATES ) { for (String requestedCovariateString : COVARIATES) {
boolean foundClass = false; boolean foundClass = false;
for( Class<?> covClass : covariateClasses ) { for (Class<?> covClass : covariateClasses) {
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class
foundClass = true; foundClass = true;
if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) { if (!requiredClasses.contains(covClass) && (!USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) {
try { try {
// Now that we've found a matching class, try to instantiate it // Now that we've found a matching class, try to instantiate it
final Covariate covariate = (Covariate)covClass.newInstance(); final Covariate covariate = (Covariate) covClass.newInstance();
requestedCovariates.add( covariate ); requestedCovariates.add(covariate);
} catch (Exception e) { } catch (Exception e) {
throw new DynamicClassResolutionException(covClass, e); throw new DynamicClassResolutionException(covClass, e);
} }
@ -320,20 +325,19 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
} }
} }
if( !foundClass ) { if (!foundClass) {
throw new UserException.CommandLineException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." ); throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates.");
} }
} }
} }
logger.info( "The covariates being used here: " ); logger.info("The covariates being used here: ");
for( Covariate cov : requestedCovariates ) { for (Covariate cov : requestedCovariates) {
logger.info( "\t" + cov.getClass().getSimpleName() ); logger.info("\t" + cov.getClass().getSimpleName());
cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection cov.initialize(RAC); // Initialize any covariate member variables using the shared argument collection
} }
} }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
// //
// map // map
@ -342,62 +346,63 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/** /**
* For each read at this locus get the various covariate values and increment that location in the map based on * For each read at this locus get the various covariate values and increment that location in the map based on
* whether or not the base matches the reference at this particular location * whether or not the base matches the reference at this particular location
*
* @param tracker The reference metadata tracker * @param tracker The reference metadata tracker
* @param ref The reference context * @param ref The reference context
* @param context The alignment context * @param context The alignment context
* @return Returns 1, but this value isn't used in the reduce step * @return Returns 1, but this value isn't used in the reduce step
*/ */
public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { public CountedData map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
// Only use data from non-dbsnp sites // Only use data from non-dbsnp sites
// Assume every mismatch at a non-dbsnp site is indicative of poor quality // Assume every mismatch at a non-dbsnp site is indicative of poor quality
CountedData counter = new CountedData(); CountedData counter = new CountedData();
if( tracker.getValues(knownSites).size() == 0 ) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed if (tracker.getValues(knownSites).size() == 0) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed
// For each read at this locus // For each read at this locus
for( final PileupElement p : context.getBasePileup() ) { for (final PileupElement p : context.getBasePileup()) {
final GATKSAMRecord gatkRead = p.getRead(); final GATKSAMRecord gatkRead = p.getRead();
int offset = p.getOffset(); int offset = p.getOffset();
if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) { if (gatkRead.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE)) {
continue; continue;
} }
if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) ) if (!gatkRead.containsTemporaryAttribute(SEEN_ATTRIBUTE)) {
{ gatkRead.setTemporaryAttribute(SEEN_ATTRIBUTE, true);
gatkRead.setTemporaryAttribute( SEEN_ATTRIBUTE, true ); RecalDataManager.parseSAMRecord(gatkRead, RAC);
RecalDataManager.parseSAMRecord( gatkRead, RAC );
// Skip over reads with no calls in the color space if the user requested it // Skip over reads with no calls in the color space if the user requested it
if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace( gatkRead ) ) { if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace(gatkRead)) {
gatkRead.setTemporaryAttribute( SKIP_RECORD_ATTRIBUTE, true); gatkRead.setTemporaryAttribute(SKIP_RECORD_ATTRIBUTE, true);
continue; continue;
} }
RecalDataManager.parseColorSpace( gatkRead ); RecalDataManager.parseColorSpace(gatkRead);
gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE, gatkRead.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION));
RecalDataManager.computeCovariates( gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION ));
} }
// Skip this position if base quality is zero // Skip this position if base quality is zero
if( gatkRead.getBaseQualities()[offset] > 0 ) { if (gatkRead.getBaseQualities()[offset] > 0) {
byte[] bases = gatkRead.getReadBases(); byte[] bases = gatkRead.getReadBases();
byte refBase = ref.getBase(); byte refBase = ref.getBase();
// Skip if this base is an 'N' or etc. // Skip if this base is an 'N' or etc.
if( BaseUtils.isRegularBase( bases[offset] ) ) { if (BaseUtils.isRegularBase(bases[offset])) {
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING || if (!gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
!RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) { !RecalDataManager.isInconsistentColorSpace(gatkRead, offset)) {
// This base finally passed all the checks for a good base, so add it to the big data hashmap // This base finally passed all the checks for a good base, so add it to the big data hashmap
updateDataFromRead( counter, gatkRead, offset, refBase ); updateDataFromRead(counter, gatkRead, offset, refBase);
} else { // calculate SOLID reference insertion rate }
if( refBase == bases[offset] ) { else { // calculate SOLID reference insertion rate
if (refBase == bases[offset]) {
counter.solidInsertedReferenceBases++; counter.solidInsertedReferenceBases++;
} else { }
else {
counter.otherColorSpaceInconsistency++; counter.otherColorSpaceInconsistency++;
} }
} }
@ -405,7 +410,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
} }
} }
counter.countedSites++; counter.countedSites++;
} else { // We skipped over the dbSNP site, and we are only processing every Nth locus }
else { // We skipped over the dbSNP site, and we are only processing every Nth locus
counter.skippedSites++; counter.skippedSites++;
updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
} }
@ -413,7 +419,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
return counter; return counter;
} }
/** /**
* Update the mismatch / total_base counts for a given class of loci. * Update the mismatch / total_base counts for a given class of loci.
* *
* @param counter The CountedData to be updated * @param counter The CountedData to be updated
@ -421,13 +427,13 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
* @param refBase The reference base * @param refBase The reference base
*/ */
private static void updateMismatchCounts(CountedData counter, final AlignmentContext context, final byte refBase) { private static void updateMismatchCounts(CountedData counter, final AlignmentContext context, final byte refBase) {
for( PileupElement p : context.getBasePileup() ) { for (PileupElement p : context.getBasePileup()) {
final byte readBase = p.getBase(); final byte readBase = p.getBase();
final int readBaseIndex = BaseUtils.simpleBaseToBaseIndex(readBase); final int readBaseIndex = BaseUtils.simpleBaseToBaseIndex(readBase);
final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase); final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase);
if( readBaseIndex != -1 && refBaseIndex != -1 ) { if (readBaseIndex != -1 && refBaseIndex != -1) {
if( readBaseIndex != refBaseIndex ) { if (readBaseIndex != refBaseIndex) {
counter.novelCountsMM++; counter.novelCountsMM++;
} }
counter.novelCountsBases++; counter.novelCountsBases++;
@ -439,13 +445,14 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
* Major workhorse routine for this walker. * Major workhorse routine for this walker.
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference * Loop through the list of requested covariates and pick out the value from the read, offset, and reference
* Using the list of covariate values as a key, pick out the RecalDatum and increment, * Using the list of covariate values as a key, pick out the RecalDatum and increment,
* adding one to the number of observations and potentially one to the number of mismatches * adding one to the number of observations and potentially one to the number of mismatches
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls * Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
* because pulling things out of the SAMRecord is an expensive operation. * because pulling things out of the SAMRecord is an expensive operation.
* @param counter Data structure which holds the counted bases *
* @param counter Data structure which holds the counted bases
* @param gatkRead The SAMRecord holding all the data for this read * @param gatkRead The SAMRecord holding all the data for this read
* @param offset The offset in the read for this locus * @param offset The offset in the read for this locus
* @param refBase The reference base at this locus * @param refBase The reference base at this locus
*/ */
private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRead, final int offset, final byte refBase) { private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRead, final int offset, final byte refBase) {
final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE); final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE);
@ -453,10 +460,10 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
final NestedHashMap data = dataManager.data; //optimization - create local reference final NestedHashMap data = dataManager.data; //optimization - create local reference
RecalDatumOptimized datum = (RecalDatumOptimized) data.get( key ); RecalDatumOptimized datum = (RecalDatumOptimized) data.get(key);
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it if (datum == null) { // key doesn't exist yet in the map so make a new bucket and add it
// initialized with zeros, will be incremented at end of method // initialized with zeros, will be incremented at end of method
datum = (RecalDatumOptimized)data.put( new RecalDatumOptimized(), true, (Object[])key ); datum = (RecalDatumOptimized) data.put(new RecalDatumOptimized(), true, (Object[]) key);
} }
// Need the bases to determine whether or not we have a mismatch // Need the bases to determine whether or not we have a mismatch
@ -464,13 +471,12 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
final long curMismatches = datum.getNumMismatches(); final long curMismatches = datum.getNumMismatches();
// Add one to the number of observations and potentially one to the number of mismatches // Add one to the number of observations and potentially one to the number of mismatches
datum.incrementBaseCounts( base, refBase ); datum.incrementBaseCounts(base, refBase);
counter.countedBases++; counter.countedBases++;
counter.novelCountsBases++; counter.novelCountsBases++;
counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
} }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
// //
// reduce // reduce
@ -479,6 +485,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/** /**
* Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker. * Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker.
*
* @return returns A PrintStream created from the -recalFile filename argument specified to the walker * @return returns A PrintStream created from the -recalFile filename argument specified to the walker
*/ */
public CountedData reduceInit() { public CountedData reduceInit() {
@ -487,11 +494,12 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/** /**
* The Reduce method doesn't do anything for this walker. * The Reduce method doesn't do anything for this walker.
*
* @param mapped Result of the map. This value is immediately ignored. * @param mapped Result of the map. This value is immediately ignored.
* @param sum The summing CountedData used to output the CSV data * @param sum The summing CountedData used to output the CSV data
* @return returns The sum used to output the CSV data * @return returns The sum used to output the CSV data
*/ */
public CountedData reduce( CountedData mapped, CountedData sum ) { public CountedData reduce(CountedData mapped, CountedData sum) {
// Do a dbSNP sanity check every so often // Do a dbSNP sanity check every so often
return validatingDbsnpMismatchRate(sum.add(mapped)); return validatingDbsnpMismatchRate(sum.add(mapped));
} }
@ -500,16 +508,15 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
* Validate the dbSNP reference mismatch rates. * Validate the dbSNP reference mismatch rates.
*/ */
private CountedData validatingDbsnpMismatchRate(CountedData counter) { private CountedData validatingDbsnpMismatchRate(CountedData counter) {
if( ++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY ) { if (++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY) {
counter.lociSinceLastDbsnpCheck = 0; counter.lociSinceLastDbsnpCheck = 0;
if( counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L ) { if (counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L) {
final double fractionMM_novel = (double)counter.novelCountsMM / (double)counter.novelCountsBases; final double fractionMM_novel = (double) counter.novelCountsMM / (double) counter.novelCountsBases;
final double fractionMM_dbsnp = (double)counter.dbSNPCountsMM / (double)counter.dbSNPCountsBases; final double fractionMM_dbsnp = (double) counter.dbSNPCountsMM / (double) counter.dbSNPCountsBases;
if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) { if (fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel) {
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel));
String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) );
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file
} }
} }
@ -518,47 +525,50 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
return counter; return counter;
} }
public CountedData treeReduce( CountedData sum1, CountedData sum2 ) { public CountedData treeReduce(CountedData sum1, CountedData sum2) {
return validatingDbsnpMismatchRate(sum1.add(sum2)); return validatingDbsnpMismatchRate(sum1.add(sum2));
} }
/** /**
* Write out the full data hashmap to disk in CSV format * Write out the full data hashmap to disk in CSV format
*
* @param sum The CountedData to write out to RECAL_FILE * @param sum The CountedData to write out to RECAL_FILE
*/ */
public void onTraversalDone( CountedData sum ) { public void onTraversalDone(CountedData sum) {
logger.info( "Writing raw recalibration data..." ); logger.info("Writing raw recalibration data...");
if( sum.countedBases == 0L ) { if (sum.countedBases == 0L) {
throw new UserException.BadInput("Could not find any usable data in the input BAM file(s)."); throw new UserException.BadInput("Could not find any usable data in the input BAM file(s).");
} }
outputToCSV( sum, RECAL_FILE ); outputToCSV(sum, RECAL_FILE);
logger.info( "...done!" ); logger.info("...done!");
} }
/** /**
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format * For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
*
* @param recalTableStream The PrintStream to write out to * @param recalTableStream The PrintStream to write out to
*/ */
private void outputToCSV( CountedData sum, final PrintStream recalTableStream ) { private void outputToCSV(CountedData sum, final PrintStream recalTableStream) {
recalTableStream.printf("# Counted Sites %d%n", sum.countedSites); recalTableStream.printf("# Counted Sites %d%n", sum.countedSites);
recalTableStream.printf("# Counted Bases %d%n", sum.countedBases); recalTableStream.printf("# Counted Bases %d%n", sum.countedBases);
recalTableStream.printf("# Skipped Sites %d%n", sum.skippedSites); recalTableStream.printf("# Skipped Sites %d%n", sum.skippedSites);
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)sum.countedSites / sum.skippedSites); recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double) sum.countedSites / sum.skippedSites);
if( sum.solidInsertedReferenceBases != 0 ) { if (sum.solidInsertedReferenceBases != 0) {
recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) sum.countedBases / sum.solidInsertedReferenceBases); recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) sum.countedBases / sum.solidInsertedReferenceBases);
recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) sum.countedBases / sum.otherColorSpaceInconsistency); recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) sum.countedBases / sum.otherColorSpaceInconsistency);
} }
// Output header saying which covariates were used and in what order // Output header saying which covariates were used and in what order
for( Covariate cov : requestedCovariates ) { for (Covariate cov : requestedCovariates) {
recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," ); recalTableStream.print(cov.getClass().getSimpleName().split("Covariate")[0] + ",");
} }
recalTableStream.println("nObservations,nMismatches,Qempirical"); recalTableStream.println("nObservations,nMismatches,Qempirical");
if( DONT_SORT_OUTPUT ) { if (DONT_SORT_OUTPUT) {
printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data); printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
} else { }
else {
printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data); printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
} }
@ -566,45 +576,47 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
recalTableStream.println(TableRecalibrationWalker.EOF_MARKER); recalTableStream.println(TableRecalibrationWalker.EOF_MARKER);
} }
private void printMappingsSorted( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) { private void printMappingsSorted(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
final ArrayList<Comparable> keyList = new ArrayList<Comparable>(); final ArrayList<Comparable> keyList = new ArrayList<Comparable>();
for( Object comp : data.keySet() ) { for (Object comp : data.keySet()) {
keyList.add((Comparable) comp); keyList.add((Comparable) comp);
} }
Collections.sort(keyList); Collections.sort(keyList);
for( Comparable comp : keyList ) { for (Comparable comp : keyList) {
key[curPos] = comp; key[curPos] = comp;
final Object val = data.get(comp); final Object val = data.get(comp);
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
// For each Covariate in the key // For each Covariate in the key
for( Object compToPrint : key ) { for (Object compToPrint : key) {
// Output the Covariate's value // Output the Covariate's value
recalTableStream.print( compToPrint + "," ); recalTableStream.print(compToPrint + ",");
} }
// Output the RecalDatum entry // Output the RecalDatum entry
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() ); recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
} else { // Another layer in the nested hash map }
printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val ); else { // Another layer in the nested hash map
printMappingsSorted(recalTableStream, curPos + 1, key, (Map) val);
} }
} }
} }
private void printMappings( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) { private void printMappings(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
for( Object comp : data.keySet() ) { for (Object comp : data.keySet()) {
key[curPos] = comp; key[curPos] = comp;
final Object val = data.get(comp); final Object val = data.get(comp);
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
// For each Covariate in the key // For each Covariate in the key
for( Object compToPrint : key ) { for (Object compToPrint : key) {
// Output the Covariate's value // Output the Covariate's value
recalTableStream.print( compToPrint + "," ); recalTableStream.print(compToPrint + ",");
} }
// Output the RecalDatum entry // Output the RecalDatum entry
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() ); recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
} else { // Another layer in the nested hash map }
printMappings( recalTableStream, curPos + 1, key, (Map) val ); else { // Another layer in the nested hash map
printMappings(recalTableStream, curPos + 1, key, (Map) val);
} }
} }
} }

View File

@ -256,32 +256,6 @@ public class RecalDataManager {
public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup(); GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup();
// If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments
if (readGroup == null) {
if (RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) {
if (!warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null) {
Utils.warnUser("The input .bam file contains reads with no read group. " +
"Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " +
"First observed at read with name = " + read.getReadName());
warnUserNullReadGroup = true;
}
// There is no readGroup so defaulting to these values
readGroup = new GATKSAMReadGroupRecord(RAC.DEFAULT_READ_GROUP);
readGroup.setPlatform(RAC.DEFAULT_PLATFORM);
((GATKSAMRecord) read).setReadGroup(readGroup);
}
else {
throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName());
}
}
if (RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP)) { // Collapse all the read groups into a single common String provided by the user
final String oldPlatform = readGroup.getPlatform();
readGroup = new GATKSAMReadGroupRecord(RAC.FORCE_READ_GROUP);
readGroup.setPlatform(oldPlatform);
((GATKSAMRecord) read).setReadGroup(readGroup);
}
if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
readGroup.setPlatform(RAC.FORCE_PLATFORM); readGroup.setPlatform(RAC.FORCE_PLATFORM);
} }

View File

@ -43,31 +43,15 @@ public class RecalibrationArgumentCollection {
// Shared Command Line Arguments // Shared Command Line Arguments
////////////////////////////////// //////////////////////////////////
@Hidden @Hidden
@Argument(fullName = "default_read_group", shortName = "dRG", required = false, doc = "If a read has no read group then default to the provided String.")
public String DEFAULT_READ_GROUP = null;
@Hidden
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
public String DEFAULT_PLATFORM = null; public String DEFAULT_PLATFORM = null;
@Hidden @Hidden
@Argument(fullName = "force_read_group", shortName = "fRG", required = false, doc = "If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.")
public String FORCE_READ_GROUP = null;
@Hidden
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null; public String FORCE_PLATFORM = null;
@Hidden @Hidden
@Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false) @Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false)
public int WINDOW_SIZE = 5; public int WINDOW_SIZE = 5;
/**
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
*/
@Hidden
@Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
public int HOMOPOLYMER_NBACK = 7;
@Hidden
@Argument(fullName = "exception_if_no_tile", shortName = "throwTileException", doc = "If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required = false)
public boolean EXCEPTION_IF_NO_TILE = false;
/** /**
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the * CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies. * reads which have had the reference inserted because of color space inconsistencies.
@ -89,4 +73,10 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false) @Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false)
public int CONTEXT_SIZE = 8; public int CONTEXT_SIZE = 8;
/**
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
*/
@Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
public int HOMOPOLYMER_NBACK = 7;
} }

View File

@ -86,12 +86,12 @@ import java.util.regex.Pattern;
* -o my_reads.recal.bam \ * -o my_reads.recal.bam \
* -recalFile my_reads.recal_data.csv * -recalFile my_reads.recal_data.csv
* </pre> * </pre>
*
*/ */
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@WalkerName("TableRecalibration") @WalkerName("TableRecalibration")
@Requires({ DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES }) // This walker requires -I input.bam, it also requires -R reference.fasta @Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
// This walker requires -I input.bam, it also requires -R reference.fasta
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> { public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration"; public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration";
@ -99,7 +99,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
///////////////////////////// /////////////////////////////
// Shared Arguments // Shared Arguments
///////////////////////////// /////////////////////////////
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); @ArgumentCollection
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
///////////////////////////// /////////////////////////////
// Command Line Arguments // Command Line Arguments
@ -110,12 +111,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
* and the raw empirical quality score calculated by phred-scaling the mismatch rate. * and the raw empirical quality score calculated by phred-scaling the mismatch rate.
*/ */
@Input(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the input covariates table recalibration .csv file") @Input(fullName = "recal_file", shortName = "recalFile", required = true, doc = "Filename for the input covariates table recalibration .csv file")
public File RECAL_FILE = null; public File RECAL_FILE = null;
/** /**
* A new bam file in which the quality scores in each read have been recalibrated. The alignment of the reads is left untouched. * A new bam file in which the quality scores in each read have been recalibrated. The alignment of the reads is left untouched.
*/ */
@Output(doc="The output recalibrated BAM file", required=true) @Output(doc = "The output recalibrated BAM file", required = true)
private StingSAMFileWriter OUTPUT_BAM = null; private StingSAMFileWriter OUTPUT_BAM = null;
/** /**
@ -126,7 +127,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* your Q2 and Q3 bins can be elevated to Q8 or Q10, leading to issues downstream. With the default value of 5, all Q0-Q4 bases * your Q2 and Q3 bins can be elevated to Q8 or Q10, leading to issues downstream. With the default value of 5, all Q0-Q4 bases
* are unmodified during recalibration, so they don't get inappropriately evaluated. * are unmodified during recalibration, so they don't get inappropriately evaluated.
*/ */
@Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false) @Argument(fullName = "preserve_qscores_less_than", shortName = "pQ", doc = "Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required = false)
private int PRESERVE_QSCORES_LESS_THAN = 5; private int PRESERVE_QSCORES_LESS_THAN = 5;
/** /**
@ -135,37 +136,36 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* argument which sets how many unobserved counts to add to every bin. Use --smoothing 0 to turn off all smoothing or, for example, * argument which sets how many unobserved counts to add to every bin. Use --smoothing 0 to turn off all smoothing or, for example,
* --smoothing 15 for a large amount of smoothing. * --smoothing 15 for a large amount of smoothing.
*/ */
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points") @Argument(fullName = "smoothing", shortName = "sm", required = false, doc = "Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
private int SMOOTHING = 1; private int SMOOTHING = 1;
/** /**
* Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation * Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
* by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later. * by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
*/ */
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores") @Argument(fullName = "max_quality_score", shortName = "maxQ", required = false, doc = "The integer value at which to cap the quality scores")
private int MAX_QUALITY_SCORE = 50; private int MAX_QUALITY_SCORE = 50;
/** /**
* By default TableRecalibration emits the OQ field -- so you can go back and look at the original quality scores, rerun * By default TableRecalibration emits the OQ field -- so you can go back and look at the original quality scores, rerun
* the system using the OQ flags, etc, on the output BAM files; to turn off emission of the OQ field use this flag. * the system using the OQ flags, etc, on the output BAM files; to turn off emission of the OQ field use this flag.
*/ */
@Argument(fullName="doNotWriteOriginalQuals", shortName="noOQs", required=false, doc="If true, we will not write the original quality (OQ) tag for each read") @Argument(fullName = "doNotWriteOriginalQuals", shortName = "noOQs", required = false, doc = "If true, we will not write the original quality (OQ) tag for each read")
private boolean DO_NOT_WRITE_OQ = false; private boolean DO_NOT_WRITE_OQ = false;
///////////////////////////// /////////////////////////////
// Debugging-only Arguments // Debugging-only Arguments
///////////////////////////// /////////////////////////////
@Hidden @Hidden
@Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") @Argument(fullName = "no_pg_tag", shortName = "noPG", required = false, doc = "Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
private boolean NO_PG_TAG = false; private boolean NO_PG_TAG = false;
@Hidden @Hidden
@Argument(fullName="fail_with_no_eof_marker", shortName="requireEOF", required=false, doc="If no EOF marker is present in the covariates file, exit the program with an exception.") @Argument(fullName = "fail_with_no_eof_marker", shortName = "requireEOF", required = false, doc = "If no EOF marker is present in the covariates file, exit the program with an exception.")
private boolean REQUIRE_EOF = false; private boolean REQUIRE_EOF = false;
@Hidden @Hidden
@Argument(fullName="skipUQUpdate", shortName="skipUQUpdate", required=false, doc="If true, we will skip the UQ updating step for each read, speeding up the calculations") @Argument(fullName = "skipUQUpdate", shortName = "skipUQUpdate", required = false, doc = "If true, we will skip the UQ updating step for each read, speeding up the calculations")
private boolean skipUQUpdate = false; private boolean skipUQUpdate = false;
///////////////////////////// /////////////////////////////
// Private Member Variables // Private Member Variables
///////////////////////////// /////////////////////////////
@ -195,8 +195,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
*/ */
public void initialize() { public void initialize() {
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; } if (RAC.FORCE_PLATFORM != null) {
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; } RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
}
// Get a list of all available covariates // Get a list of all available covariates
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins(); final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
@ -205,31 +206,33 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
boolean foundAllCovariates = false; boolean foundAllCovariates = false;
// Read in the data from the csv file and populate the data map and covariates list // Read in the data from the csv file and populate the data map and covariates list
logger.info( "Reading in the data from input csv file..." ); logger.info("Reading in the data from input csv file...");
boolean sawEOF = false; boolean sawEOF = false;
try { try {
for ( String line : new XReadLines(RECAL_FILE) ) { for (String line : new XReadLines(RECAL_FILE)) {
lineNumber++; lineNumber++;
if ( EOF_MARKER.equals(line) ) { if (EOF_MARKER.equals(line)) {
sawEOF = true; sawEOF = true;
} else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) { }
else if (COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
; // Skip over the comment lines, (which start with '#') ; // Skip over the comment lines, (which start with '#')
} }
// Read in the covariates that were used from the input file // Read in the covariates that were used from the input file
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data
if( foundAllCovariates ) { if (foundAllCovariates) {
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ); throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE);
} else { // Found the covariate list in input file, loop through all of them and instantiate them }
else { // Found the covariate list in input file, loop through all of them and instantiate them
String[] vals = line.split(","); String[] vals = line.split(",");
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical for (int iii = 0; iii < vals.length - 3; iii++) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
boolean foundClass = false; boolean foundClass = false;
for( Class<?> covClass : classes ) { for (Class<?> covClass : classes) {
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { if ((vals[iii] + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) {
foundClass = true; foundClass = true;
try { try {
Covariate covariate = (Covariate)covClass.newInstance(); Covariate covariate = (Covariate) covClass.newInstance();
requestedCovariates.add( covariate ); requestedCovariates.add(covariate);
} catch (Exception e) { } catch (Exception e) {
throw new DynamicClassResolutionException(covClass, e); throw new DynamicClassResolutionException(covClass, e);
} }
@ -237,107 +240,110 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
} }
} }
if( !foundClass ) { if (!foundClass) {
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." ); throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option.");
} }
} }
} }
} else { // Found a line of data }
if( !foundAllCovariates ) { else { // Found a line of data
if (!foundAllCovariates) {
foundAllCovariates = true; foundAllCovariates = true;
// At this point all the covariates should have been found and initialized // At this point all the covariates should have been found and initialized
if( requestedCovariates.size() < 2 ) { if (requestedCovariates.size() < 2) {
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE ); throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE);
} }
final boolean createCollapsedTables = true; final boolean createCollapsedTables = true;
// Initialize any covariate member variables using the shared argument collection // Initialize any covariate member variables using the shared argument collection
for( Covariate cov : requestedCovariates ) { for (Covariate cov : requestedCovariates) {
cov.initialize( RAC ); cov.initialize(RAC);
} }
// Initialize the data hashMaps // Initialize the data hashMaps
dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() ); dataManager = new RecalDataManager(createCollapsedTables, requestedCovariates.size());
} }
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
} }
} }
} catch ( FileNotFoundException e ) { } catch (FileNotFoundException e) {
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e); throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
} catch ( NumberFormatException e ) { } catch (NumberFormatException e) {
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker."); throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
} }
logger.info( "...done!" ); logger.info("...done!");
if ( !sawEOF ) { if (!sawEOF) {
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool."; final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
if ( REQUIRE_EOF ) if (REQUIRE_EOF)
throw new UserException.MalformedFile(RECAL_FILE, errorMessage); throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
logger.warn(errorMessage); logger.warn(errorMessage);
} }
logger.info( "The covariates being used here: " ); logger.info("The covariates being used here: ");
for( Covariate cov : requestedCovariates ) { for (Covariate cov : requestedCovariates) {
logger.info( "\t" + cov.getClass().getSimpleName() ); logger.info("\t" + cov.getClass().getSimpleName());
} }
if( dataManager == null ) { if (dataManager == null) {
throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?"); throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
} }
// Create the tables of empirical quality scores that will be used in the sequential calculation // Create the tables of empirical quality scores that will be used in the sequential calculation
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." ); logger.info("Generating tables of empirical qualities for use in sequential calculation...");
dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE ); dataManager.generateEmpiricalQualities(SMOOTHING, MAX_QUALITY_SCORE);
logger.info( "...done!" ); logger.info("...done!");
// Take the header of the input SAM file and tweak it by adding in a new programRecord with the version number and list of covariates that were used // Take the header of the input SAM file and tweak it by adding in a new programRecord with the version number and list of covariates that were used
final SAMFileHeader header = getToolkit().getSAMFileHeader().clone(); final SAMFileHeader header = getToolkit().getSAMFileHeader().clone();
if( !NO_PG_TAG ) { if (!NO_PG_TAG) {
final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText");
try { try {
final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version"); final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version");
programRecord.setProgramVersion(version); programRecord.setProgramVersion(version);
} catch (MissingResourceException e) {} } catch (MissingResourceException e) {
}
StringBuffer sb = new StringBuffer(); StringBuffer sb = new StringBuffer();
sb.append(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this)); sb.append(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this));
sb.append(" Covariates=["); sb.append(" Covariates=[");
for( Covariate cov : requestedCovariates ) { for (Covariate cov : requestedCovariates) {
sb.append(cov.getClass().getSimpleName()); sb.append(cov.getClass().getSimpleName());
sb.append(", "); sb.append(", ");
} }
sb.setCharAt(sb.length()-2, ']'); sb.setCharAt(sb.length() - 2, ']');
sb.setCharAt(sb.length()-1, ' '); sb.setCharAt(sb.length() - 1, ' ');
programRecord.setCommandLine(sb.toString()); programRecord.setCommandLine(sb.toString());
List<SAMProgramRecord> oldRecords = header.getProgramRecords(); List<SAMProgramRecord> oldRecords = header.getProgramRecords();
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size()+1); List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size() + 1);
for ( SAMProgramRecord record : oldRecords ) { for (SAMProgramRecord record : oldRecords) {
if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) ) if (!record.getId().startsWith(PROGRAM_RECORD_NAME))
newRecords.add(record); newRecords.add(record);
} }
newRecords.add(programRecord); newRecords.add(programRecord);
header.setProgramRecords(newRecords); header.setProgramRecords(newRecords);
// Write out the new header // Write out the new header
OUTPUT_BAM.writeHeader( header ); OUTPUT_BAM.writeHeader(header);
} }
} }
/** /**
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
*
* @param line A line of CSV data read from the recalibration table data file * @param line A line of CSV data read from the recalibration table data file
*/ */
private void addCSVData(final File file, final String line) { private void addCSVData(final File file, final String line) {
final String[] vals = line.split(","); final String[] vals = line.split(",");
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical if (vals.length != requestedCovariates.size() + 3) { // +3 because of nObservations, nMismatch, and Qempirical
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
" --Perhaps the read group string contains a comma and isn't being parsed correctly."); " --Perhaps the read group string contains a comma and isn't being parsed correctly.");
} }
@ -345,15 +351,15 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
final Object[] key = new Object[requestedCovariates.size()]; final Object[] key = new Object[requestedCovariates.size()];
Covariate cov; Covariate cov;
int iii; int iii;
for( iii = 0; iii < requestedCovariates.size(); iii++ ) { for (iii = 0; iii < requestedCovariates.size(); iii++) {
cov = requestedCovariates.get( iii ); cov = requestedCovariates.get(iii);
key[iii] = cov.getValue( vals[iii] ); key[iii] = cov.getValue(vals[iii]);
} }
// Create a new datum using the number of observations, number of mismatches, and reported quality score // Create a new datum using the number of observations, number of mismatches, and reported quality score
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); final RecalDatum datum = new RecalDatum(Long.parseLong(vals[iii]), Long.parseLong(vals[iii + 1]), Double.parseDouble(vals[1]), 0.0);
// Add that datum to all the collapsed tables which will be used in the sequential calculation // Add that datum to all the collapsed tables which will be used in the sequential calculation
dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN ); dataManager.addToAllTables(key, datum, PRESERVE_QSCORES_LESS_THAN);
} }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
@ -366,64 +372,63 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read * For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read
* *
* @param refBases References bases over the length of the read * @param refBases References bases over the length of the read
* @param read The read to be recalibrated * @param read The read to be recalibrated
* @return The read with quality scores replaced * @return The read with quality scores replaced
*/ */
public SAMRecord map( ReferenceContext refBases, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { public SAMRecord map(ReferenceContext refBases, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
if( read.getReadLength() == 0 ) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads. if (read.getReadLength() == 0) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads.
return read; return read;
} }
RecalDataManager.parseSAMRecord( read, RAC ); RecalDataManager.parseSAMRecord(read, RAC);
byte[] originalQuals = read.getBaseQualities(); byte[] originalQuals = read.getBaseQualities();
final byte[] recalQuals = originalQuals.clone(); final byte[] recalQuals = originalQuals.clone();
final String platform = read.getReadGroup().getPlatform(); final String platform = read.getReadGroup().getPlatform();
if( platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING) ) { if (platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING)) {
if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) ) { if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION)) {
final boolean badColor = RecalDataManager.checkNoCallColorSpace( read ); final boolean badColor = RecalDataManager.checkNoCallColorSpace(read);
if( badColor ) { if (badColor) {
numReadsWithMalformedColorSpace++; numReadsWithMalformedColorSpace++;
if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) { if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) {
return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) { }
else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
read.setReadFailsVendorQualityCheckFlag(true); read.setReadFailsVendorQualityCheckFlag(true);
return read; return read;
} }
} }
} }
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases() ); originalQuals = RecalDataManager.calcColorSpace(read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases());
} }
//compute all covariate values for this read //compute all covariate values for this read
final Comparable[][] covariateValues_offset_x_covar = final Comparable[][] covariateValues_offset_x_covar = RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
// For each base in the read // For each base in the read
for( int offset = 0; offset < read.getReadLength(); offset++ ) { for (int offset = 0; offset < read.getReadLength(); offset++) {
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset]; final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey); Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
if(qualityScore == null) if (qualityScore == null) {
{ qualityScore = performSequentialQualityCalculation(fullCovariateKey);
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey); qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
} }
recalQuals[offset] = qualityScore; recalQuals[offset] = qualityScore;
} }
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low preserveQScores(originalQuals, recalQuals); // Overwrite the work done if original quality score is too low
read.setBaseQualities( recalQuals ); // Overwrite old qualities with new recalibrated qualities read.setBaseQualities(recalQuals); // Overwrite old qualities with new recalibrated qualities
if ( !DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // Save the old qualities if the tag isn't already taken in the read if (!DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null) { // Save the old qualities if the tag isn't already taken in the read
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals)); read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals));
} }
if (! skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) { if (!skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) {
read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false)); read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false));
} }
@ -440,27 +445,28 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* *
* Given the full recalibration table, we perform the following preprocessing steps: * Given the full recalibration table, we perform the following preprocessing steps:
* *
* - calculate the global quality score shift across all data [DeltaQ] * - calculate the global quality score shift across all data [DeltaQ]
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
* - The final shift equation is: * - The final shift equation is:
*
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
* *
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
* @param key The list of Comparables that were calculated from the covariates * @param key The list of Comparables that were calculated from the covariates
* @return A recalibrated quality score as a byte * @return A recalibrated quality score as a byte
*/ */
private byte performSequentialQualityCalculation( final Object... key ) { private byte performSequentialQualityCalculation(final Object... key) {
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString()); final byte qualFromRead = (byte) Integer.parseInt(key[1].toString());
final Object[] readGroupCollapsedKey = new Object[1]; final Object[] readGroupCollapsedKey = new Object[1];
final Object[] qualityScoreCollapsedKey = new Object[2]; final Object[] qualityScoreCollapsedKey = new Object[2];
final Object[] covariateCollapsedKey = new Object[3]; final Object[] covariateCollapsedKey = new Object[3];
// The global quality shift (over the read group only) // The global quality shift (over the read group only)
readGroupCollapsedKey[0] = key[0]; readGroupCollapsedKey[0] = key[0];
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey )); final RecalDatum globalRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(0).get(readGroupCollapsedKey));
double globalDeltaQ = 0.0; double globalDeltaQ = 0.0;
if( globalRecalDatum != null ) { if (globalRecalDatum != null) {
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported(); final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
@ -469,9 +475,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// The shift in quality between reported and empirical // The shift in quality between reported and empirical
qualityScoreCollapsedKey[0] = key[0]; qualityScoreCollapsedKey[0] = key[0];
qualityScoreCollapsedKey[1] = key[1]; qualityScoreCollapsedKey[1] = key[1];
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey )); final RecalDatum qReportedRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(1).get(qualityScoreCollapsedKey));
double deltaQReported = 0.0; double deltaQReported = 0.0;
if( qReportedRecalDatum != null ) { if (qReportedRecalDatum != null) {
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
} }
@ -481,17 +487,17 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
double deltaQCovariateEmpirical; double deltaQCovariateEmpirical;
covariateCollapsedKey[0] = key[0]; covariateCollapsedKey[0] = key[0];
covariateCollapsedKey[1] = key[1]; covariateCollapsedKey[1] = key[1];
for( int iii = 2; iii < key.length; iii++ ) { for (int iii = 2; iii < key.length; iii++) {
covariateCollapsedKey[2] = key[iii]; // The given covariate covariateCollapsedKey[2] = key[iii]; // The given covariate
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey )); final RecalDatum covariateRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(iii).get(covariateCollapsedKey));
if( covariateRecalDatum != null ) { if (covariateRecalDatum != null) {
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported));
} }
} }
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE ); return QualityUtils.boundQual((int) Math.round(newQuality), (byte) MAX_QUALITY_SCORE);
// Verbose printouts used to validate with old recalibrator // Verbose printouts used to validate with old recalibrator
//if(key.contains(null)) { //if(key.contains(null)) {
@ -508,12 +514,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/** /**
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold * Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
*
* @param originalQuals The list of original base quality scores * @param originalQuals The list of original base quality scores
* @param recalQuals A list of the new recalibrated quality scores * @param recalQuals A list of the new recalibrated quality scores
*/ */
private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) { private void preserveQScores(final byte[] originalQuals, final byte[] recalQuals) {
for( int iii = 0; iii < recalQuals.length; iii++ ) { for (int iii = 0; iii < recalQuals.length; iii++) {
if( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) { if (originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN) {
recalQuals[iii] = originalQuals[iii]; recalQuals[iii] = originalQuals[iii];
} }
} }
@ -527,6 +534,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/** /**
* Start the reduce with a handle to the output bam file * Start the reduce with a handle to the output bam file
*
* @return A FileWriter pointing to a new bam file * @return A FileWriter pointing to a new bam file
*/ */
public SAMFileWriter reduceInit() { public SAMFileWriter reduceInit() {
@ -535,12 +543,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/** /**
* Output each read to disk * Output each read to disk
* @param read The read to output *
* @param read The read to output
* @param output The FileWriter to write the read to * @param output The FileWriter to write the read to
* @return The FileWriter * @return The FileWriter
*/ */
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) { public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
if( output != null ) { if (output != null) {
output.addAlignment(read); output.addAlignment(read);
} }
return output; return output;
@ -548,20 +557,22 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/** /**
* Do nothing * Do nothing
*
* @param output The SAMFileWriter that outputs the bam file * @param output The SAMFileWriter that outputs the bam file
*/ */
public void onTraversalDone(SAMFileWriter output) { public void onTraversalDone(SAMFileWriter output) {
if( numReadsWithMalformedColorSpace != 0 ) { if (numReadsWithMalformedColorSpace != 0) {
if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) { if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) {
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " + Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " + "because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " + "for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
"These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!"); "These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!");
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) { }
else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " + Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " + "because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " + "for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
"These reads were completely removed from the output bam file."); "These reads were completely removed from the output bam file.");
} }
} }