BQSR Parameter cleanup

* get rid of 320C argument that nobody uses.
   * get rid of DEFAULT_READ_GROUP parameter and functionality (later to become an engine argument).
This commit is contained in:
Mauricio Carneiro 2012-02-07 13:22:46 -05:00
parent 717cd4b912
commit 0d3ea0401c
4 changed files with 280 additions and 293 deletions

View File

@ -77,20 +77,20 @@ import java.util.Map;
* <h2>Output</h2>
* <p>
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
*
* The first 20 lines of such a file is shown below.
* The first 20 lines of such a file is shown below.
* * The file begins with a series of comment lines describing:
* ** The number of counted loci
* ** The number of counted bases
* ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
*
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
*
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
*
* * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
* reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
*
*
* <pre>
* # Counted Sites 19451059
* # Counted Bases 56582018
@ -129,13 +129,14 @@ import java.util.Map;
* -cov DinucCovariate \
* -recalFile my_reads.recal_data.csv
* </pre>
*
*/
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
@By(DataSource.READS) // Only look at covered loci, not every loci of the reference file
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class})
// Filter out all reads with zero or unavailable mapping quality
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
// This walker requires both -I input.bam and -R reference.fasta
@PartitionBy(PartitionType.LOCUS)
public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> {
@ -149,7 +150,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/////////////////////////////
// Shared Arguments
/////////////////////////////
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
@ArgumentCollection
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
/////////////////////////////
// Command Line Arguments
@ -160,7 +162,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
* for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
*/
@Input(fullName="knownSites", shortName = "knownSites", doc="A database of known polymorphic sites to skip over in the recalibration algorithm", required=false)
@Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
/**
@ -169,31 +171,31 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
*/
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the output covariates table recalibration file")
@Output(fullName = "recal_file", shortName = "recalFile", required = true, doc = "Filename for the output covariates table recalibration file")
@Gather(CountCovariatesGatherer.class)
public PrintStream RECAL_FILE;
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
@Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
private boolean LIST_ONLY = false;
/**
* See the -list argument to view available covariates.
*/
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false)
@Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false)
private String[] COVARIATES = null;
@Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false)
@Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false)
private boolean USE_STANDARD_COVARIATES = false;
/////////////////////////////
// Debugging-only Arguments
/////////////////////////////
@Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
@Argument(fullName = "dont_sort_output", shortName = "unsorted", required = false, doc = "If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
private boolean DONT_SORT_OUTPUT = false;
/**
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
*/
@Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
private boolean RUN_WITHOUT_DBSNP = false;
/////////////////////////////
@ -217,6 +219,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/**
* Adds the values of other to this, returning this
*
* @param other
* @return this object
*/
@ -247,53 +250,55 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
*/
public void initialize() {
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
if (RAC.FORCE_PLATFORM != null) {
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
}
// Get a list of all available covariates
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>( Covariate.class ).getPlugins();
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>( RequiredCovariate.class ).getPlugins();
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>( StandardCovariate.class ).getPlugins();
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>(RequiredCovariate.class).getPlugins();
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>(StandardCovariate.class).getPlugins();
// Print and exit if that's what was requested
if ( LIST_ONLY ) {
logger.info( "Available covariates:" );
for( Class<?> covClass : covariateClasses ) {
logger.info( covClass.getSimpleName() );
if (LIST_ONLY) {
logger.info("Available covariates:");
for (Class<?> covClass : covariateClasses) {
logger.info(covClass.getSimpleName());
}
logger.info("");
System.exit( 0 ); // Early exit here because user requested it
System.exit(0); // Early exit here because user requested it
}
// Warn the user if no dbSNP file or other variant mask was specified
if( knownSites.isEmpty() && !RUN_WITHOUT_DBSNP ) {
if (knownSites.isEmpty() && !RUN_WITHOUT_DBSNP) {
throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.");
}
// Initialize the requested covariates by parsing the -cov argument
// First add the required covariates
if( requiredClasses.size() == 2) { // readGroup and reported quality score
requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here
requestedCovariates.add( new QualityScoreCovariate() );
} else {
if (requiredClasses.size() == 2) { // readGroup and reported quality score
requestedCovariates.add(new ReadGroupCovariate()); // Order is important here
requestedCovariates.add(new QualityScoreCovariate());
}
else {
throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order.");
}
// Next add the standard covariates if -standard was specified by the user
if( USE_STANDARD_COVARIATES ) {
if (USE_STANDARD_COVARIATES) {
// We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order
// A list of Classes can't be sorted, but a list of Class names can be
final List<String> standardClassNames = new ArrayList<String>();
for( Class<?> covClass : standardClasses ) {
standardClassNames.add( covClass.getName() );
for (Class<?> covClass : standardClasses) {
standardClassNames.add(covClass.getName());
}
Collections.sort(standardClassNames); // Sort the list of class names
for( String className : standardClassNames ) {
for( Class<?> covClass : standardClasses ) { // Find the class that matches this class name
if( covClass.getName().equals( className ) ) {
for (String className : standardClassNames) {
for (Class<?> covClass : standardClasses) { // Find the class that matches this class name
if (covClass.getName().equals(className)) {
try {
final Covariate covariate = (Covariate)covClass.newInstance();
requestedCovariates.add( covariate );
final Covariate covariate = (Covariate) covClass.newInstance();
requestedCovariates.add(covariate);
} catch (Exception e) {
throw new DynamicClassResolutionException(covClass, e);
}
@ -302,17 +307,17 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
}
}
// Finally parse the -cov arguments that were provided, skipping over the ones already specified
if( COVARIATES != null ) {
for( String requestedCovariateString : COVARIATES ) {
if (COVARIATES != null) {
for (String requestedCovariateString : COVARIATES) {
boolean foundClass = false;
for( Class<?> covClass : covariateClasses ) {
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
for (Class<?> covClass : covariateClasses) {
if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class
foundClass = true;
if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) {
if (!requiredClasses.contains(covClass) && (!USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) {
try {
// Now that we've found a matching class, try to instantiate it
final Covariate covariate = (Covariate)covClass.newInstance();
requestedCovariates.add( covariate );
final Covariate covariate = (Covariate) covClass.newInstance();
requestedCovariates.add(covariate);
} catch (Exception e) {
throw new DynamicClassResolutionException(covClass, e);
}
@ -320,20 +325,19 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
}
}
if( !foundClass ) {
throw new UserException.CommandLineException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
if (!foundClass) {
throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates.");
}
}
}
logger.info( "The covariates being used here: " );
for( Covariate cov : requestedCovariates ) {
logger.info( "\t" + cov.getClass().getSimpleName() );
cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection
logger.info("The covariates being used here: ");
for (Covariate cov : requestedCovariates) {
logger.info("\t" + cov.getClass().getSimpleName());
cov.initialize(RAC); // Initialize any covariate member variables using the shared argument collection
}
}
//---------------------------------------------------------------------------------------------------------------
//
// map
@ -342,62 +346,63 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/**
* For each read at this locus get the various covariate values and increment that location in the map based on
* whether or not the base matches the reference at this particular location
* whether or not the base matches the reference at this particular location
*
* @param tracker The reference metadata tracker
* @param ref The reference context
* @param ref The reference context
* @param context The alignment context
* @return Returns 1, but this value isn't used in the reduce step
*/
public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
public CountedData map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
// Only use data from non-dbsnp sites
// Assume every mismatch at a non-dbsnp site is indicative of poor quality
CountedData counter = new CountedData();
if( tracker.getValues(knownSites).size() == 0 ) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed
if (tracker.getValues(knownSites).size() == 0) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed
// For each read at this locus
for( final PileupElement p : context.getBasePileup() ) {
for (final PileupElement p : context.getBasePileup()) {
final GATKSAMRecord gatkRead = p.getRead();
int offset = p.getOffset();
if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) {
if (gatkRead.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE)) {
continue;
}
if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) )
{
gatkRead.setTemporaryAttribute( SEEN_ATTRIBUTE, true );
RecalDataManager.parseSAMRecord( gatkRead, RAC );
if (!gatkRead.containsTemporaryAttribute(SEEN_ATTRIBUTE)) {
gatkRead.setTemporaryAttribute(SEEN_ATTRIBUTE, true);
RecalDataManager.parseSAMRecord(gatkRead, RAC);
// Skip over reads with no calls in the color space if the user requested it
if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace( gatkRead ) ) {
gatkRead.setTemporaryAttribute( SKIP_RECORD_ATTRIBUTE, true);
if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace(gatkRead)) {
gatkRead.setTemporaryAttribute(SKIP_RECORD_ATTRIBUTE, true);
continue;
}
RecalDataManager.parseColorSpace( gatkRead );
gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE,
RecalDataManager.computeCovariates( gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION ));
RecalDataManager.parseColorSpace(gatkRead);
gatkRead.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION));
}
// Skip this position if base quality is zero
if( gatkRead.getBaseQualities()[offset] > 0 ) {
if (gatkRead.getBaseQualities()[offset] > 0) {
byte[] bases = gatkRead.getReadBases();
byte refBase = ref.getBase();
// Skip if this base is an 'N' or etc.
if( BaseUtils.isRegularBase( bases[offset] ) ) {
if (BaseUtils.isRegularBase(bases[offset])) {
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
!RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) {
if (!gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
!RecalDataManager.isInconsistentColorSpace(gatkRead, offset)) {
// This base finally passed all the checks for a good base, so add it to the big data hashmap
updateDataFromRead( counter, gatkRead, offset, refBase );
updateDataFromRead(counter, gatkRead, offset, refBase);
} else { // calculate SOLID reference insertion rate
if( refBase == bases[offset] ) {
}
else { // calculate SOLID reference insertion rate
if (refBase == bases[offset]) {
counter.solidInsertedReferenceBases++;
} else {
}
else {
counter.otherColorSpaceInconsistency++;
}
}
@ -405,7 +410,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
}
}
counter.countedSites++;
} else { // We skipped over the dbSNP site, and we are only processing every Nth locus
}
else { // We skipped over the dbSNP site, and we are only processing every Nth locus
counter.skippedSites++;
updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
}
@ -413,7 +419,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
return counter;
}
/**
/**
* Update the mismatch / total_base counts for a given class of loci.
*
* @param counter The CountedData to be updated
@ -421,13 +427,13 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
* @param refBase The reference base
*/
private static void updateMismatchCounts(CountedData counter, final AlignmentContext context, final byte refBase) {
for( PileupElement p : context.getBasePileup() ) {
for (PileupElement p : context.getBasePileup()) {
final byte readBase = p.getBase();
final int readBaseIndex = BaseUtils.simpleBaseToBaseIndex(readBase);
final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase);
final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase);
if( readBaseIndex != -1 && refBaseIndex != -1 ) {
if( readBaseIndex != refBaseIndex ) {
if (readBaseIndex != -1 && refBaseIndex != -1) {
if (readBaseIndex != refBaseIndex) {
counter.novelCountsMM++;
}
counter.novelCountsBases++;
@ -439,13 +445,14 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
* Major workhorse routine for this walker.
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
* adding one to the number of observations and potentially one to the number of mismatches
* adding one to the number of observations and potentially one to the number of mismatches
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
* because pulling things out of the SAMRecord is an expensive operation.
* @param counter Data structure which holds the counted bases
* because pulling things out of the SAMRecord is an expensive operation.
*
* @param counter Data structure which holds the counted bases
* @param gatkRead The SAMRecord holding all the data for this read
* @param offset The offset in the read for this locus
* @param refBase The reference base at this locus
* @param offset The offset in the read for this locus
* @param refBase The reference base at this locus
*/
private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRead, final int offset, final byte refBase) {
final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE);
@ -453,10 +460,10 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
final NestedHashMap data = dataManager.data; //optimization - create local reference
RecalDatumOptimized datum = (RecalDatumOptimized) data.get( key );
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
RecalDatumOptimized datum = (RecalDatumOptimized) data.get(key);
if (datum == null) { // key doesn't exist yet in the map so make a new bucket and add it
// initialized with zeros, will be incremented at end of method
datum = (RecalDatumOptimized)data.put( new RecalDatumOptimized(), true, (Object[])key );
datum = (RecalDatumOptimized) data.put(new RecalDatumOptimized(), true, (Object[]) key);
}
// Need the bases to determine whether or not we have a mismatch
@ -464,13 +471,12 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
final long curMismatches = datum.getNumMismatches();
// Add one to the number of observations and potentially one to the number of mismatches
datum.incrementBaseCounts( base, refBase );
datum.incrementBaseCounts(base, refBase);
counter.countedBases++;
counter.novelCountsBases++;
counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
@ -479,6 +485,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/**
* Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker.
*
* @return returns A PrintStream created from the -recalFile filename argument specified to the walker
*/
public CountedData reduceInit() {
@ -487,11 +494,12 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/**
* The Reduce method doesn't do anything for this walker.
*
* @param mapped Result of the map. This value is immediately ignored.
* @param sum The summing CountedData used to output the CSV data
* @param sum The summing CountedData used to output the CSV data
* @return returns The sum used to output the CSV data
*/
public CountedData reduce( CountedData mapped, CountedData sum ) {
public CountedData reduce(CountedData mapped, CountedData sum) {
// Do a dbSNP sanity check every so often
return validatingDbsnpMismatchRate(sum.add(mapped));
}
@ -500,16 +508,15 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
* Validate the dbSNP reference mismatch rates.
*/
private CountedData validatingDbsnpMismatchRate(CountedData counter) {
if( ++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY ) {
if (++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY) {
counter.lociSinceLastDbsnpCheck = 0;
if( counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L ) {
final double fractionMM_novel = (double)counter.novelCountsMM / (double)counter.novelCountsBases;
final double fractionMM_dbsnp = (double)counter.dbSNPCountsMM / (double)counter.dbSNPCountsBases;
if (counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L) {
final double fractionMM_novel = (double) counter.novelCountsMM / (double) counter.novelCountsBases;
final double fractionMM_dbsnp = (double) counter.dbSNPCountsMM / (double) counter.dbSNPCountsBases;
if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) {
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " +
String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) );
if (fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel) {
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel));
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file
}
}
@ -518,47 +525,50 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
return counter;
}
public CountedData treeReduce( CountedData sum1, CountedData sum2 ) {
public CountedData treeReduce(CountedData sum1, CountedData sum2) {
return validatingDbsnpMismatchRate(sum1.add(sum2));
}
/**
* Write out the full data hashmap to disk in CSV format
*
* @param sum The CountedData to write out to RECAL_FILE
*/
public void onTraversalDone( CountedData sum ) {
logger.info( "Writing raw recalibration data..." );
if( sum.countedBases == 0L ) {
public void onTraversalDone(CountedData sum) {
logger.info("Writing raw recalibration data...");
if (sum.countedBases == 0L) {
throw new UserException.BadInput("Could not find any usable data in the input BAM file(s).");
}
outputToCSV( sum, RECAL_FILE );
logger.info( "...done!" );
outputToCSV(sum, RECAL_FILE);
logger.info("...done!");
}
/**
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
*
* @param recalTableStream The PrintStream to write out to
*/
private void outputToCSV( CountedData sum, final PrintStream recalTableStream ) {
private void outputToCSV(CountedData sum, final PrintStream recalTableStream) {
recalTableStream.printf("# Counted Sites %d%n", sum.countedSites);
recalTableStream.printf("# Counted Bases %d%n", sum.countedBases);
recalTableStream.printf("# Skipped Sites %d%n", sum.skippedSites);
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)sum.countedSites / sum.skippedSites);
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double) sum.countedSites / sum.skippedSites);
if( sum.solidInsertedReferenceBases != 0 ) {
if (sum.solidInsertedReferenceBases != 0) {
recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) sum.countedBases / sum.solidInsertedReferenceBases);
recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) sum.countedBases / sum.otherColorSpaceInconsistency);
}
// Output header saying which covariates were used and in what order
for( Covariate cov : requestedCovariates ) {
recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," );
for (Covariate cov : requestedCovariates) {
recalTableStream.print(cov.getClass().getSimpleName().split("Covariate")[0] + ",");
}
recalTableStream.println("nObservations,nMismatches,Qempirical");
if( DONT_SORT_OUTPUT ) {
if (DONT_SORT_OUTPUT) {
printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
} else {
}
else {
printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
}
@ -566,45 +576,47 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
recalTableStream.println(TableRecalibrationWalker.EOF_MARKER);
}
private void printMappingsSorted( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
private void printMappingsSorted(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
final ArrayList<Comparable> keyList = new ArrayList<Comparable>();
for( Object comp : data.keySet() ) {
for (Object comp : data.keySet()) {
keyList.add((Comparable) comp);
}
Collections.sort(keyList);
for( Comparable comp : keyList ) {
for (Comparable comp : keyList) {
key[curPos] = comp;
final Object val = data.get(comp);
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
// For each Covariate in the key
for( Object compToPrint : key ) {
for (Object compToPrint : key) {
// Output the Covariate's value
recalTableStream.print( compToPrint + "," );
recalTableStream.print(compToPrint + ",");
}
// Output the RecalDatum entry
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
} else { // Another layer in the nested hash map
printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val );
recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
}
else { // Another layer in the nested hash map
printMappingsSorted(recalTableStream, curPos + 1, key, (Map) val);
}
}
}
private void printMappings( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
for( Object comp : data.keySet() ) {
private void printMappings(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
for (Object comp : data.keySet()) {
key[curPos] = comp;
final Object val = data.get(comp);
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
// For each Covariate in the key
for( Object compToPrint : key ) {
for (Object compToPrint : key) {
// Output the Covariate's value
recalTableStream.print( compToPrint + "," );
recalTableStream.print(compToPrint + ",");
}
// Output the RecalDatum entry
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
} else { // Another layer in the nested hash map
printMappings( recalTableStream, curPos + 1, key, (Map) val );
recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
}
else { // Another layer in the nested hash map
printMappings(recalTableStream, curPos + 1, key, (Map) val);
}
}
}

View File

@ -256,32 +256,6 @@ public class RecalDataManager {
public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup();
// If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments
if (readGroup == null) {
if (RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) {
if (!warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null) {
Utils.warnUser("The input .bam file contains reads with no read group. " +
"Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " +
"First observed at read with name = " + read.getReadName());
warnUserNullReadGroup = true;
}
// There is no readGroup so defaulting to these values
readGroup = new GATKSAMReadGroupRecord(RAC.DEFAULT_READ_GROUP);
readGroup.setPlatform(RAC.DEFAULT_PLATFORM);
((GATKSAMRecord) read).setReadGroup(readGroup);
}
else {
throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName());
}
}
if (RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP)) { // Collapse all the read groups into a single common String provided by the user
final String oldPlatform = readGroup.getPlatform();
readGroup = new GATKSAMReadGroupRecord(RAC.FORCE_READ_GROUP);
readGroup.setPlatform(oldPlatform);
((GATKSAMRecord) read).setReadGroup(readGroup);
}
if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
readGroup.setPlatform(RAC.FORCE_PLATFORM);
}

View File

@ -43,31 +43,15 @@ public class RecalibrationArgumentCollection {
// Shared Command Line Arguments
//////////////////////////////////
@Hidden
@Argument(fullName = "default_read_group", shortName = "dRG", required = false, doc = "If a read has no read group then default to the provided String.")
public String DEFAULT_READ_GROUP = null;
@Hidden
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
public String DEFAULT_PLATFORM = null;
@Hidden
@Argument(fullName = "force_read_group", shortName = "fRG", required = false, doc = "If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.")
public String FORCE_READ_GROUP = null;
@Hidden
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
@Hidden
@Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false)
public int WINDOW_SIZE = 5;
/**
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
*/
@Hidden
@Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
public int HOMOPOLYMER_NBACK = 7;
@Hidden
@Argument(fullName = "exception_if_no_tile", shortName = "throwTileException", doc = "If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required = false)
public boolean EXCEPTION_IF_NO_TILE = false;
/**
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies.
@ -89,4 +73,10 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false)
public int CONTEXT_SIZE = 8;
/**
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
*/
@Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
public int HOMOPOLYMER_NBACK = 7;
}

View File

@ -86,12 +86,12 @@ import java.util.regex.Pattern;
* -o my_reads.recal.bam \
* -recalFile my_reads.recal_data.csv
* </pre>
*
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@WalkerName("TableRecalibration")
@Requires({ DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES }) // This walker requires -I input.bam, it also requires -R reference.fasta
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
// This walker requires -I input.bam, it also requires -R reference.fasta
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration";
@ -99,7 +99,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/////////////////////////////
// Shared Arguments
/////////////////////////////
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
@ArgumentCollection
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
/////////////////////////////
// Command Line Arguments
@ -110,12 +111,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
*/
@Input(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the input covariates table recalibration .csv file")
@Input(fullName = "recal_file", shortName = "recalFile", required = true, doc = "Filename for the input covariates table recalibration .csv file")
public File RECAL_FILE = null;
/**
* A new bam file in which the quality scores in each read have been recalibrated. The alignment of the reads is left untouched.
*/
@Output(doc="The output recalibrated BAM file", required=true)
@Output(doc = "The output recalibrated BAM file", required = true)
private StingSAMFileWriter OUTPUT_BAM = null;
/**
@ -126,7 +127,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* your Q2 and Q3 bins can be elevated to Q8 or Q10, leading to issues downstream. With the default value of 5, all Q0-Q4 bases
* are unmodified during recalibration, so they don't get inappropriately evaluated.
*/
@Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
@Argument(fullName = "preserve_qscores_less_than", shortName = "pQ", doc = "Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required = false)
private int PRESERVE_QSCORES_LESS_THAN = 5;
/**
@ -135,37 +136,36 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* argument which sets how many unobserved counts to add to every bin. Use --smoothing 0 to turn off all smoothing or, for example,
* --smoothing 15 for a large amount of smoothing.
*/
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
@Argument(fullName = "smoothing", shortName = "sm", required = false, doc = "Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
private int SMOOTHING = 1;
/**
* Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
* by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
*/
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores")
@Argument(fullName = "max_quality_score", shortName = "maxQ", required = false, doc = "The integer value at which to cap the quality scores")
private int MAX_QUALITY_SCORE = 50;
/**
* By default TableRecalibration emits the OQ field -- so you can go back and look at the original quality scores, rerun
* the system using the OQ flags, etc, on the output BAM files; to turn off emission of the OQ field use this flag.
*/
@Argument(fullName="doNotWriteOriginalQuals", shortName="noOQs", required=false, doc="If true, we will not write the original quality (OQ) tag for each read")
@Argument(fullName = "doNotWriteOriginalQuals", shortName = "noOQs", required = false, doc = "If true, we will not write the original quality (OQ) tag for each read")
private boolean DO_NOT_WRITE_OQ = false;
/////////////////////////////
// Debugging-only Arguments
/////////////////////////////
@Hidden
@Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
@Argument(fullName = "no_pg_tag", shortName = "noPG", required = false, doc = "Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
private boolean NO_PG_TAG = false;
@Hidden
@Argument(fullName="fail_with_no_eof_marker", shortName="requireEOF", required=false, doc="If no EOF marker is present in the covariates file, exit the program with an exception.")
@Argument(fullName = "fail_with_no_eof_marker", shortName = "requireEOF", required = false, doc = "If no EOF marker is present in the covariates file, exit the program with an exception.")
private boolean REQUIRE_EOF = false;
@Hidden
@Argument(fullName="skipUQUpdate", shortName="skipUQUpdate", required=false, doc="If true, we will skip the UQ updating step for each read, speeding up the calculations")
@Argument(fullName = "skipUQUpdate", shortName = "skipUQUpdate", required = false, doc = "If true, we will skip the UQ updating step for each read, speeding up the calculations")
private boolean skipUQUpdate = false;
/////////////////////////////
// Private Member Variables
/////////////////////////////
@ -195,8 +195,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
*/
public void initialize() {
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
if (RAC.FORCE_PLATFORM != null) {
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
}
// Get a list of all available covariates
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
@ -205,31 +206,33 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
boolean foundAllCovariates = false;
// Read in the data from the csv file and populate the data map and covariates list
logger.info( "Reading in the data from input csv file..." );
logger.info("Reading in the data from input csv file...");
boolean sawEOF = false;
try {
for ( String line : new XReadLines(RECAL_FILE) ) {
for (String line : new XReadLines(RECAL_FILE)) {
lineNumber++;
if ( EOF_MARKER.equals(line) ) {
if (EOF_MARKER.equals(line)) {
sawEOF = true;
} else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) {
}
else if (COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
; // Skip over the comment lines, (which start with '#')
}
// Read in the covariates that were used from the input file
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
if( foundAllCovariates ) {
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
} else { // Found the covariate list in input file, loop through all of them and instantiate them
else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data
if (foundAllCovariates) {
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE);
}
else { // Found the covariate list in input file, loop through all of them and instantiate them
String[] vals = line.split(",");
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
for (int iii = 0; iii < vals.length - 3; iii++) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
boolean foundClass = false;
for( Class<?> covClass : classes ) {
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
for (Class<?> covClass : classes) {
if ((vals[iii] + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) {
foundClass = true;
try {
Covariate covariate = (Covariate)covClass.newInstance();
requestedCovariates.add( covariate );
Covariate covariate = (Covariate) covClass.newInstance();
requestedCovariates.add(covariate);
} catch (Exception e) {
throw new DynamicClassResolutionException(covClass, e);
}
@ -237,107 +240,110 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
}
if( !foundClass ) {
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
if (!foundClass) {
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option.");
}
}
}
} else { // Found a line of data
if( !foundAllCovariates ) {
}
else { // Found a line of data
if (!foundAllCovariates) {
foundAllCovariates = true;
// At this point all the covariates should have been found and initialized
if( requestedCovariates.size() < 2 ) {
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE );
if (requestedCovariates.size() < 2) {
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE);
}
final boolean createCollapsedTables = true;
// Initialize any covariate member variables using the shared argument collection
for( Covariate cov : requestedCovariates ) {
cov.initialize( RAC );
for (Covariate cov : requestedCovariates) {
cov.initialize(RAC);
}
// Initialize the data hashMaps
dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() );
dataManager = new RecalDataManager(createCollapsedTables, requestedCovariates.size());
}
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
}
}
} catch ( FileNotFoundException e ) {
} catch (FileNotFoundException e) {
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
} catch ( NumberFormatException e ) {
} catch (NumberFormatException e) {
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
}
logger.info( "...done!" );
logger.info("...done!");
if ( !sawEOF ) {
if (!sawEOF) {
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
if ( REQUIRE_EOF )
if (REQUIRE_EOF)
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
logger.warn(errorMessage);
}
logger.info( "The covariates being used here: " );
for( Covariate cov : requestedCovariates ) {
logger.info( "\t" + cov.getClass().getSimpleName() );
logger.info("The covariates being used here: ");
for (Covariate cov : requestedCovariates) {
logger.info("\t" + cov.getClass().getSimpleName());
}
if( dataManager == null ) {
if (dataManager == null) {
throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
}
// Create the tables of empirical quality scores that will be used in the sequential calculation
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE );
logger.info( "...done!" );
logger.info("Generating tables of empirical qualities for use in sequential calculation...");
dataManager.generateEmpiricalQualities(SMOOTHING, MAX_QUALITY_SCORE);
logger.info("...done!");
// Take the header of the input SAM file and tweak it by adding in a new programRecord with the version number and list of covariates that were used
final SAMFileHeader header = getToolkit().getSAMFileHeader().clone();
if( !NO_PG_TAG ) {
if (!NO_PG_TAG) {
final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText");
try {
final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version");
programRecord.setProgramVersion(version);
} catch (MissingResourceException e) {}
} catch (MissingResourceException e) {
}
StringBuffer sb = new StringBuffer();
sb.append(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this));
sb.append(" Covariates=[");
for( Covariate cov : requestedCovariates ) {
for (Covariate cov : requestedCovariates) {
sb.append(cov.getClass().getSimpleName());
sb.append(", ");
}
sb.setCharAt(sb.length()-2, ']');
sb.setCharAt(sb.length()-1, ' ');
sb.setCharAt(sb.length() - 2, ']');
sb.setCharAt(sb.length() - 1, ' ');
programRecord.setCommandLine(sb.toString());
List<SAMProgramRecord> oldRecords = header.getProgramRecords();
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size()+1);
for ( SAMProgramRecord record : oldRecords ) {
if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) )
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size() + 1);
for (SAMProgramRecord record : oldRecords) {
if (!record.getId().startsWith(PROGRAM_RECORD_NAME))
newRecords.add(record);
}
newRecords.add(programRecord);
header.setProgramRecords(newRecords);
// Write out the new header
OUTPUT_BAM.writeHeader( header );
OUTPUT_BAM.writeHeader(header);
}
}
/**
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
*
* @param line A line of CSV data read from the recalibration table data file
*/
private void addCSVData(final File file, final String line) {
final String[] vals = line.split(",");
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
if (vals.length != requestedCovariates.size() + 3) { // +3 because of nObservations, nMismatch, and Qempirical
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
}
@ -345,15 +351,15 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
final Object[] key = new Object[requestedCovariates.size()];
Covariate cov;
int iii;
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
cov = requestedCovariates.get( iii );
key[iii] = cov.getValue( vals[iii] );
for (iii = 0; iii < requestedCovariates.size(); iii++) {
cov = requestedCovariates.get(iii);
key[iii] = cov.getValue(vals[iii]);
}
// Create a new datum using the number of observations, number of mismatches, and reported quality score
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
final RecalDatum datum = new RecalDatum(Long.parseLong(vals[iii]), Long.parseLong(vals[iii + 1]), Double.parseDouble(vals[1]), 0.0);
// Add that datum to all the collapsed tables which will be used in the sequential calculation
dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN );
dataManager.addToAllTables(key, datum, PRESERVE_QSCORES_LESS_THAN);
}
//---------------------------------------------------------------------------------------------------------------
@ -366,64 +372,63 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read
*
* @param refBases References bases over the length of the read
* @param read The read to be recalibrated
* @param read The read to be recalibrated
* @return The read with quality scores replaced
*/
public SAMRecord map( ReferenceContext refBases, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) {
public SAMRecord map(ReferenceContext refBases, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
if( read.getReadLength() == 0 ) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads.
if (read.getReadLength() == 0) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads.
return read;
}
RecalDataManager.parseSAMRecord( read, RAC );
RecalDataManager.parseSAMRecord(read, RAC);
byte[] originalQuals = read.getBaseQualities();
final byte[] recalQuals = originalQuals.clone();
final String platform = read.getReadGroup().getPlatform();
if( platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING) ) {
if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) ) {
final boolean badColor = RecalDataManager.checkNoCallColorSpace( read );
if( badColor ) {
if (platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING)) {
if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION)) {
final boolean badColor = RecalDataManager.checkNoCallColorSpace(read);
if (badColor) {
numReadsWithMalformedColorSpace++;
if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) {
if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) {
return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) {
}
else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
read.setReadFailsVendorQualityCheckFlag(true);
return read;
}
}
}
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases() );
originalQuals = RecalDataManager.calcColorSpace(read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases());
}
//compute all covariate values for this read
final Comparable[][] covariateValues_offset_x_covar =
RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
final Comparable[][] covariateValues_offset_x_covar = RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
// For each base in the read
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
for (int offset = 0; offset < read.getReadLength(); offset++) {
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
if(qualityScore == null)
{
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
if (qualityScore == null) {
qualityScore = performSequentialQualityCalculation(fullCovariateKey);
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
}
recalQuals[offset] = qualityScore;
}
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
preserveQScores(originalQuals, recalQuals); // Overwrite the work done if original quality score is too low
read.setBaseQualities( recalQuals ); // Overwrite old qualities with new recalibrated qualities
if ( !DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // Save the old qualities if the tag isn't already taken in the read
read.setBaseQualities(recalQuals); // Overwrite old qualities with new recalibrated qualities
if (!DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null) { // Save the old qualities if the tag isn't already taken in the read
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals));
}
if (! skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) {
if (!skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) {
read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false));
}
@ -440,27 +445,28 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
*
* Given the full recalibration table, we perform the following preprocessing steps:
*
* - calculate the global quality score shift across all data [DeltaQ]
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
* - The final shift equation is:
* - calculate the global quality score shift across all data [DeltaQ]
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
* - The final shift equation is:
*
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
*
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
* @param key The list of Comparables that were calculated from the covariates
* @return A recalibrated quality score as a byte
*/
private byte performSequentialQualityCalculation( final Object... key ) {
private byte performSequentialQualityCalculation(final Object... key) {
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
final byte qualFromRead = (byte) Integer.parseInt(key[1].toString());
final Object[] readGroupCollapsedKey = new Object[1];
final Object[] qualityScoreCollapsedKey = new Object[2];
final Object[] covariateCollapsedKey = new Object[3];
// The global quality shift (over the read group only)
readGroupCollapsedKey[0] = key[0];
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey ));
final RecalDatum globalRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(0).get(readGroupCollapsedKey));
double globalDeltaQ = 0.0;
if( globalRecalDatum != null ) {
if (globalRecalDatum != null) {
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
@ -469,9 +475,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// The shift in quality between reported and empirical
qualityScoreCollapsedKey[0] = key[0];
qualityScoreCollapsedKey[1] = key[1];
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey ));
final RecalDatum qReportedRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(1).get(qualityScoreCollapsedKey));
double deltaQReported = 0.0;
if( qReportedRecalDatum != null ) {
if (qReportedRecalDatum != null) {
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
}
@ -481,17 +487,17 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
double deltaQCovariateEmpirical;
covariateCollapsedKey[0] = key[0];
covariateCollapsedKey[1] = key[1];
for( int iii = 2; iii < key.length; iii++ ) {
covariateCollapsedKey[2] = key[iii]; // The given covariate
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey ));
if( covariateRecalDatum != null ) {
for (int iii = 2; iii < key.length; iii++) {
covariateCollapsedKey[2] = key[iii]; // The given covariate
final RecalDatum covariateRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(iii).get(covariateCollapsedKey));
if (covariateRecalDatum != null) {
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported));
}
}
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE );
return QualityUtils.boundQual((int) Math.round(newQuality), (byte) MAX_QUALITY_SCORE);
// Verbose printouts used to validate with old recalibrator
//if(key.contains(null)) {
@ -508,12 +514,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/**
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
*
* @param originalQuals The list of original base quality scores
* @param recalQuals A list of the new recalibrated quality scores
* @param recalQuals A list of the new recalibrated quality scores
*/
private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) {
for( int iii = 0; iii < recalQuals.length; iii++ ) {
if( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) {
private void preserveQScores(final byte[] originalQuals, final byte[] recalQuals) {
for (int iii = 0; iii < recalQuals.length; iii++) {
if (originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN) {
recalQuals[iii] = originalQuals[iii];
}
}
@ -527,6 +534,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/**
* Start the reduce with a handle to the output bam file
*
* @return A FileWriter pointing to a new bam file
*/
public SAMFileWriter reduceInit() {
@ -535,12 +543,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/**
* Output each read to disk
* @param read The read to output
*
* @param read The read to output
* @param output The FileWriter to write the read to
* @return The FileWriter
*/
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) {
if( output != null ) {
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
if (output != null) {
output.addAlignment(read);
}
return output;
@ -548,20 +557,22 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/**
* Do nothing
*
* @param output The SAMFileWriter that outputs the bam file
*/
public void onTraversalDone(SAMFileWriter output) {
if( numReadsWithMalformedColorSpace != 0 ) {
if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) {
if (numReadsWithMalformedColorSpace != 0) {
if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) {
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
"These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!");
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) {
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
"These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!");
}
else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
"These reads were completely removed from the output bam file.");
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
"These reads were completely removed from the output bam file.");
}
}