BQSR Parameter cleanup
* get rid of 320C argument that nobody uses. * get rid of DEFAULT_READ_GROUP parameter and functionality (later to become an engine argument).
This commit is contained in:
parent
717cd4b912
commit
0d3ea0401c
|
|
@ -77,20 +77,20 @@ import java.util.Map;
|
|||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
|
||||
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
|
||||
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
|
||||
*
|
||||
* The first 20 lines of such a file is shown below.
|
||||
* The first 20 lines of such a file is shown below.
|
||||
* * The file begins with a series of comment lines describing:
|
||||
* ** The number of counted loci
|
||||
* ** The number of counted bases
|
||||
* ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
|
||||
*
|
||||
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
|
||||
*
|
||||
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
|
||||
*
|
||||
* * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
|
||||
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
|
||||
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
|
||||
* reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*
|
||||
*
|
||||
* <pre>
|
||||
* # Counted Sites 19451059
|
||||
* # Counted Bases 56582018
|
||||
|
|
@ -129,13 +129,14 @@ import java.util.Map;
|
|||
* -cov DinucCovariate \
|
||||
* -recalFile my_reads.recal_data.csv
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
|
||||
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
|
||||
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
|
||||
@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality
|
||||
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
|
||||
@By(DataSource.READS) // Only look at covered loci, not every loci of the reference file
|
||||
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class})
|
||||
// Filter out all reads with zero or unavailable mapping quality
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
|
||||
// This walker requires both -I input.bam and -R reference.fasta
|
||||
@PartitionBy(PartitionType.LOCUS)
|
||||
public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> {
|
||||
|
||||
|
|
@ -149,7 +150,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
/////////////////////////////
|
||||
// Shared Arguments
|
||||
/////////////////////////////
|
||||
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
@ArgumentCollection
|
||||
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
|
|
@ -160,7 +162,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
* for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
|
||||
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
|
||||
*/
|
||||
@Input(fullName="knownSites", shortName = "knownSites", doc="A database of known polymorphic sites to skip over in the recalibration algorithm", required=false)
|
||||
@Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
|
||||
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
|
||||
|
||||
/**
|
||||
|
|
@ -169,31 +171,31 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*/
|
||||
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the output covariates table recalibration file")
|
||||
@Output(fullName = "recal_file", shortName = "recalFile", required = true, doc = "Filename for the output covariates table recalibration file")
|
||||
@Gather(CountCovariatesGatherer.class)
|
||||
public PrintStream RECAL_FILE;
|
||||
|
||||
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
|
||||
@Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
|
||||
private boolean LIST_ONLY = false;
|
||||
|
||||
/**
|
||||
* See the -list argument to view available covariates.
|
||||
*/
|
||||
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false)
|
||||
@Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false)
|
||||
private String[] COVARIATES = null;
|
||||
@Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false)
|
||||
@Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false)
|
||||
private boolean USE_STANDARD_COVARIATES = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
|
||||
@Argument(fullName = "dont_sort_output", shortName = "unsorted", required = false, doc = "If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
|
||||
private boolean DONT_SORT_OUTPUT = false;
|
||||
|
||||
/**
|
||||
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
|
||||
*/
|
||||
@Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||
private boolean RUN_WITHOUT_DBSNP = false;
|
||||
|
||||
/////////////////////////////
|
||||
|
|
@ -217,6 +219,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
/**
|
||||
* Adds the values of other to this, returning this
|
||||
*
|
||||
* @param other
|
||||
* @return this object
|
||||
*/
|
||||
|
|
@ -247,53 +250,55 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
*/
|
||||
public void initialize() {
|
||||
|
||||
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
|
||||
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
|
||||
if (RAC.FORCE_PLATFORM != null) {
|
||||
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
|
||||
}
|
||||
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>( Covariate.class ).getPlugins();
|
||||
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>( RequiredCovariate.class ).getPlugins();
|
||||
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>( StandardCovariate.class ).getPlugins();
|
||||
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>(RequiredCovariate.class).getPlugins();
|
||||
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>(StandardCovariate.class).getPlugins();
|
||||
|
||||
// Print and exit if that's what was requested
|
||||
if ( LIST_ONLY ) {
|
||||
logger.info( "Available covariates:" );
|
||||
for( Class<?> covClass : covariateClasses ) {
|
||||
logger.info( covClass.getSimpleName() );
|
||||
if (LIST_ONLY) {
|
||||
logger.info("Available covariates:");
|
||||
for (Class<?> covClass : covariateClasses) {
|
||||
logger.info(covClass.getSimpleName());
|
||||
}
|
||||
logger.info("");
|
||||
|
||||
System.exit( 0 ); // Early exit here because user requested it
|
||||
System.exit(0); // Early exit here because user requested it
|
||||
}
|
||||
|
||||
// Warn the user if no dbSNP file or other variant mask was specified
|
||||
if( knownSites.isEmpty() && !RUN_WITHOUT_DBSNP ) {
|
||||
if (knownSites.isEmpty() && !RUN_WITHOUT_DBSNP) {
|
||||
throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.");
|
||||
}
|
||||
|
||||
// Initialize the requested covariates by parsing the -cov argument
|
||||
// First add the required covariates
|
||||
if( requiredClasses.size() == 2) { // readGroup and reported quality score
|
||||
requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
} else {
|
||||
if (requiredClasses.size() == 2) { // readGroup and reported quality score
|
||||
requestedCovariates.add(new ReadGroupCovariate()); // Order is important here
|
||||
requestedCovariates.add(new QualityScoreCovariate());
|
||||
}
|
||||
else {
|
||||
throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order.");
|
||||
}
|
||||
// Next add the standard covariates if -standard was specified by the user
|
||||
if( USE_STANDARD_COVARIATES ) {
|
||||
if (USE_STANDARD_COVARIATES) {
|
||||
// We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order
|
||||
// A list of Classes can't be sorted, but a list of Class names can be
|
||||
final List<String> standardClassNames = new ArrayList<String>();
|
||||
for( Class<?> covClass : standardClasses ) {
|
||||
standardClassNames.add( covClass.getName() );
|
||||
for (Class<?> covClass : standardClasses) {
|
||||
standardClassNames.add(covClass.getName());
|
||||
}
|
||||
Collections.sort(standardClassNames); // Sort the list of class names
|
||||
for( String className : standardClassNames ) {
|
||||
for( Class<?> covClass : standardClasses ) { // Find the class that matches this class name
|
||||
if( covClass.getName().equals( className ) ) {
|
||||
for (String className : standardClassNames) {
|
||||
for (Class<?> covClass : standardClasses) { // Find the class that matches this class name
|
||||
if (covClass.getName().equals(className)) {
|
||||
try {
|
||||
final Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
final Covariate covariate = (Covariate) covClass.newInstance();
|
||||
requestedCovariates.add(covariate);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
|
|
@ -302,17 +307,17 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
}
|
||||
}
|
||||
// Finally parse the -cov arguments that were provided, skipping over the ones already specified
|
||||
if( COVARIATES != null ) {
|
||||
for( String requestedCovariateString : COVARIATES ) {
|
||||
if (COVARIATES != null) {
|
||||
for (String requestedCovariateString : COVARIATES) {
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : covariateClasses ) {
|
||||
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
|
||||
for (Class<?> covClass : covariateClasses) {
|
||||
if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class
|
||||
foundClass = true;
|
||||
if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) {
|
||||
if (!requiredClasses.contains(covClass) && (!USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) {
|
||||
try {
|
||||
// Now that we've found a matching class, try to instantiate it
|
||||
final Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
final Covariate covariate = (Covariate) covClass.newInstance();
|
||||
requestedCovariates.add(covariate);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
|
|
@ -320,20 +325,19 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new UserException.CommandLineException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
|
||||
if (!foundClass) {
|
||||
throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.info( "The covariates being used here: " );
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
logger.info( "\t" + cov.getClass().getSimpleName() );
|
||||
cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection
|
||||
logger.info("The covariates being used here: ");
|
||||
for (Covariate cov : requestedCovariates) {
|
||||
logger.info("\t" + cov.getClass().getSimpleName());
|
||||
cov.initialize(RAC); // Initialize any covariate member variables using the shared argument collection
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
|
|
@ -342,62 +346,63 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
/**
|
||||
* For each read at this locus get the various covariate values and increment that location in the map based on
|
||||
* whether or not the base matches the reference at this particular location
|
||||
* whether or not the base matches the reference at this particular location
|
||||
*
|
||||
* @param tracker The reference metadata tracker
|
||||
* @param ref The reference context
|
||||
* @param ref The reference context
|
||||
* @param context The alignment context
|
||||
* @return Returns 1, but this value isn't used in the reduce step
|
||||
*/
|
||||
public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
public CountedData map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
// Only use data from non-dbsnp sites
|
||||
// Assume every mismatch at a non-dbsnp site is indicative of poor quality
|
||||
CountedData counter = new CountedData();
|
||||
if( tracker.getValues(knownSites).size() == 0 ) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed
|
||||
if (tracker.getValues(knownSites).size() == 0) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed
|
||||
// For each read at this locus
|
||||
for( final PileupElement p : context.getBasePileup() ) {
|
||||
for (final PileupElement p : context.getBasePileup()) {
|
||||
final GATKSAMRecord gatkRead = p.getRead();
|
||||
int offset = p.getOffset();
|
||||
|
||||
if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) {
|
||||
if (gatkRead.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) )
|
||||
{
|
||||
gatkRead.setTemporaryAttribute( SEEN_ATTRIBUTE, true );
|
||||
RecalDataManager.parseSAMRecord( gatkRead, RAC );
|
||||
if (!gatkRead.containsTemporaryAttribute(SEEN_ATTRIBUTE)) {
|
||||
gatkRead.setTemporaryAttribute(SEEN_ATTRIBUTE, true);
|
||||
RecalDataManager.parseSAMRecord(gatkRead, RAC);
|
||||
|
||||
// Skip over reads with no calls in the color space if the user requested it
|
||||
if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace( gatkRead ) ) {
|
||||
gatkRead.setTemporaryAttribute( SKIP_RECORD_ATTRIBUTE, true);
|
||||
if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace(gatkRead)) {
|
||||
gatkRead.setTemporaryAttribute(SKIP_RECORD_ATTRIBUTE, true);
|
||||
continue;
|
||||
}
|
||||
|
||||
RecalDataManager.parseColorSpace( gatkRead );
|
||||
gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE,
|
||||
RecalDataManager.computeCovariates( gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION ));
|
||||
RecalDataManager.parseColorSpace(gatkRead);
|
||||
gatkRead.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION));
|
||||
}
|
||||
|
||||
// Skip this position if base quality is zero
|
||||
if( gatkRead.getBaseQualities()[offset] > 0 ) {
|
||||
if (gatkRead.getBaseQualities()[offset] > 0) {
|
||||
|
||||
byte[] bases = gatkRead.getReadBases();
|
||||
byte refBase = ref.getBase();
|
||||
|
||||
// Skip if this base is an 'N' or etc.
|
||||
if( BaseUtils.isRegularBase( bases[offset] ) ) {
|
||||
if (BaseUtils.isRegularBase(bases[offset])) {
|
||||
|
||||
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
||||
if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
|
||||
!RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) {
|
||||
if (!gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
|
||||
!RecalDataManager.isInconsistentColorSpace(gatkRead, offset)) {
|
||||
|
||||
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||
updateDataFromRead( counter, gatkRead, offset, refBase );
|
||||
updateDataFromRead(counter, gatkRead, offset, refBase);
|
||||
|
||||
} else { // calculate SOLID reference insertion rate
|
||||
if( refBase == bases[offset] ) {
|
||||
}
|
||||
else { // calculate SOLID reference insertion rate
|
||||
if (refBase == bases[offset]) {
|
||||
counter.solidInsertedReferenceBases++;
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
counter.otherColorSpaceInconsistency++;
|
||||
}
|
||||
}
|
||||
|
|
@ -405,7 +410,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
}
|
||||
}
|
||||
counter.countedSites++;
|
||||
} else { // We skipped over the dbSNP site, and we are only processing every Nth locus
|
||||
}
|
||||
else { // We skipped over the dbSNP site, and we are only processing every Nth locus
|
||||
counter.skippedSites++;
|
||||
updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||
}
|
||||
|
|
@ -413,7 +419,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
return counter;
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Update the mismatch / total_base counts for a given class of loci.
|
||||
*
|
||||
* @param counter The CountedData to be updated
|
||||
|
|
@ -421,13 +427,13 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
* @param refBase The reference base
|
||||
*/
|
||||
private static void updateMismatchCounts(CountedData counter, final AlignmentContext context, final byte refBase) {
|
||||
for( PileupElement p : context.getBasePileup() ) {
|
||||
for (PileupElement p : context.getBasePileup()) {
|
||||
final byte readBase = p.getBase();
|
||||
final int readBaseIndex = BaseUtils.simpleBaseToBaseIndex(readBase);
|
||||
final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase);
|
||||
final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase);
|
||||
|
||||
if( readBaseIndex != -1 && refBaseIndex != -1 ) {
|
||||
if( readBaseIndex != refBaseIndex ) {
|
||||
if (readBaseIndex != -1 && refBaseIndex != -1) {
|
||||
if (readBaseIndex != refBaseIndex) {
|
||||
counter.novelCountsMM++;
|
||||
}
|
||||
counter.novelCountsBases++;
|
||||
|
|
@ -439,13 +445,14 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
* Major workhorse routine for this walker.
|
||||
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
|
||||
* adding one to the number of observations and potentially one to the number of mismatches
|
||||
* adding one to the number of observations and potentially one to the number of mismatches
|
||||
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
|
||||
* because pulling things out of the SAMRecord is an expensive operation.
|
||||
* @param counter Data structure which holds the counted bases
|
||||
* because pulling things out of the SAMRecord is an expensive operation.
|
||||
*
|
||||
* @param counter Data structure which holds the counted bases
|
||||
* @param gatkRead The SAMRecord holding all the data for this read
|
||||
* @param offset The offset in the read for this locus
|
||||
* @param refBase The reference base at this locus
|
||||
* @param offset The offset in the read for this locus
|
||||
* @param refBase The reference base at this locus
|
||||
*/
|
||||
private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRead, final int offset, final byte refBase) {
|
||||
final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE);
|
||||
|
|
@ -453,10 +460,10 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
||||
final NestedHashMap data = dataManager.data; //optimization - create local reference
|
||||
RecalDatumOptimized datum = (RecalDatumOptimized) data.get( key );
|
||||
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
RecalDatumOptimized datum = (RecalDatumOptimized) data.get(key);
|
||||
if (datum == null) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
// initialized with zeros, will be incremented at end of method
|
||||
datum = (RecalDatumOptimized)data.put( new RecalDatumOptimized(), true, (Object[])key );
|
||||
datum = (RecalDatumOptimized) data.put(new RecalDatumOptimized(), true, (Object[]) key);
|
||||
}
|
||||
|
||||
// Need the bases to determine whether or not we have a mismatch
|
||||
|
|
@ -464,13 +471,12 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
final long curMismatches = datum.getNumMismatches();
|
||||
|
||||
// Add one to the number of observations and potentially one to the number of mismatches
|
||||
datum.incrementBaseCounts( base, refBase );
|
||||
datum.incrementBaseCounts(base, refBase);
|
||||
counter.countedBases++;
|
||||
counter.novelCountsBases++;
|
||||
counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
|
|
@ -479,6 +485,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
/**
|
||||
* Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker.
|
||||
*
|
||||
* @return returns A PrintStream created from the -recalFile filename argument specified to the walker
|
||||
*/
|
||||
public CountedData reduceInit() {
|
||||
|
|
@ -487,11 +494,12 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
/**
|
||||
* The Reduce method doesn't do anything for this walker.
|
||||
*
|
||||
* @param mapped Result of the map. This value is immediately ignored.
|
||||
* @param sum The summing CountedData used to output the CSV data
|
||||
* @param sum The summing CountedData used to output the CSV data
|
||||
* @return returns The sum used to output the CSV data
|
||||
*/
|
||||
public CountedData reduce( CountedData mapped, CountedData sum ) {
|
||||
public CountedData reduce(CountedData mapped, CountedData sum) {
|
||||
// Do a dbSNP sanity check every so often
|
||||
return validatingDbsnpMismatchRate(sum.add(mapped));
|
||||
}
|
||||
|
|
@ -500,16 +508,15 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
* Validate the dbSNP reference mismatch rates.
|
||||
*/
|
||||
private CountedData validatingDbsnpMismatchRate(CountedData counter) {
|
||||
if( ++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY ) {
|
||||
if (++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY) {
|
||||
counter.lociSinceLastDbsnpCheck = 0;
|
||||
|
||||
if( counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L ) {
|
||||
final double fractionMM_novel = (double)counter.novelCountsMM / (double)counter.novelCountsBases;
|
||||
final double fractionMM_dbsnp = (double)counter.dbSNPCountsMM / (double)counter.dbSNPCountsBases;
|
||||
if (counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L) {
|
||||
final double fractionMM_novel = (double) counter.novelCountsMM / (double) counter.novelCountsBases;
|
||||
final double fractionMM_dbsnp = (double) counter.dbSNPCountsMM / (double) counter.dbSNPCountsBases;
|
||||
|
||||
if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) {
|
||||
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " +
|
||||
String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) );
|
||||
if (fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel) {
|
||||
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel));
|
||||
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file
|
||||
}
|
||||
}
|
||||
|
|
@ -518,47 +525,50 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
return counter;
|
||||
}
|
||||
|
||||
public CountedData treeReduce( CountedData sum1, CountedData sum2 ) {
|
||||
public CountedData treeReduce(CountedData sum1, CountedData sum2) {
|
||||
return validatingDbsnpMismatchRate(sum1.add(sum2));
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out the full data hashmap to disk in CSV format
|
||||
*
|
||||
* @param sum The CountedData to write out to RECAL_FILE
|
||||
*/
|
||||
public void onTraversalDone( CountedData sum ) {
|
||||
logger.info( "Writing raw recalibration data..." );
|
||||
if( sum.countedBases == 0L ) {
|
||||
public void onTraversalDone(CountedData sum) {
|
||||
logger.info("Writing raw recalibration data...");
|
||||
if (sum.countedBases == 0L) {
|
||||
throw new UserException.BadInput("Could not find any usable data in the input BAM file(s).");
|
||||
}
|
||||
outputToCSV( sum, RECAL_FILE );
|
||||
logger.info( "...done!" );
|
||||
outputToCSV(sum, RECAL_FILE);
|
||||
logger.info("...done!");
|
||||
}
|
||||
|
||||
/**
|
||||
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
|
||||
*
|
||||
* @param recalTableStream The PrintStream to write out to
|
||||
*/
|
||||
private void outputToCSV( CountedData sum, final PrintStream recalTableStream ) {
|
||||
private void outputToCSV(CountedData sum, final PrintStream recalTableStream) {
|
||||
recalTableStream.printf("# Counted Sites %d%n", sum.countedSites);
|
||||
recalTableStream.printf("# Counted Bases %d%n", sum.countedBases);
|
||||
recalTableStream.printf("# Skipped Sites %d%n", sum.skippedSites);
|
||||
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)sum.countedSites / sum.skippedSites);
|
||||
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double) sum.countedSites / sum.skippedSites);
|
||||
|
||||
if( sum.solidInsertedReferenceBases != 0 ) {
|
||||
if (sum.solidInsertedReferenceBases != 0) {
|
||||
recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) sum.countedBases / sum.solidInsertedReferenceBases);
|
||||
recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) sum.countedBases / sum.otherColorSpaceInconsistency);
|
||||
}
|
||||
|
||||
// Output header saying which covariates were used and in what order
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," );
|
||||
for (Covariate cov : requestedCovariates) {
|
||||
recalTableStream.print(cov.getClass().getSimpleName().split("Covariate")[0] + ",");
|
||||
}
|
||||
recalTableStream.println("nObservations,nMismatches,Qempirical");
|
||||
|
||||
if( DONT_SORT_OUTPUT ) {
|
||||
if (DONT_SORT_OUTPUT) {
|
||||
printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
|
||||
}
|
||||
|
||||
|
|
@ -566,45 +576,47 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
recalTableStream.println(TableRecalibrationWalker.EOF_MARKER);
|
||||
}
|
||||
|
||||
private void printMappingsSorted( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||
private void printMappingsSorted(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||
final ArrayList<Comparable> keyList = new ArrayList<Comparable>();
|
||||
for( Object comp : data.keySet() ) {
|
||||
for (Object comp : data.keySet()) {
|
||||
keyList.add((Comparable) comp);
|
||||
}
|
||||
|
||||
Collections.sort(keyList);
|
||||
|
||||
for( Comparable comp : keyList ) {
|
||||
for (Comparable comp : keyList) {
|
||||
key[curPos] = comp;
|
||||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
|
||||
if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
|
||||
// For each Covariate in the key
|
||||
for( Object compToPrint : key ) {
|
||||
for (Object compToPrint : key) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( compToPrint + "," );
|
||||
recalTableStream.print(compToPrint + ",");
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
|
||||
} else { // Another layer in the nested hash map
|
||||
printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val );
|
||||
recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
|
||||
}
|
||||
else { // Another layer in the nested hash map
|
||||
printMappingsSorted(recalTableStream, curPos + 1, key, (Map) val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void printMappings( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||
for( Object comp : data.keySet() ) {
|
||||
private void printMappings(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||
for (Object comp : data.keySet()) {
|
||||
key[curPos] = comp;
|
||||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
|
||||
if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
|
||||
// For each Covariate in the key
|
||||
for( Object compToPrint : key ) {
|
||||
for (Object compToPrint : key) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( compToPrint + "," );
|
||||
recalTableStream.print(compToPrint + ",");
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
|
||||
} else { // Another layer in the nested hash map
|
||||
printMappings( recalTableStream, curPos + 1, key, (Map) val );
|
||||
recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
|
||||
}
|
||||
else { // Another layer in the nested hash map
|
||||
printMappings(recalTableStream, curPos + 1, key, (Map) val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -256,32 +256,6 @@ public class RecalDataManager {
|
|||
public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
|
||||
GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup();
|
||||
|
||||
// If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments
|
||||
if (readGroup == null) {
|
||||
if (RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) {
|
||||
if (!warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null) {
|
||||
Utils.warnUser("The input .bam file contains reads with no read group. " +
|
||||
"Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " +
|
||||
"First observed at read with name = " + read.getReadName());
|
||||
warnUserNullReadGroup = true;
|
||||
}
|
||||
// There is no readGroup so defaulting to these values
|
||||
readGroup = new GATKSAMReadGroupRecord(RAC.DEFAULT_READ_GROUP);
|
||||
readGroup.setPlatform(RAC.DEFAULT_PLATFORM);
|
||||
((GATKSAMRecord) read).setReadGroup(readGroup);
|
||||
}
|
||||
else {
|
||||
throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName());
|
||||
}
|
||||
}
|
||||
|
||||
if (RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP)) { // Collapse all the read groups into a single common String provided by the user
|
||||
final String oldPlatform = readGroup.getPlatform();
|
||||
readGroup = new GATKSAMReadGroupRecord(RAC.FORCE_READ_GROUP);
|
||||
readGroup.setPlatform(oldPlatform);
|
||||
((GATKSAMRecord) read).setReadGroup(readGroup);
|
||||
}
|
||||
|
||||
if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
|
||||
readGroup.setPlatform(RAC.FORCE_PLATFORM);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,31 +43,15 @@ public class RecalibrationArgumentCollection {
|
|||
// Shared Command Line Arguments
|
||||
//////////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName = "default_read_group", shortName = "dRG", required = false, doc = "If a read has no read group then default to the provided String.")
|
||||
public String DEFAULT_READ_GROUP = null;
|
||||
@Hidden
|
||||
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String DEFAULT_PLATFORM = null;
|
||||
@Hidden
|
||||
@Argument(fullName = "force_read_group", shortName = "fRG", required = false, doc = "If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.")
|
||||
public String FORCE_READ_GROUP = null;
|
||||
@Hidden
|
||||
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String FORCE_PLATFORM = null;
|
||||
@Hidden
|
||||
@Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false)
|
||||
public int WINDOW_SIZE = 5;
|
||||
|
||||
/**
|
||||
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
|
||||
*/
|
||||
@Hidden
|
||||
@Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
|
||||
public int HOMOPOLYMER_NBACK = 7;
|
||||
@Hidden
|
||||
@Argument(fullName = "exception_if_no_tile", shortName = "throwTileException", doc = "If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required = false)
|
||||
public boolean EXCEPTION_IF_NO_TILE = false;
|
||||
|
||||
/**
|
||||
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
|
||||
* reads which have had the reference inserted because of color space inconsistencies.
|
||||
|
|
@ -89,4 +73,10 @@ public class RecalibrationArgumentCollection {
|
|||
@Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false)
|
||||
public int CONTEXT_SIZE = 8;
|
||||
|
||||
/**
|
||||
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
|
||||
*/
|
||||
@Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
|
||||
public int HOMOPOLYMER_NBACK = 7;
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -86,12 +86,12 @@ import java.util.regex.Pattern;
|
|||
* -o my_reads.recal.bam \
|
||||
* -recalFile my_reads.recal_data.csv
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
|
||||
@WalkerName("TableRecalibration")
|
||||
@Requires({ DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES }) // This walker requires -I input.bam, it also requires -R reference.fasta
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
|
||||
// This walker requires -I input.bam, it also requires -R reference.fasta
|
||||
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
|
||||
public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration";
|
||||
|
|
@ -99,7 +99,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
/////////////////////////////
|
||||
// Shared Arguments
|
||||
/////////////////////////////
|
||||
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
@ArgumentCollection
|
||||
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
|
|
@ -110,12 +111,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*/
|
||||
@Input(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the input covariates table recalibration .csv file")
|
||||
@Input(fullName = "recal_file", shortName = "recalFile", required = true, doc = "Filename for the input covariates table recalibration .csv file")
|
||||
public File RECAL_FILE = null;
|
||||
/**
|
||||
* A new bam file in which the quality scores in each read have been recalibrated. The alignment of the reads is left untouched.
|
||||
*/
|
||||
@Output(doc="The output recalibrated BAM file", required=true)
|
||||
@Output(doc = "The output recalibrated BAM file", required = true)
|
||||
private StingSAMFileWriter OUTPUT_BAM = null;
|
||||
|
||||
/**
|
||||
|
|
@ -126,7 +127,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
* your Q2 and Q3 bins can be elevated to Q8 or Q10, leading to issues downstream. With the default value of 5, all Q0-Q4 bases
|
||||
* are unmodified during recalibration, so they don't get inappropriately evaluated.
|
||||
*/
|
||||
@Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
|
||||
@Argument(fullName = "preserve_qscores_less_than", shortName = "pQ", doc = "Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required = false)
|
||||
private int PRESERVE_QSCORES_LESS_THAN = 5;
|
||||
|
||||
/**
|
||||
|
|
@ -135,37 +136,36 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
* argument which sets how many unobserved counts to add to every bin. Use --smoothing 0 to turn off all smoothing or, for example,
|
||||
* --smoothing 15 for a large amount of smoothing.
|
||||
*/
|
||||
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
|
||||
@Argument(fullName = "smoothing", shortName = "sm", required = false, doc = "Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
|
||||
private int SMOOTHING = 1;
|
||||
|
||||
/**
|
||||
* Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
|
||||
* by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
|
||||
*/
|
||||
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores")
|
||||
@Argument(fullName = "max_quality_score", shortName = "maxQ", required = false, doc = "The integer value at which to cap the quality scores")
|
||||
private int MAX_QUALITY_SCORE = 50;
|
||||
|
||||
/**
|
||||
* By default TableRecalibration emits the OQ field -- so you can go back and look at the original quality scores, rerun
|
||||
* the system using the OQ flags, etc, on the output BAM files; to turn off emission of the OQ field use this flag.
|
||||
*/
|
||||
@Argument(fullName="doNotWriteOriginalQuals", shortName="noOQs", required=false, doc="If true, we will not write the original quality (OQ) tag for each read")
|
||||
@Argument(fullName = "doNotWriteOriginalQuals", shortName = "noOQs", required = false, doc = "If true, we will not write the original quality (OQ) tag for each read")
|
||||
private boolean DO_NOT_WRITE_OQ = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
@Argument(fullName = "no_pg_tag", shortName = "noPG", required = false, doc = "Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
private boolean NO_PG_TAG = false;
|
||||
@Hidden
|
||||
@Argument(fullName="fail_with_no_eof_marker", shortName="requireEOF", required=false, doc="If no EOF marker is present in the covariates file, exit the program with an exception.")
|
||||
@Argument(fullName = "fail_with_no_eof_marker", shortName = "requireEOF", required = false, doc = "If no EOF marker is present in the covariates file, exit the program with an exception.")
|
||||
private boolean REQUIRE_EOF = false;
|
||||
@Hidden
|
||||
@Argument(fullName="skipUQUpdate", shortName="skipUQUpdate", required=false, doc="If true, we will skip the UQ updating step for each read, speeding up the calculations")
|
||||
@Argument(fullName = "skipUQUpdate", shortName = "skipUQUpdate", required = false, doc = "If true, we will skip the UQ updating step for each read, speeding up the calculations")
|
||||
private boolean skipUQUpdate = false;
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
|
|
@ -195,8 +195,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
*/
|
||||
public void initialize() {
|
||||
|
||||
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
|
||||
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
|
||||
if (RAC.FORCE_PLATFORM != null) {
|
||||
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
|
||||
}
|
||||
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
|
|
@ -205,31 +206,33 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
boolean foundAllCovariates = false;
|
||||
|
||||
// Read in the data from the csv file and populate the data map and covariates list
|
||||
logger.info( "Reading in the data from input csv file..." );
|
||||
logger.info("Reading in the data from input csv file...");
|
||||
|
||||
boolean sawEOF = false;
|
||||
try {
|
||||
for ( String line : new XReadLines(RECAL_FILE) ) {
|
||||
for (String line : new XReadLines(RECAL_FILE)) {
|
||||
lineNumber++;
|
||||
if ( EOF_MARKER.equals(line) ) {
|
||||
if (EOF_MARKER.equals(line)) {
|
||||
sawEOF = true;
|
||||
} else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) {
|
||||
}
|
||||
else if (COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
|
||||
; // Skip over the comment lines, (which start with '#')
|
||||
}
|
||||
// Read in the covariates that were used from the input file
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
|
||||
} else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||
else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data
|
||||
if (foundAllCovariates) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE);
|
||||
}
|
||||
else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||
String[] vals = line.split(",");
|
||||
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
||||
for (int iii = 0; iii < vals.length - 3; iii++) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
|
||||
for (Class<?> covClass : classes) {
|
||||
if ((vals[iii] + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) {
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
Covariate covariate = (Covariate) covClass.newInstance();
|
||||
requestedCovariates.add(covariate);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
|
|
@ -237,107 +240,110 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
||||
if (!foundClass) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else { // Found a line of data
|
||||
if( !foundAllCovariates ) {
|
||||
}
|
||||
else { // Found a line of data
|
||||
if (!foundAllCovariates) {
|
||||
foundAllCovariates = true;
|
||||
|
||||
// At this point all the covariates should have been found and initialized
|
||||
if( requestedCovariates.size() < 2 ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE );
|
||||
if (requestedCovariates.size() < 2) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE);
|
||||
}
|
||||
|
||||
final boolean createCollapsedTables = true;
|
||||
|
||||
// Initialize any covariate member variables using the shared argument collection
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
cov.initialize( RAC );
|
||||
for (Covariate cov : requestedCovariates) {
|
||||
cov.initialize(RAC);
|
||||
}
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() );
|
||||
dataManager = new RecalDataManager(createCollapsedTables, requestedCovariates.size());
|
||||
|
||||
}
|
||||
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||
} catch ( NumberFormatException e ) {
|
||||
} catch (NumberFormatException e) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
logger.info( "...done!" );
|
||||
logger.info("...done!");
|
||||
|
||||
if ( !sawEOF ) {
|
||||
if (!sawEOF) {
|
||||
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
|
||||
if ( REQUIRE_EOF )
|
||||
if (REQUIRE_EOF)
|
||||
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||
logger.warn(errorMessage);
|
||||
}
|
||||
|
||||
logger.info( "The covariates being used here: " );
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
logger.info( "\t" + cov.getClass().getSimpleName() );
|
||||
logger.info("The covariates being used here: ");
|
||||
for (Covariate cov : requestedCovariates) {
|
||||
logger.info("\t" + cov.getClass().getSimpleName());
|
||||
}
|
||||
|
||||
if( dataManager == null ) {
|
||||
if (dataManager == null) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
|
||||
}
|
||||
|
||||
// Create the tables of empirical quality scores that will be used in the sequential calculation
|
||||
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
|
||||
dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE );
|
||||
logger.info( "...done!" );
|
||||
logger.info("Generating tables of empirical qualities for use in sequential calculation...");
|
||||
dataManager.generateEmpiricalQualities(SMOOTHING, MAX_QUALITY_SCORE);
|
||||
logger.info("...done!");
|
||||
|
||||
// Take the header of the input SAM file and tweak it by adding in a new programRecord with the version number and list of covariates that were used
|
||||
final SAMFileHeader header = getToolkit().getSAMFileHeader().clone();
|
||||
if( !NO_PG_TAG ) {
|
||||
if (!NO_PG_TAG) {
|
||||
final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
|
||||
final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText");
|
||||
try {
|
||||
final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version");
|
||||
programRecord.setProgramVersion(version);
|
||||
} catch (MissingResourceException e) {}
|
||||
} catch (MissingResourceException e) {
|
||||
}
|
||||
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this));
|
||||
sb.append(" Covariates=[");
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
for (Covariate cov : requestedCovariates) {
|
||||
sb.append(cov.getClass().getSimpleName());
|
||||
sb.append(", ");
|
||||
}
|
||||
sb.setCharAt(sb.length()-2, ']');
|
||||
sb.setCharAt(sb.length()-1, ' ');
|
||||
sb.setCharAt(sb.length() - 2, ']');
|
||||
sb.setCharAt(sb.length() - 1, ' ');
|
||||
programRecord.setCommandLine(sb.toString());
|
||||
|
||||
List<SAMProgramRecord> oldRecords = header.getProgramRecords();
|
||||
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size()+1);
|
||||
for ( SAMProgramRecord record : oldRecords ) {
|
||||
if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) )
|
||||
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size() + 1);
|
||||
for (SAMProgramRecord record : oldRecords) {
|
||||
if (!record.getId().startsWith(PROGRAM_RECORD_NAME))
|
||||
newRecords.add(record);
|
||||
}
|
||||
newRecords.add(programRecord);
|
||||
header.setProgramRecords(newRecords);
|
||||
|
||||
// Write out the new header
|
||||
OUTPUT_BAM.writeHeader( header );
|
||||
OUTPUT_BAM.writeHeader(header);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
||||
*
|
||||
* @param line A line of CSV data read from the recalibration table data file
|
||||
*/
|
||||
private void addCSVData(final File file, final String line) {
|
||||
final String[] vals = line.split(",");
|
||||
|
||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
||||
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
|
||||
if (vals.length != requestedCovariates.size() + 3) { // +3 because of nObservations, nMismatch, and Qempirical
|
||||
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
|
||||
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
||||
}
|
||||
|
|
@ -345,15 +351,15 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
final Object[] key = new Object[requestedCovariates.size()];
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
key[iii] = cov.getValue( vals[iii] );
|
||||
for (iii = 0; iii < requestedCovariates.size(); iii++) {
|
||||
cov = requestedCovariates.get(iii);
|
||||
key[iii] = cov.getValue(vals[iii]);
|
||||
}
|
||||
|
||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
||||
final RecalDatum datum = new RecalDatum(Long.parseLong(vals[iii]), Long.parseLong(vals[iii + 1]), Double.parseDouble(vals[1]), 0.0);
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN );
|
||||
dataManager.addToAllTables(key, datum, PRESERVE_QSCORES_LESS_THAN);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -366,64 +372,63 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
* For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read
|
||||
*
|
||||
* @param refBases References bases over the length of the read
|
||||
* @param read The read to be recalibrated
|
||||
* @param read The read to be recalibrated
|
||||
* @return The read with quality scores replaced
|
||||
*/
|
||||
public SAMRecord map( ReferenceContext refBases, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) {
|
||||
public SAMRecord map(ReferenceContext refBases, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
|
||||
if( read.getReadLength() == 0 ) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads.
|
||||
if (read.getReadLength() == 0) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads.
|
||||
return read;
|
||||
}
|
||||
|
||||
RecalDataManager.parseSAMRecord( read, RAC );
|
||||
RecalDataManager.parseSAMRecord(read, RAC);
|
||||
|
||||
byte[] originalQuals = read.getBaseQualities();
|
||||
final byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
final String platform = read.getReadGroup().getPlatform();
|
||||
if( platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING) ) {
|
||||
if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) ) {
|
||||
final boolean badColor = RecalDataManager.checkNoCallColorSpace( read );
|
||||
if( badColor ) {
|
||||
if (platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING)) {
|
||||
if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION)) {
|
||||
final boolean badColor = RecalDataManager.checkNoCallColorSpace(read);
|
||||
if (badColor) {
|
||||
numReadsWithMalformedColorSpace++;
|
||||
if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) {
|
||||
if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) {
|
||||
return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them
|
||||
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) {
|
||||
}
|
||||
else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
|
||||
read.setReadFailsVendorQualityCheckFlag(true);
|
||||
return read;
|
||||
}
|
||||
}
|
||||
}
|
||||
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases() );
|
||||
originalQuals = RecalDataManager.calcColorSpace(read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases());
|
||||
}
|
||||
|
||||
//compute all covariate values for this read
|
||||
final Comparable[][] covariateValues_offset_x_covar =
|
||||
RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
|
||||
final Comparable[][] covariateValues_offset_x_covar = RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
|
||||
|
||||
// For each base in the read
|
||||
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
|
||||
for (int offset = 0; offset < read.getReadLength(); offset++) {
|
||||
|
||||
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
|
||||
|
||||
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
|
||||
if(qualityScore == null)
|
||||
{
|
||||
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
|
||||
if (qualityScore == null) {
|
||||
qualityScore = performSequentialQualityCalculation(fullCovariateKey);
|
||||
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
|
||||
}
|
||||
|
||||
recalQuals[offset] = qualityScore;
|
||||
}
|
||||
|
||||
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
|
||||
preserveQScores(originalQuals, recalQuals); // Overwrite the work done if original quality score is too low
|
||||
|
||||
read.setBaseQualities( recalQuals ); // Overwrite old qualities with new recalibrated qualities
|
||||
if ( !DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // Save the old qualities if the tag isn't already taken in the read
|
||||
read.setBaseQualities(recalQuals); // Overwrite old qualities with new recalibrated qualities
|
||||
if (!DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null) { // Save the old qualities if the tag isn't already taken in the read
|
||||
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals));
|
||||
}
|
||||
|
||||
if (! skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) {
|
||||
if (!skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) {
|
||||
read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false));
|
||||
}
|
||||
|
||||
|
|
@ -440,27 +445,28 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
*
|
||||
* Given the full recalibration table, we perform the following preprocessing steps:
|
||||
*
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||
* @param key The list of Comparables that were calculated from the covariates
|
||||
* @return A recalibrated quality score as a byte
|
||||
*/
|
||||
private byte performSequentialQualityCalculation( final Object... key ) {
|
||||
private byte performSequentialQualityCalculation(final Object... key) {
|
||||
|
||||
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
|
||||
final byte qualFromRead = (byte) Integer.parseInt(key[1].toString());
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
||||
final Object[] covariateCollapsedKey = new Object[3];
|
||||
|
||||
// The global quality shift (over the read group only)
|
||||
readGroupCollapsedKey[0] = key[0];
|
||||
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey ));
|
||||
final RecalDatum globalRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(0).get(readGroupCollapsedKey));
|
||||
double globalDeltaQ = 0.0;
|
||||
if( globalRecalDatum != null ) {
|
||||
if (globalRecalDatum != null) {
|
||||
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
|
||||
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||
|
|
@ -469,9 +475,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
// The shift in quality between reported and empirical
|
||||
qualityScoreCollapsedKey[0] = key[0];
|
||||
qualityScoreCollapsedKey[1] = key[1];
|
||||
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey ));
|
||||
final RecalDatum qReportedRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(1).get(qualityScoreCollapsedKey));
|
||||
double deltaQReported = 0.0;
|
||||
if( qReportedRecalDatum != null ) {
|
||||
if (qReportedRecalDatum != null) {
|
||||
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
}
|
||||
|
|
@ -481,17 +487,17 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
double deltaQCovariateEmpirical;
|
||||
covariateCollapsedKey[0] = key[0];
|
||||
covariateCollapsedKey[1] = key[1];
|
||||
for( int iii = 2; iii < key.length; iii++ ) {
|
||||
covariateCollapsedKey[2] = key[iii]; // The given covariate
|
||||
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey ));
|
||||
if( covariateRecalDatum != null ) {
|
||||
for (int iii = 2; iii < key.length; iii++) {
|
||||
covariateCollapsedKey[2] = key[iii]; // The given covariate
|
||||
final RecalDatum covariateRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(iii).get(covariateCollapsedKey));
|
||||
if (covariateRecalDatum != null) {
|
||||
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
|
||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||
deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported));
|
||||
}
|
||||
}
|
||||
|
||||
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE );
|
||||
return QualityUtils.boundQual((int) Math.round(newQuality), (byte) MAX_QUALITY_SCORE);
|
||||
|
||||
// Verbose printouts used to validate with old recalibrator
|
||||
//if(key.contains(null)) {
|
||||
|
|
@ -508,12 +514,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
/**
|
||||
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
||||
*
|
||||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
*/
|
||||
private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) {
|
||||
for( int iii = 0; iii < recalQuals.length; iii++ ) {
|
||||
if( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) {
|
||||
private void preserveQScores(final byte[] originalQuals, final byte[] recalQuals) {
|
||||
for (int iii = 0; iii < recalQuals.length; iii++) {
|
||||
if (originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN) {
|
||||
recalQuals[iii] = originalQuals[iii];
|
||||
}
|
||||
}
|
||||
|
|
@ -527,6 +534,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
/**
|
||||
* Start the reduce with a handle to the output bam file
|
||||
*
|
||||
* @return A FileWriter pointing to a new bam file
|
||||
*/
|
||||
public SAMFileWriter reduceInit() {
|
||||
|
|
@ -535,12 +543,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
/**
|
||||
* Output each read to disk
|
||||
* @param read The read to output
|
||||
*
|
||||
* @param read The read to output
|
||||
* @param output The FileWriter to write the read to
|
||||
* @return The FileWriter
|
||||
*/
|
||||
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) {
|
||||
if( output != null ) {
|
||||
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
|
||||
if (output != null) {
|
||||
output.addAlignment(read);
|
||||
}
|
||||
return output;
|
||||
|
|
@ -548,20 +557,22 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
/**
|
||||
* Do nothing
|
||||
*
|
||||
* @param output The SAMFileWriter that outputs the bam file
|
||||
*/
|
||||
public void onTraversalDone(SAMFileWriter output) {
|
||||
if( numReadsWithMalformedColorSpace != 0 ) {
|
||||
if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) {
|
||||
if (numReadsWithMalformedColorSpace != 0) {
|
||||
if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) {
|
||||
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
|
||||
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
||||
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
||||
"These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!");
|
||||
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) {
|
||||
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
||||
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
||||
"These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!");
|
||||
}
|
||||
else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
|
||||
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
|
||||
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
||||
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
||||
"These reads were completely removed from the output bam file.");
|
||||
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
||||
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
||||
"These reads were completely removed from the output bam file.");
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue