BQSR Parameter cleanup

* get rid of 320C argument that nobody uses.
   * get rid of DEFAULT_READ_GROUP parameter and functionality (later to become an engine argument).
This commit is contained in:
Mauricio Carneiro 2012-02-07 13:22:46 -05:00
parent 717cd4b912
commit 0d3ea0401c
4 changed files with 280 additions and 293 deletions

View File

@ -129,13 +129,14 @@ import java.util.Map;
* -cov DinucCovariate \
* -recalFile my_reads.recal_data.csv
* </pre>
*
*/
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
@By(DataSource.READS) // Only look at covered loci, not every loci of the reference file
@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class})
// Filter out all reads with zero or unavailable mapping quality
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
// This walker requires both -I input.bam and -R reference.fasta
@PartitionBy(PartitionType.LOCUS)
public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> {
@ -149,7 +150,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/////////////////////////////
// Shared Arguments
/////////////////////////////
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
@ArgumentCollection
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
/////////////////////////////
// Command Line Arguments
@ -217,6 +219,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/**
* Adds the values of other to this, returning this
*
* @param other
* @return this object
*/
@ -247,8 +250,9 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
*/
public void initialize() {
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
if (RAC.FORCE_PLATFORM != null) {
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
}
// Get a list of all available covariates
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
@ -276,7 +280,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
if (requiredClasses.size() == 2) { // readGroup and reported quality score
requestedCovariates.add(new ReadGroupCovariate()); // Order is important here
requestedCovariates.add(new QualityScoreCovariate());
} else {
}
else {
throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order.");
}
// Next add the standard covariates if -standard was specified by the user
@ -333,7 +338,6 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
}
}
//---------------------------------------------------------------------------------------------------------------
//
// map
@ -343,6 +347,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/**
* For each read at this locus get the various covariate values and increment that location in the map based on
* whether or not the base matches the reference at this particular location
*
* @param tracker The reference metadata tracker
* @param ref The reference context
* @param context The alignment context
@ -362,8 +367,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
continue;
}
if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) )
{
if (!gatkRead.containsTemporaryAttribute(SEEN_ATTRIBUTE)) {
gatkRead.setTemporaryAttribute(SEEN_ATTRIBUTE, true);
RecalDataManager.parseSAMRecord(gatkRead, RAC);
@ -374,8 +378,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
}
RecalDataManager.parseColorSpace(gatkRead);
gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE,
RecalDataManager.computeCovariates( gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION ));
gatkRead.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION));
}
// Skip this position if base quality is zero
@ -394,10 +397,12 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
// This base finally passed all the checks for a good base, so add it to the big data hashmap
updateDataFromRead(counter, gatkRead, offset, refBase);
} else { // calculate SOLID reference insertion rate
}
else { // calculate SOLID reference insertion rate
if (refBase == bases[offset]) {
counter.solidInsertedReferenceBases++;
} else {
}
else {
counter.otherColorSpaceInconsistency++;
}
}
@ -405,7 +410,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
}
}
counter.countedSites++;
} else { // We skipped over the dbSNP site, and we are only processing every Nth locus
}
else { // We skipped over the dbSNP site, and we are only processing every Nth locus
counter.skippedSites++;
updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
}
@ -442,6 +448,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
* adding one to the number of observations and potentially one to the number of mismatches
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
* because pulling things out of the SAMRecord is an expensive operation.
*
* @param counter Data structure which holds the counted bases
* @param gatkRead The SAMRecord holding all the data for this read
* @param offset The offset in the read for this locus
@ -470,7 +477,6 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
@ -479,6 +485,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/**
* Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker.
*
* @return returns A PrintStream created from the -recalFile filename argument specified to the walker
*/
public CountedData reduceInit() {
@ -487,6 +494,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/**
* The Reduce method doesn't do anything for this walker.
*
* @param mapped Result of the map. This value is immediately ignored.
* @param sum The summing CountedData used to output the CSV data
* @return returns The sum used to output the CSV data
@ -508,8 +516,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
final double fractionMM_dbsnp = (double) counter.dbSNPCountsMM / (double) counter.dbSNPCountsBases;
if (fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel) {
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " +
String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) );
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel));
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file
}
}
@ -524,6 +531,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/**
* Write out the full data hashmap to disk in CSV format
*
* @param sum The CountedData to write out to RECAL_FILE
*/
public void onTraversalDone(CountedData sum) {
@ -537,6 +545,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/**
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
*
* @param recalTableStream The PrintStream to write out to
*/
private void outputToCSV(CountedData sum, final PrintStream recalTableStream) {
@ -558,7 +567,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
if (DONT_SORT_OUTPUT) {
printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
} else {
}
else {
printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
}
@ -585,7 +595,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
}
// Output the RecalDatum entry
recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
} else { // Another layer in the nested hash map
}
else { // Another layer in the nested hash map
printMappingsSorted(recalTableStream, curPos + 1, key, (Map) val);
}
}
@ -603,7 +614,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
}
// Output the RecalDatum entry
recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
} else { // Another layer in the nested hash map
}
else { // Another layer in the nested hash map
printMappings(recalTableStream, curPos + 1, key, (Map) val);
}
}

View File

@ -256,32 +256,6 @@ public class RecalDataManager {
public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup();
// If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments
if (readGroup == null) {
if (RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) {
if (!warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null) {
Utils.warnUser("The input .bam file contains reads with no read group. " +
"Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " +
"First observed at read with name = " + read.getReadName());
warnUserNullReadGroup = true;
}
// There is no readGroup so defaulting to these values
readGroup = new GATKSAMReadGroupRecord(RAC.DEFAULT_READ_GROUP);
readGroup.setPlatform(RAC.DEFAULT_PLATFORM);
((GATKSAMRecord) read).setReadGroup(readGroup);
}
else {
throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName());
}
}
if (RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP)) { // Collapse all the read groups into a single common String provided by the user
final String oldPlatform = readGroup.getPlatform();
readGroup = new GATKSAMReadGroupRecord(RAC.FORCE_READ_GROUP);
readGroup.setPlatform(oldPlatform);
((GATKSAMRecord) read).setReadGroup(readGroup);
}
if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
readGroup.setPlatform(RAC.FORCE_PLATFORM);
}

View File

@ -43,31 +43,15 @@ public class RecalibrationArgumentCollection {
// Shared Command Line Arguments
//////////////////////////////////
@Hidden
@Argument(fullName = "default_read_group", shortName = "dRG", required = false, doc = "If a read has no read group then default to the provided String.")
public String DEFAULT_READ_GROUP = null;
@Hidden
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
public String DEFAULT_PLATFORM = null;
@Hidden
@Argument(fullName = "force_read_group", shortName = "fRG", required = false, doc = "If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.")
public String FORCE_READ_GROUP = null;
@Hidden
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
@Hidden
@Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false)
public int WINDOW_SIZE = 5;
/**
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
*/
@Hidden
@Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
public int HOMOPOLYMER_NBACK = 7;
@Hidden
@Argument(fullName = "exception_if_no_tile", shortName = "throwTileException", doc = "If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required = false)
public boolean EXCEPTION_IF_NO_TILE = false;
/**
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies.
@ -89,4 +73,10 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false)
public int CONTEXT_SIZE = 8;
/**
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
*/
@Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
public int HOMOPOLYMER_NBACK = 7;
}

View File

@ -86,12 +86,12 @@ import java.util.regex.Pattern;
* -o my_reads.recal.bam \
* -recalFile my_reads.recal_data.csv
* </pre>
*
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@WalkerName("TableRecalibration")
@Requires({ DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES }) // This walker requires -I input.bam, it also requires -R reference.fasta
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
// This walker requires -I input.bam, it also requires -R reference.fasta
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration";
@ -99,7 +99,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/////////////////////////////
// Shared Arguments
/////////////////////////////
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
@ArgumentCollection
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
/////////////////////////////
// Command Line Arguments
@ -165,7 +166,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
@Argument(fullName = "skipUQUpdate", shortName = "skipUQUpdate", required = false, doc = "If true, we will skip the UQ updating step for each read, speeding up the calculations")
private boolean skipUQUpdate = false;
/////////////////////////////
// Private Member Variables
/////////////////////////////
@ -195,8 +195,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
*/
public void initialize() {
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
if (RAC.FORCE_PLATFORM != null) {
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
}
// Get a list of all available covariates
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
@ -213,14 +214,16 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
lineNumber++;
if (EOF_MARKER.equals(line)) {
sawEOF = true;
} else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) {
}
else if (COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
; // Skip over the comment lines, (which start with '#')
}
// Read in the covariates that were used from the input file
else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data
if (foundAllCovariates) {
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE);
} else { // Found the covariate list in input file, loop through all of them and instantiate them
}
else { // Found the covariate list in input file, loop through all of them and instantiate them
String[] vals = line.split(",");
for (int iii = 0; iii < vals.length - 3; iii++) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
boolean foundClass = false;
@ -243,7 +246,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
}
} else { // Found a line of data
}
else { // Found a line of data
if (!foundAllCovariates) {
foundAllCovariates = true;
@ -302,7 +306,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
try {
final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version");
programRecord.setProgramVersion(version);
} catch (MissingResourceException e) {}
} catch (MissingResourceException e) {
}
StringBuffer sb = new StringBuffer();
sb.append(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this));
@ -331,6 +336,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/**
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
*
* @param line A line of CSV data read from the recalibration table data file
*/
private void addCSVData(final File file, final String line) {
@ -388,7 +394,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
numReadsWithMalformedColorSpace++;
if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) {
return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) {
}
else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
read.setReadFailsVendorQualityCheckFlag(true);
return read;
}
@ -398,8 +405,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
//compute all covariate values for this read
final Comparable[][] covariateValues_offset_x_covar =
RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
final Comparable[][] covariateValues_offset_x_covar = RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
// For each base in the read
for (int offset = 0; offset < read.getReadLength(); offset++) {
@ -407,8 +413,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
if(qualityScore == null)
{
if (qualityScore == null) {
qualityScore = performSequentialQualityCalculation(fullCovariateKey);
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
}
@ -446,6 +451,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* - The final shift equation is:
*
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
*
* @param key The list of Comparables that were calculated from the covariates
* @return A recalibrated quality score as a byte
*/
@ -508,6 +514,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/**
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
*
* @param originalQuals The list of original base quality scores
* @param recalQuals A list of the new recalibrated quality scores
*/
@ -527,6 +534,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/**
* Start the reduce with a handle to the output bam file
*
* @return A FileWriter pointing to a new bam file
*/
public SAMFileWriter reduceInit() {
@ -535,6 +543,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/**
* Output each read to disk
*
* @param read The read to output
* @param output The FileWriter to write the read to
* @return The FileWriter
@ -548,6 +557,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/**
* Do nothing
*
* @param output The SAMFileWriter that outputs the bam file
*/
public void onTraversalDone(SAMFileWriter output) {
@ -557,7 +567,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
"These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!");
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) {
}
else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +