diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
index 4e3d4048b..626460be6 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
@@ -77,20 +77,20 @@ import java.util.Map;
*
Output
*
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
- * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
+ * It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
*
- * The first 20 lines of such a file is shown below.
+ * The first 20 lines of such a file is shown below.
* * The file begins with a series of comment lines describing:
* ** The number of counted loci
* ** The number of counted bases
* ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
- *
- * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
+ *
+ * * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
*
* * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
- * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
+ * depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
* reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
- *
+ *
*
* # Counted Sites 19451059
* # Counted Bases 56582018
@@ -129,13 +129,14 @@ import java.util.Map;
* -cov DinucCovariate \
* -recalFile my_reads.recal_data.csv
*
- *
*/
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
-@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
-@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality
-@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
+@By(DataSource.READS) // Only look at covered loci, not every loci of the reference file
+@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class})
+// Filter out all reads with zero or unavailable mapping quality
+@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
+// This walker requires both -I input.bam and -R reference.fasta
@PartitionBy(PartitionType.LOCUS)
public class CountCovariatesWalker extends LocusWalker implements TreeReducible {
@@ -149,7 +150,8 @@ public class CountCovariatesWalker extends LocusWalker> knownSites = Collections.emptyList();
/**
@@ -169,31 +171,31 @@ public class CountCovariatesWalker extends LocusWalker> covariateClasses = new PluginManager( Covariate.class ).getPlugins();
- final List> requiredClasses = new PluginManager( RequiredCovariate.class ).getPlugins();
- final List> standardClasses = new PluginManager( StandardCovariate.class ).getPlugins();
+ final List> covariateClasses = new PluginManager(Covariate.class).getPlugins();
+ final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins();
+ final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins();
// Print and exit if that's what was requested
- if ( LIST_ONLY ) {
- logger.info( "Available covariates:" );
- for( Class> covClass : covariateClasses ) {
- logger.info( covClass.getSimpleName() );
+ if (LIST_ONLY) {
+ logger.info("Available covariates:");
+ for (Class> covClass : covariateClasses) {
+ logger.info(covClass.getSimpleName());
}
logger.info("");
- System.exit( 0 ); // Early exit here because user requested it
+ System.exit(0); // Early exit here because user requested it
}
// Warn the user if no dbSNP file or other variant mask was specified
- if( knownSites.isEmpty() && !RUN_WITHOUT_DBSNP ) {
+ if (knownSites.isEmpty() && !RUN_WITHOUT_DBSNP) {
throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.");
}
// Initialize the requested covariates by parsing the -cov argument
// First add the required covariates
- if( requiredClasses.size() == 2) { // readGroup and reported quality score
- requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here
- requestedCovariates.add( new QualityScoreCovariate() );
- } else {
+ if (requiredClasses.size() == 2) { // readGroup and reported quality score
+ requestedCovariates.add(new ReadGroupCovariate()); // Order is important here
+ requestedCovariates.add(new QualityScoreCovariate());
+ }
+ else {
throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order.");
}
// Next add the standard covariates if -standard was specified by the user
- if( USE_STANDARD_COVARIATES ) {
+ if (USE_STANDARD_COVARIATES) {
// We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order
// A list of Classes can't be sorted, but a list of Class names can be
final List standardClassNames = new ArrayList();
- for( Class> covClass : standardClasses ) {
- standardClassNames.add( covClass.getName() );
+ for (Class> covClass : standardClasses) {
+ standardClassNames.add(covClass.getName());
}
Collections.sort(standardClassNames); // Sort the list of class names
- for( String className : standardClassNames ) {
- for( Class> covClass : standardClasses ) { // Find the class that matches this class name
- if( covClass.getName().equals( className ) ) {
+ for (String className : standardClassNames) {
+ for (Class> covClass : standardClasses) { // Find the class that matches this class name
+ if (covClass.getName().equals(className)) {
try {
- final Covariate covariate = (Covariate)covClass.newInstance();
- requestedCovariates.add( covariate );
+ final Covariate covariate = (Covariate) covClass.newInstance();
+ requestedCovariates.add(covariate);
} catch (Exception e) {
throw new DynamicClassResolutionException(covClass, e);
}
@@ -302,17 +307,17 @@ public class CountCovariatesWalker extends LocusWalker covClass : covariateClasses ) {
- if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
+ for (Class> covClass : covariateClasses) {
+ if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class
foundClass = true;
- if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) {
+ if (!requiredClasses.contains(covClass) && (!USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) {
try {
// Now that we've found a matching class, try to instantiate it
- final Covariate covariate = (Covariate)covClass.newInstance();
- requestedCovariates.add( covariate );
+ final Covariate covariate = (Covariate) covClass.newInstance();
+ requestedCovariates.add(covariate);
} catch (Exception e) {
throw new DynamicClassResolutionException(covClass, e);
}
@@ -320,20 +325,19 @@ public class CountCovariatesWalker extends LocusWalker 0 ) {
+ if (gatkRead.getBaseQualities()[offset] > 0) {
byte[] bases = gatkRead.getReadBases();
byte refBase = ref.getBase();
// Skip if this base is an 'N' or etc.
- if( BaseUtils.isRegularBase( bases[offset] ) ) {
+ if (BaseUtils.isRegularBase(bases[offset])) {
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
- if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
- !RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) {
+ if (!gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
+ !RecalDataManager.isInconsistentColorSpace(gatkRead, offset)) {
// This base finally passed all the checks for a good base, so add it to the big data hashmap
- updateDataFromRead( counter, gatkRead, offset, refBase );
+ updateDataFromRead(counter, gatkRead, offset, refBase);
- } else { // calculate SOLID reference insertion rate
- if( refBase == bases[offset] ) {
+ }
+ else { // calculate SOLID reference insertion rate
+ if (refBase == bases[offset]) {
counter.solidInsertedReferenceBases++;
- } else {
+ }
+ else {
counter.otherColorSpaceInconsistency++;
}
}
@@ -405,7 +410,8 @@ public class CountCovariatesWalker extends LocusWalker= DBSNP_VALIDATION_CHECK_FREQUENCY ) {
+ if (++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY) {
counter.lociSinceLastDbsnpCheck = 0;
- if( counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L ) {
- final double fractionMM_novel = (double)counter.novelCountsMM / (double)counter.novelCountsBases;
- final double fractionMM_dbsnp = (double)counter.dbSNPCountsMM / (double)counter.dbSNPCountsBases;
+ if (counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L) {
+ final double fractionMM_novel = (double) counter.novelCountsMM / (double) counter.novelCountsBases;
+ final double fractionMM_dbsnp = (double) counter.dbSNPCountsMM / (double) counter.dbSNPCountsBases;
- if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) {
- Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " +
- String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) );
+ if (fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel) {
+ Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel));
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file
}
}
@@ -518,47 +525,50 @@ public class CountCovariatesWalker extends LocusWalker keyList = new ArrayList();
- for( Object comp : data.keySet() ) {
+ for (Object comp : data.keySet()) {
keyList.add((Comparable) comp);
}
Collections.sort(keyList);
- for( Comparable comp : keyList ) {
+ for (Comparable comp : keyList) {
key[curPos] = comp;
final Object val = data.get(comp);
- if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
+ if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
// For each Covariate in the key
- for( Object compToPrint : key ) {
+ for (Object compToPrint : key) {
// Output the Covariate's value
- recalTableStream.print( compToPrint + "," );
+ recalTableStream.print(compToPrint + ",");
}
// Output the RecalDatum entry
- recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
- } else { // Another layer in the nested hash map
- printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val );
+ recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
+ }
+ else { // Another layer in the nested hash map
+ printMappingsSorted(recalTableStream, curPos + 1, key, (Map) val);
}
}
}
- private void printMappings( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
- for( Object comp : data.keySet() ) {
+ private void printMappings(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
+ for (Object comp : data.keySet()) {
key[curPos] = comp;
final Object val = data.get(comp);
- if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
+ if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
// For each Covariate in the key
- for( Object compToPrint : key ) {
+ for (Object compToPrint : key) {
// Output the Covariate's value
- recalTableStream.print( compToPrint + "," );
+ recalTableStream.print(compToPrint + ",");
}
// Output the RecalDatum entry
- recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
- } else { // Another layer in the nested hash map
- printMappings( recalTableStream, curPos + 1, key, (Map) val );
+ recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
+ }
+ else { // Another layer in the nested hash map
+ printMappings(recalTableStream, curPos + 1, key, (Map) val);
}
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
index 18b33c0e8..72c2b2829 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
@@ -256,32 +256,6 @@ public class RecalDataManager {
public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup();
- // If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments
- if (readGroup == null) {
- if (RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) {
- if (!warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null) {
- Utils.warnUser("The input .bam file contains reads with no read group. " +
- "Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " +
- "First observed at read with name = " + read.getReadName());
- warnUserNullReadGroup = true;
- }
- // There is no readGroup so defaulting to these values
- readGroup = new GATKSAMReadGroupRecord(RAC.DEFAULT_READ_GROUP);
- readGroup.setPlatform(RAC.DEFAULT_PLATFORM);
- ((GATKSAMRecord) read).setReadGroup(readGroup);
- }
- else {
- throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName());
- }
- }
-
- if (RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP)) { // Collapse all the read groups into a single common String provided by the user
- final String oldPlatform = readGroup.getPlatform();
- readGroup = new GATKSAMReadGroupRecord(RAC.FORCE_READ_GROUP);
- readGroup.setPlatform(oldPlatform);
- ((GATKSAMRecord) read).setReadGroup(readGroup);
- }
-
if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
readGroup.setPlatform(RAC.FORCE_PLATFORM);
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java
index 7f3035f1e..9752b1dee 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java
@@ -43,31 +43,15 @@ public class RecalibrationArgumentCollection {
// Shared Command Line Arguments
//////////////////////////////////
@Hidden
- @Argument(fullName = "default_read_group", shortName = "dRG", required = false, doc = "If a read has no read group then default to the provided String.")
- public String DEFAULT_READ_GROUP = null;
- @Hidden
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
public String DEFAULT_PLATFORM = null;
@Hidden
- @Argument(fullName = "force_read_group", shortName = "fRG", required = false, doc = "If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.")
- public String FORCE_READ_GROUP = null;
- @Hidden
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
@Hidden
@Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false)
public int WINDOW_SIZE = 5;
- /**
- * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
- */
- @Hidden
- @Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
- public int HOMOPOLYMER_NBACK = 7;
- @Hidden
- @Argument(fullName = "exception_if_no_tile", shortName = "throwTileException", doc = "If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required = false)
- public boolean EXCEPTION_IF_NO_TILE = false;
-
/**
* CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies.
@@ -89,4 +73,10 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false)
public int CONTEXT_SIZE = 8;
+ /**
+ * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
+ */
+ @Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
+ public int HOMOPOLYMER_NBACK = 7;
+
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
index a8006d506..cd848cd9e 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
@@ -86,12 +86,12 @@ import java.util.regex.Pattern;
* -o my_reads.recal.bam \
* -recalFile my_reads.recal_data.csv
*
- *
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@WalkerName("TableRecalibration")
-@Requires({ DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES }) // This walker requires -I input.bam, it also requires -R reference.fasta
+@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
+// This walker requires -I input.bam, it also requires -R reference.fasta
public class TableRecalibrationWalker extends ReadWalker {
public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration";
@@ -99,7 +99,8 @@ public class TableRecalibrationWalker extends ReadWalker> classes = new PluginManager(Covariate.class).getPlugins();
@@ -205,31 +206,33 @@ public class TableRecalibrationWalker extends ReadWalker covClass : classes ) {
- if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
+ for (Class> covClass : classes) {
+ if ((vals[iii] + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) {
foundClass = true;
try {
- Covariate covariate = (Covariate)covClass.newInstance();
- requestedCovariates.add( covariate );
+ Covariate covariate = (Covariate) covClass.newInstance();
+ requestedCovariates.add(covariate);
} catch (Exception e) {
throw new DynamicClassResolutionException(covClass, e);
}
@@ -237,107 +240,110 @@ public class TableRecalibrationWalker extends ReadWalker oldRecords = header.getProgramRecords();
- List newRecords = new ArrayList(oldRecords.size()+1);
- for ( SAMProgramRecord record : oldRecords ) {
- if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) )
+ List newRecords = new ArrayList(oldRecords.size() + 1);
+ for (SAMProgramRecord record : oldRecords) {
+ if (!record.getId().startsWith(PROGRAM_RECORD_NAME))
newRecords.add(record);
}
newRecords.add(programRecord);
header.setProgramRecords(newRecords);
// Write out the new header
- OUTPUT_BAM.writeHeader( header );
+ OUTPUT_BAM.writeHeader(header);
}
}
/**
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
+ *
* @param line A line of CSV data read from the recalibration table data file
*/
private void addCSVData(final File file, final String line) {
final String[] vals = line.split(",");
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
- if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
+ if (vals.length != requestedCovariates.size() + 3) { // +3 because of nObservations, nMismatch, and Qempirical
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
}
@@ -345,15 +351,15 @@ public class TableRecalibrationWalker extends ReadWalker