Added dbSNP sanity check to CountCovariates. If the mismatch rate is too low at dbSNP sites it warns the user that the dbSNP file is suspicious. Added option in CountCovariates and TableRecalibration to ignore read group id's and collapse them together. Also, If the read group is null the walkers no long crash with NullPointerException but instead warn the user the read group and platform are defaulting to some values. Default window size in MinimumNQSCovariate is 5 (two bases in either direction) based on rereading of Chris's analysis.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2140 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-11-24 16:16:44 +00:00
parent bc35a34f60
commit a59e5b5e1a
5 changed files with 195 additions and 92 deletions

View File

@ -681,6 +681,10 @@ public class GenomeAnalysisEngine {
outputTracker.prepareWalker(walker);
}
/**
* Returns the SAM File Header from the input reads' data source file
* @return the SAM File Header from the input reads' data source file
*/
public SAMFileHeader getSAMFileHeader() {
return readsDataSource.getHeader();
}

View File

@ -68,38 +68,40 @@ import net.sf.samtools.SAMReadGroupRecord;
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
/////////////////////
/////////////////////////////
// Command Line Arguments
/////////////////////
/////////////////////////////
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
private Boolean LIST_ONLY = false;
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false)
private String[] COVARIATES = null;
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
private boolean USE_ORIGINAL_QUALS = false;
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
private int WINDOW_SIZE = 3;
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
private int WINDOW_SIZE = 5;
@Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file")
private String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName="process_nth_locus", shortName="pN", required=false, doc="Only process every Nth covered locus we see.")
private int PROCESS_EVERY_NTH_LOCUS = 1;
/////////////////////
/////////////////////////////
// Debugging-only Arguments
/////////////////////
/////////////////////////////
@Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. For debugging purposes only.")
private boolean NO_PRINT_HEADER = false;
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Depricated. Match the output of the old recalibrator exactly. For debugging purposes only.")
@Argument(fullName="validate_old_recalibrator", shortName="validate", required=false, doc="Depricated. Match the output of the old recalibrator exactly. FOR DEBUGGING PURPOSES ONLY.")
private boolean VALIDATE_OLD_RECALIBRATOR = false;
// BUGBUG: This validate option no longer does exactly what it says because now using read filter to filter out zero mapping quality reads. The old recalibrator counted those reads in the counted_sites variable but the new one doesn't.
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.")
@Argument(fullName="use_slx_platform", shortName="useSLX", required=false, doc="Force the platform to be Illumina regardless of what it actually says. FOR DEBUGGING PURPOSES ONLY.")
private boolean USE_SLX_PLATFORM = false;
@Argument(fullName="sorted_output", shortName="sortedOutput", required=false, doc="The outputted table recalibration file will be in sorted order at the cost of added overhead. For debugging purposes only. This option is required in order to pass the integration tests.")
@Argument(fullName="sorted_output", shortName="sorted", required=false, doc="The outputted table recalibration file will be in sorted order at the cost of added overhead. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
private boolean SORTED_OUTPUT = false;
/////////////////////
@Argument(fullName="ignore_readgroup", shortName="noRG", required=false, doc="All read groups are combined together. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
private boolean IGNORE_READGROUP = false;
/////////////////////////////
// Private Member Variables
/////////////////////
/////////////////////////////
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
private IdentityHashMap<SAMRecord, ReadHashDatum> readDatumHashMap; // A hash map that hashes the read object itself into properties commonly pulled out of the read. Done for optimization purposes.
@ -107,7 +109,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
private final String versionNumber = "2.0.2"; // major version, minor version, and build number
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
private final String versionNumber = "2.0.3"; // Major version, minor version, and build number
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
private int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
private boolean warnUserNullReadGroup = false; // Has the walker warned the user about null read groups yet?
//---------------------------------------------------------------------------------------------------------------
//
@ -124,7 +133,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
logger.info( "CovariateCounterWalker version: " + versionNumber );
// Get a list of all available covariates
final List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
final List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface( Covariate.class );
// Print and exit if that's what was requested
if ( LIST_ONLY ) {
@ -134,7 +143,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
out.println();
System.exit( 0 ); // early exit here because user requested it
System.exit( 0 ); // Early exit here because user requested it
}
// Warn the user if no dbSNP file was specified
@ -151,15 +160,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
// Initialize the requested covariates by parsing the -cov argument
requestedCovariates = new ArrayList<Covariate>();
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
int estimatedCapacity = 1; // Capacity is multiplicitive so this starts at one
if( VALIDATE_OLD_RECALIBRATOR ) {
requestedCovariates.add( new ReadGroupCovariate() );
requestedCovariates.add( new CycleCovariate() ); // unfortunately order is different here in order to match the old recalibrator exactly
requestedCovariates.add( new CycleCovariate() ); // Unfortunately order is different here in order to match the old recalibrator exactly
requestedCovariates.add( new QualityScoreCovariate() );
requestedCovariates.add( new DinucCovariate() );
} else if( COVARIATES != null ) {
if(COVARIATES[0].equalsIgnoreCase("ALL")) { // the user wants ALL covariates to be used
requestedCovariates.add( new ReadGroupCovariate() ); // first add the required covariates then add the rest by looping over all implementing classes that were found
if(COVARIATES[0].equalsIgnoreCase( "ALL" )) { // The user wants ALL covariates to be used
requestedCovariates.add( new ReadGroupCovariate() ); // First add the required covariates then add the rest by looping over all implementing classes that were found
requestedCovariates.add( new QualityScoreCovariate() );
for( Class<?> covClass : classes ) {
try {
@ -167,7 +176,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
estimatedCapacity *= covariate.estimatedNumberOfBins();
// Some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // these were already added so don't add them again
if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // These were already added so don't add them again
requestedCovariates.add( covariate );
}
} catch ( InstantiationException e ) {
@ -241,13 +250,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
*/
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
// pull out anything passed by -B name,type,file that has the name "dbsnp"
// Pull out anything passed by -B name,type,file that has the name "dbsnp"
final RODRecordList<ReferenceOrderedDatum> dbsnpRODs = tracker.getTrackData( "dbsnp", null );
boolean isSNP = false;
if (dbsnpRODs != null) {
for( ReferenceOrderedDatum rod : dbsnpRODs ) {
if( ((Variation)rod).isSNP() ) {
isSNP = true; // at least one of the rods says this is a snp site
isSNP = true; // At least one of the rods says this is a snp site
break;
}
}
@ -255,7 +264,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
// Only use data from non-dbsnp sites
// Assume every mismatch at a non-dbsnp site is indicitive of poor quality
if( !isSNP && ( (countedSites+skippedSites) % PROCESS_EVERY_NTH_LOCUS == 0) ) {
if( !isSNP && ( ++numUnprocessed >= PROCESS_EVERY_NTH_LOCUS ) ) {
numUnprocessed = 0; // Reset the counter because we are processing this very locus
// Long list of variables that are preallocated for use in for loop below
final List<SAMRecord> reads = context.getReads();
final List<Integer> offsets = context.getOffsets();
SAMRecord read;
@ -278,6 +290,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
read = reads.get(iii);
offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct
// Try to pull the read out of the read IdentityHashMap
readDatum = readDatumHashMap.get( read );
if( readDatum == null ) {
@ -288,6 +301,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
// This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again
// Lots of lines just pulling things out of the SAMRecord and stuffing into local variables
quals = read.getBaseQualities();
// Check if we need to use the original quality scores instead
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
@ -301,35 +315,47 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'. Is this true?
isNegStrand = read.getReadNegativeStrandFlag();
final SAMReadGroupRecord readGroup = read.getReadGroup();
readGroupId = readGroup.getReadGroupId();
platform = readGroup.getPlatform();
if( readGroup == null ) {
if( !warnUserNullReadGroup ) {
Utils.warnUser("The input .bam file contains reads with no read group. Defaulting to Illumina platform and empty read group name.");
warnUserNullReadGroup = true;
}
readGroupId = ReadGroupCovariate.collapsedReadGroup;
platform = "ILLUMINA";
} else {
readGroupId = readGroup.getReadGroupId();
platform = readGroup.getPlatform();
}
mappingQuality = read.getMappingQuality();
length = bases.length;
if( USE_SLX_PLATFORM ) {
platform = "ILLUMINA";
}
if( IGNORE_READGROUP ) {
readGroupId = ReadGroupCovariate.collapsedReadGroup;
}
readDatum = new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length );
readDatumHashMap.put( read, readDatum );
sizeOfReadDatumHashMap++;
}
// skip first and last base because there is no dinuc
// Skip first and last base because there is no dinuc
// BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests.
if( offset > 0 ) {
if( offset < readDatum.length - 1 ) {
// skip if base quality is zero
// Skip if base quality is zero
if( readDatum.quals[offset] > 0 ) {
refBase = (byte)ref.getBase();
prevBase = readDatum.bases[offset-1];
// Get the complement base strand if we are a negative strand read
// Get the complement base strand if we are a negative strand read, DinucCovariate is responsible for getting the complement bases if needed
if( readDatum.isNegStrand ) {
prevBase = readDatum.bases[offset+1];
}
// skip if this base or the previous one was an 'N' or etc.
// Skip if this base or the previous one was an 'N' or etc.
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(readDatum.bases[offset]) ) ) {
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
@ -344,25 +370,75 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
} else {
if( VALIDATE_OLD_RECALIBRATOR ) {
countedBases++; // replicating a small bug in the old recalibrator
countedBases++; // Replicating a small bug in the old recalibrator
}
}
}
} else { // at the last base in the read so we can remove the read from our IdentityHashMap since we will never see it again
} else { // At the last base in the read so we can remove the read from our IdentityHashMap since we will never see it again
readDatumHashMap.remove( read );
sizeOfReadDatumHashMap--;
}
}
}
countedSites++;
updateMismatchCounts(novel_counts, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
} else { // We skipped over the dbSNP site
} else { // We skipped over the dbSNP site, and we are only processing every Nth locus
skippedSites++;
if( isSNP) {
updateMismatchCounts(dbSNP_counts, context, ref.getBase());// For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
}
}
// Do a dbSNP sanity check every so often
if ( ++lociSinceLastDbsnpCheck == DBSNP_VALIDATION_CHECK_FREQUENCY ) {
lociSinceLastDbsnpCheck = 0;
validateDbsnpMismatchRate();
}
return 1; // This value isn't actually used anywhere
}
/**
* Update the mismatch / total_base counts for a given class of loci.
*
* @param counts The counts to be updated
* @param context The AlignmentContext which holds the reads covered by this locus
* @param ref The reference base
*/
private static void updateMismatchCounts(Pair<Long, Long> counts, AlignmentContext context, char ref) {
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int iii = 0; iii < reads.size(); iii++ ) {
char readChar = (char)(reads.get(iii).getReadBases()[offsets.get(iii)]);
int readCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(readChar);
int refCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref);
if ( readCharBaseIndex != -1 && refCharBaseIndex != -1 ) {
if ( readCharBaseIndex != refCharBaseIndex )
counts.first++;
counts.second++;
}
}
}
/**
* Validate the dbSNP mismatch rates.
*/
private void validateDbsnpMismatchRate() {
if ( novel_counts.second == 0 || dbSNP_counts.second == 0 )
return;
double fractionMM_novel = (double)novel_counts.first / (double)novel_counts.second;
double fractionMM_dbsnp = (double)dbSNP_counts.first / (double)dbSNP_counts.second;
//System.out.println(String.format("dbSNP rate = %.2f, novel rate = %.2f", fractionMM_dbsnp, fractionMM_novel));
if ( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) {
Utils.warnUser("The variation rate at supplied known variant sites seems suspiciously low. Please double-check that the correct ROD is being used.");
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't output the message every megabase of a large file
}
}
/**
* Major workhorse routine for this walker.
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
@ -398,7 +474,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
byte base = readDatum.bases[offset];
// Add one to the number of observations and potentially one to the number of mismatches
datum.increment( (char)base, (char)refBase ); // dangerous: if you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad
datum.increment( (char)base, (char)refBase ); // Dangerous: If you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad
countedBases++;
}
@ -428,7 +504,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
* @return returns The PrintStream used to output the CSV data
*/
public PrintStream reduce( Integer value, PrintStream recalTableStream ) {
return recalTableStream; // nothing to do here
return recalTableStream; // Nothing to do here
}
/**
@ -451,7 +527,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
//BUGBUG: This method is a mess. It will be cleaned up when I get rid of the validation and no_header debug options.
if( VALIDATE_OLD_RECALIBRATOR ) {
// output the old header as well as output the data in sorted order
// Output the old header as well as output the data in sorted order
recalTableStream.printf("# collapsed_pos false%n");
recalTableStream.printf("# collapsed_dinuc false%n");
recalTableStream.printf("# counted_sites %d%n", countedSites);

View File

@ -39,7 +39,7 @@ public class MinimumNQSCovariate implements Covariate {
private int windowReach; // how far in each direction from the current base to look
public MinimumNQSCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
windowReach = 1; // window size = 3 was the best covariate according to Chris's analysis
windowReach = 2; // window size = 5 was the best covariate according to Chris's analysis
}
public MinimumNQSCovariate(final int windowSize) {
@ -69,6 +69,6 @@ public class MinimumNQSCovariate implements Covariate {
}
public String toString() {
return "Minimum Neighborhood Quality Score";
return "Minimum Neighborhood Quality Score (window size = " + (windowReach * 2 + 1) + ")";
}
}

View File

@ -35,6 +35,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
public class ReadGroupCovariate implements Covariate{
public static final String collapsedReadGroup = "Read Group Collapsed";
public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMProgramRecord;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.Requires;
@ -62,45 +63,49 @@ import java.io.FileNotFoundException;
// GATK to not actually spend time giving me the refBase array since I don't use it
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@Argument(fullName="recal_file", shortName="recalFile", doc="Input recalibration table file generated by CountCovariates", required=true)
private String RECAL_FILE = null;
@Argument(fullName="output_bam", shortName="outputBam", doc="output BAM file", required=false)
private SAMFileWriter OUTPUT_BAM = null;
@Argument(fullName="preserve_qscores_less_than", shortName="pQ",
doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
private int PRESERVE_QSCORES_LESS_THAN = 5;
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
private boolean USE_ORIGINAL_QUALS = false;
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
private int WINDOW_SIZE = 3;
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
private int WINDOW_SIZE = 5;
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
private int SMOOTHING = 1;
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.")
/////////////////////////////
// Debugging-only Arguments
/////////////////////////////
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. FOR DEBUGGING PURPOSES ONLY.")
private boolean VALIDATE_OLD_RECALIBRATOR = false;
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.")
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. FOR DEBUGGING PURPOSES ONLY.")
private boolean USE_SLX_PLATFORM = false;
@Argument(fullName="ignore_readgroup", shortName="noRG", required=false, doc="All read groups are combined together. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
private boolean IGNORE_READGROUP = false;
//@Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the recalibrated bam file. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
//private boolean NO_PG_TAG = false;
//public enum RecalibrationMode {
// COMBINATORIAL,
// SEQUENTIAL,
// ERROR
//}
//@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false)
private String MODE_STRING = "SEQUENTIAL";
//public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes?
private RecalDataManager dataManager;
private ArrayList<Covariate> requestedCovariates;
private ArrayList<Comparable> fullCovariateKey; // the list that will be used over and over again to hold the full set of covariate values
private ArrayList<Comparable> collapsedTableKey; // the key that will be used over and over again to query the collapsed tables
/////////////////////////////
// Private Member Variables
/////////////////////////////
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
private ArrayList<Covariate> requestedCovariates; // List of covariates to be used in this calculation
private ArrayList<Comparable> fullCovariateKey; // The list that will be used over and over again to hold the full set of covariate values
private ArrayList<Comparable> collapsedTableKey; // The key that will be used over and over again to query the collapsed tables
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
private final String versionNumber = "2.0.1"; // Major version, minor version, and build number
private boolean warnUserNullReadGroup = false; // Has the walker warned the user about null read groups yet?
private final String versionNumber = "2.0.0"; // major version, minor version, and build number
//---------------------------------------------------------------------------------------------------------------
//
// initialize
@ -121,7 +126,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
int lineNumber = 0;
boolean foundAllCovariates = false;
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
int estimatedCapacity = 1; // Capacity is multiplicitive so this starts at one
// Warn the user if a dbSNP file was specified since it isn't being used here
boolean foundDBSNP = false;
@ -134,8 +139,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
Utils.warnUser("A dbSNP rod file was specified but TableRecalibrationWalker doesn't make use of it.");
}
fullCovariateKey = new ArrayList<Comparable>();
collapsedTableKey = new ArrayList<Comparable>();
fullCovariateKey = new ArrayList<Comparable>(); // Initialize the key only once
collapsedTableKey = new ArrayList<Comparable>(); // Initialize the key only once
// Read in the covariates that were used from the input file
requestedCovariates = new ArrayList<Covariate>();
@ -147,20 +152,20 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
for ( String line : new xReadLines(new File( RECAL_FILE )) ) {
lineNumber++;
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
; // skip over the comment lines, (which start with '#')
; // Skip over the comment lines, (which start with '#')
}
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // the line string is either specifying a covariate or is giving csv data
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
if( foundAllCovariates ) {
throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data. " + RECAL_FILE );
} else { // found another covariate in input file
} else { // Found another covariate in input file
boolean foundClass = false;
for( Class<?> covClass : classes ) {
if( line.equalsIgnoreCase( "@!" + covClass.getSimpleName() ) ) { // the "@!" was added by CovariateCounterWalker as a code to recognize covariate class names
if( line.equalsIgnoreCase( "@!" + covClass.getSimpleName() ) ) { // The "@!" was added by CovariateCounterWalker as a code to recognize covariate class names
foundClass = true;
try {
Covariate covariate = (Covariate)covClass.newInstance();
// some covariates need parameters (user supplied command line arguments) passed to them
// Some covariates need parameters (user supplied command line arguments) passed to them
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
requestedCovariates.add( covariate );
estimatedCapacity *= covariate.estimatedNumberOfBins();
@ -179,7 +184,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
} else { // found some data
} else { // Found some data
if( !foundAllCovariates ) {
if( VALIDATE_OLD_RECALIBRATOR ) {
requestedCovariates.add( new ReadGroupCovariate() );
@ -194,7 +199,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() );
}
addCSVData(line); // parse the line and add the data to the HashMap
addCSVData(line); // Parse the line and add the data to the HashMap
}
}
@ -208,12 +213,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
logger.info( "The covariates being used here: " );
logger.info( requestedCovariates );
// Create the collapsed tables that are used in the sequential calculation
if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING );
logger.info( "...done!" );
}
// Create the tables of empirical quality scores that will be used in the sequential calculation
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING );
logger.info( "...done!" );
}
/**
@ -228,7 +231,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
cov = requestedCovariates.get( iii );
if( VALIDATE_OLD_RECALIBRATOR ) {
if( iii == 1 ) { // order is different in the old recalibrator
if( iii == 1 ) { // Order is different in the old recalibrator unfortunately
key.add( cov.getValue( vals[2] ) );
} else if ( iii == 2 ) {
key.add( cov.getValue( vals[1] ) );
@ -239,7 +242,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
key.add( cov.getValue( vals[iii] ) );
}
}
// Create a new datum using the number of observations and number of mismatches
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) );
// Add that datum to all the collapsed tables which will be used in the sequential calculation
dataManager.addToAllTables( key, datum );
}
@ -274,13 +279,22 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
byte[] recalQuals = originalQuals.clone();
// These calls are all expensive so only do them once for each read
// Pulling out all of the useful things from the read
final SAMReadGroupRecord readGroup = read.getReadGroup();
final String readGroupId = readGroup.getReadGroupId();
String platform = readGroup.getPlatform();
final boolean isNegStrand = read.getReadNegativeStrandFlag();
if( USE_SLX_PLATFORM ) {
String readGroupId;
String platform;
if( readGroup == null ) {
if( !warnUserNullReadGroup ) {
Utils.warnUser("The input .bam file contains reads with no read group. Defaulting to Illumina platform and empty read group name.");
warnUserNullReadGroup = true;
}
readGroupId = ReadGroupCovariate.collapsedReadGroup;
platform = "ILLUMINA";
} else {
readGroupId = readGroup.getReadGroupId();
platform = readGroup.getPlatform();
}
final boolean isNegStrand = read.getReadNegativeStrandFlag();
byte[] bases = read.getReadBases();
int startPos = 1;
int stopPos = bases.length;
@ -289,11 +303,17 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
startPos = 0;
stopPos = bases.length - 1;
}
if( USE_SLX_PLATFORM ) {
platform = "ILLUMINA";
}
if( IGNORE_READGROUP ) {
readGroupId = ReadGroupCovariate.collapsedReadGroup;
}
ReadHashDatum readDatum = new ReadHashDatum( readGroupId, platform, originalQuals, bases, isNegStrand, read.getMappingQuality(), bases.length );
// For each base in the read
for( int iii = startPos; iii < stopPos; iii++ ) { // skip first or last base because there is no dinuc depending on the direction of the read
for( int iii = startPos; iii < stopPos; iii++ ) { // Skip first or last base because there is no dinuc depending on the direction of the read
// Get the covariate values which make up the key
for( Covariate covariate : requestedCovariates ) {
@ -304,7 +324,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
fullCovariateKey.clear();
}
preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
// SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores
if( platform.equalsIgnoreCase("SOLID") ) {
@ -312,8 +332,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); }
}
read.setBaseQualities(recalQuals); // overwrite old qualities with new recalibrated qualities
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if the tag isn't already taken in the read
read.setBaseQualities(recalQuals); // Overwrite old qualities with new recalibrated qualities
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // Save the old qualities if the tag isn't already taken in the read
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
}
@ -366,16 +386,17 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
double deltaQPos = 0.0;
double deltaQDinuc = 0.0;
for( int iii = 2; iii < key.size(); iii++ ) {
collapsedTableKey.add( key.get(iii) ); // the given covariate
collapsedTableKey.add( key.get(iii) ); // The given covariate
deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get(collapsedTableKey);
if( deltaQCovariateEmpirical != null ) {
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
if( VALIDATE_OLD_RECALIBRATOR ) {
if(iii==2) { deltaQPos = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); } // BUGBUG: only here to validate against the old recalibrator
if(iii==2) { deltaQPos = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); } // BUGBUG: Only here to validate against the old recalibrator
if(iii==3) { deltaQDinuc = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); }
}
}
collapsedTableKey.remove( 2 ); // this new covariate is always added in at position 2 in the collapsedTableKey list
collapsedTableKey.remove( 2 ); // This new covariate is always added in at position 2 in the collapsedTableKey list
// The collapsedTableKey should be: < ReadGroup, Reported Quality Score, This Covariate >
}
double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
@ -397,7 +418,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
/**
* Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
* @param originalQuals The list of original base quality scores
* @param recalQuals A list of the new recalibrated quality scores
*/
@ -410,7 +431,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
}
/**
* Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if the color space quality is zero
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if the color space quality is zero
* @param originalQuals The list of original base quality scores
* @param recalQuals A list of the new recalibrated quality scores
* @param colorSpaceQuals The list of color space quality scores for this read
@ -430,7 +451,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
//---------------------------------------------------------------------------------------------------------------
/**
* Nothing start the reduce with a new handle to the output bam file
* Start the reduce with a new handle to the output bam file
* @return A FileWriter pointing to a new bam file
*/
public SAMFileWriter reduceInit() {