Added dbSNP sanity check to CountCovariates. If the mismatch rate is too low at dbSNP sites it warns the user that the dbSNP file is suspicious. Added option in CountCovariates and TableRecalibration to ignore read group id's and collapse them together. Also, If the read group is null the walkers no long crash with NullPointerException but instead warn the user the read group and platform are defaulting to some values. Default window size in MinimumNQSCovariate is 5 (two bases in either direction) based on rereading of Chris's analysis.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2140 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
bc35a34f60
commit
a59e5b5e1a
|
|
@ -681,6 +681,10 @@ public class GenomeAnalysisEngine {
|
|||
outputTracker.prepareWalker(walker);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the SAM File Header from the input reads' data source file
|
||||
* @return the SAM File Header from the input reads' data source file
|
||||
*/
|
||||
public SAMFileHeader getSAMFileHeader() {
|
||||
return readsDataSource.getHeader();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,38 +68,40 @@ import net.sf.samtools.SAMReadGroupRecord;
|
|||
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
|
||||
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||
|
||||
/////////////////////
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////
|
||||
/////////////////////////////
|
||||
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
|
||||
private Boolean LIST_ONLY = false;
|
||||
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false)
|
||||
private String[] COVARIATES = null;
|
||||
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
private boolean USE_ORIGINAL_QUALS = false;
|
||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
|
||||
private int WINDOW_SIZE = 3;
|
||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
|
||||
private int WINDOW_SIZE = 5;
|
||||
@Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file")
|
||||
private String RECAL_FILE = "output.recal_data.csv";
|
||||
@Argument(fullName="process_nth_locus", shortName="pN", required=false, doc="Only process every Nth covered locus we see.")
|
||||
private int PROCESS_EVERY_NTH_LOCUS = 1;
|
||||
|
||||
/////////////////////
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////
|
||||
/////////////////////////////
|
||||
@Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. For debugging purposes only.")
|
||||
private boolean NO_PRINT_HEADER = false;
|
||||
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Depricated. Match the output of the old recalibrator exactly. For debugging purposes only.")
|
||||
@Argument(fullName="validate_old_recalibrator", shortName="validate", required=false, doc="Depricated. Match the output of the old recalibrator exactly. FOR DEBUGGING PURPOSES ONLY.")
|
||||
private boolean VALIDATE_OLD_RECALIBRATOR = false;
|
||||
// BUGBUG: This validate option no longer does exactly what it says because now using read filter to filter out zero mapping quality reads. The old recalibrator counted those reads in the counted_sites variable but the new one doesn't.
|
||||
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.")
|
||||
@Argument(fullName="use_slx_platform", shortName="useSLX", required=false, doc="Force the platform to be Illumina regardless of what it actually says. FOR DEBUGGING PURPOSES ONLY.")
|
||||
private boolean USE_SLX_PLATFORM = false;
|
||||
@Argument(fullName="sorted_output", shortName="sortedOutput", required=false, doc="The outputted table recalibration file will be in sorted order at the cost of added overhead. For debugging purposes only. This option is required in order to pass the integration tests.")
|
||||
@Argument(fullName="sorted_output", shortName="sorted", required=false, doc="The outputted table recalibration file will be in sorted order at the cost of added overhead. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
private boolean SORTED_OUTPUT = false;
|
||||
|
||||
/////////////////////
|
||||
@Argument(fullName="ignore_readgroup", shortName="noRG", required=false, doc="All read groups are combined together. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
private boolean IGNORE_READGROUP = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////
|
||||
/////////////////////////////
|
||||
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
|
||||
private IdentityHashMap<SAMRecord, ReadHashDatum> readDatumHashMap; // A hash map that hashes the read object itself into properties commonly pulled out of the read. Done for optimization purposes.
|
||||
|
|
@ -107,7 +109,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
|
||||
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
|
||||
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
|
||||
private final String versionNumber = "2.0.2"; // major version, minor version, and build number
|
||||
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
|
||||
private final String versionNumber = "2.0.3"; // Major version, minor version, and build number
|
||||
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
|
||||
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
|
||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||
private int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
|
||||
private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
|
||||
private boolean warnUserNullReadGroup = false; // Has the walker warned the user about null read groups yet?
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -124,7 +133,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
logger.info( "CovariateCounterWalker version: " + versionNumber );
|
||||
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
|
||||
final List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface( Covariate.class );
|
||||
|
||||
// Print and exit if that's what was requested
|
||||
if ( LIST_ONLY ) {
|
||||
|
|
@ -134,7 +143,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
out.println();
|
||||
|
||||
System.exit( 0 ); // early exit here because user requested it
|
||||
System.exit( 0 ); // Early exit here because user requested it
|
||||
}
|
||||
|
||||
// Warn the user if no dbSNP file was specified
|
||||
|
|
@ -151,15 +160,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
|
||||
// Initialize the requested covariates by parsing the -cov argument
|
||||
requestedCovariates = new ArrayList<Covariate>();
|
||||
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
|
||||
int estimatedCapacity = 1; // Capacity is multiplicitive so this starts at one
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
requestedCovariates.add( new ReadGroupCovariate() );
|
||||
requestedCovariates.add( new CycleCovariate() ); // unfortunately order is different here in order to match the old recalibrator exactly
|
||||
requestedCovariates.add( new CycleCovariate() ); // Unfortunately order is different here in order to match the old recalibrator exactly
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
requestedCovariates.add( new DinucCovariate() );
|
||||
} else if( COVARIATES != null ) {
|
||||
if(COVARIATES[0].equalsIgnoreCase("ALL")) { // the user wants ALL covariates to be used
|
||||
requestedCovariates.add( new ReadGroupCovariate() ); // first add the required covariates then add the rest by looping over all implementing classes that were found
|
||||
if(COVARIATES[0].equalsIgnoreCase( "ALL" )) { // The user wants ALL covariates to be used
|
||||
requestedCovariates.add( new ReadGroupCovariate() ); // First add the required covariates then add the rest by looping over all implementing classes that were found
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
for( Class<?> covClass : classes ) {
|
||||
try {
|
||||
|
|
@ -167,7 +176,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
estimatedCapacity *= covariate.estimatedNumberOfBins();
|
||||
// Some covariates need parameters (user supplied command line arguments) passed to them
|
||||
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
|
||||
if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // these were already added so don't add them again
|
||||
if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // These were already added so don't add them again
|
||||
requestedCovariates.add( covariate );
|
||||
}
|
||||
} catch ( InstantiationException e ) {
|
||||
|
|
@ -241,13 +250,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
*/
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
// pull out anything passed by -B name,type,file that has the name "dbsnp"
|
||||
// Pull out anything passed by -B name,type,file that has the name "dbsnp"
|
||||
final RODRecordList<ReferenceOrderedDatum> dbsnpRODs = tracker.getTrackData( "dbsnp", null );
|
||||
boolean isSNP = false;
|
||||
if (dbsnpRODs != null) {
|
||||
for( ReferenceOrderedDatum rod : dbsnpRODs ) {
|
||||
if( ((Variation)rod).isSNP() ) {
|
||||
isSNP = true; // at least one of the rods says this is a snp site
|
||||
isSNP = true; // At least one of the rods says this is a snp site
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
@ -255,7 +264,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
|
||||
// Only use data from non-dbsnp sites
|
||||
// Assume every mismatch at a non-dbsnp site is indicitive of poor quality
|
||||
if( !isSNP && ( (countedSites+skippedSites) % PROCESS_EVERY_NTH_LOCUS == 0) ) {
|
||||
if( !isSNP && ( ++numUnprocessed >= PROCESS_EVERY_NTH_LOCUS ) ) {
|
||||
numUnprocessed = 0; // Reset the counter because we are processing this very locus
|
||||
|
||||
// Long list of variables that are preallocated for use in for loop below
|
||||
final List<SAMRecord> reads = context.getReads();
|
||||
final List<Integer> offsets = context.getOffsets();
|
||||
SAMRecord read;
|
||||
|
|
@ -278,6 +290,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
read = reads.get(iii);
|
||||
offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct
|
||||
|
||||
// Try to pull the read out of the read IdentityHashMap
|
||||
readDatum = readDatumHashMap.get( read );
|
||||
if( readDatum == null ) {
|
||||
|
||||
|
|
@ -288,6 +301,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
|
||||
// This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again
|
||||
// Lots of lines just pulling things out of the SAMRecord and stuffing into local variables
|
||||
quals = read.getBaseQualities();
|
||||
// Check if we need to use the original quality scores instead
|
||||
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
|
|
@ -301,35 +315,47 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'. Is this true?
|
||||
isNegStrand = read.getReadNegativeStrandFlag();
|
||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
readGroupId = readGroup.getReadGroupId();
|
||||
platform = readGroup.getPlatform();
|
||||
if( readGroup == null ) {
|
||||
if( !warnUserNullReadGroup ) {
|
||||
Utils.warnUser("The input .bam file contains reads with no read group. Defaulting to Illumina platform and empty read group name.");
|
||||
warnUserNullReadGroup = true;
|
||||
}
|
||||
readGroupId = ReadGroupCovariate.collapsedReadGroup;
|
||||
platform = "ILLUMINA";
|
||||
} else {
|
||||
readGroupId = readGroup.getReadGroupId();
|
||||
platform = readGroup.getPlatform();
|
||||
}
|
||||
mappingQuality = read.getMappingQuality();
|
||||
length = bases.length;
|
||||
if( USE_SLX_PLATFORM ) {
|
||||
platform = "ILLUMINA";
|
||||
}
|
||||
if( IGNORE_READGROUP ) {
|
||||
readGroupId = ReadGroupCovariate.collapsedReadGroup;
|
||||
}
|
||||
|
||||
readDatum = new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length );
|
||||
readDatumHashMap.put( read, readDatum );
|
||||
sizeOfReadDatumHashMap++;
|
||||
}
|
||||
|
||||
// skip first and last base because there is no dinuc
|
||||
// Skip first and last base because there is no dinuc
|
||||
// BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests.
|
||||
if( offset > 0 ) {
|
||||
if( offset < readDatum.length - 1 ) {
|
||||
// skip if base quality is zero
|
||||
// Skip if base quality is zero
|
||||
if( readDatum.quals[offset] > 0 ) {
|
||||
|
||||
refBase = (byte)ref.getBase();
|
||||
prevBase = readDatum.bases[offset-1];
|
||||
|
||||
// Get the complement base strand if we are a negative strand read
|
||||
// Get the complement base strand if we are a negative strand read, DinucCovariate is responsible for getting the complement bases if needed
|
||||
if( readDatum.isNegStrand ) {
|
||||
prevBase = readDatum.bases[offset+1];
|
||||
}
|
||||
|
||||
// skip if this base or the previous one was an 'N' or etc.
|
||||
// Skip if this base or the previous one was an 'N' or etc.
|
||||
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(readDatum.bases[offset]) ) ) {
|
||||
|
||||
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
|
||||
|
|
@ -344,25 +370,75 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
} else {
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
countedBases++; // replicating a small bug in the old recalibrator
|
||||
countedBases++; // Replicating a small bug in the old recalibrator
|
||||
}
|
||||
}
|
||||
}
|
||||
} else { // at the last base in the read so we can remove the read from our IdentityHashMap since we will never see it again
|
||||
} else { // At the last base in the read so we can remove the read from our IdentityHashMap since we will never see it again
|
||||
readDatumHashMap.remove( read );
|
||||
sizeOfReadDatumHashMap--;
|
||||
}
|
||||
}
|
||||
}
|
||||
countedSites++;
|
||||
updateMismatchCounts(novel_counts, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||
|
||||
} else { // We skipped over the dbSNP site
|
||||
} else { // We skipped over the dbSNP site, and we are only processing every Nth locus
|
||||
skippedSites++;
|
||||
if( isSNP) {
|
||||
updateMismatchCounts(dbSNP_counts, context, ref.getBase());// For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Do a dbSNP sanity check every so often
|
||||
if ( ++lociSinceLastDbsnpCheck == DBSNP_VALIDATION_CHECK_FREQUENCY ) {
|
||||
lociSinceLastDbsnpCheck = 0;
|
||||
validateDbsnpMismatchRate();
|
||||
}
|
||||
|
||||
return 1; // This value isn't actually used anywhere
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the mismatch / total_base counts for a given class of loci.
|
||||
*
|
||||
* @param counts The counts to be updated
|
||||
* @param context The AlignmentContext which holds the reads covered by this locus
|
||||
* @param ref The reference base
|
||||
*/
|
||||
private static void updateMismatchCounts(Pair<Long, Long> counts, AlignmentContext context, char ref) {
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
for (int iii = 0; iii < reads.size(); iii++ ) {
|
||||
char readChar = (char)(reads.get(iii).getReadBases()[offsets.get(iii)]);
|
||||
int readCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(readChar);
|
||||
int refCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref);
|
||||
|
||||
if ( readCharBaseIndex != -1 && refCharBaseIndex != -1 ) {
|
||||
if ( readCharBaseIndex != refCharBaseIndex )
|
||||
counts.first++;
|
||||
counts.second++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate the dbSNP mismatch rates.
|
||||
*/
|
||||
private void validateDbsnpMismatchRate() {
|
||||
if ( novel_counts.second == 0 || dbSNP_counts.second == 0 )
|
||||
return;
|
||||
|
||||
double fractionMM_novel = (double)novel_counts.first / (double)novel_counts.second;
|
||||
double fractionMM_dbsnp = (double)dbSNP_counts.first / (double)dbSNP_counts.second;
|
||||
//System.out.println(String.format("dbSNP rate = %.2f, novel rate = %.2f", fractionMM_dbsnp, fractionMM_novel));
|
||||
|
||||
if ( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) {
|
||||
Utils.warnUser("The variation rate at supplied known variant sites seems suspiciously low. Please double-check that the correct ROD is being used.");
|
||||
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't output the message every megabase of a large file
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Major workhorse routine for this walker.
|
||||
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
|
|
@ -398,7 +474,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
byte base = readDatum.bases[offset];
|
||||
|
||||
// Add one to the number of observations and potentially one to the number of mismatches
|
||||
datum.increment( (char)base, (char)refBase ); // dangerous: if you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad
|
||||
datum.increment( (char)base, (char)refBase ); // Dangerous: If you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad
|
||||
countedBases++;
|
||||
}
|
||||
|
||||
|
|
@ -428,7 +504,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
* @return returns The PrintStream used to output the CSV data
|
||||
*/
|
||||
public PrintStream reduce( Integer value, PrintStream recalTableStream ) {
|
||||
return recalTableStream; // nothing to do here
|
||||
return recalTableStream; // Nothing to do here
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -451,7 +527,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
|
||||
//BUGBUG: This method is a mess. It will be cleaned up when I get rid of the validation and no_header debug options.
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
// output the old header as well as output the data in sorted order
|
||||
// Output the old header as well as output the data in sorted order
|
||||
recalTableStream.printf("# collapsed_pos false%n");
|
||||
recalTableStream.printf("# collapsed_dinuc false%n");
|
||||
recalTableStream.printf("# counted_sites %d%n", countedSites);
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ public class MinimumNQSCovariate implements Covariate {
|
|||
private int windowReach; // how far in each direction from the current base to look
|
||||
|
||||
public MinimumNQSCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
windowReach = 1; // window size = 3 was the best covariate according to Chris's analysis
|
||||
windowReach = 2; // window size = 5 was the best covariate according to Chris's analysis
|
||||
}
|
||||
|
||||
public MinimumNQSCovariate(final int windowSize) {
|
||||
|
|
@ -69,6 +69,6 @@ public class MinimumNQSCovariate implements Covariate {
|
|||
}
|
||||
|
||||
public String toString() {
|
||||
return "Minimum Neighborhood Quality Score";
|
||||
return "Minimum Neighborhood Quality Score (window size = " + (windowReach * 2 + 1) + ")";
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,6 +35,8 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
|
|||
|
||||
public class ReadGroupCovariate implements Covariate{
|
||||
|
||||
public static final String collapsedReadGroup = "Read Group Collapsed";
|
||||
|
||||
public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
|
|||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMProgramRecord;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
|
|
@ -62,45 +63,49 @@ import java.io.FileNotFoundException;
|
|||
// GATK to not actually spend time giving me the refBase array since I don't use it
|
||||
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="recal_file", shortName="recalFile", doc="Input recalibration table file generated by CountCovariates", required=true)
|
||||
private String RECAL_FILE = null;
|
||||
@Argument(fullName="output_bam", shortName="outputBam", doc="output BAM file", required=false)
|
||||
private SAMFileWriter OUTPUT_BAM = null;
|
||||
@Argument(fullName="preserve_qscores_less_than", shortName="pQ",
|
||||
doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
|
||||
doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
|
||||
private int PRESERVE_QSCORES_LESS_THAN = 5;
|
||||
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
private boolean USE_ORIGINAL_QUALS = false;
|
||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false)
|
||||
private int WINDOW_SIZE = 3;
|
||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
|
||||
private int WINDOW_SIZE = 5;
|
||||
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
|
||||
private int SMOOTHING = 1;
|
||||
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.")
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. FOR DEBUGGING PURPOSES ONLY.")
|
||||
private boolean VALIDATE_OLD_RECALIBRATOR = false;
|
||||
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.")
|
||||
@Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. FOR DEBUGGING PURPOSES ONLY.")
|
||||
private boolean USE_SLX_PLATFORM = false;
|
||||
@Argument(fullName="ignore_readgroup", shortName="noRG", required=false, doc="All read groups are combined together. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
private boolean IGNORE_READGROUP = false;
|
||||
//@Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the recalibrated bam file. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
//private boolean NO_PG_TAG = false;
|
||||
|
||||
//public enum RecalibrationMode {
|
||||
// COMBINATORIAL,
|
||||
// SEQUENTIAL,
|
||||
// ERROR
|
||||
//}
|
||||
|
||||
//@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false)
|
||||
private String MODE_STRING = "SEQUENTIAL";
|
||||
//public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes?
|
||||
|
||||
private RecalDataManager dataManager;
|
||||
private ArrayList<Covariate> requestedCovariates;
|
||||
private ArrayList<Comparable> fullCovariateKey; // the list that will be used over and over again to hold the full set of covariate values
|
||||
private ArrayList<Comparable> collapsedTableKey; // the key that will be used over and over again to query the collapsed tables
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private ArrayList<Covariate> requestedCovariates; // List of covariates to be used in this calculation
|
||||
private ArrayList<Comparable> fullCovariateKey; // The list that will be used over and over again to hold the full set of covariate values
|
||||
private ArrayList<Comparable> collapsedTableKey; // The key that will be used over and over again to query the collapsed tables
|
||||
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
|
||||
private final String versionNumber = "2.0.1"; // Major version, minor version, and build number
|
||||
private boolean warnUserNullReadGroup = false; // Has the walker warned the user about null read groups yet?
|
||||
|
||||
private final String versionNumber = "2.0.0"; // major version, minor version, and build number
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
|
|
@ -121,7 +126,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
int lineNumber = 0;
|
||||
boolean foundAllCovariates = false;
|
||||
int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one
|
||||
int estimatedCapacity = 1; // Capacity is multiplicitive so this starts at one
|
||||
|
||||
// Warn the user if a dbSNP file was specified since it isn't being used here
|
||||
boolean foundDBSNP = false;
|
||||
|
|
@ -134,8 +139,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
Utils.warnUser("A dbSNP rod file was specified but TableRecalibrationWalker doesn't make use of it.");
|
||||
}
|
||||
|
||||
fullCovariateKey = new ArrayList<Comparable>();
|
||||
collapsedTableKey = new ArrayList<Comparable>();
|
||||
fullCovariateKey = new ArrayList<Comparable>(); // Initialize the key only once
|
||||
collapsedTableKey = new ArrayList<Comparable>(); // Initialize the key only once
|
||||
|
||||
// Read in the covariates that were used from the input file
|
||||
requestedCovariates = new ArrayList<Covariate>();
|
||||
|
|
@ -147,20 +152,20 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
for ( String line : new xReadLines(new File( RECAL_FILE )) ) {
|
||||
lineNumber++;
|
||||
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
|
||||
; // skip over the comment lines, (which start with '#')
|
||||
; // Skip over the comment lines, (which start with '#')
|
||||
}
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // the line string is either specifying a covariate or is giving csv data
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data. " + RECAL_FILE );
|
||||
} else { // found another covariate in input file
|
||||
} else { // Found another covariate in input file
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
|
||||
if( line.equalsIgnoreCase( "@!" + covClass.getSimpleName() ) ) { // the "@!" was added by CovariateCounterWalker as a code to recognize covariate class names
|
||||
if( line.equalsIgnoreCase( "@!" + covClass.getSimpleName() ) ) { // The "@!" was added by CovariateCounterWalker as a code to recognize covariate class names
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
// some covariates need parameters (user supplied command line arguments) passed to them
|
||||
// Some covariates need parameters (user supplied command line arguments) passed to them
|
||||
if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); }
|
||||
requestedCovariates.add( covariate );
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins();
|
||||
|
|
@ -179,7 +184,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
}
|
||||
|
||||
} else { // found some data
|
||||
} else { // Found some data
|
||||
if( !foundAllCovariates ) {
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
requestedCovariates.add( new ReadGroupCovariate() );
|
||||
|
|
@ -194,7 +199,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
dataManager = new RecalDataManager( estimatedCapacity, createCollapsedTables, requestedCovariates.size() );
|
||||
|
||||
}
|
||||
addCSVData(line); // parse the line and add the data to the HashMap
|
||||
addCSVData(line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -208,12 +213,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
logger.info( "The covariates being used here: " );
|
||||
logger.info( requestedCovariates );
|
||||
|
||||
// Create the collapsed tables that are used in the sequential calculation
|
||||
if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
|
||||
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
|
||||
dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING );
|
||||
logger.info( "...done!" );
|
||||
}
|
||||
// Create the tables of empirical quality scores that will be used in the sequential calculation
|
||||
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
|
||||
dataManager.generateEmpiricalQualities( requestedCovariates.size(), SMOOTHING );
|
||||
logger.info( "...done!" );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -228,7 +231,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
if( iii == 1 ) { // order is different in the old recalibrator
|
||||
if( iii == 1 ) { // Order is different in the old recalibrator unfortunately
|
||||
key.add( cov.getValue( vals[2] ) );
|
||||
} else if ( iii == 2 ) {
|
||||
key.add( cov.getValue( vals[1] ) );
|
||||
|
|
@ -239,7 +242,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
key.add( cov.getValue( vals[iii] ) );
|
||||
}
|
||||
}
|
||||
// Create a new datum using the number of observations and number of mismatches
|
||||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ) );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum );
|
||||
|
||||
}
|
||||
|
|
@ -274,13 +279,22 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
// These calls are all expensive so only do them once for each read
|
||||
// Pulling out all of the useful things from the read
|
||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
final String readGroupId = readGroup.getReadGroupId();
|
||||
String platform = readGroup.getPlatform();
|
||||
final boolean isNegStrand = read.getReadNegativeStrandFlag();
|
||||
if( USE_SLX_PLATFORM ) {
|
||||
String readGroupId;
|
||||
String platform;
|
||||
if( readGroup == null ) {
|
||||
if( !warnUserNullReadGroup ) {
|
||||
Utils.warnUser("The input .bam file contains reads with no read group. Defaulting to Illumina platform and empty read group name.");
|
||||
warnUserNullReadGroup = true;
|
||||
}
|
||||
readGroupId = ReadGroupCovariate.collapsedReadGroup;
|
||||
platform = "ILLUMINA";
|
||||
} else {
|
||||
readGroupId = readGroup.getReadGroupId();
|
||||
platform = readGroup.getPlatform();
|
||||
}
|
||||
final boolean isNegStrand = read.getReadNegativeStrandFlag();
|
||||
byte[] bases = read.getReadBases();
|
||||
int startPos = 1;
|
||||
int stopPos = bases.length;
|
||||
|
|
@ -289,11 +303,17 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
startPos = 0;
|
||||
stopPos = bases.length - 1;
|
||||
}
|
||||
if( USE_SLX_PLATFORM ) {
|
||||
platform = "ILLUMINA";
|
||||
}
|
||||
if( IGNORE_READGROUP ) {
|
||||
readGroupId = ReadGroupCovariate.collapsedReadGroup;
|
||||
}
|
||||
|
||||
ReadHashDatum readDatum = new ReadHashDatum( readGroupId, platform, originalQuals, bases, isNegStrand, read.getMappingQuality(), bases.length );
|
||||
|
||||
// For each base in the read
|
||||
for( int iii = startPos; iii < stopPos; iii++ ) { // skip first or last base because there is no dinuc depending on the direction of the read
|
||||
for( int iii = startPos; iii < stopPos; iii++ ) { // Skip first or last base because there is no dinuc depending on the direction of the read
|
||||
|
||||
// Get the covariate values which make up the key
|
||||
for( Covariate covariate : requestedCovariates ) {
|
||||
|
|
@ -304,7 +324,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
fullCovariateKey.clear();
|
||||
}
|
||||
|
||||
preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low
|
||||
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
|
||||
|
||||
// SOLID bams insert the reference base into the read if the color space quality is zero, so don't change their base quality scores
|
||||
if( platform.equalsIgnoreCase("SOLID") ) {
|
||||
|
|
@ -312,8 +332,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
if(colorSpaceQuals != null) { preserveBadColorSpaceQualities_SOLID( originalQuals, recalQuals, colorSpaceQuals ); }
|
||||
}
|
||||
|
||||
read.setBaseQualities(recalQuals); // overwrite old qualities with new recalibrated qualities
|
||||
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if the tag isn't already taken in the read
|
||||
read.setBaseQualities(recalQuals); // Overwrite old qualities with new recalibrated qualities
|
||||
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // Save the old qualities if the tag isn't already taken in the read
|
||||
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
|
||||
}
|
||||
|
||||
|
|
@ -366,16 +386,17 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
double deltaQPos = 0.0;
|
||||
double deltaQDinuc = 0.0;
|
||||
for( int iii = 2; iii < key.size(); iii++ ) {
|
||||
collapsedTableKey.add( key.get(iii) ); // the given covariate
|
||||
collapsedTableKey.add( key.get(iii) ); // The given covariate
|
||||
deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get(collapsedTableKey);
|
||||
if( deltaQCovariateEmpirical != null ) {
|
||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||
if(iii==2) { deltaQPos = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); } // BUGBUG: only here to validate against the old recalibrator
|
||||
if(iii==2) { deltaQPos = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); } // BUGBUG: Only here to validate against the old recalibrator
|
||||
if(iii==3) { deltaQDinuc = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); }
|
||||
}
|
||||
}
|
||||
collapsedTableKey.remove( 2 ); // this new covariate is always added in at position 2 in the collapsedTableKey list
|
||||
collapsedTableKey.remove( 2 ); // This new covariate is always added in at position 2 in the collapsedTableKey list
|
||||
// The collapsedTableKey should be: < ReadGroup, Reported Quality Score, This Covariate >
|
||||
}
|
||||
|
||||
double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||
|
|
@ -397,7 +418,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
|
||||
/**
|
||||
* Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
||||
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
||||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
*/
|
||||
|
|
@ -410,7 +431,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
|
||||
/**
|
||||
* Loop of the list of qualities and overwrite the newly recalibrated score to be the original score if the color space quality is zero
|
||||
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if the color space quality is zero
|
||||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
* @param colorSpaceQuals The list of color space quality scores for this read
|
||||
|
|
@ -430,7 +451,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Nothing start the reduce with a new handle to the output bam file
|
||||
* Start the reduce with a new handle to the output bam file
|
||||
* @return A FileWriter pointing to a new bam file
|
||||
*/
|
||||
public SAMFileWriter reduceInit() {
|
||||
|
|
|
|||
Loading…
Reference in New Issue