structure refactored throughout for performance improvements

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2036 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-11-13 15:41:09 +00:00
parent cff31f2d06
commit b1376e4216
12 changed files with 262 additions and 214 deletions

View File

@ -38,7 +38,8 @@ import net.sf.samtools.SAMRecord;
public interface Covariate { public interface Covariate {
public static final String COVARIATE_ERROR = "COVARIATE_ERROR"; public static final String COVARIATE_ERROR = "COVARIATE_ERROR";
public static final String COVARIATE_NULL = "COVARITATE_NULL"; public static final String COVARIATE_NULL = "COVARITATE_NULL";
public Comparable getValue(SAMRecord read, int offset, char[] refBases); // used to pick out the value from the read, offset, and reference bases
public Comparable getValue(SAMRecord read, int offset, String readGroup, byte[] quals, char[] bases, char refBase); //used to pick out the value from attributes of the read
public Comparable getValue(String str); // used to get value from input file public Comparable getValue(String str); // used to get value from input file
public int estimatedNumberOfBins(); // used to estimate the amount space required in the HashMap public int estimatedNumberOfBins(); // used to estimate the amount space required in the HashMap
} }

View File

@ -64,24 +64,29 @@ import net.sf.samtools.SAMRecord;
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> { public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false) @Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
protected Boolean LIST_ONLY = false; private Boolean LIST_ONLY = false;
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false) @Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false)
protected String[] COVARIATES = null; private String[] COVARIATES = null;
@Argument(fullName="min_mapping_quality", shortName="minmap", required=false, doc="Only use reads with at least this mapping quality score") @Argument(fullName="min_mapping_quality", shortName="minmap", required=false, doc="Only use reads with at least this mapping quality score")
public int MIN_MAPPING_QUALITY = 1; private int MIN_MAPPING_QUALITY = 1;
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) @Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
public boolean USE_ORIGINAL_QUALS = false; private boolean USE_ORIGINAL_QUALS = false; // BUGBUG need to pass this value to the proper covariates
@Argument(fullName = "platform", shortName="pl", doc="Which sequencing technology was used? This is important for the cycle covariate. Options are SLX, 454, and SOLID.", required=false)
private String PLATFORM = "SLX";
@Argument(fullName="recal_file", shortName="rf", required=false, doc="Filename for the outputted covariates table recalibration file") @Argument(fullName="recal_file", shortName="rf", required=false, doc="Filename for the outputted covariates table recalibration file")
public String RECAL_FILE = "output.recal_data.csv"; private String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName="validateOldRecalibrator", shortName="valor", required=false, doc="If yes will reorder the output to match the old recalibrator exactly but makes the file useless to the refactored TableRecalibrationWalker") @Argument(fullName="noPrintHeader", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file")
public boolean validateOldRecalibrator = false; private boolean NO_PRINT_HEADER = false;
@Argument(fullName="validateOldRecalibrator", shortName="valor", required=false, doc="Depricated, no longer replicates previous behavior exactly due to clean up of structure of code. If yes will reorder the output to match the old recalibrator exactly but makes the file useless to the refactored TableRecalibrationWalker")
private boolean validateOldRecalibrator = false;
protected static RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
protected static ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
//private HashMap<SAMRecord, String> readGroupHashMap; // A hash map that hashes the read object itself into the read group name
private long countedSites = 0; // used for reporting at the end // This is done for optimization purposes because pulling the read group out of the SAMRecord is expensive
private long countedBases = 0; // used for reporting at the end private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
private long skippedSites = 0; // used for reporting at the end private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
// //
@ -96,8 +101,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
public void initialize() { public void initialize() {
// Get a list of all available covariates // Get a list of all available covariates
List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class); final List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
int estimatedCapacity = 1; // start at one because capacity is multiplicative for each covariate //int estimatedCapacity = 1; // start at one because capacity is multiplicative for each covariate
// Print and exit if that's what was requested // Print and exit if that's what was requested
if ( LIST_ONLY ) { if ( LIST_ONLY ) {
@ -117,7 +122,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
requestedCovariates.add( new CycleCovariate() ); // unfortunately the ordering here is different to match the output of the old recalibrator requestedCovariates.add( new CycleCovariate() ); // unfortunately the ordering here is different to match the output of the old recalibrator
requestedCovariates.add( new QualityScoreCovariate() ); requestedCovariates.add( new QualityScoreCovariate() );
requestedCovariates.add( new DinucCovariate() ); requestedCovariates.add( new DinucCovariate() );
estimatedCapacity = 300 * 200 * 40 * 16; //estimatedCapacity = 300 * 200 * 40 * 16;
} else if( COVARIATES != null ) { } else if( COVARIATES != null ) {
int covNumber = 1; int covNumber = 1;
for( String requestedCovariateString : COVARIATES ) { for( String requestedCovariateString : COVARIATES ) {
@ -135,7 +140,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
// Now that we've found a matching class, try to instantiate it // Now that we've found a matching class, try to instantiate it
Covariate covariate = (Covariate)covClass.newInstance(); Covariate covariate = (Covariate)covClass.newInstance();
requestedCovariates.add( covariate ); requestedCovariates.add( covariate );
estimatedCapacity *= covariate.estimatedNumberOfBins(); // update the estimated initial capacity //estimatedCapacity *= covariate.estimatedNumberOfBins(); // update the estimated initial capacity
} catch ( InstantiationException e ) { } catch ( InstantiationException e ) {
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
} catch ( IllegalAccessException e ) { } catch ( IllegalAccessException e ) {
@ -154,13 +159,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
requestedCovariates.add( new QualityScoreCovariate() ); requestedCovariates.add( new QualityScoreCovariate() );
requestedCovariates.add( new CycleCovariate() ); requestedCovariates.add( new CycleCovariate() );
requestedCovariates.add( new DinucCovariate() ); requestedCovariates.add( new DinucCovariate() );
estimatedCapacity = 300 * 40 * 200 * 16; //estimatedCapacity = 300 * 40 * 200 * 16;
} }
logger.info( "The covariates being used here: " ); logger.info( "The covariates being used here: " );
logger.info( requestedCovariates ); logger.info( requestedCovariates );
dataManager = new RecalDataManager( estimatedCapacity ); //dataManager = new RecalDataManager( estimatedCapacity );
dataManager = new RecalDataManager();
//readGroupHashMap = new HashMap<SAMRecord, String>( 50000000, 0.97f );
} }
@ -180,29 +187,60 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
*/ */
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)); final rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
// Only use data from non-dbsnp sites // Only use data from non-dbsnp sites
// Assume every mismatch at a non-dbsnp site is indicitive of poor quality // Assume every mismatch at a non-dbsnp site is indicitive of poor quality
if( dbsnp == null ) { if( dbsnp == null ) {
List<SAMRecord> reads = context.getReads(); final List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets(); final List<Integer> offsets = context.getOffsets();
SAMRecord read; SAMRecord read;
int offset; int offset;
String readGroup;
byte[] quals;
char[] bases;
char refBase;
char prevBase;
final int numReads = reads.size();
// For each read at this locus // For each read at this locus
for( int iii = 0; iii < reads.size(); iii++ ) { for( int iii = 0; iii < numReads; iii++ ) {
read = reads.get(iii); read = reads.get(iii);
offset = offsets.get(iii);
// Only use data from reads with mapping quality above specified quality value
// Only use data from reads with mapping quality above specified quality value and base quality greater than zero if( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) {
byte[] quals = read.getBaseQualities();
if( read.getMappingQuality() >= MIN_MAPPING_QUALITY && quals[offset] > 0 ) //readGroup = readGroupHashMap.get( read );
{ //if( readGroup == null ) { // read is not in the hashmap so add it
// Skip first and last bases because they don't have a dinuc count // readGroup = read.getReadGroup().getReadGroupId();
if( offset > 0 && offset < (read.getReadLength() - 1) ) { // readGroupHashMap.put( read, readGroup );
updateDataFromRead(read, offset, ref); //}
}
offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct
// skip first and last base because there is no dinuc, this is mainly done for speed so we don't have to check cases
if( offset > 0 && offset < read.getReadLength() - 1) {
quals = read.getBaseQualities();
// skip if base quality is zero
if( quals[offset] > 0 ) {
bases = read.getReadString().toCharArray();
refBase = ref.getBase();
prevBase = bases[offset-1];
// Get the complement base strand if we are a negative strand read
if( read.getReadNegativeStrandFlag() ) {
bases = BaseUtils.simpleComplement( read.getReadString() ).toCharArray();
refBase = BaseUtils.simpleComplement( refBase );
prevBase = bases[offset+1];
}
// skip if this base or the previous one was an 'N' or etc.
if( BaseUtils.isRegularBase(prevBase) && BaseUtils.isRegularBase(bases[offset]) ) {
readGroup = read.getReadGroup().getReadGroupId();
updateDataFromRead( read, offset, readGroup, quals, bases, refBase );
}
}
}
} }
} }
@ -220,61 +258,41 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference * Loop through the list of requested covariates and pick out the value from the read, offset, and reference
* Using the list of covariate values as a key, pick out the RecalDatum and increment, * Using the list of covariate values as a key, pick out the RecalDatum and increment,
* adding one to the number of observations and potentially one to the number of mismatches * adding one to the number of observations and potentially one to the number of mismatches
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
* because pulling things out of the SAMRecord is an expensive operation.
* @param read The read * @param read The read
* @param offset The offset in the read for this locus * @param offset The offset in the read for this locus
* @param ref The reference context * @param readGroup The read group the read is in
* @param quals List of base quality scores
* @param bases The bases which make up the read
* @param refBase The reference base at this locus
*/ */
private void updateDataFromRead(SAMRecord read, int offset, ReferenceContext ref) { private void updateDataFromRead(final SAMRecord read, final int offset, final String readGroup, final byte[] quals, final char[] bases, final char refBase) {
List<Comparable> key = new ArrayList<Comparable>(); List<Comparable> key = new ArrayList<Comparable>();
Comparable keyElement;
boolean badKey = false;
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference // Loop through the list of requested covariates and pick out the value from the read, offset, and reference
for( Covariate covariate : requestedCovariates ) { for( Covariate covariate : requestedCovariates ) {
keyElement = covariate.getValue( read, offset, ref.getBases() ); key.add( covariate.getValue( read, offset, readGroup, quals, bases, refBase ) );
if( keyElement != Covariate.COVARIATE_ERROR && keyElement != Covariate.COVARIATE_NULL) {
key.add( keyElement );
} else {
badKey = true; // Covariate returned bad value, for example dinuc returns null because base = 'N'
}
} }
// Using the list of covariate values as a key, pick out the RecalDatum // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
RecalDatum datum = null; RecalDatum datum = dataManager.data.get( key );
if( !badKey ) { if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
datum = dataManager.data.get( key ); datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
if( validateOldRecalibrator ) {
dataManager.data.myPut( key, datum );
} else {
dataManager.data.put( key, datum );
}
}
}
// Need the bases to determine whether or not we have a mismatch
byte[] bases = read.getReadBases();
char base = (char)bases[offset];
char refBase = ref.getBase();
// Get the complement base strand if we are a negative direction read
if ( read.getReadNegativeStrandFlag() ) {
refBase = BaseUtils.simpleComplement( refBase );
base = BaseUtils.simpleComplement( base );
}
if( datum != null ) {
// Add one to the number of observations and potentially one to the number of mismatches
datum.increment( base, refBase );
countedBases++;
} else {
if( validateOldRecalibrator ) { if( validateOldRecalibrator ) {
countedBases++; // This line here to replicate behavior in the old recalibrator dataManager.data.myPut( key, datum );
// (reads with bad covariates [prev_base = 'N', for example] were properly tossed out but this count was still incremented) } else {
dataManager.data.put( key, datum );
} }
} }
// Need the bases to determine whether or not we have a mismatch
char base = bases[offset];
// Add one to the number of observations and potentially one to the number of mismatches
datum.increment( base, refBase );
countedBases++;
} }
@ -322,11 +340,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format * For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
* @param recalTableStream The PrintStream to write out to * @param recalTableStream The PrintStream to write out to
*/ */
private void outputToCSV( PrintStream recalTableStream ) { private void outputToCSV( final PrintStream recalTableStream ) {
if( validateOldRecalibrator ) { if( validateOldRecalibrator ) {
boolean collapsePos = false; final boolean collapsePos = false;
boolean collapseDinuc = false; final boolean collapseDinuc = false;
recalTableStream.printf("# collapsed_pos %b%n", collapsePos); recalTableStream.printf("# collapsed_pos %b%n", collapsePos);
recalTableStream.printf("# collapsed_dinuc %b%n", collapseDinuc); recalTableStream.printf("# collapsed_dinuc %b%n", collapseDinuc);
recalTableStream.printf("# counted_sites %d%n", countedSites); recalTableStream.printf("# counted_sites %d%n", countedSites);
@ -344,13 +362,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
recalTableStream.println( entry.getSecond().outputToCSV() ); recalTableStream.println( entry.getSecond().outputToCSV() );
} }
} else { } else {
recalTableStream.printf("# Counted Sites %d%n", countedSites); if( !NO_PRINT_HEADER ) {
recalTableStream.printf("# Counted Bases %d%n", countedBases); recalTableStream.printf("# Counted Sites %d%n", countedSites);
recalTableStream.printf("# Skipped Sites %d%n", skippedSites); recalTableStream.printf("# Counted Bases %d%n", countedBases);
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); recalTableStream.printf("# Skipped Sites %d%n", skippedSites);
for( Covariate cov : requestedCovariates ) { recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
// The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name for( Covariate cov : requestedCovariates ) {
recalTableStream.println( "@!" + cov.getClass().getSimpleName() ); // The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name
recalTableStream.println( "@!" + cov.getClass().getSimpleName() );
}
} }
// For each entry in the data hashmap // For each entry in the data hashmap
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) { for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import org.broadinstitute.sting.utils.StingException;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
/* /*
@ -43,30 +45,39 @@ import net.sf.samtools.SAMRecord;
public class CycleCovariate implements Covariate { public class CycleCovariate implements Covariate {
public String platform; private String platform;
public CycleCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker public CycleCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
platform = null; platform = "SLX";
} }
public CycleCovariate(String _platform) { public CycleCovariate(final String _platform) {
platform = _platform; platform = _platform;
} }
public Comparable getValue(SAMRecord read, int offset, char[] refBases) { public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
//BUGBUG: assumes Solexa platform final byte[] quals, final char[] bases, final char refBase) {
Integer cycle = offset; if( platform.equalsIgnoreCase( "SLX" ) ) {
if( read.getReadNegativeStrandFlag() ) { int cycle = offset;
cycle = read.getReadLength() - (offset + 1); if( read.getReadNegativeStrandFlag() ) {
cycle = bases.length - (offset + 1);
}
return cycle;
} else if( platform.equalsIgnoreCase( "454" ) ) {
return 0; //BUGBUG: not yet implemented
} else if( platform.equalsIgnoreCase( "SOLID" ) ) {
return 0; //BUGBUG: not yet implemented
} else {
throw new StingException( "Requested platform (" + platform + ") not supported in CycleCovariate." );
} }
return cycle;
} }
public Comparable getValue(String str) { public final Comparable getValue(final String str) {
return Integer.parseInt( str ); return (int)Integer.parseInt( str );
} }
public int estimatedNumberOfBins() { public final int estimatedNumberOfBins() {
return 100; return 100;
} }

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Pair;
/* /*
* Copyright (c) 2009 The Broad Institute * Copyright (c) 2009 The Broad Institute
@ -40,35 +41,31 @@ import org.broadinstitute.sting.utils.BaseUtils;
public class DinucCovariate implements Covariate { public class DinucCovariate implements Covariate {
private String returnString;
public DinucCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker public DinucCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
} }
public Comparable getValue(SAMRecord read, int offset, char[] refBases) { public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
byte[] bases = read.getReadBases(); final byte[] quals, final char[] bases, final char refBase) {
char base = (char)bases[offset];
char prevBase; // value set below char base = bases[offset];
// If it is a negative strand read then we need to get the complement base and reverse the direction for our previous base char prevBase = bases[offset - 1];
// If it is a negative strand read then we need to reverse the direction for our previous base
if( read.getReadNegativeStrandFlag() ) { if( read.getReadNegativeStrandFlag() ) {
base = BaseUtils.simpleComplement(base); prevBase = bases[offset + 1];
if ( offset + 1 > bases.length - 1 ) { return Covariate.COVARIATE_ERROR; } // no prevBase because at the beginning of the read }
prevBase = BaseUtils.simpleComplement( (char)bases[offset + 1] ); final char[] charArray = {prevBase, base};
} else { returnString = String.valueOf(charArray);
if ( offset - 1 < 0 ) { return Covariate.COVARIATE_ERROR; } // no prevBase because at the beginning of the read return returnString;
prevBase = (char)bases[offset - 1]; //return String.format("%c%c", prevBase, base); // This return statement is too slow
}
// Check if bad base, probably an 'N'
if( ( BaseUtils.simpleBaseToBaseIndex(prevBase) == -1 ) || ( BaseUtils.simpleBaseToBaseIndex(base) == -1 ) ) {
return Covariate.COVARIATE_NULL; // CovariateCounterWalker will recognize that null means skip this particular location in the read
} else {
return String.format("%c%c", prevBase, base);
}
} }
public Comparable getValue(String str) { public final Comparable getValue(final String str) {
return str; return str;
} }
public int estimatedNumberOfBins() { public final int estimatedNumberOfBins() {
return 16; return 16;
} }

View File

@ -40,15 +40,17 @@ public class MappingQualityCovariate implements Covariate {
public MappingQualityCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker public MappingQualityCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
} }
public Comparable getValue(SAMRecord read, int offset, char[] refBases) { public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
return (Integer)(read.getMappingQuality()); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code final byte[] quals, final char[] bases, final char refBase) {
return (Integer)(read.getMappingQuality()); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code
} }
public Comparable getValue(String str) { public final Comparable getValue(final String str) {
return Integer.parseInt( str ); return Integer.parseInt( str );
} }
public int estimatedNumberOfBins() { public final int estimatedNumberOfBins() {
return 100; return 100;
} }

View File

@ -41,26 +41,28 @@ import org.broadinstitute.sting.utils.QualityUtils;
public class MinimumNQSCovariate implements Covariate { public class MinimumNQSCovariate implements Covariate {
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ";
protected boolean USE_ORIGINAL_QUALS; private boolean USE_ORIGINAL_QUALS;
public MinimumNQSCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker public MinimumNQSCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
USE_ORIGINAL_QUALS = false; USE_ORIGINAL_QUALS = false;
} }
public MinimumNQSCovariate(boolean originalQuals) { public MinimumNQSCovariate(final boolean originalQuals) {
USE_ORIGINAL_QUALS = originalQuals; USE_ORIGINAL_QUALS = originalQuals;
} }
public Comparable getValue(SAMRecord read, int offset, char[] refBases) { public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
byte[] quals = read.getBaseQualities(); final byte[] quals, final char[] bases, final char refBase) {
if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG); //BUGBUG: Original qualities
if ( obj instanceof String ) //if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
quals = QualityUtils.fastqToPhred((String)obj); // Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
else { // if ( obj instanceof String )
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); // quals = QualityUtils.fastqToPhred((String)obj);
} // else {
} // throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
// }
//}
// Loop over the list of qualities and find the minimum // Loop over the list of qualities and find the minimum
Integer minQual = (int)(quals[0]); Integer minQual = (int)(quals[0]);
@ -72,11 +74,11 @@ public class MinimumNQSCovariate implements Covariate {
return minQual; return minQual;
} }
public Comparable getValue(String str) { public final Comparable getValue(final String str) {
return Integer.parseInt( str ); return Integer.parseInt( str );
} }
public int estimatedNumberOfBins() { public final int estimatedNumberOfBins() {
return 40; return 40;
} }

View File

@ -38,7 +38,7 @@ import java.util.*;
public class NHashMap<T> extends HashMap<List<? extends Comparable>, T> { public class NHashMap<T> extends HashMap<List<? extends Comparable>, T> {
private static final long serialVersionUID = 1L; //BUGBUG: what should I do here? private static final long serialVersionUID = 1L; //BUGBUG: what should I do here?, added by Eclipse
private ArrayList<ArrayList<Comparable>> keyLists; private ArrayList<ArrayList<Comparable>> keyLists;
public NHashMap() { public NHashMap() {

View File

@ -45,29 +45,32 @@ public class QualityScoreCovariate implements Covariate {
USE_ORIGINAL_QUALS = false; USE_ORIGINAL_QUALS = false;
} }
public QualityScoreCovariate(boolean originalQuals) { public QualityScoreCovariate(final boolean originalQuals) {
USE_ORIGINAL_QUALS = originalQuals; USE_ORIGINAL_QUALS = originalQuals;
} }
public Comparable getValue(SAMRecord read, int offset, char[] refBases) { public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
byte[] quals = read.getBaseQualities(); final byte[] quals, final char[] bases, final char refBase) {
if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG); //BUGBUG: original qualities
if ( obj instanceof String ) //if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
quals = QualityUtils.fastqToPhred((String)obj); // Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
else { // if ( obj instanceof String )
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); // quals = QualityUtils.fastqToPhred((String)obj);
} // else {
} // throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
// }
return ((Integer)((int)quals[offset])); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code //}
//Integer returnQual = (int)quals[offset];
return (int)quals[offset]; // returning an Integer Object (as opposed to primitive int) is required so that return values from the two getValue methods hash to the same code
} }
public Comparable getValue(String str) { public final Comparable getValue(final String str) {
return Integer.parseInt( str ); return (int)Integer.parseInt( str );
} }
public int estimatedNumberOfBins() { public final int estimatedNumberOfBins() {
return 40; return 40;
} }

View File

@ -40,15 +40,16 @@ public class ReadGroupCovariate implements Covariate{
public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
} }
public Comparable getValue(SAMRecord read, int offset, char[] refBases) { public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
return read.getReadGroup().getReadGroupId(); final byte[] quals, final char[] bases, final char refBase) {
return readGroup;
} }
public Comparable getValue(String str) { public final Comparable getValue(final String str) {
return str; return str;
} }
public int estimatedNumberOfBins() { public final int estimatedNumberOfBins() {
return 300; return 300;
} }

View File

@ -38,12 +38,17 @@ import java.util.*;
public class RecalDataManager { public class RecalDataManager {
public NHashMap<RecalDatum> data; // the full dataset public NHashMap<RecalDatum> data; // the full dataset
public NHashMap<RecalDatum> dataCollapsedReadGroup; // table where everything except read group has been collapsed private NHashMap<RecalDatum> dataCollapsedReadGroup; // table where everything except read group has been collapsed
public NHashMap<RecalDatum> dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed private NHashMap<RecalDatum> dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
public ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed
public boolean collapsedTablesCreated; private boolean collapsedTablesCreated;
public NHashMap<Double> dataSumExpectedErrors; public NHashMap<Double> dataSumExpectedErrors;
RecalDataManager() {
data = new NHashMap<RecalDatum>();
collapsedTablesCreated = false;
}
RecalDataManager( int estimatedCapacity ) { RecalDataManager( int estimatedCapacity ) {
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.95f ); // second arg is the 'loading factor', data = new NHashMap<RecalDatum>( estimatedCapacity, 0.95f ); // second arg is the 'loading factor',
// a number to monkey around with when optimizing performace of the HashMap // a number to monkey around with when optimizing performace of the HashMap
@ -51,7 +56,7 @@ public class RecalDataManager {
} }
// BUGBUG: A lot going on in this method, doing a lot of pre-calculations for use in the sequential mode calculation later in TableRecalibrationWalker // BUGBUG: A lot going on in this method, doing a lot of pre-calculations for use in the sequential mode calculation later in TableRecalibrationWalker
public void createCollapsedTables( int numCovariates ) { public final void createCollapsedTables( final int numCovariates ) {
dataCollapsedReadGroup = new NHashMap<RecalDatum>(); dataCollapsedReadGroup = new NHashMap<RecalDatum>();
dataCollapsedQualityScore = new NHashMap<RecalDatum>(); dataCollapsedQualityScore = new NHashMap<RecalDatum>();
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>(); dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
@ -122,7 +127,7 @@ public class RecalDataManager {
collapsedTablesCreated = true; collapsedTablesCreated = true;
} }
public NHashMap<RecalDatum> getCollapsedTable( int covariate ) { public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) {
if( !collapsedTablesCreated ) { if( !collapsedTablesCreated ) {
throw new StingException("Trying to get collapsed tables before they have been populated. Null pointers abound."); throw new StingException("Trying to get collapsed tables before they have been populated. Null pointers abound.");
} }

View File

@ -38,8 +38,8 @@ import java.util.*;
public class RecalDatum { public class RecalDatum {
long numObservations; // number of bases seen in total private long numObservations; // number of bases seen in total
long numMismatches; // number of bases seen that didn't match the reference private long numMismatches; // number of bases seen that didn't match the reference
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
@ -52,12 +52,12 @@ public class RecalDatum {
numMismatches = 0L; numMismatches = 0L;
} }
public RecalDatum( long _numObservations, long _numMismatches ) { public RecalDatum( final long _numObservations, final long _numMismatches ) {
numObservations = _numObservations; numObservations = _numObservations;
numMismatches = _numMismatches; numMismatches = _numMismatches;
} }
public RecalDatum( RecalDatum copy ) { public RecalDatum( final RecalDatum copy ) {
this.numObservations = copy.numObservations; this.numObservations = copy.numObservations;
this.numMismatches = copy.numMismatches; this.numMismatches = copy.numMismatches;
} }
@ -69,22 +69,22 @@ public class RecalDatum {
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
public void increment( long incObservations, long incMismatches ) { public final void increment( final long incObservations, final long incMismatches ) {
numObservations += incObservations; numObservations += incObservations;
numMismatches += incMismatches; numMismatches += incMismatches;
} }
public void increment( RecalDatum other ) { public final void increment( final RecalDatum other ) {
increment( other.numObservations, other.numMismatches ); increment( other.numObservations, other.numMismatches );
} }
public void increment( List<RecalDatum> data ) { public final void increment( final List<RecalDatum> data ) {
for ( RecalDatum other : data ) { for ( RecalDatum other : data ) {
this.increment( other ); this.increment( other );
} }
} }
public void increment( char curBase, char ref ) { public final void increment( final char curBase, final char ref ) {
increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1 ); // increment takes num observations, then num mismatches increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1 ); // increment takes num observations, then num mismatches
} }
@ -94,22 +94,22 @@ public class RecalDatum {
// //
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
public double empiricalQualDouble( int smoothing ) { public final double empiricalQualDouble( final int smoothing ) {
double doubleMismatches = (double) ( numMismatches + smoothing ); double doubleMismatches = (double) ( numMismatches + smoothing );
double doubleObservations = (double) ( numObservations + smoothing ); double doubleObservations = (double) ( numObservations + smoothing );
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE;
return empiricalQual; return empiricalQual;
} }
public double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero
public byte empiricalQualByte( int smoothing ) { public final byte empiricalQualByte( final int smoothing ) {
double doubleMismatches = (double) ( numMismatches + smoothing ); double doubleMismatches = (double) ( numMismatches + smoothing );
double doubleObservations = (double) ( numObservations + smoothing ); double doubleObservations = (double) ( numObservations + smoothing );
return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations ); return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations );
} }
public byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero public final byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
// //
@ -117,14 +117,14 @@ public class RecalDatum {
// //
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
public String outputToCSV( ) { public final String outputToCSV( ) {
return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() ); return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() );
} }
public String outputToCSV( int smoothing ) { public final String outputToCSV( final int smoothing ) {
return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte( smoothing ) ); return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte( smoothing ) );
} }
public Long getNumObservations() { public final Long getNumObservations() {
return numObservations; return numObservations;
} }

View File

@ -67,15 +67,15 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points") @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
public int SMOOTHING = 1; public int SMOOTHING = 1;
public enum RecalibrationMode { //public enum RecalibrationMode {
COMBINATORIAL, // COMBINATORIAL,
SEQUENTIAL, // SEQUENTIAL,
ERROR // ERROR
} //}
//@Argument(fullName="RecalibrationMode", shortName="mode", doc="which type of calculation to use when recalibrating, default is SEQUENTIAL", required=false) @Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false)
//public String MODE_STRING = RecalibrationMode.SEQUENTIAL.toString(); public String MODE_STRING = "SEQUENTIAL";
public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes? //public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes?
protected RecalDataManager dataManager; protected RecalDataManager dataManager;
protected ArrayList<Covariate> requestedCovariates; protected ArrayList<Covariate> requestedCovariates;
@ -165,7 +165,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
logger.info( "...done!" ); logger.info( "...done!" );
// Create the collapsed tables that are used in the sequential calculation // Create the collapsed tables that are used in the sequential calculation
if( MODE == RecalibrationMode.SEQUENTIAL ) { if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
logger.info( "Creating collapsed tables for use in sequential calculation..." ); logger.info( "Creating collapsed tables for use in sequential calculation..." );
dataManager.createCollapsedTables( requestedCovariates.size() ); dataManager.createCollapsedTables( requestedCovariates.size() );
logger.info( "...done!" ); logger.info( "...done!" );
@ -203,6 +203,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
*/ */
public SAMRecord map( char[] refBases, SAMRecord read ) { public SAMRecord map( char[] refBases, SAMRecord read ) {
if( refBases == null ) {
return read; // early return here, unmapped reads should be left alone
}
byte[] originalQuals = read.getBaseQualities(); byte[] originalQuals = read.getBaseQualities();
// Check if we need to use the original quality scores instead // Check if we need to use the original quality scores instead
if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
@ -215,36 +219,37 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
} }
byte[] recalQuals = originalQuals.clone(); byte[] recalQuals = originalQuals.clone();
// These calls are expensive so only do them once for each read
String readGroup = read.getReadGroup().getReadGroupId();
char[] bases = read.getReadString().toCharArray();
String myRefBases = new String(refBases);
if( read.getReadNegativeStrandFlag() ) {
bases = BaseUtils.simpleComplement( read.getReadString() ).toCharArray();
myRefBases = BaseUtils.simpleComplement( myRefBases );
}
// For each base in the read // For each base in the read
for( int iii = 0; iii < read.getReadLength(); iii++ ) { for( int iii = 1; iii < read.getReadLength() - 1; iii++ ) { // skip first and last bases because there is no dinuc
List<Comparable> key = new ArrayList<Comparable>(); List<Comparable> key = new ArrayList<Comparable>();
boolean badKey = false;
Comparable keyElement;
for( Covariate covariate : requestedCovariates ) { for( Covariate covariate : requestedCovariates ) {
keyElement = covariate.getValue( read, iii, refBases ); key.add( covariate.getValue( read, iii, readGroup, originalQuals, bases, myRefBases.charAt(iii) ) ); // offset is zero based so passing iii is correct here // technically COVARIATE_ERROR is fine too, but this won't match the behavior of the old recalibrator
if( keyElement != Covariate.COVARIATE_ERROR ) { // COVARIATE_NULL is okay because the sequential calculation will do the best it can with the valid data it has
key.add( keyElement ); // technically COVARIATE_ERROR is fine too, but this won't match the behavior of the old recalibrator
} else {
badKey = true;
}
} }
if( !badKey ) { if( MODE_STRING.equalsIgnoreCase("COMBINATORIAL") ) {
if( MODE == RecalibrationMode.COMBINATORIAL ) { RecalDatum datum = dataManager.data.get( key );
RecalDatum datum = dataManager.data.get( key ); if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing
if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing recalQuals[iii] = datum.empiricalQualByte( SMOOTHING );
recalQuals[iii] = datum.empiricalQualByte( SMOOTHING );
}
} else if( MODE == RecalibrationMode.SEQUENTIAL ) {
recalQuals[iii] = performSequentialQualityCalculation( key );
} else {
throw new StingException( "Specified RecalibrationMode is not supported: " + MODE );
} }
} else if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
recalQuals[iii] = performSequentialQualityCalculation( key );
} else {
throw new StingException( "Specified RecalibrationMode is not supported: " + MODE_STRING );
}
// Do some error checking on the new quality score // Do some error checking on the new quality score
if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) { if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) {
throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] ); throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] );
}
} }
} }
@ -254,6 +259,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
read.setAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals)); read.setAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
} }
return read; return read;
} }