structure refactored throughout for performance improvements
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2036 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
cff31f2d06
commit
b1376e4216
|
|
@ -38,7 +38,8 @@ import net.sf.samtools.SAMRecord;
|
|||
public interface Covariate {
|
||||
public static final String COVARIATE_ERROR = "COVARIATE_ERROR";
|
||||
public static final String COVARIATE_NULL = "COVARITATE_NULL";
|
||||
public Comparable getValue(SAMRecord read, int offset, char[] refBases); // used to pick out the value from the read, offset, and reference bases
|
||||
|
||||
public Comparable getValue(SAMRecord read, int offset, String readGroup, byte[] quals, char[] bases, char refBase); //used to pick out the value from attributes of the read
|
||||
public Comparable getValue(String str); // used to get value from input file
|
||||
public int estimatedNumberOfBins(); // used to estimate the amount space required in the HashMap
|
||||
}
|
||||
|
|
|
|||
|
|
@ -64,24 +64,29 @@ import net.sf.samtools.SAMRecord;
|
|||
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||
|
||||
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
|
||||
protected Boolean LIST_ONLY = false;
|
||||
private Boolean LIST_ONLY = false;
|
||||
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false)
|
||||
protected String[] COVARIATES = null;
|
||||
private String[] COVARIATES = null;
|
||||
@Argument(fullName="min_mapping_quality", shortName="minmap", required=false, doc="Only use reads with at least this mapping quality score")
|
||||
public int MIN_MAPPING_QUALITY = 1;
|
||||
private int MIN_MAPPING_QUALITY = 1;
|
||||
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
|
||||
public boolean USE_ORIGINAL_QUALS = false;
|
||||
private boolean USE_ORIGINAL_QUALS = false; // BUGBUG need to pass this value to the proper covariates
|
||||
@Argument(fullName = "platform", shortName="pl", doc="Which sequencing technology was used? This is important for the cycle covariate. Options are SLX, 454, and SOLID.", required=false)
|
||||
private String PLATFORM = "SLX";
|
||||
@Argument(fullName="recal_file", shortName="rf", required=false, doc="Filename for the outputted covariates table recalibration file")
|
||||
public String RECAL_FILE = "output.recal_data.csv";
|
||||
@Argument(fullName="validateOldRecalibrator", shortName="valor", required=false, doc="If yes will reorder the output to match the old recalibrator exactly but makes the file useless to the refactored TableRecalibrationWalker")
|
||||
public boolean validateOldRecalibrator = false;
|
||||
private String RECAL_FILE = "output.recal_data.csv";
|
||||
@Argument(fullName="noPrintHeader", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file")
|
||||
private boolean NO_PRINT_HEADER = false;
|
||||
@Argument(fullName="validateOldRecalibrator", shortName="valor", required=false, doc="Depricated, no longer replicates previous behavior exactly due to clean up of structure of code. If yes will reorder the output to match the old recalibrator exactly but makes the file useless to the refactored TableRecalibrationWalker")
|
||||
private boolean validateOldRecalibrator = false;
|
||||
|
||||
protected static RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
protected static ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
|
||||
|
||||
private long countedSites = 0; // used for reporting at the end
|
||||
private long countedBases = 0; // used for reporting at the end
|
||||
private long skippedSites = 0; // used for reporting at the end
|
||||
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
|
||||
//private HashMap<SAMRecord, String> readGroupHashMap; // A hash map that hashes the read object itself into the read group name
|
||||
// This is done for optimization purposes because pulling the read group out of the SAMRecord is expensive
|
||||
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
|
||||
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
|
||||
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -96,8 +101,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
public void initialize() {
|
||||
|
||||
// Get a list of all available covariates
|
||||
List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
|
||||
int estimatedCapacity = 1; // start at one because capacity is multiplicative for each covariate
|
||||
final List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
|
||||
//int estimatedCapacity = 1; // start at one because capacity is multiplicative for each covariate
|
||||
|
||||
// Print and exit if that's what was requested
|
||||
if ( LIST_ONLY ) {
|
||||
|
|
@ -117,7 +122,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
requestedCovariates.add( new CycleCovariate() ); // unfortunately the ordering here is different to match the output of the old recalibrator
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
requestedCovariates.add( new DinucCovariate() );
|
||||
estimatedCapacity = 300 * 200 * 40 * 16;
|
||||
//estimatedCapacity = 300 * 200 * 40 * 16;
|
||||
} else if( COVARIATES != null ) {
|
||||
int covNumber = 1;
|
||||
for( String requestedCovariateString : COVARIATES ) {
|
||||
|
|
@ -135,7 +140,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
// Now that we've found a matching class, try to instantiate it
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins(); // update the estimated initial capacity
|
||||
//estimatedCapacity *= covariate.estimatedNumberOfBins(); // update the estimated initial capacity
|
||||
} catch ( InstantiationException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
|
||||
} catch ( IllegalAccessException e ) {
|
||||
|
|
@ -154,13 +159,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
requestedCovariates.add( new CycleCovariate() );
|
||||
requestedCovariates.add( new DinucCovariate() );
|
||||
estimatedCapacity = 300 * 40 * 200 * 16;
|
||||
//estimatedCapacity = 300 * 40 * 200 * 16;
|
||||
}
|
||||
|
||||
logger.info( "The covariates being used here: " );
|
||||
logger.info( requestedCovariates );
|
||||
|
||||
dataManager = new RecalDataManager( estimatedCapacity );
|
||||
//dataManager = new RecalDataManager( estimatedCapacity );
|
||||
dataManager = new RecalDataManager();
|
||||
//readGroupHashMap = new HashMap<SAMRecord, String>( 50000000, 0.97f );
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -180,29 +187,60 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
*/
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
|
||||
final rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null));
|
||||
|
||||
// Only use data from non-dbsnp sites
|
||||
// Assume every mismatch at a non-dbsnp site is indicitive of poor quality
|
||||
if( dbsnp == null ) {
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
final List<SAMRecord> reads = context.getReads();
|
||||
final List<Integer> offsets = context.getOffsets();
|
||||
SAMRecord read;
|
||||
int offset;
|
||||
|
||||
String readGroup;
|
||||
byte[] quals;
|
||||
char[] bases;
|
||||
char refBase;
|
||||
char prevBase;
|
||||
|
||||
final int numReads = reads.size();
|
||||
// For each read at this locus
|
||||
for( int iii = 0; iii < reads.size(); iii++ ) {
|
||||
for( int iii = 0; iii < numReads; iii++ ) {
|
||||
read = reads.get(iii);
|
||||
offset = offsets.get(iii);
|
||||
|
||||
// Only use data from reads with mapping quality above specified quality value and base quality greater than zero
|
||||
byte[] quals = read.getBaseQualities();
|
||||
if( read.getMappingQuality() >= MIN_MAPPING_QUALITY && quals[offset] > 0 )
|
||||
{
|
||||
// Skip first and last bases because they don't have a dinuc count
|
||||
if( offset > 0 && offset < (read.getReadLength() - 1) ) {
|
||||
updateDataFromRead(read, offset, ref);
|
||||
}
|
||||
|
||||
// Only use data from reads with mapping quality above specified quality value
|
||||
if( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) {
|
||||
|
||||
//readGroup = readGroupHashMap.get( read );
|
||||
//if( readGroup == null ) { // read is not in the hashmap so add it
|
||||
// readGroup = read.getReadGroup().getReadGroupId();
|
||||
// readGroupHashMap.put( read, readGroup );
|
||||
//}
|
||||
|
||||
offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct
|
||||
|
||||
// skip first and last base because there is no dinuc, this is mainly done for speed so we don't have to check cases
|
||||
if( offset > 0 && offset < read.getReadLength() - 1) {
|
||||
|
||||
quals = read.getBaseQualities();
|
||||
// skip if base quality is zero
|
||||
if( quals[offset] > 0 ) {
|
||||
bases = read.getReadString().toCharArray();
|
||||
refBase = ref.getBase();
|
||||
prevBase = bases[offset-1];
|
||||
// Get the complement base strand if we are a negative strand read
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
bases = BaseUtils.simpleComplement( read.getReadString() ).toCharArray();
|
||||
refBase = BaseUtils.simpleComplement( refBase );
|
||||
prevBase = bases[offset+1];
|
||||
}
|
||||
|
||||
// skip if this base or the previous one was an 'N' or etc.
|
||||
if( BaseUtils.isRegularBase(prevBase) && BaseUtils.isRegularBase(bases[offset]) ) {
|
||||
readGroup = read.getReadGroup().getReadGroupId();
|
||||
updateDataFromRead( read, offset, readGroup, quals, bases, refBase );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -220,61 +258,41 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
|
||||
* adding one to the number of observations and potentially one to the number of mismatches
|
||||
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
|
||||
* because pulling things out of the SAMRecord is an expensive operation.
|
||||
* @param read The read
|
||||
* @param offset The offset in the read for this locus
|
||||
* @param ref The reference context
|
||||
* @param readGroup The read group the read is in
|
||||
* @param quals List of base quality scores
|
||||
* @param bases The bases which make up the read
|
||||
* @param refBase The reference base at this locus
|
||||
*/
|
||||
private void updateDataFromRead(SAMRecord read, int offset, ReferenceContext ref) {
|
||||
private void updateDataFromRead(final SAMRecord read, final int offset, final String readGroup, final byte[] quals, final char[] bases, final char refBase) {
|
||||
|
||||
List<Comparable> key = new ArrayList<Comparable>();
|
||||
Comparable keyElement;
|
||||
boolean badKey = false;
|
||||
|
||||
|
||||
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
for( Covariate covariate : requestedCovariates ) {
|
||||
keyElement = covariate.getValue( read, offset, ref.getBases() );
|
||||
if( keyElement != Covariate.COVARIATE_ERROR && keyElement != Covariate.COVARIATE_NULL) {
|
||||
key.add( keyElement );
|
||||
} else {
|
||||
badKey = true; // Covariate returned bad value, for example dinuc returns null because base = 'N'
|
||||
}
|
||||
key.add( covariate.getValue( read, offset, readGroup, quals, bases, refBase ) );
|
||||
}
|
||||
|
||||
// Using the list of covariate values as a key, pick out the RecalDatum
|
||||
RecalDatum datum = null;
|
||||
if( !badKey ) {
|
||||
datum = dataManager.data.get( key );
|
||||
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
|
||||
if( validateOldRecalibrator ) {
|
||||
dataManager.data.myPut( key, datum );
|
||||
} else {
|
||||
dataManager.data.put( key, datum );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Need the bases to determine whether or not we have a mismatch
|
||||
byte[] bases = read.getReadBases();
|
||||
|
||||
char base = (char)bases[offset];
|
||||
char refBase = ref.getBase();
|
||||
// Get the complement base strand if we are a negative direction read
|
||||
if ( read.getReadNegativeStrandFlag() ) {
|
||||
refBase = BaseUtils.simpleComplement( refBase );
|
||||
base = BaseUtils.simpleComplement( base );
|
||||
}
|
||||
|
||||
if( datum != null ) {
|
||||
// Add one to the number of observations and potentially one to the number of mismatches
|
||||
datum.increment( base, refBase );
|
||||
countedBases++;
|
||||
} else {
|
||||
|
||||
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
||||
RecalDatum datum = dataManager.data.get( key );
|
||||
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
|
||||
if( validateOldRecalibrator ) {
|
||||
countedBases++; // This line here to replicate behavior in the old recalibrator
|
||||
// (reads with bad covariates [prev_base = 'N', for example] were properly tossed out but this count was still incremented)
|
||||
dataManager.data.myPut( key, datum );
|
||||
} else {
|
||||
dataManager.data.put( key, datum );
|
||||
}
|
||||
}
|
||||
|
||||
// Need the bases to determine whether or not we have a mismatch
|
||||
char base = bases[offset];
|
||||
|
||||
// Add one to the number of observations and potentially one to the number of mismatches
|
||||
datum.increment( base, refBase );
|
||||
countedBases++;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -322,11 +340,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
|
||||
* @param recalTableStream The PrintStream to write out to
|
||||
*/
|
||||
private void outputToCSV( PrintStream recalTableStream ) {
|
||||
private void outputToCSV( final PrintStream recalTableStream ) {
|
||||
|
||||
if( validateOldRecalibrator ) {
|
||||
boolean collapsePos = false;
|
||||
boolean collapseDinuc = false;
|
||||
final boolean collapsePos = false;
|
||||
final boolean collapseDinuc = false;
|
||||
recalTableStream.printf("# collapsed_pos %b%n", collapsePos);
|
||||
recalTableStream.printf("# collapsed_dinuc %b%n", collapseDinuc);
|
||||
recalTableStream.printf("# counted_sites %d%n", countedSites);
|
||||
|
|
@ -344,13 +362,15 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
recalTableStream.println( entry.getSecond().outputToCSV() );
|
||||
}
|
||||
} else {
|
||||
recalTableStream.printf("# Counted Sites %d%n", countedSites);
|
||||
recalTableStream.printf("# Counted Bases %d%n", countedBases);
|
||||
recalTableStream.printf("# Skipped Sites %d%n", skippedSites);
|
||||
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
// The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name
|
||||
recalTableStream.println( "@!" + cov.getClass().getSimpleName() );
|
||||
if( !NO_PRINT_HEADER ) {
|
||||
recalTableStream.printf("# Counted Sites %d%n", countedSites);
|
||||
recalTableStream.printf("# Counted Bases %d%n", countedBases);
|
||||
recalTableStream.printf("# Skipped Sites %d%n", skippedSites);
|
||||
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
// The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name
|
||||
recalTableStream.println( "@!" + cov.getClass().getSimpleName() );
|
||||
}
|
||||
}
|
||||
// For each entry in the data hashmap
|
||||
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/*
|
||||
|
|
@ -43,30 +45,39 @@ import net.sf.samtools.SAMRecord;
|
|||
|
||||
public class CycleCovariate implements Covariate {
|
||||
|
||||
public String platform;
|
||||
private String platform;
|
||||
|
||||
public CycleCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
platform = null;
|
||||
platform = "SLX";
|
||||
}
|
||||
|
||||
public CycleCovariate(String _platform) {
|
||||
platform = _platform;
|
||||
public CycleCovariate(final String _platform) {
|
||||
platform = _platform;
|
||||
}
|
||||
|
||||
public Comparable getValue(SAMRecord read, int offset, char[] refBases) {
|
||||
//BUGBUG: assumes Solexa platform
|
||||
Integer cycle = offset;
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
cycle = read.getReadLength() - (offset + 1);
|
||||
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
|
||||
final byte[] quals, final char[] bases, final char refBase) {
|
||||
if( platform.equalsIgnoreCase( "SLX" ) ) {
|
||||
int cycle = offset;
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
cycle = bases.length - (offset + 1);
|
||||
}
|
||||
return cycle;
|
||||
} else if( platform.equalsIgnoreCase( "454" ) ) {
|
||||
return 0; //BUGBUG: not yet implemented
|
||||
} else if( platform.equalsIgnoreCase( "SOLID" ) ) {
|
||||
return 0; //BUGBUG: not yet implemented
|
||||
} else {
|
||||
throw new StingException( "Requested platform (" + platform + ") not supported in CycleCovariate." );
|
||||
}
|
||||
return cycle;
|
||||
|
||||
}
|
||||
|
||||
public Comparable getValue(String str) {
|
||||
return Integer.parseInt( str );
|
||||
public final Comparable getValue(final String str) {
|
||||
return (int)Integer.parseInt( str );
|
||||
}
|
||||
|
||||
public int estimatedNumberOfBins() {
|
||||
public final int estimatedNumberOfBins() {
|
||||
return 100;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
|||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -40,35 +41,31 @@ import org.broadinstitute.sting.utils.BaseUtils;
|
|||
|
||||
public class DinucCovariate implements Covariate {
|
||||
|
||||
private String returnString;
|
||||
|
||||
public DinucCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
}
|
||||
|
||||
public Comparable getValue(SAMRecord read, int offset, char[] refBases) {
|
||||
byte[] bases = read.getReadBases();
|
||||
char base = (char)bases[offset];
|
||||
char prevBase; // value set below
|
||||
// If it is a negative strand read then we need to get the complement base and reverse the direction for our previous base
|
||||
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
|
||||
final byte[] quals, final char[] bases, final char refBase) {
|
||||
|
||||
char base = bases[offset];
|
||||
char prevBase = bases[offset - 1];
|
||||
// If it is a negative strand read then we need to reverse the direction for our previous base
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
base = BaseUtils.simpleComplement(base);
|
||||
if ( offset + 1 > bases.length - 1 ) { return Covariate.COVARIATE_ERROR; } // no prevBase because at the beginning of the read
|
||||
prevBase = BaseUtils.simpleComplement( (char)bases[offset + 1] );
|
||||
} else {
|
||||
if ( offset - 1 < 0 ) { return Covariate.COVARIATE_ERROR; } // no prevBase because at the beginning of the read
|
||||
prevBase = (char)bases[offset - 1];
|
||||
}
|
||||
// Check if bad base, probably an 'N'
|
||||
if( ( BaseUtils.simpleBaseToBaseIndex(prevBase) == -1 ) || ( BaseUtils.simpleBaseToBaseIndex(base) == -1 ) ) {
|
||||
return Covariate.COVARIATE_NULL; // CovariateCounterWalker will recognize that null means skip this particular location in the read
|
||||
} else {
|
||||
return String.format("%c%c", prevBase, base);
|
||||
}
|
||||
prevBase = bases[offset + 1];
|
||||
}
|
||||
final char[] charArray = {prevBase, base};
|
||||
returnString = String.valueOf(charArray);
|
||||
return returnString;
|
||||
//return String.format("%c%c", prevBase, base); // This return statement is too slow
|
||||
}
|
||||
|
||||
public Comparable getValue(String str) {
|
||||
public final Comparable getValue(final String str) {
|
||||
return str;
|
||||
}
|
||||
|
||||
public int estimatedNumberOfBins() {
|
||||
public final int estimatedNumberOfBins() {
|
||||
return 16;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -40,15 +40,17 @@ public class MappingQualityCovariate implements Covariate {
|
|||
public MappingQualityCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
}
|
||||
|
||||
public Comparable getValue(SAMRecord read, int offset, char[] refBases) {
|
||||
return (Integer)(read.getMappingQuality()); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code
|
||||
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
|
||||
final byte[] quals, final char[] bases, final char refBase) {
|
||||
|
||||
return (Integer)(read.getMappingQuality()); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code
|
||||
}
|
||||
|
||||
public Comparable getValue(String str) {
|
||||
public final Comparable getValue(final String str) {
|
||||
return Integer.parseInt( str );
|
||||
}
|
||||
|
||||
public int estimatedNumberOfBins() {
|
||||
public final int estimatedNumberOfBins() {
|
||||
return 100;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -41,26 +41,28 @@ import org.broadinstitute.sting.utils.QualityUtils;
|
|||
public class MinimumNQSCovariate implements Covariate {
|
||||
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ";
|
||||
protected boolean USE_ORIGINAL_QUALS;
|
||||
private boolean USE_ORIGINAL_QUALS;
|
||||
|
||||
public MinimumNQSCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
USE_ORIGINAL_QUALS = false;
|
||||
}
|
||||
|
||||
public MinimumNQSCovariate(boolean originalQuals) {
|
||||
public MinimumNQSCovariate(final boolean originalQuals) {
|
||||
USE_ORIGINAL_QUALS = originalQuals;
|
||||
}
|
||||
|
||||
public Comparable getValue(SAMRecord read, int offset, char[] refBases) {
|
||||
byte[] quals = read.getBaseQualities();
|
||||
if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
if ( obj instanceof String )
|
||||
quals = QualityUtils.fastqToPhred((String)obj);
|
||||
else {
|
||||
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
}
|
||||
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
|
||||
final byte[] quals, final char[] bases, final char refBase) {
|
||||
|
||||
//BUGBUG: Original qualities
|
||||
//if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
// Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
// if ( obj instanceof String )
|
||||
// quals = QualityUtils.fastqToPhred((String)obj);
|
||||
// else {
|
||||
// throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
// }
|
||||
//}
|
||||
|
||||
// Loop over the list of qualities and find the minimum
|
||||
Integer minQual = (int)(quals[0]);
|
||||
|
|
@ -72,11 +74,11 @@ public class MinimumNQSCovariate implements Covariate {
|
|||
return minQual;
|
||||
}
|
||||
|
||||
public Comparable getValue(String str) {
|
||||
public final Comparable getValue(final String str) {
|
||||
return Integer.parseInt( str );
|
||||
}
|
||||
|
||||
public int estimatedNumberOfBins() {
|
||||
public final int estimatedNumberOfBins() {
|
||||
return 40;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ import java.util.*;
|
|||
|
||||
public class NHashMap<T> extends HashMap<List<? extends Comparable>, T> {
|
||||
|
||||
private static final long serialVersionUID = 1L; //BUGBUG: what should I do here?
|
||||
private static final long serialVersionUID = 1L; //BUGBUG: what should I do here?, added by Eclipse
|
||||
private ArrayList<ArrayList<Comparable>> keyLists;
|
||||
|
||||
public NHashMap() {
|
||||
|
|
|
|||
|
|
@ -45,29 +45,32 @@ public class QualityScoreCovariate implements Covariate {
|
|||
USE_ORIGINAL_QUALS = false;
|
||||
}
|
||||
|
||||
public QualityScoreCovariate(boolean originalQuals) {
|
||||
public QualityScoreCovariate(final boolean originalQuals) {
|
||||
USE_ORIGINAL_QUALS = originalQuals;
|
||||
}
|
||||
|
||||
public Comparable getValue(SAMRecord read, int offset, char[] refBases) {
|
||||
byte[] quals = read.getBaseQualities();
|
||||
if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
if ( obj instanceof String )
|
||||
quals = QualityUtils.fastqToPhred((String)obj);
|
||||
else {
|
||||
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
}
|
||||
|
||||
return ((Integer)((int)quals[offset])); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code
|
||||
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
|
||||
final byte[] quals, final char[] bases, final char refBase) {
|
||||
|
||||
//BUGBUG: original qualities
|
||||
//if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
// Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
// if ( obj instanceof String )
|
||||
// quals = QualityUtils.fastqToPhred((String)obj);
|
||||
// else {
|
||||
// throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
// }
|
||||
//}
|
||||
|
||||
//Integer returnQual = (int)quals[offset];
|
||||
return (int)quals[offset]; // returning an Integer Object (as opposed to primitive int) is required so that return values from the two getValue methods hash to the same code
|
||||
}
|
||||
|
||||
public Comparable getValue(String str) {
|
||||
return Integer.parseInt( str );
|
||||
public final Comparable getValue(final String str) {
|
||||
return (int)Integer.parseInt( str );
|
||||
}
|
||||
|
||||
public int estimatedNumberOfBins() {
|
||||
public final int estimatedNumberOfBins() {
|
||||
return 40;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -40,15 +40,16 @@ public class ReadGroupCovariate implements Covariate{
|
|||
public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
}
|
||||
|
||||
public Comparable getValue(SAMRecord read, int offset, char[] refBases) {
|
||||
return read.getReadGroup().getReadGroupId();
|
||||
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup,
|
||||
final byte[] quals, final char[] bases, final char refBase) {
|
||||
return readGroup;
|
||||
}
|
||||
|
||||
public Comparable getValue(String str) {
|
||||
public final Comparable getValue(final String str) {
|
||||
return str;
|
||||
}
|
||||
|
||||
public int estimatedNumberOfBins() {
|
||||
public final int estimatedNumberOfBins() {
|
||||
return 300;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -38,12 +38,17 @@ import java.util.*;
|
|||
|
||||
public class RecalDataManager {
|
||||
public NHashMap<RecalDatum> data; // the full dataset
|
||||
public NHashMap<RecalDatum> dataCollapsedReadGroup; // table where everything except read group has been collapsed
|
||||
public NHashMap<RecalDatum> dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
|
||||
public ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
public boolean collapsedTablesCreated;
|
||||
private NHashMap<RecalDatum> dataCollapsedReadGroup; // table where everything except read group has been collapsed
|
||||
private NHashMap<RecalDatum> dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed
|
||||
private ArrayList<NHashMap<RecalDatum>> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
private boolean collapsedTablesCreated;
|
||||
public NHashMap<Double> dataSumExpectedErrors;
|
||||
|
||||
RecalDataManager() {
|
||||
data = new NHashMap<RecalDatum>();
|
||||
collapsedTablesCreated = false;
|
||||
}
|
||||
|
||||
RecalDataManager( int estimatedCapacity ) {
|
||||
data = new NHashMap<RecalDatum>( estimatedCapacity, 0.95f ); // second arg is the 'loading factor',
|
||||
// a number to monkey around with when optimizing performace of the HashMap
|
||||
|
|
@ -51,7 +56,7 @@ public class RecalDataManager {
|
|||
}
|
||||
|
||||
// BUGBUG: A lot going on in this method, doing a lot of pre-calculations for use in the sequential mode calculation later in TableRecalibrationWalker
|
||||
public void createCollapsedTables( int numCovariates ) {
|
||||
public final void createCollapsedTables( final int numCovariates ) {
|
||||
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
|
||||
dataCollapsedQualityScore = new NHashMap<RecalDatum>();
|
||||
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
|
||||
|
|
@ -122,7 +127,7 @@ public class RecalDataManager {
|
|||
collapsedTablesCreated = true;
|
||||
}
|
||||
|
||||
public NHashMap<RecalDatum> getCollapsedTable( int covariate ) {
|
||||
public final NHashMap<RecalDatum> getCollapsedTable( final int covariate ) {
|
||||
if( !collapsedTablesCreated ) {
|
||||
throw new StingException("Trying to get collapsed tables before they have been populated. Null pointers abound.");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -38,8 +38,8 @@ import java.util.*;
|
|||
|
||||
public class RecalDatum {
|
||||
|
||||
long numObservations; // number of bases seen in total
|
||||
long numMismatches; // number of bases seen that didn't match the reference
|
||||
private long numObservations; // number of bases seen in total
|
||||
private long numMismatches; // number of bases seen that didn't match the reference
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -52,12 +52,12 @@ public class RecalDatum {
|
|||
numMismatches = 0L;
|
||||
}
|
||||
|
||||
public RecalDatum( long _numObservations, long _numMismatches ) {
|
||||
public RecalDatum( final long _numObservations, final long _numMismatches ) {
|
||||
numObservations = _numObservations;
|
||||
numMismatches = _numMismatches;
|
||||
}
|
||||
|
||||
public RecalDatum( RecalDatum copy ) {
|
||||
public RecalDatum( final RecalDatum copy ) {
|
||||
this.numObservations = copy.numObservations;
|
||||
this.numMismatches = copy.numMismatches;
|
||||
}
|
||||
|
|
@ -69,22 +69,22 @@ public class RecalDatum {
|
|||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
public void increment( long incObservations, long incMismatches ) {
|
||||
public final void increment( final long incObservations, final long incMismatches ) {
|
||||
numObservations += incObservations;
|
||||
numMismatches += incMismatches;
|
||||
}
|
||||
|
||||
public void increment( RecalDatum other ) {
|
||||
public final void increment( final RecalDatum other ) {
|
||||
increment( other.numObservations, other.numMismatches );
|
||||
}
|
||||
|
||||
public void increment( List<RecalDatum> data ) {
|
||||
public final void increment( final List<RecalDatum> data ) {
|
||||
for ( RecalDatum other : data ) {
|
||||
this.increment( other );
|
||||
}
|
||||
}
|
||||
|
||||
public void increment( char curBase, char ref ) {
|
||||
public final void increment( final char curBase, final char ref ) {
|
||||
increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1 ); // increment takes num observations, then num mismatches
|
||||
}
|
||||
|
||||
|
|
@ -94,22 +94,22 @@ public class RecalDatum {
|
|||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public double empiricalQualDouble( int smoothing ) {
|
||||
public final double empiricalQualDouble( final int smoothing ) {
|
||||
double doubleMismatches = (double) ( numMismatches + smoothing );
|
||||
double doubleObservations = (double) ( numObservations + smoothing );
|
||||
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
|
||||
if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE;
|
||||
return empiricalQual;
|
||||
}
|
||||
public double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero
|
||||
public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero
|
||||
|
||||
|
||||
public byte empiricalQualByte( int smoothing ) {
|
||||
public final byte empiricalQualByte( final int smoothing ) {
|
||||
double doubleMismatches = (double) ( numMismatches + smoothing );
|
||||
double doubleObservations = (double) ( numObservations + smoothing );
|
||||
return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations );
|
||||
}
|
||||
public byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero
|
||||
public final byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -117,14 +117,14 @@ public class RecalDatum {
|
|||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public String outputToCSV( ) {
|
||||
public final String outputToCSV( ) {
|
||||
return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() );
|
||||
}
|
||||
public String outputToCSV( int smoothing ) {
|
||||
public final String outputToCSV( final int smoothing ) {
|
||||
return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte( smoothing ) );
|
||||
}
|
||||
|
||||
public Long getNumObservations() {
|
||||
public final Long getNumObservations() {
|
||||
return numObservations;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -67,15 +67,15 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
|
||||
public int SMOOTHING = 1;
|
||||
|
||||
public enum RecalibrationMode {
|
||||
COMBINATORIAL,
|
||||
SEQUENTIAL,
|
||||
ERROR
|
||||
}
|
||||
//public enum RecalibrationMode {
|
||||
// COMBINATORIAL,
|
||||
// SEQUENTIAL,
|
||||
// ERROR
|
||||
//}
|
||||
|
||||
//@Argument(fullName="RecalibrationMode", shortName="mode", doc="which type of calculation to use when recalibrating, default is SEQUENTIAL", required=false)
|
||||
//public String MODE_STRING = RecalibrationMode.SEQUENTIAL.toString();
|
||||
public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes?
|
||||
@Argument(fullName="recalibrationMode", shortName="mode", doc="Which calculation to use when recalibrating, default is SEQUENTIAL", required=false)
|
||||
public String MODE_STRING = "SEQUENTIAL";
|
||||
//public RecalibrationMode MODE = RecalibrationMode.SEQUENTIAL; //BUGBUG: do we need to support the other modes?
|
||||
|
||||
protected RecalDataManager dataManager;
|
||||
protected ArrayList<Covariate> requestedCovariates;
|
||||
|
|
@ -165,7 +165,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
logger.info( "...done!" );
|
||||
|
||||
// Create the collapsed tables that are used in the sequential calculation
|
||||
if( MODE == RecalibrationMode.SEQUENTIAL ) {
|
||||
if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
|
||||
logger.info( "Creating collapsed tables for use in sequential calculation..." );
|
||||
dataManager.createCollapsedTables( requestedCovariates.size() );
|
||||
logger.info( "...done!" );
|
||||
|
|
@ -203,6 +203,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
*/
|
||||
public SAMRecord map( char[] refBases, SAMRecord read ) {
|
||||
|
||||
if( refBases == null ) {
|
||||
return read; // early return here, unmapped reads should be left alone
|
||||
}
|
||||
|
||||
byte[] originalQuals = read.getBaseQualities();
|
||||
// Check if we need to use the original quality scores instead
|
||||
if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
|
|
@ -215,36 +219,37 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
// These calls are expensive so only do them once for each read
|
||||
String readGroup = read.getReadGroup().getReadGroupId();
|
||||
char[] bases = read.getReadString().toCharArray();
|
||||
String myRefBases = new String(refBases);
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
bases = BaseUtils.simpleComplement( read.getReadString() ).toCharArray();
|
||||
myRefBases = BaseUtils.simpleComplement( myRefBases );
|
||||
}
|
||||
|
||||
|
||||
// For each base in the read
|
||||
for( int iii = 0; iii < read.getReadLength(); iii++ ) {
|
||||
for( int iii = 1; iii < read.getReadLength() - 1; iii++ ) { // skip first and last bases because there is no dinuc
|
||||
List<Comparable> key = new ArrayList<Comparable>();
|
||||
boolean badKey = false;
|
||||
Comparable keyElement;
|
||||
for( Covariate covariate : requestedCovariates ) {
|
||||
keyElement = covariate.getValue( read, iii, refBases );
|
||||
if( keyElement != Covariate.COVARIATE_ERROR ) { // COVARIATE_NULL is okay because the sequential calculation will do the best it can with the valid data it has
|
||||
key.add( keyElement ); // technically COVARIATE_ERROR is fine too, but this won't match the behavior of the old recalibrator
|
||||
} else {
|
||||
badKey = true;
|
||||
}
|
||||
key.add( covariate.getValue( read, iii, readGroup, originalQuals, bases, myRefBases.charAt(iii) ) ); // offset is zero based so passing iii is correct here // technically COVARIATE_ERROR is fine too, but this won't match the behavior of the old recalibrator
|
||||
}
|
||||
|
||||
if( !badKey ) {
|
||||
if( MODE == RecalibrationMode.COMBINATORIAL ) {
|
||||
RecalDatum datum = dataManager.data.get( key );
|
||||
if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing
|
||||
recalQuals[iii] = datum.empiricalQualByte( SMOOTHING );
|
||||
}
|
||||
} else if( MODE == RecalibrationMode.SEQUENTIAL ) {
|
||||
recalQuals[iii] = performSequentialQualityCalculation( key );
|
||||
} else {
|
||||
throw new StingException( "Specified RecalibrationMode is not supported: " + MODE );
|
||||
if( MODE_STRING.equalsIgnoreCase("COMBINATORIAL") ) {
|
||||
RecalDatum datum = dataManager.data.get( key );
|
||||
if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing
|
||||
recalQuals[iii] = datum.empiricalQualByte( SMOOTHING );
|
||||
}
|
||||
} else if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) {
|
||||
recalQuals[iii] = performSequentialQualityCalculation( key );
|
||||
} else {
|
||||
throw new StingException( "Specified RecalibrationMode is not supported: " + MODE_STRING );
|
||||
}
|
||||
|
||||
// Do some error checking on the new quality score
|
||||
if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) {
|
||||
throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] );
|
||||
}
|
||||
// Do some error checking on the new quality score
|
||||
if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) {
|
||||
throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -254,6 +259,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
read.setAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
|
||||
}
|
||||
|
||||
|
||||
return read;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue