The refactored CountCovariates now hashes the read object into a HashMap which holds all the properties the covariates pull out of the read over and over again such as read group string, bases string and its complement string, quality scores, etc. This results in a big speed up. CountCovariatesRefactored is now just slightly slower than CountCovariates (perhaps 1.07x according to my latest time trial). Thanks to Alec for suggesting IdentityHashMap. CycleCovariate now warns the user that is is defaulting to the Solexa definition of cycle when the platform string pulled out of the read is unrecognized instead of halting with an Exception.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2108 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
27122f7f97
commit
98f921fe24
|
|
@ -38,7 +38,7 @@ import net.sf.samtools.SAMRecord;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public interface Covariate {
|
public interface Covariate {
|
||||||
public Comparable getValue(SAMRecord read, int offset, String readGroup, String platform, byte[] quals, byte[] bases); // used to pick out the value from attributes of the read
|
public Comparable getValue(ReadHashDatum readDatum, int offset); // used to pick out the value from attributes of the read
|
||||||
public Comparable getValue(String str); // used to get value from input file
|
public Comparable getValue(String str); // used to get value from input file
|
||||||
public int estimatedNumberOfBins(); // used to estimate the amount space required for the HashMap
|
public int estimatedNumberOfBins(); // used to estimate the amount space required for the HashMap
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -88,8 +88,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
|
|
||||||
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||||
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
|
private ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
|
||||||
//private HashMap<SAMRecord, String> readGroupHashMap; // A hash map that hashes the read object itself into the read group name
|
private IdentityHashMap<SAMRecord, ReadHashDatum> readDatumHashMap; // A hash map that hashes the read object itself into properties commonly pulled out of the read. Done for optimization purposes.
|
||||||
// This is done for optimization purposes because pulling the read group out of the SAMRecord is expensive
|
private int sizeOfReadDatumHashMap = 0;
|
||||||
|
|
||||||
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
|
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
|
||||||
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
|
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
|
||||||
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
|
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
|
||||||
|
|
@ -204,7 +205,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
|
|
||||||
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
|
if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception
|
||||||
dataManager = new RecalDataManager( estimatedCapacity );
|
dataManager = new RecalDataManager( estimatedCapacity );
|
||||||
//readGroupHashMap = new HashMap<SAMRecord, String>( 50000000, 0.97f );
|
readDatumHashMap = new IdentityHashMap<SAMRecord, ReadHashDatum>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -250,73 +251,93 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
byte prevBase;
|
byte prevBase;
|
||||||
String platform;
|
String platform;
|
||||||
byte[] colorSpaceQuals;
|
byte[] colorSpaceQuals;
|
||||||
|
ReadHashDatum readDatum;
|
||||||
|
boolean isNegStrand;
|
||||||
|
int mappingQuality;
|
||||||
|
int length;
|
||||||
|
|
||||||
final int numReads = reads.size();
|
final int numReads = reads.size();
|
||||||
// For each read at this locus
|
// For each read at this locus
|
||||||
for( int iii = 0; iii < numReads; iii++ ) {
|
for( int iii = 0; iii < numReads; iii++ ) {
|
||||||
read = reads.get(iii);
|
read = reads.get(iii);
|
||||||
|
offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct
|
||||||
|
|
||||||
//readGroupId = readGroupHashMap.get( read );
|
readDatum = readDatumHashMap.get( read );
|
||||||
//if( readGroupId == null ) { // read is not in the hashmap so add it
|
if( readDatum == null ) {
|
||||||
// readGroupId = read.getReadGroup().getReadGroupId();
|
|
||||||
// readGroupHashMap.put( read, readGroupId );
|
|
||||||
//}
|
|
||||||
|
|
||||||
if( read.getMappingQuality() > 0 ) { // BUGBUG: turn this into a read filter after passing the old integration tests
|
|
||||||
|
|
||||||
offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct
|
// If the HashMap of read objects has grown too large then throw out the (mostly stale) reads
|
||||||
|
if( sizeOfReadDatumHashMap > 100000 ) { //BUGBUG: Can I make this number larger?
|
||||||
|
readDatumHashMap.clear();
|
||||||
|
sizeOfReadDatumHashMap = 0;
|
||||||
|
}
|
||||||
|
|
||||||
// skip first and last base because there is no dinuc, this is mainly done for speed so we don't have to check cases
|
// This read isn't in the hashMap yet so fill out the datum and add it to the map so that we never have to do the work again
|
||||||
if( offset > 0 && offset < read.getReadLength() - 1 ) {
|
quals = read.getBaseQualities();
|
||||||
|
// Check if we need to use the original quality scores instead
|
||||||
quals = read.getBaseQualities();
|
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||||
// Check if we need to use the original quality scores instead
|
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||||
if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
if ( obj instanceof String )
|
||||||
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
quals = QualityUtils.fastqToPhred((String)obj);
|
||||||
if ( obj instanceof String )
|
else {
|
||||||
quals = QualityUtils.fastqToPhred((String)obj);
|
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||||
else {
|
|
||||||
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'. Is this true?
|
||||||
|
isNegStrand = read.getReadNegativeStrandFlag();
|
||||||
|
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||||
|
readGroupId = readGroup.getReadGroupId();
|
||||||
|
platform = readGroup.getPlatform();
|
||||||
|
mappingQuality = read.getMappingQuality();
|
||||||
|
length = bases.length;
|
||||||
|
if( USE_SLX_PLATFORM ) {
|
||||||
|
platform = "ILLUMINA";
|
||||||
|
}
|
||||||
|
|
||||||
// skip if base quality is zero
|
readDatum = new ReadHashDatum( readGroupId, platform, quals, bases, isNegStrand, mappingQuality, length );
|
||||||
if( quals[offset] > 0 ) {
|
readDatumHashMap.put( read, readDatum );
|
||||||
bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A'
|
sizeOfReadDatumHashMap++;
|
||||||
refBase = (byte)ref.getBase();
|
}
|
||||||
prevBase = bases[offset-1];
|
|
||||||
|
|
||||||
// Get the complement base strand if we are a negative strand read
|
|
||||||
if( read.getReadNegativeStrandFlag() ) {
|
|
||||||
bases = BaseUtils.simpleComplement( bases ); // this is an expensive call
|
|
||||||
refBase = (byte)BaseUtils.simpleComplement( ref.getBase() );
|
|
||||||
prevBase = bases[offset+1];
|
|
||||||
}
|
|
||||||
|
|
||||||
// skip if this base or the previous one was an 'N' or etc.
|
if( readDatum.mappingQuality > 0 ) { // BUGBUG: turn this into a read filter after passing the old integration tests
|
||||||
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)bases[offset] ) ) {
|
|
||||||
|
|
||||||
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
// skip first and last base because there is no dinuc
|
||||||
readGroupId = readGroup.getReadGroupId();
|
// BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests.
|
||||||
platform = readGroup.getPlatform();
|
if( offset > 0 ) {
|
||||||
if( USE_SLX_PLATFORM ) {
|
if( offset < readDatum.length - 1 ) {
|
||||||
platform = "ILLUMINA";
|
// skip if base quality is zero
|
||||||
|
if( readDatum.quals[offset] > 0 ) {
|
||||||
|
|
||||||
|
refBase = (byte)ref.getBase();
|
||||||
|
prevBase = readDatum.bases[offset-1];
|
||||||
|
|
||||||
|
// Get the complement base strand if we are a negative strand read
|
||||||
|
if( readDatum.isNegStrand ) {
|
||||||
|
prevBase = readDatum.bases[offset+1];
|
||||||
}
|
}
|
||||||
|
|
||||||
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
|
// skip if this base or the previous one was an 'N' or etc.
|
||||||
colorSpaceQuals = null;
|
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(readDatum.bases[offset]) ) ) {
|
||||||
if( platform.equalsIgnoreCase("SOLID") ) {
|
|
||||||
colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
|
// SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them
|
||||||
}
|
colorSpaceQuals = null;
|
||||||
if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
|
if( readDatum.platform.equalsIgnoreCase("SOLID") ) {
|
||||||
{
|
colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
|
||||||
updateDataFromRead( read, offset, readGroupId, platform, quals, bases, refBase );
|
}
|
||||||
}
|
if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet
|
||||||
} else {
|
{
|
||||||
if( VALIDATE_OLD_RECALIBRATOR ) {
|
// This base finally passed all the checks, so add it to the big hashmap
|
||||||
countedBases++; // replicating a small bug in the old recalibrator
|
updateDataFromRead( readDatum, offset, refBase );
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if( VALIDATE_OLD_RECALIBRATOR ) {
|
||||||
|
countedBases++; // replicating a small bug in the old recalibrator
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else { // at the last base in the read so we can remove it from our IdentityHashMap
|
||||||
|
readDatumHashMap.remove( read );
|
||||||
|
sizeOfReadDatumHashMap--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -337,22 +358,17 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
* adding one to the number of observations and potentially one to the number of mismatches
|
* adding one to the number of observations and potentially one to the number of mismatches
|
||||||
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
|
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
|
||||||
* because pulling things out of the SAMRecord is an expensive operation.
|
* because pulling things out of the SAMRecord is an expensive operation.
|
||||||
* @param read The read
|
* @param readDatum The ReadHashDatum holding all the important properties of this read
|
||||||
* @param offset The offset in the read for this locus
|
* @param offset The offset in the read for this locus
|
||||||
* @param readGroup The read group the read is in
|
|
||||||
* @param platform The String that has the platform this read came from: Illumina, 454, or solid
|
|
||||||
* @param quals List of base quality scores
|
|
||||||
* @param bases The bases which make up the read
|
|
||||||
* @param refBase The reference base at this locus
|
* @param refBase The reference base at this locus
|
||||||
*/
|
*/
|
||||||
private void updateDataFromRead(final SAMRecord read, final int offset, final String readGroup, final String platform,
|
private void updateDataFromRead(final ReadHashDatum readDatum, final int offset, final byte refBase) {
|
||||||
final byte[] quals, final byte[] bases, final byte refBase) {
|
|
||||||
|
|
||||||
List<Comparable> key = new ArrayList<Comparable>();
|
List<Comparable> key = new ArrayList<Comparable>();
|
||||||
|
|
||||||
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
// Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||||
for( Covariate covariate : requestedCovariates ) {
|
for( Covariate covariate : requestedCovariates ) {
|
||||||
key.add( covariate.getValue( read, offset, readGroup, platform, quals, bases ) );
|
key.add( covariate.getValue( readDatum, offset ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
||||||
|
|
@ -367,7 +383,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Need the bases to determine whether or not we have a mismatch
|
// Need the bases to determine whether or not we have a mismatch
|
||||||
byte base = bases[offset];
|
byte base = readDatum.bases[offset];
|
||||||
|
|
||||||
// Add one to the number of observations and potentially one to the number of mismatches
|
// Add one to the number of observations and potentially one to the number of mismatches
|
||||||
datum.increment( (char)base, (char)refBase ); // dangerous: if you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad
|
datum.increment( (char)base, (char)refBase ); // dangerous: if you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad
|
||||||
|
|
@ -408,9 +424,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
* @param recalTableStream The PrintStream to write out to
|
* @param recalTableStream The PrintStream to write out to
|
||||||
*/
|
*/
|
||||||
public void onTraversalDone( PrintStream recalTableStream ) {
|
public void onTraversalDone( PrintStream recalTableStream ) {
|
||||||
out.print( "Writing raw recalibration data..." );
|
logger.info( "Writing raw recalibration data..." );
|
||||||
outputToCSV( recalTableStream );
|
outputToCSV( recalTableStream );
|
||||||
out.println( "...done!" );
|
logger.info( "...done!" );
|
||||||
|
|
||||||
recalTableStream.close();
|
recalTableStream.close();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
|
@ -44,37 +45,42 @@ import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
public class CycleCovariate implements Covariate {
|
public class CycleCovariate implements Covariate {
|
||||||
|
|
||||||
|
private static boolean warnedUserNoPlatform = false;
|
||||||
|
|
||||||
public CycleCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
public CycleCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
|
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
|
||||||
final byte[] quals, final byte[] bases) {
|
if( readDatum.platform.equalsIgnoreCase( "ILLUMINA" ) ) {
|
||||||
if( platform.equalsIgnoreCase( "ILLUMINA" ) ) {
|
|
||||||
int cycle = offset;
|
int cycle = offset;
|
||||||
if( read.getReadNegativeStrandFlag() ) {
|
if( readDatum.isNegStrand ) {
|
||||||
cycle = bases.length - (offset + 1);
|
cycle = readDatum.bases.length - (offset + 1);
|
||||||
}
|
}
|
||||||
return cycle;
|
return cycle;
|
||||||
} else if( platform.contains( "454" ) ) { // some bams have "LS454" and others have just "454"
|
} else if( readDatum.platform.contains( "454" ) ) { // some bams have "LS454" and others have just "454"
|
||||||
int cycle = 0;
|
int cycle = 0;
|
||||||
byte prevBase = bases[0];
|
byte prevBase = readDatum.bases[0];
|
||||||
for( int iii = 1; iii <= offset; iii++ ) {
|
for( int iii = 1; iii <= offset; iii++ ) {
|
||||||
if(bases[iii] != prevBase) { // this base doesn't match the previous one so it is a new cycle
|
if(readDatum.bases[iii] != prevBase) { // this base doesn't match the previous one so it is a new cycle
|
||||||
cycle++;
|
cycle++;
|
||||||
prevBase = bases[iii];
|
prevBase = readDatum.bases[iii];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return cycle;
|
return cycle;
|
||||||
} else if( platform.equalsIgnoreCase( "SOLID" ) ) {
|
} else if( readDatum.platform.equalsIgnoreCase( "SOLID" ) ) {
|
||||||
// the ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
|
// the ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
|
||||||
return offset / 5; // integer division
|
return offset / 5; // integer division
|
||||||
} else {
|
} else { // platform is unrecognized so revert to Illumina definition of cycle but warn the user
|
||||||
throw new StingException( "Platform in read (" + platform + ") is not supported in CycleCovariate. Read = " + read );
|
if( !warnedUserNoPlatform ) {
|
||||||
|
Utils.warnUser( "Platform (" + readDatum.platform + ") unrecognized. Reverting to Illumina definition of machine cycle." );
|
||||||
|
warnedUserNoPlatform = true;
|
||||||
|
}
|
||||||
|
return PositionCovariate.revertToPositionAsCycle( readDatum, offset );
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final String str) {
|
public final Comparable getValue( final String str ) {
|
||||||
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
|
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,8 @@ import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2009 The Broad Institute
|
* Copyright (c) 2009 The Broad Institute
|
||||||
*
|
*
|
||||||
|
|
@ -54,16 +56,17 @@ public class DinucCovariate implements Covariate {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
|
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
|
||||||
final byte[] quals, final byte[] bases) {
|
|
||||||
|
|
||||||
byte base = bases[offset];
|
byte base;
|
||||||
byte prevBase;
|
byte prevBase;
|
||||||
// If this is a negative strand read then we need to reverse the direction for our previous base
|
// If this is a negative strand read then we need to reverse the direction for our previous base
|
||||||
if( read.getReadNegativeStrandFlag() ) {
|
if( readDatum.isNegStrand ) {
|
||||||
prevBase = bases[offset + 1];
|
base = (byte)BaseUtils.simpleComplement( (char)readDatum.bases[offset] );
|
||||||
|
prevBase = (byte)BaseUtils.simpleComplement( (char)readDatum.bases[offset + 1] );
|
||||||
} else {
|
} else {
|
||||||
prevBase = bases[offset - 1];
|
base = readDatum.bases[offset];
|
||||||
|
prevBase = readDatum.bases[offset - 1];
|
||||||
}
|
}
|
||||||
//char[] charArray = {(char)prevBase, (char)base};
|
//char[] charArray = {(char)prevBase, (char)base};
|
||||||
//return new String( charArray ); // This is an expensive call
|
//return new String( charArray ); // This is an expensive call
|
||||||
|
|
@ -71,7 +74,7 @@ public class DinucCovariate implements Covariate {
|
||||||
//return String.format("%c%c", prevBase, base); // This return statement is too slow
|
//return String.format("%c%c", prevBase, base); // This return statement is too slow
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final String str) {
|
public final Comparable getValue( final String str ) {
|
||||||
//return str;
|
//return str;
|
||||||
return dinucHashMap.get( Dinuc.hashBytes( (byte)str.charAt(0), (byte)str.charAt(1) ) );
|
return dinucHashMap.get( Dinuc.hashBytes( (byte)str.charAt(0), (byte)str.charAt(1) ) );
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -40,13 +40,12 @@ public class MappingQualityCovariate implements Covariate {
|
||||||
public MappingQualityCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
public MappingQualityCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
|
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
|
||||||
final byte[] quals, final byte[] bases) {
|
|
||||||
|
|
||||||
return read.getMappingQuality();
|
return readDatum.mappingQuality;
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final String str) {
|
public final Comparable getValue( final String str ) {
|
||||||
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
|
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -48,22 +48,21 @@ public class MinimumNQSCovariate implements Covariate {
|
||||||
windowReach = windowSize / 2; // integer division
|
windowReach = windowSize / 2; // integer division
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
|
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
|
||||||
final byte[] quals, final byte[] bases) {
|
|
||||||
|
|
||||||
// Loop over the list of base quality scores in the window and find the minimum
|
// Loop over the list of base quality scores in the window and find the minimum
|
||||||
int minQual = quals[offset];
|
int minQual = readDatum.quals[offset];
|
||||||
int minIndex = Math.max(offset - windowReach, 0);
|
int minIndex = Math.max(offset - windowReach, 0);
|
||||||
int maxIndex = Math.min(offset + windowReach, quals.length - 1);
|
int maxIndex = Math.min(offset + windowReach, readDatum.quals.length - 1);
|
||||||
for ( int iii = minIndex; iii < maxIndex; iii++ ) {
|
for ( int iii = minIndex; iii < maxIndex; iii++ ) {
|
||||||
if( quals[iii] < minQual ) {
|
if( readDatum.quals[iii] < minQual ) {
|
||||||
minQual = quals[iii];
|
minQual = readDatum.quals[iii];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return minQual;
|
return minQual;
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final String str) {
|
public final Comparable getValue( final String str ) {
|
||||||
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
|
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||||
|
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2009 The Broad Institute
|
* Copyright (c) 2009 The Broad Institute
|
||||||
|
|
@ -41,16 +42,23 @@ public class PositionCovariate implements Covariate {
|
||||||
public PositionCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
public PositionCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
|
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
|
||||||
final byte[] quals, final byte[] bases) {
|
|
||||||
int cycle = offset;
|
int cycle = offset;
|
||||||
if( read.getReadNegativeStrandFlag() ) {
|
if( readDatum.isNegStrand ) {
|
||||||
cycle = bases.length - (offset + 1);
|
cycle = readDatum.bases.length - (offset + 1);
|
||||||
}
|
}
|
||||||
return cycle;
|
return cycle;
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final String str) {
|
public static Comparable revertToPositionAsCycle( final ReadHashDatum readDatum, final int offset ) { // called from CycleCovariate if platform was unrecognized
|
||||||
|
int cycle = offset;
|
||||||
|
if( readDatum.isNegStrand ) {
|
||||||
|
cycle = readDatum.bases.length - (offset + 1);
|
||||||
|
}
|
||||||
|
return cycle;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final Comparable getValue( final String str ) {
|
||||||
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
|
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -44,9 +44,8 @@ public class PrimerRoundCovariate implements Covariate {
|
||||||
public PrimerRoundCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
public PrimerRoundCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
|
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
|
||||||
final byte[] quals, final byte[] bases) {
|
if( readDatum.platform.equalsIgnoreCase( "SOLID" ) ) {
|
||||||
if( platform.equalsIgnoreCase( "SOLID" ) ) {
|
|
||||||
return offset % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
|
return offset % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
|
||||||
} else {
|
} else {
|
||||||
return 1; // nothing to do here because it is always the same
|
return 1; // nothing to do here because it is always the same
|
||||||
|
|
@ -54,7 +53,7 @@ public class PrimerRoundCovariate implements Covariate {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final String str) {
|
public final Comparable getValue( final String str ) {
|
||||||
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
|
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -40,13 +40,12 @@ public class QualityScoreCovariate implements Covariate {
|
||||||
public QualityScoreCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
public QualityScoreCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
|
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
|
||||||
final byte[] quals, final byte[] bases) {
|
|
||||||
|
|
||||||
return (int)quals[offset];
|
return (int)(readDatum.quals[offset]);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final String str) {
|
public final Comparable getValue( final String str ) {
|
||||||
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
|
return (int)Integer.parseInt( str ); // cast to primitive int (as opposed to Integer Object) is required so that the return value from the two getValue methods hash to same thing
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -40,12 +40,11 @@ public class ReadGroupCovariate implements Covariate{
|
||||||
public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, final String platform,
|
public final Comparable getValue( final ReadHashDatum readDatum, final int offset ) {
|
||||||
final byte[] quals, final byte[] bases) {
|
return readDatum.readGroup;
|
||||||
return readGroup;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Comparable getValue(final String str) {
|
public final Comparable getValue( final String str ) {
|
||||||
return str;
|
return str;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,26 @@
|
||||||
|
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: rpoplin
|
||||||
|
* Date: Nov 14, 2009
|
||||||
|
*/
|
||||||
|
public class ReadHashDatum {
|
||||||
|
public String readGroup;
|
||||||
|
public String platform;
|
||||||
|
public byte[] quals;
|
||||||
|
public byte[] bases;
|
||||||
|
public boolean isNegStrand;
|
||||||
|
public int mappingQuality;
|
||||||
|
public int length;
|
||||||
|
|
||||||
|
public ReadHashDatum(String _readGroup, String _platform, byte[] _quals, byte[] _bases, boolean _isNegStrand, int _mappingQuality, int _length) {
|
||||||
|
readGroup = _readGroup;
|
||||||
|
platform = _platform;
|
||||||
|
quals = _quals;
|
||||||
|
bases = _bases;
|
||||||
|
isNegStrand = _isNegStrand;
|
||||||
|
mappingQuality = _mappingQuality;
|
||||||
|
length = _length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||||
|
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
import net.sf.samtools.SAMFileWriter;
|
import net.sf.samtools.SAMFileWriter;
|
||||||
|
import net.sf.samtools.SAMReadGroupRecord;
|
||||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||||
|
|
@ -126,7 +127,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if( foundDBSNP ) {
|
if( foundDBSNP ) {
|
||||||
Utils.warnUser("A dbSNP rod file was specified but this walker doesn't make use of it.");
|
Utils.warnUser("A dbSNP rod file was specified but TableRecalibrationWalker doesn't make use of it.");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read in the covariates that were used from the input file
|
// Read in the covariates that were used from the input file
|
||||||
|
|
@ -193,7 +194,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
} catch ( FileNotFoundException e ) {
|
} catch ( FileNotFoundException e ) {
|
||||||
Utils.scareUser("Can not find input file: " + RECAL_FILE);
|
Utils.scareUser("Can not find input file: " + RECAL_FILE);
|
||||||
} catch ( NumberFormatException e ) {
|
} catch ( NumberFormatException e ) {
|
||||||
throw new StingException("Error parsing recalibration data at line " + lineNumber + ". Was your table generated by CountCovariatesRefactored?");
|
throw new StingException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table generated by an older version of CovariateCounterWalker.");
|
||||||
}
|
}
|
||||||
logger.info( "...done!" );
|
logger.info( "...done!" );
|
||||||
|
|
||||||
|
|
@ -265,30 +266,31 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
}
|
}
|
||||||
byte[] recalQuals = originalQuals.clone();
|
byte[] recalQuals = originalQuals.clone();
|
||||||
|
|
||||||
// These calls are expensive so only do them once for each read
|
// These calls are all expensive so only do them once for each read
|
||||||
String readGroup = read.getReadGroup().getReadGroupId();
|
final SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||||
String platform = read.getReadGroup().getPlatform();
|
final String readGroupId = readGroup.getReadGroupId();
|
||||||
|
String platform = readGroup.getPlatform();
|
||||||
|
final boolean isNegStrand = read.getReadNegativeStrandFlag();
|
||||||
if( USE_SLX_PLATFORM ) {
|
if( USE_SLX_PLATFORM ) {
|
||||||
platform = "ILLUMINA";
|
platform = "ILLUMINA";
|
||||||
}
|
}
|
||||||
|
|
||||||
byte[] bases = read.getReadBases();
|
byte[] bases = read.getReadBases();
|
||||||
int startPos = 1;
|
int startPos = 1;
|
||||||
int stopPos = read.getReadLength();
|
int stopPos = bases.length;
|
||||||
|
|
||||||
if( read.getReadNegativeStrandFlag() ) {
|
if( isNegStrand ) { // DinucCovariate is responsible for getting the complement base if needed
|
||||||
bases = BaseUtils.simpleComplement( bases );
|
|
||||||
startPos = 0;
|
startPos = 0;
|
||||||
stopPos = read.getReadLength() - 1;
|
stopPos = bases.length - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ReadHashDatum readDatum = new ReadHashDatum( readGroupId, platform, originalQuals, bases, isNegStrand, read.getMappingQuality(), bases.length );
|
||||||
|
|
||||||
// For each base in the read
|
// For each base in the read
|
||||||
for( int iii = startPos; iii < stopPos; iii++ ) { // skip first or last base because there is no dinuc depending on the direction of the read
|
for( int iii = startPos; iii < stopPos; iii++ ) { // skip first or last base because there is no dinuc depending on the direction of the read
|
||||||
List<Comparable> key = new ArrayList<Comparable>();
|
List<Comparable> key = new ArrayList<Comparable>();
|
||||||
// Get the covariate values which make up the key
|
// Get the covariate values which make up the key
|
||||||
for( Covariate covariate : requestedCovariates ) {
|
for( Covariate covariate : requestedCovariates ) {
|
||||||
key.add( covariate.getValue( read, iii, readGroup, platform, originalQuals, bases ) ); // offset is zero based so passing iii is correct here
|
key.add( covariate.getValue( readDatum, iii ) ); // offset is zero based so passing iii is correct here
|
||||||
}
|
}
|
||||||
|
|
||||||
recalQuals[iii] = performSequentialQualityCalculation( key );
|
recalQuals[iii] = performSequentialQualityCalculation( key );
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue