Optimization - Added method to Covariates: void getValues( SAMRecord read, Comparable[] comparable ) which takes an array of size (at least) read.getReadLength() and fills it with covariate values for all positions in the given read. Made CovariateCounterWalker and TableRecalibrationWalker use this method instead of calling getValue(..) for each covariate and each offset.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2863 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
alecw 2010-02-22 17:35:25 +00:00
parent 32d14d988e
commit b236714c8a
18 changed files with 524 additions and 138 deletions

View File

@ -41,6 +41,9 @@ public interface Covariate {
public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers
public Comparable getValue( SAMRecord read, int offset ); // Used to pick out the covariate's value from attributes of the read public Comparable getValue( SAMRecord read, int offset ); // Used to pick out the covariate's value from attributes of the read
public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker
public void getValues( SAMRecord read, Comparable[] comparable ); //Takes an array of size (at least) read.getReadLength() and fills it with covariate
//values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows
//read-specific calculations to be done just once rather than for each offset.
} }
interface RequiredCovariate extends Covariate { interface RequiredCovariate extends Covariate {

View File

@ -1,23 +1,35 @@
package org.broadinstitute.sting.gatk.walkers.recalibration; package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.gatk.walkers.*; import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.walkers.ReadFilters;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.NestedHashMap;
import org.broadinstitute.sting.utils.PackageUtils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection; import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.genotype.Variation; import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import java.io.PrintStream; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.FileNotFoundException;
import java.util.*;
import net.sf.samtools.SAMRecord;
/* /*
* Copyright (c) 2009 The Broad Institute * Copyright (c) 2009 The Broad Institute
@ -68,6 +80,13 @@ import net.sf.samtools.SAMRecord;
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta @Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> { public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
/////////////////////////////
// Constants
/////////////////////////////
private static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; //used to label GATKSAMRecords that should be skipped.
private static final String SEEN_ATTRIBUTE = "SEEN"; //used to label GATKSAMRecords as processed.
private static final String COVARS_ATTRIBUTE = "COVARS"; //used to store covariates array as a temporary attribute inside GATKSAMRecord.
///////////////////////////// /////////////////////////////
// Shared Arguments // Shared Arguments
///////////////////////////// /////////////////////////////
@ -252,39 +271,54 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
if( !isSNP && ( ++numUnprocessed >= PROCESS_EVERY_NTH_LOCUS ) ) { if( !isSNP && ( ++numUnprocessed >= PROCESS_EVERY_NTH_LOCUS ) ) {
numUnprocessed = 0; // Reset the counter because we are processing this very locus numUnprocessed = 0; // Reset the counter because we are processing this very locus
SAMRecord read; GATKSAMRecord gatkRead;
int offset; int offset;
byte refBase; byte refBase;
byte[] bases; byte[] bases;
// For each read at this locus // For each read at this locus
for( PileupElement p : context.getBasePileup() ) { for( PileupElement p : context.getBasePileup() ) {
read = p.getRead(); gatkRead = (GATKSAMRecord) p.getRead();
offset = p.getOffset(); offset = p.getOffset();
RecalDataManager.parseSAMRecord( read, RAC ); if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) {
continue;
}
if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) )
{
gatkRead.setTemporaryAttribute( SEEN_ATTRIBUTE, true );
RecalDataManager.parseSAMRecord( gatkRead, RAC );
// Skip over reads with no calls in the color space if the user requested it // Skip over reads with no calls in the color space if the user requested it
if( !RAC.IGNORE_NOCALL_COLORSPACE || !RecalDataManager.checkNoCallColorSpace( read ) ) { if( RAC.IGNORE_NOCALL_COLORSPACE && RecalDataManager.checkNoCallColorSpace( gatkRead ) ) {
RecalDataManager.parseColorSpace( read ); gatkRead.setTemporaryAttribute( SKIP_RECORD_ATTRIBUTE, true);
continue;
}
// Skip if base quality is zero RecalDataManager.parseColorSpace( gatkRead );
if( read.getBaseQualities()[offset] > 0 ) { gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE,
RecalDataManager.computeCovariates( gatkRead, requestedCovariates ));
}
bases = read.getReadBases();
// Skip this position if base quality is zero
if( gatkRead.getBaseQualities()[offset] > 0 ) {
bases = gatkRead.getReadBases();
refBase = (byte)ref.getBase(); refBase = (byte)ref.getBase();
// Skip if this base is an 'N' or etc. // Skip if this base is an 'N' or etc.
if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) { if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
if( !read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) { if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) {
// This base finally passed all the checks for a good base, so add it to the big data hashmap // This base finally passed all the checks for a good base, so add it to the big data hashmap
updateDataFromRead( read, offset, refBase ); updateDataFromRead( gatkRead, offset, refBase );
} else { // calculate SOLID reference insertion rate } else { // calculate SOLID reference insertion rate
if( ref.getBase() == (char)bases[offset] ) { if( refBase == (char)bases[offset] ) {
solidInsertedReferenceBases++; solidInsertedReferenceBases++;
} else { } else {
otherColorSpaceInconsistency++; otherColorSpaceInconsistency++;
@ -293,7 +327,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
} }
} }
} }
}
countedSites++; countedSites++;
} else { // We skipped over the dbSNP site, and we are only processing every Nth locus } else { // We skipped over the dbSNP site, and we are only processing every Nth locus
skippedSites++; skippedSites++;
@ -311,6 +344,8 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
return 1; // This value isn't actually used anywhere return 1; // This value isn't actually used anywhere
} }
/** /**
* Update the mismatch / total_base counts for a given class of loci. * Update the mismatch / total_base counts for a given class of loci.
* *
@ -358,29 +393,26 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
* adding one to the number of observations and potentially one to the number of mismatches * adding one to the number of observations and potentially one to the number of mismatches
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls * Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
* because pulling things out of the SAMRecord is an expensive operation. * because pulling things out of the SAMRecord is an expensive operation.
* @param read The SAMRecord holding all the data for this read * @param gatkRead The SAMRecord holding all the data for this read
* @param offset The offset in the read for this locus * @param offset The offset in the read for this locus
* @param refBase The reference base at this locus * @param refBase The reference base at this locus
*/ */
private void updateDataFromRead(final SAMRecord read, final int offset, final byte refBase) { private void updateDataFromRead(final GATKSAMRecord gatkRead, final int offset, final byte refBase) {
final Object[] key = new Object[requestedCovariates.size()];
// Loop through the list of requested covariates and pick out the value from the read and offset final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE);
int iii = 0; final Object[] key = covars[offset];
for( Covariate covariate : requestedCovariates ) {
key[iii++] = covariate.getValue( read, offset );
}
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
RecalDatumOptimized datum = (RecalDatumOptimized) dataManager.data.get( key ); final NestedHashMap data = dataManager.data; //optimization - create local reference
RecalDatumOptimized datum = (RecalDatumOptimized) data.get( key );
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
datum = new RecalDatumOptimized(); // initialized with zeros, will be incremented at end of method datum = new RecalDatumOptimized(); // initialized with zeros, will be incremented at end of method
dataManager.data.put( datum, (Object[])key ); data.put( datum, (Object[])key );
} }
// Need the bases to determine whether or not we have a mismatch // Need the bases to determine whether or not we have a mismatch
final byte base = read.getReadBases()[offset]; final byte base = gatkRead.getReadBases()[offset];
final long curMismatches = datum.getNumMismatches(); final long curMismatches = datum.getNumMismatches();
// Add one to the number of observations and potentially one to the number of mismatches // Add one to the number of observations and potentially one to the number of mismatches

View File

@ -1,10 +1,11 @@
package org.broadinstitute.sting.gatk.walkers.recalibration; package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.BaseUtils;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
/* /*
* Copyright (c) 2009 The Broad Institute * Copyright (c) 2009 The Broad Institute
* *
@ -158,6 +159,106 @@ public class CycleCovariate implements StandardCovariate {
return cycle; return cycle;
} }
public void getValues(SAMRecord read, Comparable[] comparable) {
//-----------------------------
// ILLUMINA and SOLID
//-----------------------------
if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) || // Some bams have "illumina" and others have "SLX"
read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "ABI_SOLID" )) { // Some bams have "solid" and others have "ABI_SOLID"
final int init;
final int increment;
if( !read.getReadNegativeStrandFlag() ) {
// Differentiate between first and second of pair.
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
// Therefore the cycle covariate must differentiate between first and second of pair reads.
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
// the current sequential model would consider the effects independently instead of jointly.
if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) {
//second of pair, positive strand
init = -1;
increment = -1;
}
else
{
//first of pair, positive strand
init = 1;
increment = 1;
}
} else {
if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) {
//second of pair, negative strand
init = -read.getReadLength();
increment = 1;
}
else
{
//first of pair, negative strand
init = read.getReadLength();
increment = -1;
}
}
int cycle = init;
for(int i = 0; i < read.getReadLength(); i++) {
comparable[i] = cycle;
cycle += increment;
}
}
else if( read.getReadGroup().getPlatform().contains( "454" ) ) { // Some bams have "LS454" and others have just "454"
final int readLength = read.getReadLength();
final byte[] bases = read.getReadBases();
// Differentiate between first and second of pair.
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
// Therefore the cycle covariate must differentiate between first and second of pair reads.
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
// the current sequential model would consider the effects independently instead of jointly.
final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag();
int cycle = multiplyByNegative1 ? -1 : 1;
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
if( !read.getReadNegativeStrandFlag() ) { // Forward direction
int iii = 0;
while( iii < readLength )
{
while( iii < readLength && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii++; }
while( iii < readLength && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii++; }
while( iii < readLength && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii++; }
while( iii < readLength && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii++; }
if( iii < readLength ) { if (multiplyByNegative1) cycle--; else cycle++; }
if( iii < readLength && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii++; }
}
} else { // Negative direction
int iii = readLength-1;
while( iii >= 0 )
{
while( iii >= 0 && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii--; }
while( iii >= 0 && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii--; }
while( iii >= 0 && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii--; }
while( iii >= 0 && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii--; }
if( iii >= 0 ) { if (multiplyByNegative1) cycle--; else cycle++; }
if( iii >= 0 && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii--; }
}
}
}
else {
throw new IllegalStateException("This method hasn't been implemented yet for " + read.getReadGroup().getPlatform());
}
}
// Used to get the covariate's value from input csv file in TableRecalibrationWalker // Used to get the covariate's value from input csv file in TableRecalibrationWalker
public final Comparable getValue( final String str ) { public final Comparable getValue( final String str ) {
return Integer.parseInt( str ); return Integer.parseInt( str );

View File

@ -61,7 +61,7 @@ public class Dinuc implements Comparable<Dinuc>{
} }
public static int hashBytes(final byte byte1, final byte byte2) { public static int hashBytes(final byte byte1, final byte byte2) {
return byte1 << 16 + byte2 << 4; return byte1 << 8 + byte2;
} }
public String toString() { // This method call is how the Dinuc will get written out to the table recalibration file public String toString() { // This method call is how the Dinuc will get written out to the table recalibration file

View File

@ -2,9 +2,10 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
import java.util.HashMap; import java.util.HashMap;
import org.broadinstitute.sting.utils.BaseUtils;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.BaseUtils;
/* /*
* Copyright (c) 2009 The Broad Institute * Copyright (c) 2009 The Broad Institute
* *
@ -45,7 +46,7 @@ public class DinucCovariate implements StandardCovariate {
private static final byte NO_CALL = (byte)'N'; private static final byte NO_CALL = (byte)'N';
private static final Dinuc NO_DINUC = new Dinuc(NO_CALL, NO_CALL); private static final Dinuc NO_DINUC = new Dinuc(NO_CALL, NO_CALL);
HashMap<Integer, Dinuc> dinucHashMap; private HashMap<Integer, Dinuc> dinucHashMap;
// Initialize any member variables using the command-line arguments passed to the walkers // Initialize any member variables using the command-line arguments passed to the walkers
public void initialize( final RecalibrationArgumentCollection RAC ) { public void initialize( final RecalibrationArgumentCollection RAC ) {
@ -93,6 +94,45 @@ public class DinucCovariate implements StandardCovariate {
return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) ); return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) );
} }
/**
* Takes an array of size (at least) read.getReadLength() and fills it with the covariate values for each position in the read.
*/
public void getValues( SAMRecord read, Comparable[] result ) {
final HashMap<Integer, Dinuc> dinucHashMapRef = this.dinucHashMap; //optimize access to dinucHashMap
final int readLength = read.getReadLength();
final boolean negativeStrand = read.getReadNegativeStrandFlag();
byte[] bases = read.getReadBases();
byte base;
byte prevBase;
int offset = 0;
// If this is a negative strand read then we need to reverse the direction for our previous base
if(negativeStrand) {
bases = BaseUtils.simpleReverseComplement(bases); //this is NOT in-place
}
result[0] = NO_DINUC; // No dinuc at the beginning of the read
prevBase = bases[0];
offset++;
while(offset < readLength) {
// Note: We are using the previous base in the read, not the
// previous base in the reference. This is done in part to be consistent with unmapped reads.
base = bases[offset];
if( BaseUtils.isRegularBase( prevBase ) ) {
result[offset] = dinucHashMapRef.get( Dinuc.hashBytes( prevBase, base ) );
} else {
result[offset] = NO_DINUC;
}
offset++;
prevBase = base;
}
if(negativeStrand) {
reverse( result );
}
}
// Used to get the covariate's value from input csv file in TableRecalibrationWalker // Used to get the covariate's value from input csv file in TableRecalibrationWalker
public final Comparable getValue( final String str ) { public final Comparable getValue( final String str ) {
final Dinuc returnDinuc = dinucHashMap.get( Dinuc.hashBytes( (byte)str.charAt(0), (byte)str.charAt(1) ) ); final Dinuc returnDinuc = dinucHashMap.get( Dinuc.hashBytes( (byte)str.charAt(0), (byte)str.charAt(1) ) );
@ -100,7 +140,20 @@ public class DinucCovariate implements StandardCovariate {
return null; return null;
} }
return returnDinuc; return returnDinuc;
} }
/**
* Reverses the given array in place.
*
* @param array
*/
private static final void reverse(final Comparable[] array) {
final int arrayLength = array.length;
for(int l = 0, r = arrayLength - 1; l < r; l++, r--) {
final Comparable temp = array[l];
array[l] = array[r];
array[r] = temp;
}
}
} }

View File

@ -76,9 +76,15 @@ public class GCContentCovariate implements ExperimentalCovariate {
} }
} }
public void getValues(SAMRecord read, Comparable[] comparable) {
throw new IllegalStateException("Not yet implemented");
}
// Used to get the covariate's value from input csv file in TableRecalibrationWalker // Used to get the covariate's value from input csv file in TableRecalibrationWalker
public final Comparable getValue( final String str ) { public final Comparable getValue( final String str ) {
return Integer.parseInt( str ); return Integer.parseInt( str );
} }
} }

View File

@ -90,6 +90,10 @@ public class HomopolymerCovariate implements ExperimentalCovariate {
return numAgree; return numAgree;
} }
public void getValues(SAMRecord read, Comparable[] comparable) {
throw new IllegalStateException("Not yet implemented");
}
// Used to get the covariate's value from input csv file in TableRecalibrationWalker // Used to get the covariate's value from input csv file in TableRecalibrationWalker
public final Comparable getValue( final String str ) { public final Comparable getValue( final String str ) {
return Integer.parseInt( str ); return Integer.parseInt( str );

View File

@ -51,4 +51,8 @@ public class MappingQualityCovariate implements ExperimentalCovariate {
return Integer.parseInt( str ); return Integer.parseInt( str );
} }
public void getValues(SAMRecord read, Comparable[] comparable) {
throw new IllegalStateException("Not yet implemented");
}
} }

View File

@ -66,4 +66,7 @@ public class MinimumNQSCovariate implements ExperimentalCovariate {
return Integer.parseInt( str ); return Integer.parseInt( str );
} }
public void getValues(SAMRecord read, Comparable[] comparable) {
throw new IllegalStateException("Not yet implemented");
}
} }

View File

@ -56,4 +56,8 @@ public class PositionCovariate implements ExperimentalCovariate {
return Integer.parseInt( str ); return Integer.parseInt( str );
} }
public void getValues(SAMRecord read, Comparable[] comparable) {
throw new IllegalStateException("Not yet implemented");
}
} }

View File

@ -62,4 +62,7 @@ public class PrimerRoundCovariate implements ExperimentalCovariate {
return Integer.parseInt( str ); return Integer.parseInt( str );
} }
public void getValues(SAMRecord read, Comparable[] comparable) {
throw new IllegalStateException("Not yet implemented");
}
} }

View File

@ -46,6 +46,13 @@ public class QualityScoreCovariate implements RequiredCovariate {
return (int)(read.getBaseQualities()[offset]); return (int)(read.getBaseQualities()[offset]);
} }
public void getValues(SAMRecord read, Comparable[] comparable) {
byte[] baseQualities = read.getBaseQualities();
for(int i = 0; i < read.getReadLength(); i++) {
comparable[i] = (int) baseQualities[i];
}
}
// Used to get the covariate's value from input csv file in TableRecalibrationWalker // Used to get the covariate's value from input csv file in TableRecalibrationWalker
public final Comparable getValue( final String str ) { public final Comparable getValue( final String str ) {
return Integer.parseInt( str ); return Integer.parseInt( str );

View File

@ -48,6 +48,13 @@ public class ReadGroupCovariate implements RequiredCovariate{
return read.getReadGroup().getReadGroupId(); return read.getReadGroup().getReadGroupId();
} }
public void getValues(SAMRecord read, Comparable[] comparable) {
final String readGroupId = read.getReadGroup().getReadGroupId();
for(int i = 0; i < read.getReadLength(); i++) {
comparable[i] = readGroupId;
}
}
// Used to get the covariate's value from input csv file in TableRecalibrationWalker // Used to get the covariate's value from input csv file in TableRecalibrationWalker
public final Comparable getValue( final String str ) { public final Comparable getValue( final String str ) {
return str; return str;

View File

@ -547,4 +547,36 @@ public class RecalDataManager {
return false; return false;
} }
} }
/**
* Computes all requested covariates for every offset in the given read
* by calling covariate.getValues(..).
*
* @param gatkRead The read for which to compute covariate values.
* @param requestedCovariates The list of requested covariates.
* @return An array of covariate values where result[i][j] is the covariate
* value for the ith position in the read and the jth covariate in
* reqeustedCovariates list.
*/
public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List<Covariate> requestedCovariates) {
//compute all covariates for this read
final List<Covariate> requestedCovariatesRef = requestedCovariates;
final int numRequestedCovariates = requestedCovariatesRef.size();
final int readLength = gatkRead.getReadLength();
final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates];
final Comparable[] tempCovariateValuesHolder = new Comparable[readLength];
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
for( int i = 0; i < numRequestedCovariates; i++ ) {
requestedCovariatesRef.get(i).getValues( gatkRead, tempCovariateValuesHolder );
for(int j = 0; j < readLength; j++) {
//copy values into a 2D array that allows all covar types to be extracted at once for
//an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types.
covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j];
}
}
return covariateValues_offset_x_covar;
}
} }

View File

@ -1,24 +1,41 @@
package org.broadinstitute.sting.gatk.walkers.recalibration; package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.*; import java.io.File;
import net.sf.samtools.util.SequenceUtil; import java.io.FileNotFoundException;
import org.broadinstitute.sting.gatk.walkers.ReadWalker; import java.lang.reflect.Field;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.utils.cmdLine.*;
import org.broadinstitute.sting.utils.*;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Random; import java.util.Random;
import java.util.ResourceBundle; import java.util.ResourceBundle;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.io.File;
import java.io.FileNotFoundException; import net.sf.samtools.SAMFileHeader;
import java.lang.reflect.Field; import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMProgramRecord;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMTag;
import net.sf.samtools.util.SequenceUtil;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.utils.JVMUtils;
import org.broadinstitute.sting.utils.NestedHashMap;
import org.broadinstitute.sting.utils.PackageUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.TextFormattingUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.xReadLines;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection;
import org.broadinstitute.sting.utils.cmdLine.ArgumentDefinition;
import org.broadinstitute.sting.utils.cmdLine.ArgumentSource;
import org.broadinstitute.sting.utils.cmdLine.ArgumentTypeDescriptor;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/* /*
* Copyright (c) 2009 The Broad Institute * Copyright (c) 2009 The Broad Institute
@ -33,7 +50,7 @@ import java.lang.reflect.Field;
* conditions: * conditions:
* *
* The above copyright notice and this permission notice shall be * The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
@ -100,6 +117,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private final Random coinFlip = new Random( RANDOM_SEED ); // Random number generator is used to remove reference bias in solid bams private final Random coinFlip = new Random( RANDOM_SEED ); // Random number generator is used to remove reference bias in solid bams
private long numReadsWithMalformedColorSpace = 0; private long numReadsWithMalformedColorSpace = 0;
/////////////////////////////
// Optimization
/////////////////////////////
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values.
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
// //
// initialize // initialize
@ -277,7 +300,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
// Add that datum to all the collapsed tables which will be used in the sequential calculation // Add that datum to all the collapsed tables which will be used in the sequential calculation
dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN ); dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN );
} }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
@ -311,19 +333,23 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, coinFlip, refBases ); originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, coinFlip, refBases );
} }
final Object[] fullCovariateKey = new Object[requestedCovariates.size()]; //compute all covariate values for this read
final Comparable[][] covariateValues_offset_x_covar =
RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates);
// For each base in the read // For each base in the read
final int readLength = read.getReadLength(); for( int offset = 0; offset < read.getReadLength(); offset++ ) {
for( int offset = 0; offset < readLength; offset++ ) {
// Loop through the list of requested covariates and pick out the value from the read and offset final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
int iii = 0;
for( Covariate covariate : requestedCovariates ) { Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
fullCovariateKey[iii++] = covariate.getValue( read, offset ); if(qualityScore == null)
{
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
} }
recalQuals[offset] = performSequentialQualityCalculation( fullCovariateKey ); recalQuals[offset] = qualityScore;
} }
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low

View File

@ -59,4 +59,10 @@ public class TileCovariate implements ExperimentalCovariate {
return Integer.parseInt( str ); return Integer.parseInt( str );
} }
public void getValues(SAMRecord read, Comparable[] comparable) {
final Comparable tileCovariateValue = getValue(read, 0);
for(int i = 0; i < read.getReadLength(); i++) {
comparable[i] = tileCovariateValue;
}
}
} }

View File

@ -39,8 +39,9 @@ public class NestedHashMap{
public Object get( final Object... keys ) { public Object get( final Object... keys ) {
Map map = this.data; Map map = this.data;
for( int iii = 0; iii < keys.length; iii++ ) { final int keysLength = keys.length;
if( iii == keys.length - 1 ) { for( int iii = 0; iii < keysLength; iii++ ) {
if( iii == keysLength - 1 ) {
return map.get(keys[iii]); return map.get(keys[iii]);
} else { } else {
map = (Map) map.get(keys[iii]); map = (Map) map.get(keys[iii]);
@ -54,8 +55,9 @@ public class NestedHashMap{
public void put( final Object value, final Object... keys ) { public void put( final Object value, final Object... keys ) {
Map map = this.data; Map map = this.data;
for( int iii = 0; iii < keys.length; iii++ ) { final int keysLength = keys.length;
if( iii == keys.length - 1 ) { for( int iii = 0; iii < keysLength; iii++ ) {
if( iii == keysLength - 1 ) {
map.put(keys[iii], value); map.put(keys[iii], value);
} else { } else {
Map tmp = (Map) map.get(keys[iii]); Map tmp = (Map) map.get(keys[iii]);

View File

@ -1,8 +1,15 @@
package org.broadinstitute.sting.utils.sam; package org.broadinstitute.sting.utils.sam;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import net.sf.samtools.*; import net.sf.samtools.AlignmentBlock;
import net.sf.samtools.Cigar;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMValidationError;
/** /**
* @author ebanks * @author ebanks
@ -19,23 +26,33 @@ import net.sf.samtools.*;
public class GATKSAMRecord extends SAMRecord { public class GATKSAMRecord extends SAMRecord {
// the underlying SAMRecord which we are wrapping // the underlying SAMRecord which we are wrapping
private SAMRecord mRecord; private final SAMRecord mRecord;
// the SAMRecord data we're caching // the SAMRecord data we're caching
private String mReadString = null; private String mReadString = null;
private SAMReadGroupRecord mReadGroup = null; private SAMReadGroupRecord mReadGroup = null;
private Boolean mNegativeStrandFlag = null; private boolean mNegativeStrandFlag;
private Boolean mUnmappedFlag = null; private boolean mUnmappedFlag;
private Boolean mSecondOfPairFlag = null;
// because some values can be null, we don't want to duplicate effort // because some values can be null, we don't want to duplicate effort
private boolean retrievedReadGroup = false; private boolean retrievedReadGroup = false;
// These temporary attributes were added here to make life easier for
// certain algorithms by providing a way to label or attach arbitrary data to
// individual GATKSAMRecords.
// These attributes exist in memory only, and are never written to disk.
private Map<Object, Object> temporaryAttributes;
public GATKSAMRecord(SAMRecord record) { public GATKSAMRecord(SAMRecord record) {
super(null); // it doesn't matter - this isn't used super(null); // it doesn't matter - this isn't used
if ( record == null ) if ( record == null )
throw new IllegalArgumentException("The SAMRecord argument cannot be null"); throw new IllegalArgumentException("The SAMRecord argument cannot be null");
mRecord = record; mRecord = record;
mNegativeStrandFlag = mRecord.getReadNegativeStrandFlag();
mUnmappedFlag = mRecord.getReadUnmappedFlag();
// because attribute methods are declared to be final (and we can't overload them), // because attribute methods are declared to be final (and we can't overload them),
// we need to actually set all of the attributes here // we need to actually set all of the attributes here
@ -79,8 +96,6 @@ public class GATKSAMRecord extends SAMRecord {
} }
public boolean getReadUnmappedFlag() { public boolean getReadUnmappedFlag() {
if ( mUnmappedFlag == null )
mUnmappedFlag = mRecord.getReadUnmappedFlag();
return mUnmappedFlag; return mUnmappedFlag;
} }
@ -90,8 +105,6 @@ public class GATKSAMRecord extends SAMRecord {
} }
public boolean getReadNegativeStrandFlag() { public boolean getReadNegativeStrandFlag() {
if ( mNegativeStrandFlag == null )
mNegativeStrandFlag = mRecord.getReadNegativeStrandFlag();
return mNegativeStrandFlag; return mNegativeStrandFlag;
} }
@ -100,6 +113,90 @@ public class GATKSAMRecord extends SAMRecord {
mNegativeStrandFlag = b; mNegativeStrandFlag = b;
} }
public boolean getSecondOfPairFlag() {
if( mSecondOfPairFlag == null ) {
//not done in constructor because this method can't be called for
//all SAMRecords.
mSecondOfPairFlag = mRecord.getSecondOfPairFlag();
}
return mSecondOfPairFlag;
}
public void setSecondOfPairFlag(boolean b) {
mRecord.setSecondOfPairFlag(b);
mSecondOfPairFlag = b;
}
/**
* Checks whether an attribute has been set for the given key.
*
* Temporary attributes provide a way to label or attach arbitrary data to
* individual GATKSAMRecords. These attributes exist in memory only,
* and are never written to disk.
*
* @param key
* @return True if an attribute has been set for this key.
*/
public boolean containsTemporaryAttribute(Object key) {
if(temporaryAttributes != null) {
return temporaryAttributes.containsKey(key);
}
return false;
}
/**
* Sets the key to the given value, replacing any previous value. The previous
* value is returned.
*
* Temporary attributes provide a way to label or attach arbitrary data to
* individual GATKSAMRecords. These attributes exist in memory only,
* and are never written to disk.
*
* @param key
* @param value
*/
public Object setTemporaryAttribute(Object key, Object value) {
if(temporaryAttributes == null) {
temporaryAttributes = new HashMap<Object, Object>();
}
return temporaryAttributes.put(key, value);
}
/**
* Looks up the value associated with the given key.
*
* Temporary attributes provide a way to label or attach arbitrary data to
* individual GATKSAMRecords. These attributes exist in memory only,
* and are never written to disk.
*
* @param key
* @return The value, or null.
*/
public Object getTemporaryAttribute(Object key) {
if(temporaryAttributes != null) {
return temporaryAttributes.get(key);
}
return null;
}
/**
* Removes the attribute that has the given key.
*
* Temporary attributes provide a way to label or attach arbitrary data to
* individual GATKSAMRecords. These attributes exist in memory only,
* and are never written to disk.
*
* @param key
* @return The value that was associated with this key, or null.
*/
public Object removeTemporaryAttribute(Object key) {
if(temporaryAttributes != null) {
return temporaryAttributes.remove(key);
}
return null;
}
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// *** The following methods are final and cannot be overridden ***// // *** The following methods are final and cannot be overridden ***//
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
@ -209,8 +306,6 @@ public class GATKSAMRecord extends SAMRecord {
public boolean getFirstOfPairFlag() { return mRecord.getFirstOfPairFlag(); } public boolean getFirstOfPairFlag() { return mRecord.getFirstOfPairFlag(); }
public boolean getSecondOfPairFlag() { return mRecord.getSecondOfPairFlag(); }
public boolean getNotPrimaryAlignmentFlag() { return mRecord.getNotPrimaryAlignmentFlag(); } public boolean getNotPrimaryAlignmentFlag() { return mRecord.getNotPrimaryAlignmentFlag(); }
public boolean getReadFailsVendorQualityCheckFlag() { return mRecord.getReadFailsVendorQualityCheckFlag(); } public boolean getReadFailsVendorQualityCheckFlag() { return mRecord.getReadFailsVendorQualityCheckFlag(); }
@ -227,8 +322,6 @@ public class GATKSAMRecord extends SAMRecord {
public void setFirstOfPairFlag(boolean b) { mRecord.setFirstOfPairFlag(b); } public void setFirstOfPairFlag(boolean b) { mRecord.setFirstOfPairFlag(b); }
public void setSecondOfPairFlag(boolean b) { mRecord.setSecondOfPairFlag(b); }
public void setNotPrimaryAlignmentFlag(boolean b) { mRecord.setNotPrimaryAlignmentFlag(b); } public void setNotPrimaryAlignmentFlag(boolean b) { mRecord.setNotPrimaryAlignmentFlag(b); }
public void setReadFailsVendorQualityCheckFlag(boolean b) { mRecord.setReadFailsVendorQualityCheckFlag(b); } public void setReadFailsVendorQualityCheckFlag(boolean b) { mRecord.setReadFailsVendorQualityCheckFlag(b); }