diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java index 70b8d5fcf..996528a6c 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java @@ -41,6 +41,9 @@ public interface Covariate { public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers public Comparable getValue( SAMRecord read, int offset ); // Used to pick out the covariate's value from attributes of the read public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker + public void getValues( SAMRecord read, Comparable[] comparable ); //Takes an array of size (at least) read.getReadLength() and fills it with covariate + //values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows + //read-specific calculations to be done just once rather than for each offset. } interface RequiredCovariate extends Covariate { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index 2c86b835e..db3b7fae9 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -1,23 +1,35 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import org.broadinstitute.sting.gatk.walkers.*; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.walkers.By; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.WalkerName; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.NestedHashMap; +import org.broadinstitute.sting.utils.PackageUtils; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.genotype.Variation; - -import java.io.PrintStream; -import java.io.FileNotFoundException; -import java.util.*; - -import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -68,6 +80,13 @@ import net.sf.samtools.SAMRecord; @Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta public class CovariateCounterWalker extends LocusWalker { + ///////////////////////////// + // Constants + ///////////////////////////// + private static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; //used to label GATKSAMRecords that should be skipped. + private static final String SEEN_ATTRIBUTE = "SEEN"; //used to label GATKSAMRecords as processed. + private static final String COVARS_ATTRIBUTE = "COVARS"; //used to store covariates array as a temporary attribute inside GATKSAMRecord. + ///////////////////////////// // Shared Arguments ///////////////////////////// @@ -139,7 +158,7 @@ public class CovariateCounterWalker extends LocusWalker { out.println( covClass.getSimpleName() ); } out.println(); - + System.exit( 0 ); // Early exit here because user requested it } @@ -252,43 +271,57 @@ public class CovariateCounterWalker extends LocusWalker { if( !isSNP && ( ++numUnprocessed >= PROCESS_EVERY_NTH_LOCUS ) ) { numUnprocessed = 0; // Reset the counter because we are processing this very locus - SAMRecord read; + GATKSAMRecord gatkRead; int offset; byte refBase; byte[] bases; // For each read at this locus for( PileupElement p : context.getBasePileup() ) { - read = p.getRead(); + gatkRead = (GATKSAMRecord) p.getRead(); offset = p.getOffset(); - RecalDataManager.parseSAMRecord( read, RAC ); + if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) { + continue; + } - // Skip over reads with no calls in the color space if the user requested it - if( !RAC.IGNORE_NOCALL_COLORSPACE || !RecalDataManager.checkNoCallColorSpace( read ) ) { - RecalDataManager.parseColorSpace( read ); + if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) ) + { + gatkRead.setTemporaryAttribute( SEEN_ATTRIBUTE, true ); + RecalDataManager.parseSAMRecord( gatkRead, RAC ); - // Skip if base quality is zero - if( read.getBaseQualities()[offset] > 0 ) { + // Skip over reads with no calls in the color space if the user requested it + if( RAC.IGNORE_NOCALL_COLORSPACE && RecalDataManager.checkNoCallColorSpace( gatkRead ) ) { + gatkRead.setTemporaryAttribute( SKIP_RECORD_ATTRIBUTE, true); + continue; + } - bases = read.getReadBases(); - refBase = (byte)ref.getBase(); + RecalDataManager.parseColorSpace( gatkRead ); + gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE, + RecalDataManager.computeCovariates( gatkRead, requestedCovariates )); + } - // Skip if this base is an 'N' or etc. - if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) { - // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it - if( !read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) { + // Skip this position if base quality is zero + if( gatkRead.getBaseQualities()[offset] > 0 ) { - // This base finally passed all the checks for a good base, so add it to the big data hashmap - updateDataFromRead( read, offset, refBase ); + bases = gatkRead.getReadBases(); + refBase = (byte)ref.getBase(); - } else { // calculate SOLID reference insertion rate - if( ref.getBase() == (char)bases[offset] ) { - solidInsertedReferenceBases++; - } else { - otherColorSpaceInconsistency++; - } + // Skip if this base is an 'N' or etc. + if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) { + + // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it + if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) { + + // This base finally passed all the checks for a good base, so add it to the big data hashmap + updateDataFromRead( gatkRead, offset, refBase ); + + } else { // calculate SOLID reference insertion rate + if( refBase == (char)bases[offset] ) { + solidInsertedReferenceBases++; + } else { + otherColorSpaceInconsistency++; } } } @@ -311,6 +344,8 @@ public class CovariateCounterWalker extends LocusWalker { return 1; // This value isn't actually used anywhere } + + /** * Update the mismatch / total_base counts for a given class of loci. * @@ -343,7 +378,7 @@ public class CovariateCounterWalker extends LocusWalker { final double fractionMM_novel = (double)novel_counts.first / (double)novel_counts.second; final double fractionMM_dbsnp = (double)dbSNP_counts.first / (double)dbSNP_counts.second; - + if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) { Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) ); @@ -358,29 +393,26 @@ public class CovariateCounterWalker extends LocusWalker { * adding one to the number of observations and potentially one to the number of mismatches * Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls * because pulling things out of the SAMRecord is an expensive operation. - * @param read The SAMRecord holding all the data for this read + * @param gatkRead The SAMRecord holding all the data for this read * @param offset The offset in the read for this locus * @param refBase The reference base at this locus */ - private void updateDataFromRead(final SAMRecord read, final int offset, final byte refBase) { + private void updateDataFromRead(final GATKSAMRecord gatkRead, final int offset, final byte refBase) { - final Object[] key = new Object[requestedCovariates.size()]; - - // Loop through the list of requested covariates and pick out the value from the read and offset - int iii = 0; - for( Covariate covariate : requestedCovariates ) { - key[iii++] = covariate.getValue( read, offset ); - } - // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap - RecalDatumOptimized datum = (RecalDatumOptimized) dataManager.data.get( key ); + final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE); + final Object[] key = covars[offset]; + + // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap + final NestedHashMap data = dataManager.data; //optimization - create local reference + RecalDatumOptimized datum = (RecalDatumOptimized) data.get( key ); if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it datum = new RecalDatumOptimized(); // initialized with zeros, will be incremented at end of method - dataManager.data.put( datum, (Object[])key ); + data.put( datum, (Object[])key ); } - + // Need the bases to determine whether or not we have a mismatch - final byte base = read.getReadBases()[offset]; + final byte base = gatkRead.getReadBases()[offset]; final long curMismatches = datum.getNumMismatches(); // Add one to the number of observations and potentially one to the number of mismatches @@ -427,7 +459,7 @@ public class CovariateCounterWalker extends LocusWalker { logger.info( "Writing raw recalibration data..." ); outputToCSV( recalTableStream ); logger.info( "...done!" ); - + recalTableStream.close(); } @@ -436,7 +468,7 @@ public class CovariateCounterWalker extends LocusWalker { * @param recalTableStream The PrintStream to write out to */ private void outputToCSV( final PrintStream recalTableStream ) { - + recalTableStream.printf("# Counted Sites %d%n", countedSites); recalTableStream.printf("# Counted Bases %d%n", countedBases); recalTableStream.printf("# Skipped Sites %d%n", skippedSites); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index 7b0859390..c460dadc5 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -1,10 +1,11 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.BaseUtils; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Utils; + /* * Copyright (c) 2009 The Broad Institute * @@ -72,9 +73,9 @@ public class CycleCovariate implements StandardCovariate { if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) || // Some bams have "illumina" and others have "SLX" read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "ABI_SOLID" )) { // Some bams have "solid" and others have "ABI_SOLID" cycle = offset + 1; - if( read.getReadNegativeStrandFlag() ) { - cycle = read.getReadLength() - offset; - } + if( read.getReadNegativeStrandFlag() ) { + cycle = read.getReadLength() - offset; + } } //----------------------------- @@ -119,10 +120,10 @@ public class CycleCovariate implements StandardCovariate { //else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { // // The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf // int pos = offset + 1; - // if( read.getReadNegativeStrandFlag() ) { - // pos = read.getReadLength() - offset; - // } - // cycle = pos / 5; // integer division + // if( read.getReadNegativeStrandFlag() ) { + // pos = read.getReadLength() - offset; + // } + // cycle = pos / 5; // integer division //} //----------------------------- @@ -158,6 +159,106 @@ public class CycleCovariate implements StandardCovariate { return cycle; } + + public void getValues(SAMRecord read, Comparable[] comparable) { + + //----------------------------- + // ILLUMINA and SOLID + //----------------------------- + + if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) || // Some bams have "illumina" and others have "SLX" + read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "ABI_SOLID" )) { // Some bams have "solid" and others have "ABI_SOLID" + + final int init; + final int increment; + if( !read.getReadNegativeStrandFlag() ) { + // Differentiate between first and second of pair. + // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group + // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. + // Therefore the cycle covariate must differentiate between first and second of pair reads. + // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because + // the current sequential model would consider the effects independently instead of jointly. + if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { + //second of pair, positive strand + init = -1; + increment = -1; + } + else + { + //first of pair, positive strand + init = 1; + increment = 1; + } + + } else { + if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { + //second of pair, negative strand + init = -read.getReadLength(); + increment = 1; + } + else + { + //first of pair, negative strand + init = read.getReadLength(); + increment = -1; + } + } + + int cycle = init; + for(int i = 0; i < read.getReadLength(); i++) { + comparable[i] = cycle; + cycle += increment; + } + } + else if( read.getReadGroup().getPlatform().contains( "454" ) ) { // Some bams have "LS454" and others have just "454" + + final int readLength = read.getReadLength(); + final byte[] bases = read.getReadBases(); + + // Differentiate between first and second of pair. + // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group + // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. + // Therefore the cycle covariate must differentiate between first and second of pair reads. + // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because + // the current sequential model would consider the effects independently instead of jointly. + final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); + + int cycle = multiplyByNegative1 ? -1 : 1; + + // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change + // For example, AAAAAAA was probably read in two flow cycles but here we count it as one + if( !read.getReadNegativeStrandFlag() ) { // Forward direction + int iii = 0; + while( iii < readLength ) + { + while( iii < readLength && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii++; } + while( iii < readLength && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii++; } + while( iii < readLength && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii++; } + while( iii < readLength && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii++; } + if( iii < readLength ) { if (multiplyByNegative1) cycle--; else cycle++; } + if( iii < readLength && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii++; } + + } + } else { // Negative direction + int iii = readLength-1; + while( iii >= 0 ) + { + while( iii >= 0 && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii--; } + while( iii >= 0 && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii--; } + while( iii >= 0 && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii--; } + while( iii >= 0 && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii--; } + if( iii >= 0 ) { if (multiplyByNegative1) cycle--; else cycle++; } + if( iii >= 0 && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii--; } + } + } + } + else { + throw new IllegalStateException("This method hasn't been implemented yet for " + read.getReadGroup().getPlatform()); + } + + + } + // Used to get the covariate's value from input csv file in TableRecalibrationWalker public final Comparable getValue( final String str ) { return Integer.parseInt( str ); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java index f2428e808..9e1c2fe1f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java @@ -61,7 +61,7 @@ public class Dinuc implements Comparable{ } public static int hashBytes(final byte byte1, final byte byte2) { - return byte1 << 16 + byte2 << 4; + return byte1 << 8 + byte2; } public String toString() { // This method call is how the Dinuc will get written out to the table recalibration file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java index aa4d50b0a..c7be5057a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java @@ -2,9 +2,10 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import java.util.HashMap; -import org.broadinstitute.sting.utils.BaseUtils; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.BaseUtils; + /* * Copyright (c) 2009 The Broad Institute * @@ -45,7 +46,7 @@ public class DinucCovariate implements StandardCovariate { private static final byte NO_CALL = (byte)'N'; private static final Dinuc NO_DINUC = new Dinuc(NO_CALL, NO_CALL); - HashMap dinucHashMap; + private HashMap dinucHashMap; // Initialize any member variables using the command-line arguments passed to the walkers public void initialize( final RecalibrationArgumentCollection RAC ) { @@ -89,10 +90,49 @@ public class DinucCovariate implements StandardCovariate { if( !BaseUtils.isRegularBase( prevBase ) ) { return NO_DINUC; } - + return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) ); } + + /** + * Takes an array of size (at least) read.getReadLength() and fills it with the covariate values for each position in the read. + */ + public void getValues( SAMRecord read, Comparable[] result ) { + final HashMap dinucHashMapRef = this.dinucHashMap; //optimize access to dinucHashMap + final int readLength = read.getReadLength(); + final boolean negativeStrand = read.getReadNegativeStrandFlag(); + byte[] bases = read.getReadBases(); + byte base; + byte prevBase; + int offset = 0; + // If this is a negative strand read then we need to reverse the direction for our previous base + + if(negativeStrand) { + bases = BaseUtils.simpleReverseComplement(bases); //this is NOT in-place + } + result[0] = NO_DINUC; // No dinuc at the beginning of the read + + prevBase = bases[0]; + offset++; + while(offset < readLength) { + // Note: We are using the previous base in the read, not the + // previous base in the reference. This is done in part to be consistent with unmapped reads. + base = bases[offset]; + if( BaseUtils.isRegularBase( prevBase ) ) { + result[offset] = dinucHashMapRef.get( Dinuc.hashBytes( prevBase, base ) ); + } else { + result[offset] = NO_DINUC; + } + + offset++; + prevBase = base; + } + if(negativeStrand) { + reverse( result ); + } + } + // Used to get the covariate's value from input csv file in TableRecalibrationWalker public final Comparable getValue( final String str ) { final Dinuc returnDinuc = dinucHashMap.get( Dinuc.hashBytes( (byte)str.charAt(0), (byte)str.charAt(1) ) ); @@ -100,7 +140,20 @@ public class DinucCovariate implements StandardCovariate { return null; } return returnDinuc; - } -} \ No newline at end of file + + /** + * Reverses the given array in place. + * + * @param array + */ + private static final void reverse(final Comparable[] array) { + final int arrayLength = array.length; + for(int l = 0, r = arrayLength - 1; l < r; l++, r--) { + final Comparable temp = array[l]; + array[l] = array[r]; + array[r] = temp; + } + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java index 86b5e9e7d..a66acdba8 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java @@ -33,7 +33,7 @@ import net.sf.samtools.SAMRecord; * Date: Jan 29, 2010 * * The number of previous N bases (along the direction of the read) that contain G's and C's. The goal is to correct for dye slippage. - * Only valid for Illumina reads. Otherwise return -1. + * Only valid for Illumina reads. Otherwise return -1. */ public class GCContentCovariate implements ExperimentalCovariate { @@ -69,16 +69,22 @@ public class GCContentCovariate implements ExperimentalCovariate { numGC++; } } - + return numGC; } else { // This effect is specific to the Illumina platform return -1; } } + + public void getValues(SAMRecord read, Comparable[] comparable) { + throw new IllegalStateException("Not yet implemented"); + } // Used to get the covariate's value from input csv file in TableRecalibrationWalker public final Comparable getValue( final String str ) { return Integer.parseInt( str ); } + + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java index cfeebdfd1..4ed1a6a27 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java @@ -90,6 +90,10 @@ public class HomopolymerCovariate implements ExperimentalCovariate { return numAgree; } + public void getValues(SAMRecord read, Comparable[] comparable) { + throw new IllegalStateException("Not yet implemented"); + } + // Used to get the covariate's value from input csv file in TableRecalibrationWalker public final Comparable getValue( final String str ) { return Integer.parseInt( str ); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java index 398884dbb..e7147cbca 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java @@ -42,8 +42,8 @@ public class MappingQualityCovariate implements ExperimentalCovariate { } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - return read.getMappingQuality(); + public final Comparable getValue( final SAMRecord read, final int offset ) { + return read.getMappingQuality(); } // Used to get the covariate's value from input csv file in TableRecalibrationWalker @@ -51,4 +51,8 @@ public class MappingQualityCovariate implements ExperimentalCovariate { return Integer.parseInt( str ); } + public void getValues(SAMRecord read, Comparable[] comparable) { + throw new IllegalStateException("Not yet implemented"); + } + } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java index 355e4bb5c..6b8100e17 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java @@ -47,8 +47,8 @@ public class MinimumNQSCovariate implements ExperimentalCovariate { // Used to pick out the covariate's value from attributes of the read public final Comparable getValue( final SAMRecord read, final int offset ) { - - // Loop over the list of base quality scores in the window and find the minimum + + // Loop over the list of base quality scores in the window and find the minimum final byte[] quals = read.getBaseQualities(); int minQual = quals[offset]; final int minIndex = Math.max(offset - windowReach, 0); @@ -66,4 +66,7 @@ public class MinimumNQSCovariate implements ExperimentalCovariate { return Integer.parseInt( str ); } + public void getValues(SAMRecord read, Comparable[] comparable) { + throw new IllegalStateException("Not yet implemented"); + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java index 21a3bbef9..5242d05fc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java @@ -56,4 +56,8 @@ public class PositionCovariate implements ExperimentalCovariate { return Integer.parseInt( str ); } + public void getValues(SAMRecord read, Comparable[] comparable) { + throw new IllegalStateException("Not yet implemented"); + } + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java index 2613d6248..684c149f9 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java @@ -34,7 +34,7 @@ import net.sf.samtools.SAMRecord; * * The Primer Round covariate. * For Solexa and 454 this is the same value of the length of the read. - * For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf + * For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf */ public class PrimerRoundCovariate implements ExperimentalCovariate { @@ -47,10 +47,10 @@ public class PrimerRoundCovariate implements ExperimentalCovariate { public final Comparable getValue( final SAMRecord read, final int offset ) { if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { int pos = offset; - if( read.getReadNegativeStrandFlag() ) { - pos = read.getReadLength() - (offset + 1); - } - return pos % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf + if( read.getReadNegativeStrandFlag() ) { + pos = read.getReadLength() - (offset + 1); + } + return pos % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf } else { return 1; // nothing to do here because it is always the same } @@ -62,4 +62,7 @@ public class PrimerRoundCovariate implements ExperimentalCovariate { return Integer.parseInt( str ); } + public void getValues(SAMRecord read, Comparable[] comparable) { + throw new IllegalStateException("Not yet implemented"); + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java index 337fcf4b5..bd1f957bb 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java @@ -42,8 +42,15 @@ public class QualityScoreCovariate implements RequiredCovariate { } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - return (int)(read.getBaseQualities()[offset]); + public final Comparable getValue( final SAMRecord read, final int offset ) { + return (int)(read.getBaseQualities()[offset]); + } + + public void getValues(SAMRecord read, Comparable[] comparable) { + byte[] baseQualities = read.getBaseQualities(); + for(int i = 0; i < read.getReadLength(); i++) { + comparable[i] = (int) baseQualities[i]; + } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java index 4a87c1f8d..f1ae7b24b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java @@ -45,12 +45,19 @@ public class ReadGroupCovariate implements RequiredCovariate{ // Used to pick out the covariate's value from attributes of the read public final Comparable getValue( final SAMRecord read, final int offset ) { - return read.getReadGroup().getReadGroupId(); + return read.getReadGroup().getReadGroupId(); + } + + public void getValues(SAMRecord read, Comparable[] comparable) { + final String readGroupId = read.getReadGroup().getReadGroupId(); + for(int i = 0; i < read.getReadLength(); i++) { + comparable[i] = readGroupId; + } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker public final Comparable getValue( final String str ) { - return str; + return str; } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 23e200ed2..7c0f25ae6 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -44,7 +44,7 @@ import net.sf.samtools.SAMReadGroupRecord; */ public class RecalDataManager { - + public final NestedHashMap data; // The full dataset private final NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed private final NestedHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed @@ -58,14 +58,14 @@ public class RecalDataManager { private static boolean warnUserNullPlatform = false; RecalDataManager() { - data = new NestedHashMap(); + data = new NestedHashMap(); dataCollapsedReadGroup = null; dataCollapsedQualityScore = null; dataCollapsedByCovariate = null; } RecalDataManager( final boolean createCollapsedTables, final int numCovariates ) { - if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker + if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker data = null; dataCollapsedReadGroup = new NestedHashMap(); dataCollapsedQualityScore = new NestedHashMap(); @@ -78,9 +78,9 @@ public class RecalDataManager { dataCollapsedReadGroup = null; dataCollapsedQualityScore = null; dataCollapsedByCovariate = null; - } + } } - + /** * Add the given mapping to all of the collapsed hash tables * @param key The list of comparables that is the key for this mapping @@ -140,7 +140,7 @@ public class RecalDataManager { * Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score * that will be used in the sequential calculation in TableRecalibrationWalker * @param smoothing The smoothing paramter that goes into empirical quality score calculation - * @param maxQual At which value to cap the quality scores + * @param maxQual At which value to cap the quality scores */ public final void generateEmpiricalQualities( final int smoothing, final int maxQual ) { @@ -349,7 +349,7 @@ public class RecalDataManager { originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); } else if( SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead, coinFlip); - } + } } else { throw new StingException("Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + @@ -542,9 +542,41 @@ public class RecalDataManager { // return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0); // } //} - + } else { // No inconsistency array, so nothing is inconsistent return false; } } + + /** + * Computes all requested covariates for every offset in the given read + * by calling covariate.getValues(..). + * + * @param gatkRead The read for which to compute covariate values. + * @param requestedCovariates The list of requested covariates. + * @return An array of covariate values where result[i][j] is the covariate + * value for the ith position in the read and the jth covariate in + * reqeustedCovariates list. + */ + public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List requestedCovariates) { + //compute all covariates for this read + final List requestedCovariatesRef = requestedCovariates; + final int numRequestedCovariates = requestedCovariatesRef.size(); + final int readLength = gatkRead.getReadLength(); + + final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates]; + final Comparable[] tempCovariateValuesHolder = new Comparable[readLength]; + + // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read + for( int i = 0; i < numRequestedCovariates; i++ ) { + requestedCovariatesRef.get(i).getValues( gatkRead, tempCovariateValuesHolder ); + for(int j = 0; j < readLength; j++) { + //copy values into a 2D array that allows all covar types to be extracted at once for + //an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types. + covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; + } + } + + return covariateValues_offset_x_covar; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 6e4788037..7cfd069e5 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -1,24 +1,41 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.*; -import net.sf.samtools.util.SequenceUtil; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.utils.cmdLine.*; -import org.broadinstitute.sting.utils.*; - +import java.io.File; +import java.io.FileNotFoundException; +import java.lang.reflect.Field; import java.util.ArrayList; import java.util.List; import java.util.Random; import java.util.ResourceBundle; import java.util.regex.Pattern; -import java.io.File; -import java.io.FileNotFoundException; -import java.lang.reflect.Field; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMProgramRecord; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMTag; +import net.sf.samtools.util.SequenceUtil; + +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.WalkerName; +import org.broadinstitute.sting.utils.JVMUtils; +import org.broadinstitute.sting.utils.NestedHashMap; +import org.broadinstitute.sting.utils.PackageUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.TextFormattingUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.xReadLines; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection; +import org.broadinstitute.sting.utils.cmdLine.ArgumentDefinition; +import org.broadinstitute.sting.utils.cmdLine.ArgumentSource; +import org.broadinstitute.sting.utils.cmdLine.ArgumentTypeDescriptor; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -33,7 +50,7 @@ import java.lang.reflect.Field; * conditions: * * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES @@ -51,7 +68,7 @@ import java.lang.reflect.Field; * For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) * Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read. * This walker then outputs a new bam file with these updated (recalibrated) reads. - * + * * Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker. * Note: This walker is designed to be used in conjunction with CovariateCounterWalker. * @@ -81,7 +98,7 @@ public class TableRecalibrationWalker extends ReadWalker temporaryAttributes; + public GATKSAMRecord(SAMRecord record) { super(null); // it doesn't matter - this isn't used if ( record == null ) throw new IllegalArgumentException("The SAMRecord argument cannot be null"); mRecord = record; + mNegativeStrandFlag = mRecord.getReadNegativeStrandFlag(); + mUnmappedFlag = mRecord.getReadUnmappedFlag(); + // because attribute methods are declared to be final (and we can't overload them), // we need to actually set all of the attributes here @@ -79,8 +96,6 @@ public class GATKSAMRecord extends SAMRecord { } public boolean getReadUnmappedFlag() { - if ( mUnmappedFlag == null ) - mUnmappedFlag = mRecord.getReadUnmappedFlag(); return mUnmappedFlag; } @@ -90,8 +105,6 @@ public class GATKSAMRecord extends SAMRecord { } public boolean getReadNegativeStrandFlag() { - if ( mNegativeStrandFlag == null ) - mNegativeStrandFlag = mRecord.getReadNegativeStrandFlag(); return mNegativeStrandFlag; } @@ -100,6 +113,90 @@ public class GATKSAMRecord extends SAMRecord { mNegativeStrandFlag = b; } + public boolean getSecondOfPairFlag() { + if( mSecondOfPairFlag == null ) { + //not done in constructor because this method can't be called for + //all SAMRecords. + mSecondOfPairFlag = mRecord.getSecondOfPairFlag(); + } + return mSecondOfPairFlag; + } + + public void setSecondOfPairFlag(boolean b) { + mRecord.setSecondOfPairFlag(b); + mSecondOfPairFlag = b; + } + + + /** + * Checks whether an attribute has been set for the given key. + * + * Temporary attributes provide a way to label or attach arbitrary data to + * individual GATKSAMRecords. These attributes exist in memory only, + * and are never written to disk. + * + * @param key + * @return True if an attribute has been set for this key. + */ + public boolean containsTemporaryAttribute(Object key) { + if(temporaryAttributes != null) { + return temporaryAttributes.containsKey(key); + } + return false; + } + + /** + * Sets the key to the given value, replacing any previous value. The previous + * value is returned. + * + * Temporary attributes provide a way to label or attach arbitrary data to + * individual GATKSAMRecords. These attributes exist in memory only, + * and are never written to disk. + * + * @param key + * @param value + */ + public Object setTemporaryAttribute(Object key, Object value) { + if(temporaryAttributes == null) { + temporaryAttributes = new HashMap(); + } + return temporaryAttributes.put(key, value); + } + + /** + * Looks up the value associated with the given key. + * + * Temporary attributes provide a way to label or attach arbitrary data to + * individual GATKSAMRecords. These attributes exist in memory only, + * and are never written to disk. + * + * @param key + * @return The value, or null. + */ + public Object getTemporaryAttribute(Object key) { + if(temporaryAttributes != null) { + return temporaryAttributes.get(key); + } + return null; + } + + /** + * Removes the attribute that has the given key. + * + * Temporary attributes provide a way to label or attach arbitrary data to + * individual GATKSAMRecords. These attributes exist in memory only, + * and are never written to disk. + * + * @param key + * @return The value that was associated with this key, or null. + */ + public Object removeTemporaryAttribute(Object key) { + if(temporaryAttributes != null) { + return temporaryAttributes.remove(key); + } + return null; + } + ///////////////////////////////////////////////////////////////////// // *** The following methods are final and cannot be overridden ***// ///////////////////////////////////////////////////////////////////// @@ -209,8 +306,6 @@ public class GATKSAMRecord extends SAMRecord { public boolean getFirstOfPairFlag() { return mRecord.getFirstOfPairFlag(); } - public boolean getSecondOfPairFlag() { return mRecord.getSecondOfPairFlag(); } - public boolean getNotPrimaryAlignmentFlag() { return mRecord.getNotPrimaryAlignmentFlag(); } public boolean getReadFailsVendorQualityCheckFlag() { return mRecord.getReadFailsVendorQualityCheckFlag(); } @@ -227,8 +322,6 @@ public class GATKSAMRecord extends SAMRecord { public void setFirstOfPairFlag(boolean b) { mRecord.setFirstOfPairFlag(b); } - public void setSecondOfPairFlag(boolean b) { mRecord.setSecondOfPairFlag(b); } - public void setNotPrimaryAlignmentFlag(boolean b) { mRecord.setNotPrimaryAlignmentFlag(b); } public void setReadFailsVendorQualityCheckFlag(boolean b) { mRecord.setReadFailsVendorQualityCheckFlag(b); }