From b236714c8a32ee64caf792a5a1c404e9608c424a Mon Sep 17 00:00:00 2001 From: alecw Date: Mon, 22 Feb 2010 17:35:25 +0000 Subject: [PATCH] Optimization - Added method to Covariates: void getValues( SAMRecord read, Comparable[] comparable ) which takes an array of size (at least) read.getReadLength() and fills it with covariate values for all positions in the given read. Made CovariateCounterWalker and TableRecalibrationWalker use this method instead of calling getValue(..) for each covariate and each offset. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2863 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/recalibration/Covariate.java | 3 + .../recalibration/CovariateCounterWalker.java | 138 +++++++++++------- .../walkers/recalibration/CycleCovariate.java | 121 +++++++++++++-- .../gatk/walkers/recalibration/Dinuc.java | 2 +- .../walkers/recalibration/DinucCovariate.java | 63 +++++++- .../recalibration/GCContentCovariate.java | 10 +- .../recalibration/HomopolymerCovariate.java | 4 + .../MappingQualityCovariate.java | 8 +- .../recalibration/MinimumNQSCovariate.java | 7 +- .../recalibration/PositionCovariate.java | 4 + .../recalibration/PrimerRoundCovariate.java | 13 +- .../recalibration/QualityScoreCovariate.java | 11 +- .../recalibration/ReadGroupCovariate.java | 11 +- .../recalibration/RecalDataManager.java | 48 +++++- .../TableRecalibrationWalker.java | 84 +++++++---- .../walkers/recalibration/TileCovariate.java | 8 +- .../sting/utils/NestedHashMap.java | 10 +- .../sting/utils/sam/GATKSAMRecord.java | 117 +++++++++++++-- 18 files changed, 524 insertions(+), 138 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java index 70b8d5fcf..996528a6c 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Covariate.java @@ -41,6 +41,9 @@ public interface Covariate { public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers public Comparable getValue( SAMRecord read, int offset ); // Used to pick out the covariate's value from attributes of the read public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker + public void getValues( SAMRecord read, Comparable[] comparable ); //Takes an array of size (at least) read.getReadLength() and fills it with covariate + //values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows + //read-specific calculations to be done just once rather than for each offset. } interface RequiredCovariate extends Covariate { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index 2c86b835e..db3b7fae9 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -1,23 +1,35 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import org.broadinstitute.sting.gatk.walkers.*; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.walkers.By; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.WalkerName; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.NestedHashMap; +import org.broadinstitute.sting.utils.PackageUtils; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.genotype.Variation; - -import java.io.PrintStream; -import java.io.FileNotFoundException; -import java.util.*; - -import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -68,6 +80,13 @@ import net.sf.samtools.SAMRecord; @Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta public class CovariateCounterWalker extends LocusWalker { + ///////////////////////////// + // Constants + ///////////////////////////// + private static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; //used to label GATKSAMRecords that should be skipped. + private static final String SEEN_ATTRIBUTE = "SEEN"; //used to label GATKSAMRecords as processed. + private static final String COVARS_ATTRIBUTE = "COVARS"; //used to store covariates array as a temporary attribute inside GATKSAMRecord. + ///////////////////////////// // Shared Arguments ///////////////////////////// @@ -139,7 +158,7 @@ public class CovariateCounterWalker extends LocusWalker { out.println( covClass.getSimpleName() ); } out.println(); - + System.exit( 0 ); // Early exit here because user requested it } @@ -252,43 +271,57 @@ public class CovariateCounterWalker extends LocusWalker { if( !isSNP && ( ++numUnprocessed >= PROCESS_EVERY_NTH_LOCUS ) ) { numUnprocessed = 0; // Reset the counter because we are processing this very locus - SAMRecord read; + GATKSAMRecord gatkRead; int offset; byte refBase; byte[] bases; // For each read at this locus for( PileupElement p : context.getBasePileup() ) { - read = p.getRead(); + gatkRead = (GATKSAMRecord) p.getRead(); offset = p.getOffset(); - RecalDataManager.parseSAMRecord( read, RAC ); + if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) { + continue; + } - // Skip over reads with no calls in the color space if the user requested it - if( !RAC.IGNORE_NOCALL_COLORSPACE || !RecalDataManager.checkNoCallColorSpace( read ) ) { - RecalDataManager.parseColorSpace( read ); + if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) ) + { + gatkRead.setTemporaryAttribute( SEEN_ATTRIBUTE, true ); + RecalDataManager.parseSAMRecord( gatkRead, RAC ); - // Skip if base quality is zero - if( read.getBaseQualities()[offset] > 0 ) { + // Skip over reads with no calls in the color space if the user requested it + if( RAC.IGNORE_NOCALL_COLORSPACE && RecalDataManager.checkNoCallColorSpace( gatkRead ) ) { + gatkRead.setTemporaryAttribute( SKIP_RECORD_ATTRIBUTE, true); + continue; + } - bases = read.getReadBases(); - refBase = (byte)ref.getBase(); + RecalDataManager.parseColorSpace( gatkRead ); + gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE, + RecalDataManager.computeCovariates( gatkRead, requestedCovariates )); + } - // Skip if this base is an 'N' or etc. - if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) { - // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it - if( !read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) { + // Skip this position if base quality is zero + if( gatkRead.getBaseQualities()[offset] > 0 ) { - // This base finally passed all the checks for a good base, so add it to the big data hashmap - updateDataFromRead( read, offset, refBase ); + bases = gatkRead.getReadBases(); + refBase = (byte)ref.getBase(); - } else { // calculate SOLID reference insertion rate - if( ref.getBase() == (char)bases[offset] ) { - solidInsertedReferenceBases++; - } else { - otherColorSpaceInconsistency++; - } + // Skip if this base is an 'N' or etc. + if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) { + + // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it + if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) { + + // This base finally passed all the checks for a good base, so add it to the big data hashmap + updateDataFromRead( gatkRead, offset, refBase ); + + } else { // calculate SOLID reference insertion rate + if( refBase == (char)bases[offset] ) { + solidInsertedReferenceBases++; + } else { + otherColorSpaceInconsistency++; } } } @@ -311,6 +344,8 @@ public class CovariateCounterWalker extends LocusWalker { return 1; // This value isn't actually used anywhere } + + /** * Update the mismatch / total_base counts for a given class of loci. * @@ -343,7 +378,7 @@ public class CovariateCounterWalker extends LocusWalker { final double fractionMM_novel = (double)novel_counts.first / (double)novel_counts.second; final double fractionMM_dbsnp = (double)dbSNP_counts.first / (double)dbSNP_counts.second; - + if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) { Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) ); @@ -358,29 +393,26 @@ public class CovariateCounterWalker extends LocusWalker { * adding one to the number of observations and potentially one to the number of mismatches * Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls * because pulling things out of the SAMRecord is an expensive operation. - * @param read The SAMRecord holding all the data for this read + * @param gatkRead The SAMRecord holding all the data for this read * @param offset The offset in the read for this locus * @param refBase The reference base at this locus */ - private void updateDataFromRead(final SAMRecord read, final int offset, final byte refBase) { + private void updateDataFromRead(final GATKSAMRecord gatkRead, final int offset, final byte refBase) { - final Object[] key = new Object[requestedCovariates.size()]; - - // Loop through the list of requested covariates and pick out the value from the read and offset - int iii = 0; - for( Covariate covariate : requestedCovariates ) { - key[iii++] = covariate.getValue( read, offset ); - } - // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap - RecalDatumOptimized datum = (RecalDatumOptimized) dataManager.data.get( key ); + final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE); + final Object[] key = covars[offset]; + + // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap + final NestedHashMap data = dataManager.data; //optimization - create local reference + RecalDatumOptimized datum = (RecalDatumOptimized) data.get( key ); if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it datum = new RecalDatumOptimized(); // initialized with zeros, will be incremented at end of method - dataManager.data.put( datum, (Object[])key ); + data.put( datum, (Object[])key ); } - + // Need the bases to determine whether or not we have a mismatch - final byte base = read.getReadBases()[offset]; + final byte base = gatkRead.getReadBases()[offset]; final long curMismatches = datum.getNumMismatches(); // Add one to the number of observations and potentially one to the number of mismatches @@ -427,7 +459,7 @@ public class CovariateCounterWalker extends LocusWalker { logger.info( "Writing raw recalibration data..." ); outputToCSV( recalTableStream ); logger.info( "...done!" ); - + recalTableStream.close(); } @@ -436,7 +468,7 @@ public class CovariateCounterWalker extends LocusWalker { * @param recalTableStream The PrintStream to write out to */ private void outputToCSV( final PrintStream recalTableStream ) { - + recalTableStream.printf("# Counted Sites %d%n", countedSites); recalTableStream.printf("# Counted Bases %d%n", countedBases); recalTableStream.printf("# Skipped Sites %d%n", skippedSites); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index 7b0859390..c460dadc5 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -1,10 +1,11 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.BaseUtils; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.Utils; + /* * Copyright (c) 2009 The Broad Institute * @@ -72,9 +73,9 @@ public class CycleCovariate implements StandardCovariate { if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) || // Some bams have "illumina" and others have "SLX" read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "ABI_SOLID" )) { // Some bams have "solid" and others have "ABI_SOLID" cycle = offset + 1; - if( read.getReadNegativeStrandFlag() ) { - cycle = read.getReadLength() - offset; - } + if( read.getReadNegativeStrandFlag() ) { + cycle = read.getReadLength() - offset; + } } //----------------------------- @@ -119,10 +120,10 @@ public class CycleCovariate implements StandardCovariate { //else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { // // The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf // int pos = offset + 1; - // if( read.getReadNegativeStrandFlag() ) { - // pos = read.getReadLength() - offset; - // } - // cycle = pos / 5; // integer division + // if( read.getReadNegativeStrandFlag() ) { + // pos = read.getReadLength() - offset; + // } + // cycle = pos / 5; // integer division //} //----------------------------- @@ -158,6 +159,106 @@ public class CycleCovariate implements StandardCovariate { return cycle; } + + public void getValues(SAMRecord read, Comparable[] comparable) { + + //----------------------------- + // ILLUMINA and SOLID + //----------------------------- + + if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) || // Some bams have "illumina" and others have "SLX" + read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "ABI_SOLID" )) { // Some bams have "solid" and others have "ABI_SOLID" + + final int init; + final int increment; + if( !read.getReadNegativeStrandFlag() ) { + // Differentiate between first and second of pair. + // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group + // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. + // Therefore the cycle covariate must differentiate between first and second of pair reads. + // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because + // the current sequential model would consider the effects independently instead of jointly. + if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { + //second of pair, positive strand + init = -1; + increment = -1; + } + else + { + //first of pair, positive strand + init = 1; + increment = 1; + } + + } else { + if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { + //second of pair, negative strand + init = -read.getReadLength(); + increment = 1; + } + else + { + //first of pair, negative strand + init = read.getReadLength(); + increment = -1; + } + } + + int cycle = init; + for(int i = 0; i < read.getReadLength(); i++) { + comparable[i] = cycle; + cycle += increment; + } + } + else if( read.getReadGroup().getPlatform().contains( "454" ) ) { // Some bams have "LS454" and others have just "454" + + final int readLength = read.getReadLength(); + final byte[] bases = read.getReadBases(); + + // Differentiate between first and second of pair. + // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group + // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. + // Therefore the cycle covariate must differentiate between first and second of pair reads. + // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because + // the current sequential model would consider the effects independently instead of jointly. + final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); + + int cycle = multiplyByNegative1 ? -1 : 1; + + // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change + // For example, AAAAAAA was probably read in two flow cycles but here we count it as one + if( !read.getReadNegativeStrandFlag() ) { // Forward direction + int iii = 0; + while( iii < readLength ) + { + while( iii < readLength && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii++; } + while( iii < readLength && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii++; } + while( iii < readLength && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii++; } + while( iii < readLength && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii++; } + if( iii < readLength ) { if (multiplyByNegative1) cycle--; else cycle++; } + if( iii < readLength && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii++; } + + } + } else { // Negative direction + int iii = readLength-1; + while( iii >= 0 ) + { + while( iii >= 0 && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii--; } + while( iii >= 0 && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii--; } + while( iii >= 0 && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii--; } + while( iii >= 0 && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii--; } + if( iii >= 0 ) { if (multiplyByNegative1) cycle--; else cycle++; } + if( iii >= 0 && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii--; } + } + } + } + else { + throw new IllegalStateException("This method hasn't been implemented yet for " + read.getReadGroup().getPlatform()); + } + + + } + // Used to get the covariate's value from input csv file in TableRecalibrationWalker public final Comparable getValue( final String str ) { return Integer.parseInt( str ); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java index f2428e808..9e1c2fe1f 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/Dinuc.java @@ -61,7 +61,7 @@ public class Dinuc implements Comparable{ } public static int hashBytes(final byte byte1, final byte byte2) { - return byte1 << 16 + byte2 << 4; + return byte1 << 8 + byte2; } public String toString() { // This method call is how the Dinuc will get written out to the table recalibration file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java index aa4d50b0a..c7be5057a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/DinucCovariate.java @@ -2,9 +2,10 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import java.util.HashMap; -import org.broadinstitute.sting.utils.BaseUtils; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.BaseUtils; + /* * Copyright (c) 2009 The Broad Institute * @@ -45,7 +46,7 @@ public class DinucCovariate implements StandardCovariate { private static final byte NO_CALL = (byte)'N'; private static final Dinuc NO_DINUC = new Dinuc(NO_CALL, NO_CALL); - HashMap dinucHashMap; + private HashMap dinucHashMap; // Initialize any member variables using the command-line arguments passed to the walkers public void initialize( final RecalibrationArgumentCollection RAC ) { @@ -89,10 +90,49 @@ public class DinucCovariate implements StandardCovariate { if( !BaseUtils.isRegularBase( prevBase ) ) { return NO_DINUC; } - + return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) ); } + + /** + * Takes an array of size (at least) read.getReadLength() and fills it with the covariate values for each position in the read. + */ + public void getValues( SAMRecord read, Comparable[] result ) { + final HashMap dinucHashMapRef = this.dinucHashMap; //optimize access to dinucHashMap + final int readLength = read.getReadLength(); + final boolean negativeStrand = read.getReadNegativeStrandFlag(); + byte[] bases = read.getReadBases(); + byte base; + byte prevBase; + int offset = 0; + // If this is a negative strand read then we need to reverse the direction for our previous base + + if(negativeStrand) { + bases = BaseUtils.simpleReverseComplement(bases); //this is NOT in-place + } + result[0] = NO_DINUC; // No dinuc at the beginning of the read + + prevBase = bases[0]; + offset++; + while(offset < readLength) { + // Note: We are using the previous base in the read, not the + // previous base in the reference. This is done in part to be consistent with unmapped reads. + base = bases[offset]; + if( BaseUtils.isRegularBase( prevBase ) ) { + result[offset] = dinucHashMapRef.get( Dinuc.hashBytes( prevBase, base ) ); + } else { + result[offset] = NO_DINUC; + } + + offset++; + prevBase = base; + } + if(negativeStrand) { + reverse( result ); + } + } + // Used to get the covariate's value from input csv file in TableRecalibrationWalker public final Comparable getValue( final String str ) { final Dinuc returnDinuc = dinucHashMap.get( Dinuc.hashBytes( (byte)str.charAt(0), (byte)str.charAt(1) ) ); @@ -100,7 +140,20 @@ public class DinucCovariate implements StandardCovariate { return null; } return returnDinuc; - } -} \ No newline at end of file + + /** + * Reverses the given array in place. + * + * @param array + */ + private static final void reverse(final Comparable[] array) { + final int arrayLength = array.length; + for(int l = 0, r = arrayLength - 1; l < r; l++, r--) { + final Comparable temp = array[l]; + array[l] = array[r]; + array[r] = temp; + } + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java index 86b5e9e7d..a66acdba8 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/GCContentCovariate.java @@ -33,7 +33,7 @@ import net.sf.samtools.SAMRecord; * Date: Jan 29, 2010 * * The number of previous N bases (along the direction of the read) that contain G's and C's. The goal is to correct for dye slippage. - * Only valid for Illumina reads. Otherwise return -1. + * Only valid for Illumina reads. Otherwise return -1. */ public class GCContentCovariate implements ExperimentalCovariate { @@ -69,16 +69,22 @@ public class GCContentCovariate implements ExperimentalCovariate { numGC++; } } - + return numGC; } else { // This effect is specific to the Illumina platform return -1; } } + + public void getValues(SAMRecord read, Comparable[] comparable) { + throw new IllegalStateException("Not yet implemented"); + } // Used to get the covariate's value from input csv file in TableRecalibrationWalker public final Comparable getValue( final String str ) { return Integer.parseInt( str ); } + + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java index cfeebdfd1..4ed1a6a27 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java @@ -90,6 +90,10 @@ public class HomopolymerCovariate implements ExperimentalCovariate { return numAgree; } + public void getValues(SAMRecord read, Comparable[] comparable) { + throw new IllegalStateException("Not yet implemented"); + } + // Used to get the covariate's value from input csv file in TableRecalibrationWalker public final Comparable getValue( final String str ) { return Integer.parseInt( str ); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java index 398884dbb..e7147cbca 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MappingQualityCovariate.java @@ -42,8 +42,8 @@ public class MappingQualityCovariate implements ExperimentalCovariate { } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - return read.getMappingQuality(); + public final Comparable getValue( final SAMRecord read, final int offset ) { + return read.getMappingQuality(); } // Used to get the covariate's value from input csv file in TableRecalibrationWalker @@ -51,4 +51,8 @@ public class MappingQualityCovariate implements ExperimentalCovariate { return Integer.parseInt( str ); } + public void getValues(SAMRecord read, Comparable[] comparable) { + throw new IllegalStateException("Not yet implemented"); + } + } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java index 355e4bb5c..6b8100e17 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/MinimumNQSCovariate.java @@ -47,8 +47,8 @@ public class MinimumNQSCovariate implements ExperimentalCovariate { // Used to pick out the covariate's value from attributes of the read public final Comparable getValue( final SAMRecord read, final int offset ) { - - // Loop over the list of base quality scores in the window and find the minimum + + // Loop over the list of base quality scores in the window and find the minimum final byte[] quals = read.getBaseQualities(); int minQual = quals[offset]; final int minIndex = Math.max(offset - windowReach, 0); @@ -66,4 +66,7 @@ public class MinimumNQSCovariate implements ExperimentalCovariate { return Integer.parseInt( str ); } + public void getValues(SAMRecord read, Comparable[] comparable) { + throw new IllegalStateException("Not yet implemented"); + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java index 21a3bbef9..5242d05fc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PositionCovariate.java @@ -56,4 +56,8 @@ public class PositionCovariate implements ExperimentalCovariate { return Integer.parseInt( str ); } + public void getValues(SAMRecord read, Comparable[] comparable) { + throw new IllegalStateException("Not yet implemented"); + } + } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java index 2613d6248..684c149f9 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/PrimerRoundCovariate.java @@ -34,7 +34,7 @@ import net.sf.samtools.SAMRecord; * * The Primer Round covariate. * For Solexa and 454 this is the same value of the length of the read. - * For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf + * For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf */ public class PrimerRoundCovariate implements ExperimentalCovariate { @@ -47,10 +47,10 @@ public class PrimerRoundCovariate implements ExperimentalCovariate { public final Comparable getValue( final SAMRecord read, final int offset ) { if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { int pos = offset; - if( read.getReadNegativeStrandFlag() ) { - pos = read.getReadLength() - (offset + 1); - } - return pos % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf + if( read.getReadNegativeStrandFlag() ) { + pos = read.getReadLength() - (offset + 1); + } + return pos % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf } else { return 1; // nothing to do here because it is always the same } @@ -62,4 +62,7 @@ public class PrimerRoundCovariate implements ExperimentalCovariate { return Integer.parseInt( str ); } + public void getValues(SAMRecord read, Comparable[] comparable) { + throw new IllegalStateException("Not yet implemented"); + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java index 337fcf4b5..bd1f957bb 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/QualityScoreCovariate.java @@ -42,8 +42,15 @@ public class QualityScoreCovariate implements RequiredCovariate { } // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - return (int)(read.getBaseQualities()[offset]); + public final Comparable getValue( final SAMRecord read, final int offset ) { + return (int)(read.getBaseQualities()[offset]); + } + + public void getValues(SAMRecord read, Comparable[] comparable) { + byte[] baseQualities = read.getBaseQualities(); + for(int i = 0; i < read.getReadLength(); i++) { + comparable[i] = (int) baseQualities[i]; + } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java index 4a87c1f8d..f1ae7b24b 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/ReadGroupCovariate.java @@ -45,12 +45,19 @@ public class ReadGroupCovariate implements RequiredCovariate{ // Used to pick out the covariate's value from attributes of the read public final Comparable getValue( final SAMRecord read, final int offset ) { - return read.getReadGroup().getReadGroupId(); + return read.getReadGroup().getReadGroupId(); + } + + public void getValues(SAMRecord read, Comparable[] comparable) { + final String readGroupId = read.getReadGroup().getReadGroupId(); + for(int i = 0; i < read.getReadLength(); i++) { + comparable[i] = readGroupId; + } } // Used to get the covariate's value from input csv file in TableRecalibrationWalker public final Comparable getValue( final String str ) { - return str; + return str; } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index 23e200ed2..7c0f25ae6 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -44,7 +44,7 @@ import net.sf.samtools.SAMReadGroupRecord; */ public class RecalDataManager { - + public final NestedHashMap data; // The full dataset private final NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed private final NestedHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed @@ -58,14 +58,14 @@ public class RecalDataManager { private static boolean warnUserNullPlatform = false; RecalDataManager() { - data = new NestedHashMap(); + data = new NestedHashMap(); dataCollapsedReadGroup = null; dataCollapsedQualityScore = null; dataCollapsedByCovariate = null; } RecalDataManager( final boolean createCollapsedTables, final int numCovariates ) { - if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker + if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker data = null; dataCollapsedReadGroup = new NestedHashMap(); dataCollapsedQualityScore = new NestedHashMap(); @@ -78,9 +78,9 @@ public class RecalDataManager { dataCollapsedReadGroup = null; dataCollapsedQualityScore = null; dataCollapsedByCovariate = null; - } + } } - + /** * Add the given mapping to all of the collapsed hash tables * @param key The list of comparables that is the key for this mapping @@ -140,7 +140,7 @@ public class RecalDataManager { * Loop over all the collapsed tables and turn the recalDatums found there into an empricial quality score * that will be used in the sequential calculation in TableRecalibrationWalker * @param smoothing The smoothing paramter that goes into empirical quality score calculation - * @param maxQual At which value to cap the quality scores + * @param maxQual At which value to cap the quality scores */ public final void generateEmpiricalQualities( final int smoothing, final int maxQual ) { @@ -349,7 +349,7 @@ public class RecalDataManager { originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN); } else if( SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead, coinFlip); - } + } } else { throw new StingException("Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + @@ -542,9 +542,41 @@ public class RecalDataManager { // return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0); // } //} - + } else { // No inconsistency array, so nothing is inconsistent return false; } } + + /** + * Computes all requested covariates for every offset in the given read + * by calling covariate.getValues(..). + * + * @param gatkRead The read for which to compute covariate values. + * @param requestedCovariates The list of requested covariates. + * @return An array of covariate values where result[i][j] is the covariate + * value for the ith position in the read and the jth covariate in + * reqeustedCovariates list. + */ + public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List requestedCovariates) { + //compute all covariates for this read + final List requestedCovariatesRef = requestedCovariates; + final int numRequestedCovariates = requestedCovariatesRef.size(); + final int readLength = gatkRead.getReadLength(); + + final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates]; + final Comparable[] tempCovariateValuesHolder = new Comparable[readLength]; + + // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read + for( int i = 0; i < numRequestedCovariates; i++ ) { + requestedCovariatesRef.get(i).getValues( gatkRead, tempCovariateValuesHolder ); + for(int j = 0; j < readLength; j++) { + //copy values into a 2D array that allows all covar types to be extracted at once for + //an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types. + covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; + } + } + + return covariateValues_offset_x_covar; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 6e4788037..7cfd069e5 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -1,24 +1,41 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; -import net.sf.samtools.*; -import net.sf.samtools.util.SequenceUtil; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.WalkerName; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.utils.cmdLine.*; -import org.broadinstitute.sting.utils.*; - +import java.io.File; +import java.io.FileNotFoundException; +import java.lang.reflect.Field; import java.util.ArrayList; import java.util.List; import java.util.Random; import java.util.ResourceBundle; import java.util.regex.Pattern; -import java.io.File; -import java.io.FileNotFoundException; -import java.lang.reflect.Field; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMProgramRecord; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMTag; +import net.sf.samtools.util.SequenceUtil; + +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.WalkerName; +import org.broadinstitute.sting.utils.JVMUtils; +import org.broadinstitute.sting.utils.NestedHashMap; +import org.broadinstitute.sting.utils.PackageUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.TextFormattingUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.xReadLines; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.cmdLine.ArgumentCollection; +import org.broadinstitute.sting.utils.cmdLine.ArgumentDefinition; +import org.broadinstitute.sting.utils.cmdLine.ArgumentSource; +import org.broadinstitute.sting.utils.cmdLine.ArgumentTypeDescriptor; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -33,7 +50,7 @@ import java.lang.reflect.Field; * conditions: * * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES @@ -51,7 +68,7 @@ import java.lang.reflect.Field; * For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) * Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read. * This walker then outputs a new bam file with these updated (recalibrated) reads. - * + * * Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker. * Note: This walker is designed to be used in conjunction with CovariateCounterWalker. * @@ -81,7 +98,7 @@ public class TableRecalibrationWalker extends ReadWalker temporaryAttributes; + public GATKSAMRecord(SAMRecord record) { super(null); // it doesn't matter - this isn't used if ( record == null ) throw new IllegalArgumentException("The SAMRecord argument cannot be null"); mRecord = record; + mNegativeStrandFlag = mRecord.getReadNegativeStrandFlag(); + mUnmappedFlag = mRecord.getReadUnmappedFlag(); + // because attribute methods are declared to be final (and we can't overload them), // we need to actually set all of the attributes here @@ -79,8 +96,6 @@ public class GATKSAMRecord extends SAMRecord { } public boolean getReadUnmappedFlag() { - if ( mUnmappedFlag == null ) - mUnmappedFlag = mRecord.getReadUnmappedFlag(); return mUnmappedFlag; } @@ -90,8 +105,6 @@ public class GATKSAMRecord extends SAMRecord { } public boolean getReadNegativeStrandFlag() { - if ( mNegativeStrandFlag == null ) - mNegativeStrandFlag = mRecord.getReadNegativeStrandFlag(); return mNegativeStrandFlag; } @@ -100,6 +113,90 @@ public class GATKSAMRecord extends SAMRecord { mNegativeStrandFlag = b; } + public boolean getSecondOfPairFlag() { + if( mSecondOfPairFlag == null ) { + //not done in constructor because this method can't be called for + //all SAMRecords. + mSecondOfPairFlag = mRecord.getSecondOfPairFlag(); + } + return mSecondOfPairFlag; + } + + public void setSecondOfPairFlag(boolean b) { + mRecord.setSecondOfPairFlag(b); + mSecondOfPairFlag = b; + } + + + /** + * Checks whether an attribute has been set for the given key. + * + * Temporary attributes provide a way to label or attach arbitrary data to + * individual GATKSAMRecords. These attributes exist in memory only, + * and are never written to disk. + * + * @param key + * @return True if an attribute has been set for this key. + */ + public boolean containsTemporaryAttribute(Object key) { + if(temporaryAttributes != null) { + return temporaryAttributes.containsKey(key); + } + return false; + } + + /** + * Sets the key to the given value, replacing any previous value. The previous + * value is returned. + * + * Temporary attributes provide a way to label or attach arbitrary data to + * individual GATKSAMRecords. These attributes exist in memory only, + * and are never written to disk. + * + * @param key + * @param value + */ + public Object setTemporaryAttribute(Object key, Object value) { + if(temporaryAttributes == null) { + temporaryAttributes = new HashMap(); + } + return temporaryAttributes.put(key, value); + } + + /** + * Looks up the value associated with the given key. + * + * Temporary attributes provide a way to label or attach arbitrary data to + * individual GATKSAMRecords. These attributes exist in memory only, + * and are never written to disk. + * + * @param key + * @return The value, or null. + */ + public Object getTemporaryAttribute(Object key) { + if(temporaryAttributes != null) { + return temporaryAttributes.get(key); + } + return null; + } + + /** + * Removes the attribute that has the given key. + * + * Temporary attributes provide a way to label or attach arbitrary data to + * individual GATKSAMRecords. These attributes exist in memory only, + * and are never written to disk. + * + * @param key + * @return The value that was associated with this key, or null. + */ + public Object removeTemporaryAttribute(Object key) { + if(temporaryAttributes != null) { + return temporaryAttributes.remove(key); + } + return null; + } + ///////////////////////////////////////////////////////////////////// // *** The following methods are final and cannot be overridden ***// ///////////////////////////////////////////////////////////////////// @@ -209,8 +306,6 @@ public class GATKSAMRecord extends SAMRecord { public boolean getFirstOfPairFlag() { return mRecord.getFirstOfPairFlag(); } - public boolean getSecondOfPairFlag() { return mRecord.getSecondOfPairFlag(); } - public boolean getNotPrimaryAlignmentFlag() { return mRecord.getNotPrimaryAlignmentFlag(); } public boolean getReadFailsVendorQualityCheckFlag() { return mRecord.getReadFailsVendorQualityCheckFlag(); } @@ -227,8 +322,6 @@ public class GATKSAMRecord extends SAMRecord { public void setFirstOfPairFlag(boolean b) { mRecord.setFirstOfPairFlag(b); } - public void setSecondOfPairFlag(boolean b) { mRecord.setSecondOfPairFlag(b); } - public void setNotPrimaryAlignmentFlag(boolean b) { mRecord.setNotPrimaryAlignmentFlag(b); } public void setReadFailsVendorQualityCheckFlag(boolean b) { mRecord.setReadFailsVendorQualityCheckFlag(b); }