diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java index 486fb9651..7ac202ce6 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java @@ -38,7 +38,7 @@ import net.sf.samtools.SAMRecord; */ public interface Covariate { - public Comparable getValue(SAMRecord read, int offset, String readGroup, byte[] quals, char[] bases, char refBase); // used to pick out the value from attributes of the read + public Comparable getValue(SAMRecord read, int offset, String readGroup, byte[] quals, byte[] bases); // used to pick out the value from attributes of the read public Comparable getValue(String str); // used to get value from input file public int estimatedNumberOfBins(); // used to estimate the amount space required for the HashMap } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java index f39d136cb..22f51474c 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.*; @@ -57,27 +58,26 @@ import net.sf.samtools.SAMRecord; * Note: This walker is designed to be used in conjunction with TableRecalibrationWalker. */ -@WalkerName("CountCovariatesRefactored") +@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file +@WalkerName( "CountCovariatesRefactored" ) +@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality +@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false) private Boolean LIST_ONLY = false; @Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false) private String[] COVARIATES = null; - @Argument(fullName="min_mapping_quality", shortName="minmap", required=false, doc="Only use reads with at least this mapping quality score") - private int MIN_MAPPING_QUALITY = 1; @Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) private boolean USE_ORIGINAL_QUALS = false; @Argument(fullName = "platform", shortName="pl", doc="Which sequencing technology was used? This is important for the cycle covariate. Options are SLX, 454, and SOLID.", required=false) private String PLATFORM = "SLX"; - @Argument(fullName = "windowSizeNQS", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false) + @Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false) private int WINDOW_SIZE = 3; @Argument(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the outputted covariates table recalibration file") private String RECAL_FILE = "output.recal_data.csv"; - @Argument(fullName="noPrintHeader", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file") + @Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. For debugging purposes only.") private boolean NO_PRINT_HEADER = false; - @Argument(fullName="validateOldRecalibrator", shortName="valor", required=false, doc="Depricated, no longer replicates previous behavior exactly due to clean up of structure of code. If yes will reorder the output to match the old recalibrator exactly but makes the file useless to the refactored TableRecalibrationWalker") - private boolean validateOldRecalibrator = false; private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps private ArrayList requestedCovariates; // A list to hold the covariate objects that were requested @@ -101,8 +101,7 @@ public class CovariateCounterWalker extends LocusWalker { // Get a list of all available covariates final List> classes = PackageUtils.getClassesImplementingInterface(Covariate.class); - //int estimatedCapacity = 1; // start at one because capacity is multiplicative for each covariate - + // Print and exit if that's what was requested if ( LIST_ONLY ) { out.println( "Available covariates:" ); @@ -121,60 +120,75 @@ public class CovariateCounterWalker extends LocusWalker { // Initialize the requested covariates by parsing the -cov argument requestedCovariates = new ArrayList(); - if( validateOldRecalibrator ) { - requestedCovariates.add( new ReadGroupCovariate() ); - requestedCovariates.add( new CycleCovariate( PLATFORM ) ); // unfortunately the ordering here is different to match the output of the old recalibrator - requestedCovariates.add( new QualityScoreCovariate() ); - requestedCovariates.add( new DinucCovariate() ); - //estimatedCapacity = 300 * 200 * 40 * 16; - } else if( COVARIATES != null ) { - int covNumber = 1; - for( String requestedCovariateString : COVARIATES ) { - boolean foundClass = false; + int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one + if( COVARIATES != null ) { + if(COVARIATES[0].equalsIgnoreCase("ALL")) { // the user wants ALL covariates to be used + requestedCovariates.add( new ReadGroupCovariate() ); // first add the required covariates then add the rest by looping over all implementing classes that were found + requestedCovariates.add( new QualityScoreCovariate() ); for( Class covClass : classes ) { - if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class - foundClass = true; - // Read Group Covariate and Quality Score Covariate are required covariates for the recalibration calculation and must begin the list - if( (covNumber == 1 && !requestedCovariateString.equalsIgnoreCase( "ReadGroupCovariate" )) || - (covNumber == 2 && !requestedCovariateString.equalsIgnoreCase( "QualityScoreCovariate" )) ) { - throw new StingException("ReadGroupCovariate and QualityScoreCovariate are required covariates for the recalibration calculation and must begin the list" ); - } - covNumber++; - try { - // Now that we've found a matching class, try to instantiate it - Covariate covariate = (Covariate)covClass.newInstance(); - // some covariates need parameters (user supplied command line arguments) passed to them - if( covariate instanceof CycleCovariate ) { covariate = new CycleCovariate( PLATFORM ); } - else if( covariate instanceof PrimerRoundCovariate ) { covariate = new PrimerRoundCovariate( PLATFORM ); } - else if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); } + try { + Covariate covariate = (Covariate)covClass.newInstance(); + estimatedCapacity *= covariate.estimatedNumberOfBins(); + // Some covariates need parameters (user supplied command line arguments) passed to them + if( covariate instanceof CycleCovariate ) { covariate = new CycleCovariate( PLATFORM ); } + else if( covariate instanceof PrimerRoundCovariate ) { covariate = new PrimerRoundCovariate( PLATFORM ); } + else if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); } + if( !( covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate ) ) { // these were already added so don't add them again requestedCovariates.add( covariate ); - //estimatedCapacity *= covariate.estimatedNumberOfBins(); // update the estimated initial capacity - } catch ( InstantiationException e ) { - throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); - } catch ( IllegalAccessException e ) { - throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) ); + } + } catch ( InstantiationException e ) { + throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); + } catch ( IllegalAccessException e ) { + throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) ); + } + } + } else { // The user has specified a list of several covariates + int covNumber = 1; + for( String requestedCovariateString : COVARIATES ) { + boolean foundClass = false; + for( Class covClass : classes ) { + if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class + foundClass = true; + // Read Group Covariate and Quality Score Covariate are required covariates for the recalibration calculation and must begin the list + if( (covNumber == 1 && !requestedCovariateString.equalsIgnoreCase( "ReadGroupCovariate" )) || + (covNumber == 2 && !requestedCovariateString.equalsIgnoreCase( "QualityScoreCovariate" )) ) { + throw new StingException("ReadGroupCovariate and QualityScoreCovariate are required covariates for the recalibration calculation and must begin the list" ); + } + covNumber++; + try { + // Now that we've found a matching class, try to instantiate it + Covariate covariate = (Covariate)covClass.newInstance(); + estimatedCapacity *= covariate.estimatedNumberOfBins(); + // Some covariates need parameters (user supplied command line arguments) passed to them + if( covariate instanceof CycleCovariate ) { covariate = new CycleCovariate( PLATFORM ); } + else if( covariate instanceof PrimerRoundCovariate ) { covariate = new PrimerRoundCovariate( PLATFORM ); } + else if( covariate instanceof MinimumNQSCovariate ) { covariate = new MinimumNQSCovariate( WINDOW_SIZE ); } + requestedCovariates.add( covariate ); + } catch ( InstantiationException e ) { + throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); + } catch ( IllegalAccessException e ) { + throw new StingException( String.format("Can not instantiate covariate class '%s': must have no-arg constructor.", covClass.getSimpleName()) ); + } } } - } - if( !foundClass ) { - throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." ); + if( !foundClass ) { + throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." ); + } } } - } else { // Not validating old recalibrator and no covariates were specified by the user so add the default ones - logger.info( "Note: Using default set of covariates because none were specified." ); + } else { // No covariates were specified by the user so add the default, required ones + Utils.warnUser( "Using default set of covariates because none were specified. Using ReadGroupCovariate and QualityScoreCovariate only." ); requestedCovariates.add( new ReadGroupCovariate() ); requestedCovariates.add( new QualityScoreCovariate() ); - requestedCovariates.add( new CycleCovariate( PLATFORM ) ); - requestedCovariates.add( new DinucCovariate() ); - //estimatedCapacity = 300 * 40 * 200 * 16; + estimatedCapacity = 300 * 40; } logger.info( "The covariates being used here: " ); logger.info( requestedCovariates ); - //dataManager = new RecalDataManager( estimatedCapacity ); - dataManager = new RecalDataManager(); + if(estimatedCapacity > 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception + dataManager = new RecalDataManager( estimatedCapacity ); //readGroupHashMap = new HashMap( 50000000, 0.97f ); } @@ -195,7 +209,7 @@ public class CovariateCounterWalker extends LocusWalker { */ public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - final rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)); + final rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP( tracker.getTrackData("dbsnp", null) ); // Only use data from non-dbsnp sites // Assume every mismatch at a non-dbsnp site is indicitive of poor quality @@ -206,61 +220,65 @@ public class CovariateCounterWalker extends LocusWalker { int offset; String readGroup; byte[] quals; - char[] bases; - char refBase; - char prevBase; + byte[] bases; + byte refBase; + byte prevBase; final int numReads = reads.size(); // For each read at this locus for( int iii = 0; iii < numReads; iii++ ) { read = reads.get(iii); - - // Only use data from reads with mapping quality above specified quality value - if( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) { - - //readGroup = readGroupHashMap.get( read ); - //if( readGroup == null ) { // read is not in the hashmap so add it - // readGroup = read.getReadGroup().getReadGroupId(); - // readGroupHashMap.put( read, readGroup ); - //} - - offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct - - // skip first and last base because there is no dinuc, this is mainly done for speed so we don't have to check cases - if( offset > 0 && offset < read.getReadLength() - 1) { - - quals = read.getBaseQualities(); - // Check if we need to use the original quality scores instead - if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { - Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); - if ( obj instanceof String ) - quals = QualityUtils.fastqToPhred((String)obj); - else { - throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); + + //readGroup = readGroupHashMap.get( read ); + //if( readGroup == null ) { // read is not in the hashmap so add it + // readGroup = read.getReadGroup().getReadGroupId(); + // readGroupHashMap.put( read, readGroup ); + //} + + offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct + + // skip first and last base because there is no dinuc, this is mainly done for speed so we don't have to check cases + if( offset > 0 && offset < read.getReadLength() - 1 ) { + + quals = read.getBaseQualities(); + // Check if we need to use the original quality scores instead + if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { + Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); + if ( obj instanceof String ) + quals = QualityUtils.fastqToPhred((String)obj); + else { + throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); + } + } + + // SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them + byte[] colorSpaceQuals = null; + if( PLATFORM.equalsIgnoreCase("SOLID") ) { + colorSpaceQuals = (byte[])read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG); + } + if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet + { + // skip if base quality is zero + if( quals[offset] > 0 ) { + bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A' + refBase = (byte)ref.getBase(); + prevBase = bases[offset-1]; + + // Get the complement base strand if we are a negative strand read + if( read.getReadNegativeStrandFlag() ) { + bases = BaseUtils.simpleComplement( bases ); // this is an expensive call + refBase = (byte)BaseUtils.simpleComplement( ref.getBase() ); + prevBase = bases[offset+1]; + } + + // skip if this base or the previous one was an 'N' or etc. + if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)bases[offset] ) ) { + readGroup = read.getReadGroup().getReadGroupId(); // this is an expensive call + updateDataFromRead( read, offset, readGroup, quals, bases, refBase ); } } - - // skip if base quality is zero - if( quals[offset] > 0 ) { - bases = read.getReadString().toCharArray(); - refBase = ref.getBase(); - prevBase = bases[offset-1]; - // Get the complement base strand if we are a negative strand read - if( read.getReadNegativeStrandFlag() ) { - bases = BaseUtils.simpleComplement( read.getReadString() ).toCharArray(); // this is an expensive call - refBase = BaseUtils.simpleComplement( refBase ); - prevBase = bases[offset+1]; - } - - // skip if this base or the previous one was an 'N' or etc. - if( BaseUtils.isRegularBase(prevBase) && BaseUtils.isRegularBase(bases[offset]) ) { - readGroup = read.getReadGroup().getReadGroupId(); // this is an expensive call - updateDataFromRead( read, offset, readGroup, quals, bases, refBase ); - } - } - } + } } - } countedSites++; @@ -285,31 +303,27 @@ public class CovariateCounterWalker extends LocusWalker { * @param bases The bases which make up the read * @param refBase The reference base at this locus */ - private void updateDataFromRead(final SAMRecord read, final int offset, final String readGroup, final byte[] quals, final char[] bases, final char refBase) { + private void updateDataFromRead(final SAMRecord read, final int offset, final String readGroup, final byte[] quals, final byte[] bases, final byte refBase) { List key = new ArrayList(); // Loop through the list of requested covariates and pick out the value from the read, offset, and reference for( Covariate covariate : requestedCovariates ) { - key.add( covariate.getValue( read, offset, readGroup, quals, bases, refBase ) ); + key.add( covariate.getValue( read, offset, readGroup, quals, bases ) ); } - + // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap RecalDatum datum = dataManager.data.get( key ); if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method - if( validateOldRecalibrator ) { - dataManager.data.myPut( key, datum ); - } else { - dataManager.data.put( key, datum ); - } + dataManager.data.put( key, datum ); } // Need the bases to determine whether or not we have a mismatch - char base = bases[offset]; + byte base = bases[offset]; // Add one to the number of observations and potentially one to the number of mismatches - datum.increment( base, refBase ); + datum.increment( (char)base, (char)refBase ); // dangerous: if you don't cast to char than the bytes default to the (long, long) version of the increment method which is really bad countedBases++; } @@ -360,49 +374,27 @@ public class CovariateCounterWalker extends LocusWalker { */ private void outputToCSV( final PrintStream recalTableStream ) { - if( validateOldRecalibrator ) { - final boolean collapsePos = false; - final boolean collapseDinuc = false; - recalTableStream.printf("# collapsed_pos %b%n", collapsePos); - recalTableStream.printf("# collapsed_dinuc %b%n", collapseDinuc); - recalTableStream.printf("# counted_sites %d%n", countedSites); - recalTableStream.printf("# counted_bases %d%n", countedBases); - recalTableStream.printf("# skipped_sites %d%n", skippedSites); - recalTableStream.printf("# fraction_skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); - recalTableStream.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp"); - for(Pair, RecalDatum> entry : dataManager.data.entrySetSorted4() ) { - // For each Covariate in the key - for( Comparable comp : entry.getFirst() ) { - // Output the Covariate's value - recalTableStream.print( comp + "," ); - } - // Output the RecalDatum entry - recalTableStream.println( entry.getSecond().outputToCSV() ); - } - } else { - if( !NO_PRINT_HEADER ) { - recalTableStream.printf("# Counted Sites %d%n", countedSites); - recalTableStream.printf("# Counted Bases %d%n", countedBases); - recalTableStream.printf("# Skipped Sites %d%n", skippedSites); - recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); - for( Covariate cov : requestedCovariates ) { - // The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name - recalTableStream.println( "@!" + cov.getClass().getSimpleName() ); - } - } - // For each entry in the data hashmap - for( Map.Entry, RecalDatum> entry : dataManager.data.entrySet() ) { - // For each Covariate in the key - for( Comparable comp : entry.getKey() ) { - // Output the Covariate's value - if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG - recalTableStream.print( comp + "," ); - } - // Output the RecalDatum entry - recalTableStream.println( entry.getValue().outputToCSV() ); + if( !NO_PRINT_HEADER ) { + recalTableStream.printf("# Counted Sites %d%n", countedSites); + recalTableStream.printf("# Counted Bases %d%n", countedBases); + recalTableStream.printf("# Skipped Sites %d%n", skippedSites); + recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); + for( Covariate cov : requestedCovariates ) { + // The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name + recalTableStream.println( "@!" + cov.getClass().getSimpleName() ); } } + // For each entry in the data hashmap + for( Map.Entry, RecalDatum> entry : dataManager.data.entrySet() ) { + // For each Covariate in the key + for( Comparable comp : entry.getKey() ) { + // Output the Covariate's value + if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG + recalTableStream.print( comp + "," ); + } + // Output the RecalDatum entry + recalTableStream.println( entry.getValue().outputToCSV() ); + } } - } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java index 1bc050172..e1965434b 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java @@ -55,7 +55,7 @@ public class CycleCovariate implements Covariate { } public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, - final byte[] quals, final char[] bases, final char refBase) { + final byte[] quals, final byte[] bases) { if( platform.equalsIgnoreCase( "SLX" ) ) { int cycle = offset; if( read.getReadNegativeStrandFlag() ) { @@ -64,7 +64,7 @@ public class CycleCovariate implements Covariate { return cycle; } else if( platform.equalsIgnoreCase( "454" ) ) { int cycle = 0; - char prevBase = bases[0]; + byte prevBase = bases[0]; for( int iii = 1; iii <= offset; iii++ ) { if(bases[iii] != prevBase) { // this base doesn't match the previous one so it is a new cycle cycle++; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Dinuc.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Dinuc.java new file mode 100755 index 000000000..7b07a0ec2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Dinuc.java @@ -0,0 +1,46 @@ +package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 16, 2009 + */ +public class Dinuc implements Comparable{ + private byte first; + private byte second; + + public Dinuc() { + first = 0; + second = 0; + } + + public Dinuc(final byte _first, final byte _second) { + first = _first; + second = _second; + } + + public final void setValues(final byte _first, final byte _second) { + first = _first; + second = _second; + } + + public int compareTo(final Dinuc that) { + if( this.first > that.first ) { return 1; } + else if( this.first < that.first ) { return -1; } + else { //this.first equals that.first + if( this.second > that.second ) { return 1; } + else if( this.second < that.second ) { return -1; } + else { return 0; } + } + + } + + public static int hashBytes(final byte byte1, final byte byte2) { + return byte1 << 16 + byte2 << 4; + } + + public String toString() { // This method call is how the Dinuc will get written out to the table recalibration file + byte[] byteArray = {first,second}; + return new String(byteArray); + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java index 34623c9ce..4a7045563 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java @@ -2,6 +2,10 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; import net.sf.samtools.SAMRecord; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; + /* * Copyright (c) 2009 The Broad Institute * @@ -39,28 +43,37 @@ import net.sf.samtools.SAMRecord; public class DinucCovariate implements Covariate { - private String returnString; - + HashMap dinucHashMap; + public DinucCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker + + final byte[] BASES = { (byte)'A', (byte)'C', (byte)'G', (byte)'T' }; + dinucHashMap = new HashMap(); + for(byte byte1 : BASES) { + for(byte byte2: BASES) { + dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); + } + } } public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, - final byte[] quals, final char[] bases, final char refBase) { + final byte[] quals, final byte[] bases) { - char base = bases[offset]; - char prevBase = bases[offset - 1]; - // If it is a negative strand read then we need to reverse the direction for our previous base + byte base = bases[offset]; + byte prevBase = bases[offset - 1]; + // If this is a negative strand read then we need to reverse the direction for our previous base if( read.getReadNegativeStrandFlag() ) { prevBase = bases[offset + 1]; - } - final char[] charArray = {prevBase, base}; - returnString = String.valueOf(charArray); - return returnString; + } + //char[] charArray = {(char)prevBase, (char)base}; + //return new String( charArray ); // This is an expensive call + return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) ); //return String.format("%c%c", prevBase, base); // This return statement is too slow } public final Comparable getValue(final String str) { - return str; + //return str; + return dinucHashMap.get( Dinuc.hashBytes( (byte)str.charAt(0), (byte)str.charAt(1) ) ); } public final int estimatedNumberOfBins() { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java index 340330bdb..287c23667 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java @@ -41,7 +41,7 @@ public class MappingQualityCovariate implements Covariate { } public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, - final byte[] quals, final char[] bases, final char refBase) { + final byte[] quals, final byte[] bases) { return read.getMappingQuality(); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java index 704fc0d61..616682c4a 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java @@ -49,7 +49,7 @@ public class MinimumNQSCovariate implements Covariate { } public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, - final byte[] quals, final char[] bases, final char refBase) { + final byte[] quals, final byte[] bases) { // Loop over the list of base quality scores in the window and find the minimum int minQual = quals[offset]; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java index 0ddf07ace..e44e1f1bf 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java @@ -1,8 +1,5 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; -import org.broadinstitute.sting.utils.Pair; -import org.broadinstitute.sting.utils.StingException; - import java.util.*; /* @@ -36,84 +33,18 @@ import java.util.*; * Date: Oct 30, 2009 * * A HashMap that maps a list of comparables to any object . - * There is functionality for the mappings to be given back to you in sorted order. */ public class NHashMap extends HashMap, T> { private static final long serialVersionUID = 1L; //BUGBUG: what should I do here?, added by Eclipse - private ArrayList> keyLists; public NHashMap() { super(); - keyLists = null; } public NHashMap( int initialCapacity, float loadingFactor ) { super( initialCapacity, loadingFactor ); - keyLists = null; - } - - - // This method is here only to help facilitate direct comparison of old and refactored recalibrator. - // The old recalibrator prints out its mappings in a sorted order but the refactored recalibrator doesn't need to. - public T myPut(List key, T value) { - - if( keyLists == null ) { - keyLists = new ArrayList>(); - for( Comparable comp : key ) { - keyLists.add( new ArrayList() ); - } - } - - ArrayList thisList; - for( int iii = 0; iii < key.size(); iii++ ) { - thisList = keyLists.get( iii ); - if( thisList == null ) { - thisList = new ArrayList(); - } - if( !thisList.contains( key.get( iii ) ) ) { - thisList.add( key.get(iii ) ); - } - } - return super.put( key, value ); - } - - // This method is very ugly but is here only to help facilitate direct comparison of old and refactored recalibrator. - // The old recalibrator prints out its mappings in a sorted order but the refactored recalibrator doesn't need to. - @SuppressWarnings(value = "unchecked") - public ArrayList, T>> entrySetSorted4() { - - ArrayList, T>> theSet = new ArrayList, T>>(); - - for( ArrayList list : keyLists ) { - Collections.sort(list); - } - - if( keyLists.size() != 4 ) { - throw new StingException("Are you sure you want to be calling this ugly method? NHashMap.entrySetSorted4()"); - } - - ArrayList newKey = null; - for( Comparable c0 : keyLists.get(0) ) { - for( Comparable c1 : keyLists.get(1) ) { - for( Comparable c2 : keyLists.get(2) ) { - for( Comparable c3 : keyLists.get(3) ) { - newKey = new ArrayList(); - newKey.add(c0); - newKey.add(c1); - newKey.add(c2); - newKey.add(c3); - T value = this.get( newKey ); - if( value!= null ) { - theSet.add(new Pair,T>( newKey, value ) ); - } - } - } - } - } - - return theSet; } public static List makeList(T... args) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/PrimerRoundCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/PrimerRoundCovariate.java index da5ac7345..058286537 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/PrimerRoundCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/PrimerRoundCovariate.java @@ -52,7 +52,7 @@ public class PrimerRoundCovariate implements Covariate { } public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, - final byte[] quals, final char[] bases, final char refBase) { + final byte[] quals, final byte[] bases) { if( platform.equalsIgnoreCase( "SLX" ) ) { return 1; // nothing to do here because it is always the same } else if( platform.equalsIgnoreCase( "454" ) ) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java index 9159656fc..02531c16d 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java @@ -41,7 +41,7 @@ public class QualityScoreCovariate implements Covariate { } public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, - final byte[] quals, final char[] bases, final char refBase) { + final byte[] quals, final byte[] bases) { return (int)quals[offset]; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java index 7dbfe4e6d..4824b7078 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java @@ -41,7 +41,7 @@ public class ReadGroupCovariate implements Covariate{ } public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, - final byte[] quals, final char[] bases, final char refBase) { + final byte[] quals, final byte[] bases) { return readGroup; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java index 05d5abfde..5d03c2972 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java @@ -46,7 +46,8 @@ public class RecalDataManager { private boolean collapsedTablesCreated; public NHashMap dataSumExpectedErrors; // table used to calculate the overall aggregate quality score in which everything except read group is collapsed - public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag in a BAM file that holds the original quality scores + public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // the tag that holds the original quality scores + public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // the tag that holds the color space quality scores for SOLID bams RecalDataManager() { data = new NHashMap(); @@ -54,8 +55,8 @@ public class RecalDataManager { } RecalDataManager( int estimatedCapacity ) { - data = new NHashMap( estimatedCapacity, 0.95f ); // second arg is the 'loading factor', - // a number to monkey around with when optimizing performace of the HashMap + data = new NHashMap( estimatedCapacity, 0.8f ); // second arg is the 'loading factor', + // a number to monkey around with when optimizing performace of the HashMap collapsedTablesCreated = false; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java index 9f519cb2e..c0cbd8c69 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java @@ -4,6 +4,8 @@ import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMFileWriter; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.WalkerName; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.*; @@ -54,19 +56,22 @@ import java.io.FileNotFoundException; */ @WalkerName("TableRecalibrationRefactored") +@Requires({ DataSource.READS, DataSource.REFERENCE }) // This walker requires -I input.bam, it also requires -R reference.fasta, but by not saying @requires REFERENCE_BASES I'm telling the + // GATK to not actually spend time giving me the refBase array since I don't use it public class TableRecalibrationWalker extends ReadWalker { @Argument(fullName="recal_file", shortName="recalFile", doc="Input recalibration table file generated by CountCovariates", required=true) - public String RECAL_FILE; - @Argument(fullName="outputBam", shortName="outputBam", doc="output BAM file", required=false) - public SAMFileWriter OUTPUT_BAM = null; - @Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false) - public int PRESERVE_QSCORES_LESS_THAN = 5; + private String RECAL_FILE = null; + @Argument(fullName="output_bam", shortName="outputBam", doc="output BAM file", required=false) + private SAMFileWriter OUTPUT_BAM = null; + @Argument(fullName="preserve_qscores_less_than", shortName="pQ", + doc="If provided, bases with quality scores less than this threshold won't be recalibrated. In general its unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false) + private int PRESERVE_QSCORES_LESS_THAN = 5; @Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) private boolean USE_ORIGINAL_QUALS = false; @Argument(fullName = "platform", shortName="pl", doc="Which sequencing technology was used? This is important for the cycle covariate. Options are SLX, 454, and SOLID.", required=false) private String PLATFORM = "SLX"; - @Argument(fullName = "windowSizeNQS", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false) + @Argument(fullName = "window_size_nqs", shortName="nqs", doc="How big of a window should the MinimumNQSCovariate use for its calculation", required=false) private int WINDOW_SIZE = 3; @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points") public int SMOOTHING = 1; @@ -77,7 +82,7 @@ public class TableRecalibrationWalker extends ReadWalker(); @@ -136,7 +143,7 @@ public class TableRecalibrationWalker extends ReadWalker 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception + dataManager = new RecalDataManager( estimatedCapacity ); } addCSVData(line); // parse the line and add the data to the HashMap @@ -211,9 +218,12 @@ public class TableRecalibrationWalker extends ReadWalker key = new ArrayList(); for( Covariate covariate : requestedCovariates ) { - key.add( covariate.getValue( read, iii, readGroup, originalQuals, bases, myRefBases.charAt(iii) ) ); // offset is zero based so passing iii is correct here // technically COVARIATE_ERROR is fine too, but this won't match the behavior of the old recalibrator + key.add( covariate.getValue( read, iii, readGroup, originalQuals, bases ) ); // offset is zero based so passing iii is correct here } if( MODE_STRING.equalsIgnoreCase("COMBINATORIAL") ) { @@ -265,18 +271,24 @@ public class TableRecalibrationWalker extends ReadWalker newKey; - + + // The global quality shift (over the read group only) newKey = new ArrayList(); newKey.add( key.get(0) ); // read group RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get( newKey ); @@ -304,7 +317,7 @@ public class TableRecalibrationWalker extends ReadWalker(); newKey.add( key.get(0) ); // read group newKey.add( key.get(1) ); // quality score @@ -314,7 +327,7 @@ public class TableRecalibrationWalker extends ReadWalker