diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java index 9f1b99046..a9a63c400 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java @@ -38,7 +38,8 @@ import net.sf.samtools.SAMRecord; public interface Covariate { public static final String COVARIATE_ERROR = "COVARIATE_ERROR"; public static final String COVARIATE_NULL = "COVARITATE_NULL"; - public Comparable getValue(SAMRecord read, int offset, char[] refBases); // used to pick out the value from the read, offset, and reference bases + + public Comparable getValue(SAMRecord read, int offset, String readGroup, byte[] quals, char[] bases, char refBase); //used to pick out the value from attributes of the read public Comparable getValue(String str); // used to get value from input file public int estimatedNumberOfBins(); // used to estimate the amount space required in the HashMap } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java index a4c50e15d..eaa0dcbef 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java @@ -64,24 +64,29 @@ import net.sf.samtools.SAMRecord; public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false) - protected Boolean LIST_ONLY = false; + private Boolean LIST_ONLY = false; @Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are already added for you.", required=false) - protected String[] COVARIATES = null; + private String[] COVARIATES = null; @Argument(fullName="min_mapping_quality", shortName="minmap", required=false, doc="Only use reads with at least this mapping quality score") - public int MIN_MAPPING_QUALITY = 1; + private int MIN_MAPPING_QUALITY = 1; @Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) - public boolean USE_ORIGINAL_QUALS = false; + private boolean USE_ORIGINAL_QUALS = false; // BUGBUG need to pass this value to the proper covariates + @Argument(fullName = "platform", shortName="pl", doc="Which sequencing technology was used? This is important for the cycle covariate. Options are SLX, 454, and SOLID.", required=false) + private String PLATFORM = "SLX"; @Argument(fullName="recal_file", shortName="rf", required=false, doc="Filename for the outputted covariates table recalibration file") - public String RECAL_FILE = "output.recal_data.csv"; - @Argument(fullName="validateOldRecalibrator", shortName="valor", required=false, doc="If yes will reorder the output to match the old recalibrator exactly but makes the file useless to the refactored TableRecalibrationWalker") - public boolean validateOldRecalibrator = false; + private String RECAL_FILE = "output.recal_data.csv"; + @Argument(fullName="noPrintHeader", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file") + private boolean NO_PRINT_HEADER = false; + @Argument(fullName="validateOldRecalibrator", shortName="valor", required=false, doc="Depricated, no longer replicates previous behavior exactly due to clean up of structure of code. If yes will reorder the output to match the old recalibrator exactly but makes the file useless to the refactored TableRecalibrationWalker") + private boolean validateOldRecalibrator = false; - protected static RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps - protected static ArrayList requestedCovariates; // A list to hold the covariate objects that were requested - - private long countedSites = 0; // used for reporting at the end - private long countedBases = 0; // used for reporting at the end - private long skippedSites = 0; // used for reporting at the end + private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps + private ArrayList requestedCovariates; // A list to hold the covariate objects that were requested + //private HashMap readGroupHashMap; // A hash map that hashes the read object itself into the read group name + // This is done for optimization purposes because pulling the read group out of the SAMRecord is expensive + private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file + private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file + private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file //--------------------------------------------------------------------------------------------------------------- // @@ -96,8 +101,8 @@ public class CovariateCounterWalker extends LocusWalker { public void initialize() { // Get a list of all available covariates - List> classes = PackageUtils.getClassesImplementingInterface(Covariate.class); - int estimatedCapacity = 1; // start at one because capacity is multiplicative for each covariate + final List> classes = PackageUtils.getClassesImplementingInterface(Covariate.class); + //int estimatedCapacity = 1; // start at one because capacity is multiplicative for each covariate // Print and exit if that's what was requested if ( LIST_ONLY ) { @@ -117,7 +122,7 @@ public class CovariateCounterWalker extends LocusWalker { requestedCovariates.add( new CycleCovariate() ); // unfortunately the ordering here is different to match the output of the old recalibrator requestedCovariates.add( new QualityScoreCovariate() ); requestedCovariates.add( new DinucCovariate() ); - estimatedCapacity = 300 * 200 * 40 * 16; + //estimatedCapacity = 300 * 200 * 40 * 16; } else if( COVARIATES != null ) { int covNumber = 1; for( String requestedCovariateString : COVARIATES ) { @@ -135,7 +140,7 @@ public class CovariateCounterWalker extends LocusWalker { // Now that we've found a matching class, try to instantiate it Covariate covariate = (Covariate)covClass.newInstance(); requestedCovariates.add( covariate ); - estimatedCapacity *= covariate.estimatedNumberOfBins(); // update the estimated initial capacity + //estimatedCapacity *= covariate.estimatedNumberOfBins(); // update the estimated initial capacity } catch ( InstantiationException e ) { throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) ); } catch ( IllegalAccessException e ) { @@ -154,13 +159,15 @@ public class CovariateCounterWalker extends LocusWalker { requestedCovariates.add( new QualityScoreCovariate() ); requestedCovariates.add( new CycleCovariate() ); requestedCovariates.add( new DinucCovariate() ); - estimatedCapacity = 300 * 40 * 200 * 16; + //estimatedCapacity = 300 * 40 * 200 * 16; } logger.info( "The covariates being used here: " ); logger.info( requestedCovariates ); - dataManager = new RecalDataManager( estimatedCapacity ); + //dataManager = new RecalDataManager( estimatedCapacity ); + dataManager = new RecalDataManager(); + //readGroupHashMap = new HashMap( 50000000, 0.97f ); } @@ -180,29 +187,60 @@ public class CovariateCounterWalker extends LocusWalker { */ public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)); + final rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)); // Only use data from non-dbsnp sites // Assume every mismatch at a non-dbsnp site is indicitive of poor quality if( dbsnp == null ) { - List reads = context.getReads(); - List offsets = context.getOffsets(); + final List reads = context.getReads(); + final List offsets = context.getOffsets(); SAMRecord read; int offset; - + String readGroup; + byte[] quals; + char[] bases; + char refBase; + char prevBase; + + final int numReads = reads.size(); // For each read at this locus - for( int iii = 0; iii < reads.size(); iii++ ) { + for( int iii = 0; iii < numReads; iii++ ) { read = reads.get(iii); - offset = offsets.get(iii); - - // Only use data from reads with mapping quality above specified quality value and base quality greater than zero - byte[] quals = read.getBaseQualities(); - if( read.getMappingQuality() >= MIN_MAPPING_QUALITY && quals[offset] > 0 ) - { - // Skip first and last bases because they don't have a dinuc count - if( offset > 0 && offset < (read.getReadLength() - 1) ) { - updateDataFromRead(read, offset, ref); - } + + // Only use data from reads with mapping quality above specified quality value + if( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) { + + //readGroup = readGroupHashMap.get( read ); + //if( readGroup == null ) { // read is not in the hashmap so add it + // readGroup = read.getReadGroup().getReadGroupId(); + // readGroupHashMap.put( read, readGroup ); + //} + + offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct + + // skip first and last base because there is no dinuc, this is mainly done for speed so we don't have to check cases + if( offset > 0 && offset < read.getReadLength() - 1) { + + quals = read.getBaseQualities(); + // skip if base quality is zero + if( quals[offset] > 0 ) { + bases = read.getReadString().toCharArray(); + refBase = ref.getBase(); + prevBase = bases[offset-1]; + // Get the complement base strand if we are a negative strand read + if( read.getReadNegativeStrandFlag() ) { + bases = BaseUtils.simpleComplement( read.getReadString() ).toCharArray(); + refBase = BaseUtils.simpleComplement( refBase ); + prevBase = bases[offset+1]; + } + + // skip if this base or the previous one was an 'N' or etc. + if( BaseUtils.isRegularBase(prevBase) && BaseUtils.isRegularBase(bases[offset]) ) { + readGroup = read.getReadGroup().getReadGroupId(); + updateDataFromRead( read, offset, readGroup, quals, bases, refBase ); + } + } + } } } @@ -220,61 +258,41 @@ public class CovariateCounterWalker extends LocusWalker { * Loop through the list of requested covariates and pick out the value from the read, offset, and reference * Using the list of covariate values as a key, pick out the RecalDatum and increment, * adding one to the number of observations and potentially one to the number of mismatches + * Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls + * because pulling things out of the SAMRecord is an expensive operation. * @param read The read * @param offset The offset in the read for this locus - * @param ref The reference context + * @param readGroup The read group the read is in + * @param quals List of base quality scores + * @param bases The bases which make up the read + * @param refBase The reference base at this locus */ - private void updateDataFromRead(SAMRecord read, int offset, ReferenceContext ref) { + private void updateDataFromRead(final SAMRecord read, final int offset, final String readGroup, final byte[] quals, final char[] bases, final char refBase) { List key = new ArrayList(); - Comparable keyElement; - boolean badKey = false; - + // Loop through the list of requested covariates and pick out the value from the read, offset, and reference for( Covariate covariate : requestedCovariates ) { - keyElement = covariate.getValue( read, offset, ref.getBases() ); - if( keyElement != Covariate.COVARIATE_ERROR && keyElement != Covariate.COVARIATE_NULL) { - key.add( keyElement ); - } else { - badKey = true; // Covariate returned bad value, for example dinuc returns null because base = 'N' - } + key.add( covariate.getValue( read, offset, readGroup, quals, bases, refBase ) ); } - - // Using the list of covariate values as a key, pick out the RecalDatum - RecalDatum datum = null; - if( !badKey ) { - datum = dataManager.data.get( key ); - if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it - datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method - if( validateOldRecalibrator ) { - dataManager.data.myPut( key, datum ); - } else { - dataManager.data.put( key, datum ); - } - } - } - - // Need the bases to determine whether or not we have a mismatch - byte[] bases = read.getReadBases(); - - char base = (char)bases[offset]; - char refBase = ref.getBase(); - // Get the complement base strand if we are a negative direction read - if ( read.getReadNegativeStrandFlag() ) { - refBase = BaseUtils.simpleComplement( refBase ); - base = BaseUtils.simpleComplement( base ); - } - - if( datum != null ) { - // Add one to the number of observations and potentially one to the number of mismatches - datum.increment( base, refBase ); - countedBases++; - } else { + + // Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap + RecalDatum datum = dataManager.data.get( key ); + if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it + datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method if( validateOldRecalibrator ) { - countedBases++; // This line here to replicate behavior in the old recalibrator - // (reads with bad covariates [prev_base = 'N', for example] were properly tossed out but this count was still incremented) + dataManager.data.myPut( key, datum ); + } else { + dataManager.data.put( key, datum ); } } + + // Need the bases to determine whether or not we have a mismatch + char base = bases[offset]; + + // Add one to the number of observations and potentially one to the number of mismatches + datum.increment( base, refBase ); + countedBases++; } @@ -322,11 +340,11 @@ public class CovariateCounterWalker extends LocusWalker { * For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format * @param recalTableStream The PrintStream to write out to */ - private void outputToCSV( PrintStream recalTableStream ) { + private void outputToCSV( final PrintStream recalTableStream ) { if( validateOldRecalibrator ) { - boolean collapsePos = false; - boolean collapseDinuc = false; + final boolean collapsePos = false; + final boolean collapseDinuc = false; recalTableStream.printf("# collapsed_pos %b%n", collapsePos); recalTableStream.printf("# collapsed_dinuc %b%n", collapseDinuc); recalTableStream.printf("# counted_sites %d%n", countedSites); @@ -344,13 +362,15 @@ public class CovariateCounterWalker extends LocusWalker { recalTableStream.println( entry.getSecond().outputToCSV() ); } } else { - recalTableStream.printf("# Counted Sites %d%n", countedSites); - recalTableStream.printf("# Counted Bases %d%n", countedBases); - recalTableStream.printf("# Skipped Sites %d%n", skippedSites); - recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); - for( Covariate cov : requestedCovariates ) { - // The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name - recalTableStream.println( "@!" + cov.getClass().getSimpleName() ); + if( !NO_PRINT_HEADER ) { + recalTableStream.printf("# Counted Sites %d%n", countedSites); + recalTableStream.printf("# Counted Bases %d%n", countedBases); + recalTableStream.printf("# Skipped Sites %d%n", skippedSites); + recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); + for( Covariate cov : requestedCovariates ) { + // The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name + recalTableStream.println( "@!" + cov.getClass().getSimpleName() ); + } } // For each entry in the data hashmap for( Map.Entry, RecalDatum> entry : dataManager.data.entrySet() ) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java index 3d85555d5..492bbd557 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +import org.broadinstitute.sting.utils.StingException; + import net.sf.samtools.SAMRecord; /* @@ -43,30 +45,39 @@ import net.sf.samtools.SAMRecord; public class CycleCovariate implements Covariate { - public String platform; + private String platform; public CycleCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker - platform = null; + platform = "SLX"; } - public CycleCovariate(String _platform) { - platform = _platform; + public CycleCovariate(final String _platform) { + platform = _platform; } - public Comparable getValue(SAMRecord read, int offset, char[] refBases) { - //BUGBUG: assumes Solexa platform - Integer cycle = offset; - if( read.getReadNegativeStrandFlag() ) { - cycle = read.getReadLength() - (offset + 1); + public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, + final byte[] quals, final char[] bases, final char refBase) { + if( platform.equalsIgnoreCase( "SLX" ) ) { + int cycle = offset; + if( read.getReadNegativeStrandFlag() ) { + cycle = bases.length - (offset + 1); + } + return cycle; + } else if( platform.equalsIgnoreCase( "454" ) ) { + return 0; //BUGBUG: not yet implemented + } else if( platform.equalsIgnoreCase( "SOLID" ) ) { + return 0; //BUGBUG: not yet implemented + } else { + throw new StingException( "Requested platform (" + platform + ") not supported in CycleCovariate." ); } - return cycle; + } - public Comparable getValue(String str) { - return Integer.parseInt( str ); + public final Comparable getValue(final String str) { + return (int)Integer.parseInt( str ); } - public int estimatedNumberOfBins() { + public final int estimatedNumberOfBins() { return 100; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java index 3d6909ddc..f5a0d4ea1 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Pair; /* * Copyright (c) 2009 The Broad Institute @@ -40,35 +41,31 @@ import org.broadinstitute.sting.utils.BaseUtils; public class DinucCovariate implements Covariate { + private String returnString; + public DinucCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker } - public Comparable getValue(SAMRecord read, int offset, char[] refBases) { - byte[] bases = read.getReadBases(); - char base = (char)bases[offset]; - char prevBase; // value set below - // If it is a negative strand read then we need to get the complement base and reverse the direction for our previous base + public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, + final byte[] quals, final char[] bases, final char refBase) { + + char base = bases[offset]; + char prevBase = bases[offset - 1]; + // If it is a negative strand read then we need to reverse the direction for our previous base if( read.getReadNegativeStrandFlag() ) { - base = BaseUtils.simpleComplement(base); - if ( offset + 1 > bases.length - 1 ) { return Covariate.COVARIATE_ERROR; } // no prevBase because at the beginning of the read - prevBase = BaseUtils.simpleComplement( (char)bases[offset + 1] ); - } else { - if ( offset - 1 < 0 ) { return Covariate.COVARIATE_ERROR; } // no prevBase because at the beginning of the read - prevBase = (char)bases[offset - 1]; - } - // Check if bad base, probably an 'N' - if( ( BaseUtils.simpleBaseToBaseIndex(prevBase) == -1 ) || ( BaseUtils.simpleBaseToBaseIndex(base) == -1 ) ) { - return Covariate.COVARIATE_NULL; // CovariateCounterWalker will recognize that null means skip this particular location in the read - } else { - return String.format("%c%c", prevBase, base); - } + prevBase = bases[offset + 1]; + } + final char[] charArray = {prevBase, base}; + returnString = String.valueOf(charArray); + return returnString; + //return String.format("%c%c", prevBase, base); // This return statement is too slow } - public Comparable getValue(String str) { + public final Comparable getValue(final String str) { return str; } - public int estimatedNumberOfBins() { + public final int estimatedNumberOfBins() { return 16; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java index 47ed8d7d8..8d71dd831 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java @@ -40,15 +40,17 @@ public class MappingQualityCovariate implements Covariate { public MappingQualityCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker } - public Comparable getValue(SAMRecord read, int offset, char[] refBases) { - return (Integer)(read.getMappingQuality()); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code + public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, + final byte[] quals, final char[] bases, final char refBase) { + + return (Integer)(read.getMappingQuality()); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code } - public Comparable getValue(String str) { + public final Comparable getValue(final String str) { return Integer.parseInt( str ); } - public int estimatedNumberOfBins() { + public final int estimatedNumberOfBins() { return 100; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java index ec9a20aca..f34cecfbc 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java @@ -41,26 +41,28 @@ import org.broadinstitute.sting.utils.QualityUtils; public class MinimumNQSCovariate implements Covariate { public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; - protected boolean USE_ORIGINAL_QUALS; + private boolean USE_ORIGINAL_QUALS; public MinimumNQSCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker USE_ORIGINAL_QUALS = false; } - public MinimumNQSCovariate(boolean originalQuals) { + public MinimumNQSCovariate(final boolean originalQuals) { USE_ORIGINAL_QUALS = originalQuals; } - public Comparable getValue(SAMRecord read, int offset, char[] refBases) { - byte[] quals = read.getBaseQualities(); - if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { - Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG); - if ( obj instanceof String ) - quals = QualityUtils.fastqToPhred((String)obj); - else { - throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); - } - } + public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, + final byte[] quals, final char[] bases, final char refBase) { + + //BUGBUG: Original qualities + //if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { + // Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG); + // if ( obj instanceof String ) + // quals = QualityUtils.fastqToPhred((String)obj); + // else { + // throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); + // } + //} // Loop over the list of qualities and find the minimum Integer minQual = (int)(quals[0]); @@ -72,11 +74,11 @@ public class MinimumNQSCovariate implements Covariate { return minQual; } - public Comparable getValue(String str) { + public final Comparable getValue(final String str) { return Integer.parseInt( str ); } - public int estimatedNumberOfBins() { + public final int estimatedNumberOfBins() { return 40; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java index f46dc03c4..95d7f4ef1 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java @@ -38,7 +38,7 @@ import java.util.*; public class NHashMap extends HashMap, T> { - private static final long serialVersionUID = 1L; //BUGBUG: what should I do here? + private static final long serialVersionUID = 1L; //BUGBUG: what should I do here?, added by Eclipse private ArrayList> keyLists; public NHashMap() { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java index b26c1253e..9e0f4fcfe 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java @@ -45,29 +45,32 @@ public class QualityScoreCovariate implements Covariate { USE_ORIGINAL_QUALS = false; } - public QualityScoreCovariate(boolean originalQuals) { + public QualityScoreCovariate(final boolean originalQuals) { USE_ORIGINAL_QUALS = originalQuals; } - public Comparable getValue(SAMRecord read, int offset, char[] refBases) { - byte[] quals = read.getBaseQualities(); - if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { - Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG); - if ( obj instanceof String ) - quals = QualityUtils.fastqToPhred((String)obj); - else { - throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); - } - } - - return ((Integer)((int)quals[offset])); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code + public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, + final byte[] quals, final char[] bases, final char refBase) { + + //BUGBUG: original qualities + //if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { + // Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG); + // if ( obj instanceof String ) + // quals = QualityUtils.fastqToPhred((String)obj); + // else { + // throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); + // } + //} + + //Integer returnQual = (int)quals[offset]; + return (int)quals[offset]; // returning an Integer Object (as opposed to primitive int) is required so that return values from the two getValue methods hash to the same code } - public Comparable getValue(String str) { - return Integer.parseInt( str ); + public final Comparable getValue(final String str) { + return (int)Integer.parseInt( str ); } - public int estimatedNumberOfBins() { + public final int estimatedNumberOfBins() { return 40; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java index 7fc8d3c3a..7dbfe4e6d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java @@ -40,15 +40,16 @@ public class ReadGroupCovariate implements Covariate{ public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker } - public Comparable getValue(SAMRecord read, int offset, char[] refBases) { - return read.getReadGroup().getReadGroupId(); + public final Comparable getValue(final SAMRecord read, final int offset, final String readGroup, + final byte[] quals, final char[] bases, final char refBase) { + return readGroup; } - public Comparable getValue(String str) { + public final Comparable getValue(final String str) { return str; } - public int estimatedNumberOfBins() { + public final int estimatedNumberOfBins() { return 300; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java index adb8c1eaf..ca1ed97b5 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java @@ -38,12 +38,17 @@ import java.util.*; public class RecalDataManager { public NHashMap data; // the full dataset - public NHashMap dataCollapsedReadGroup; // table where everything except read group has been collapsed - public NHashMap dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed - public ArrayList> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed - public boolean collapsedTablesCreated; + private NHashMap dataCollapsedReadGroup; // table where everything except read group has been collapsed + private NHashMap dataCollapsedQualityScore; // table where everything except read group and quality score has been collapsed + private ArrayList> dataCollapsedByCovariate; // tables where everything except read group, quality score, and given covariate has been collapsed + private boolean collapsedTablesCreated; public NHashMap dataSumExpectedErrors; + RecalDataManager() { + data = new NHashMap(); + collapsedTablesCreated = false; + } + RecalDataManager( int estimatedCapacity ) { data = new NHashMap( estimatedCapacity, 0.95f ); // second arg is the 'loading factor', // a number to monkey around with when optimizing performace of the HashMap @@ -51,7 +56,7 @@ public class RecalDataManager { } // BUGBUG: A lot going on in this method, doing a lot of pre-calculations for use in the sequential mode calculation later in TableRecalibrationWalker - public void createCollapsedTables( int numCovariates ) { + public final void createCollapsedTables( final int numCovariates ) { dataCollapsedReadGroup = new NHashMap(); dataCollapsedQualityScore = new NHashMap(); dataCollapsedByCovariate = new ArrayList>(); @@ -122,7 +127,7 @@ public class RecalDataManager { collapsedTablesCreated = true; } - public NHashMap getCollapsedTable( int covariate ) { + public final NHashMap getCollapsedTable( final int covariate ) { if( !collapsedTablesCreated ) { throw new StingException("Trying to get collapsed tables before they have been populated. Null pointers abound."); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java index 87a5aaa8b..b9e6d866c 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java @@ -38,8 +38,8 @@ import java.util.*; public class RecalDatum { - long numObservations; // number of bases seen in total - long numMismatches; // number of bases seen that didn't match the reference + private long numObservations; // number of bases seen in total + private long numMismatches; // number of bases seen that didn't match the reference //--------------------------------------------------------------------------------------------------------------- @@ -52,12 +52,12 @@ public class RecalDatum { numMismatches = 0L; } - public RecalDatum( long _numObservations, long _numMismatches ) { + public RecalDatum( final long _numObservations, final long _numMismatches ) { numObservations = _numObservations; numMismatches = _numMismatches; } - public RecalDatum( RecalDatum copy ) { + public RecalDatum( final RecalDatum copy ) { this.numObservations = copy.numObservations; this.numMismatches = copy.numMismatches; } @@ -69,22 +69,22 @@ public class RecalDatum { //--------------------------------------------------------------------------------------------------------------- - public void increment( long incObservations, long incMismatches ) { + public final void increment( final long incObservations, final long incMismatches ) { numObservations += incObservations; numMismatches += incMismatches; } - public void increment( RecalDatum other ) { + public final void increment( final RecalDatum other ) { increment( other.numObservations, other.numMismatches ); } - public void increment( List data ) { + public final void increment( final List data ) { for ( RecalDatum other : data ) { this.increment( other ); } } - public void increment( char curBase, char ref ) { + public final void increment( final char curBase, final char ref ) { increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1 ); // increment takes num observations, then num mismatches } @@ -94,22 +94,22 @@ public class RecalDatum { // //--------------------------------------------------------------------------------------------------------------- - public double empiricalQualDouble( int smoothing ) { + public final double empiricalQualDouble( final int smoothing ) { double doubleMismatches = (double) ( numMismatches + smoothing ); double doubleObservations = (double) ( numObservations + smoothing ); double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); if (empiricalQual > QualityUtils.MAX_REASONABLE_Q_SCORE) empiricalQual = QualityUtils.MAX_REASONABLE_Q_SCORE; return empiricalQual; } - public double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero + public final double empiricalQualDouble() { return empiricalQualDouble( 0 ); } // 'default' behavior is to use smoothing value of zero - public byte empiricalQualByte( int smoothing ) { + public final byte empiricalQualByte( final int smoothing ) { double doubleMismatches = (double) ( numMismatches + smoothing ); double doubleObservations = (double) ( numObservations + smoothing ); return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations ); } - public byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero + public final byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero //--------------------------------------------------------------------------------------------------------------- // @@ -117,14 +117,14 @@ public class RecalDatum { // //--------------------------------------------------------------------------------------------------------------- - public String outputToCSV( ) { + public final String outputToCSV( ) { return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() ); } - public String outputToCSV( int smoothing ) { + public final String outputToCSV( final int smoothing ) { return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte( smoothing ) ); } - public Long getNumObservations() { + public final Long getNumObservations() { return numObservations; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java index 1b215a2be..932819a0d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java @@ -67,15 +67,15 @@ public class TableRecalibrationWalker extends ReadWalker requestedCovariates; @@ -165,7 +165,7 @@ public class TableRecalibrationWalker extends ReadWalker key = new ArrayList(); - boolean badKey = false; - Comparable keyElement; for( Covariate covariate : requestedCovariates ) { - keyElement = covariate.getValue( read, iii, refBases ); - if( keyElement != Covariate.COVARIATE_ERROR ) { // COVARIATE_NULL is okay because the sequential calculation will do the best it can with the valid data it has - key.add( keyElement ); // technically COVARIATE_ERROR is fine too, but this won't match the behavior of the old recalibrator - } else { - badKey = true; - } + key.add( covariate.getValue( read, iii, readGroup, originalQuals, bases, myRefBases.charAt(iii) ) ); // offset is zero based so passing iii is correct here // technically COVARIATE_ERROR is fine too, but this won't match the behavior of the old recalibrator } - if( !badKey ) { - if( MODE == RecalibrationMode.COMBINATORIAL ) { - RecalDatum datum = dataManager.data.get( key ); - if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing - recalQuals[iii] = datum.empiricalQualByte( SMOOTHING ); - } - } else if( MODE == RecalibrationMode.SEQUENTIAL ) { - recalQuals[iii] = performSequentialQualityCalculation( key ); - } else { - throw new StingException( "Specified RecalibrationMode is not supported: " + MODE ); + if( MODE_STRING.equalsIgnoreCase("COMBINATORIAL") ) { + RecalDatum datum = dataManager.data.get( key ); + if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing + recalQuals[iii] = datum.empiricalQualByte( SMOOTHING ); } + } else if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) { + recalQuals[iii] = performSequentialQualityCalculation( key ); + } else { + throw new StingException( "Specified RecalibrationMode is not supported: " + MODE_STRING ); + } - // Do some error checking on the new quality score - if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) { - throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] ); - } + // Do some error checking on the new quality score + if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) { + throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] ); } } @@ -254,6 +259,7 @@ public class TableRecalibrationWalker extends ReadWalker