diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java index d48f389a9..e107aca84 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/Covariate.java @@ -2,12 +2,41 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; import net.sf.samtools.SAMRecord; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + /** * Created by IntelliJ IDEA. * User: rpoplin * Date: Oct 30, 2009 + * + * The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read and corresponding reference bases */ + public interface Covariate { - public Comparable getValue(SAMRecord read, int offset, char[] refBases); // used to pick out the value from the read and etc - public Comparable getValue(String str); // used to get value from input file + public Comparable getValue(SAMRecord read, int offset, char[] refBases); // used to pick out the value from the read and reference bases + public Comparable getValue(String str); // used to get value from input file + public int estimatedNumberOfBins(); // used to estimate the amount space required in the hashmap } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java index 36ad4b0b1..1ec584ebf 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java @@ -16,10 +16,47 @@ import java.util.*; import net.sf.samtools.SAMRecord; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + /** * Created by IntelliJ IDEA. * User: rpoplin * Date: Nov 3, 2009 + * + * This walker is designed to work as the first pass in a two-pass processing step. + * It does a by-locus traversal operating only at sites that are not in dbSNP. + * We assume that all mismatches we see are therefore errors. + * This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinuc) + * Since there is a large amount of data one can then calculate an empirical probability of error + * given the particular covariates seen at this site, where p(error) = num mismatches / num observations + * The output file is a CSV list of (several covariate values, num observations, num mismatches, empirical quality score) + * The first lines of the output file give the name of the covariate classes that were used for this calculation. + * + * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and are automatically added first. + * Note: This walker is designed to be used in conjunction with TableRecalibrationWalker. */ @WalkerName("CountCovariatesRefactored") @@ -36,15 +73,18 @@ public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="recal_file", shortName="rf", required=false, doc="Filename for the outputted covariates table recalibration file") public String RECAL_FILE = "output.recal_data.csv"; - protected static RecalDataManager dataManager; - protected static ArrayList requestedCovariates; + protected static RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps + protected static ArrayList requestedCovariates; // A list to hold the covariate objects that were requested + /** + * Parse the -cov arguments and create a list of covariates to be used here + * Based on the covariates' estimates for initial capacity allocate the data hashmap + */ public void initialize() { - dataManager = new RecalDataManager(); - // Get a list of all available covariates List> classes = PackageUtils.getClassesImplementingInterface(Covariate.class); + int estimatedCapacity = 1; // start at one because capacity is multiplicative for each covariate // Print and exit if that's what was requested if ( LIST_ONLY ) { @@ -57,7 +97,7 @@ public class CovariateCounterWalker extends LocusWalker { System.exit( 0 ); // early exit here because user requested it } - // Initialize the requested covariates + // Initialize the requested covariates by parsing the -cov argument requestedCovariates = new ArrayList(); requestedCovariates.add( new ReadGroupCovariate() ); // Read Group Covariate is a required covariate for the recalibration calculation requestedCovariates.add( new QualityScoreCovariate() ); // Quality Score Covariate is a required covariate for the recalibration calculation @@ -67,11 +107,13 @@ public class CovariateCounterWalker extends LocusWalker { boolean foundClass = false; for( Class covClass : classes ) { - if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { + if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class foundClass = true; try { Covariate covariate = (Covariate)covClass.newInstance(); requestedCovariates.add( covariate ); + estimatedCapacity *= covariate.estimatedNumberOfBins(); // update the estimated initial capacity + if (covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate) { throw new StingException( "ReadGroupCovariate and QualityScoreCovariate are required covariates and are therefore added for you. Please remove them from the -cov list" ); } @@ -92,27 +134,48 @@ public class CovariateCounterWalker extends LocusWalker { logger.info( "The covariates being used here: " ); logger.info( requestedCovariates ); + + dataManager = new RecalDataManager( estimatedCapacity ); } - + + + //--------------------------------------------------------------------------------------------------------------- + // + // map + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * For each read at this locus get the various covariate values and increment that location in the map based on + * whether or not the base matches the reference at this particular location + * @param tracker the reference metadata tracker + * @param ref the reference context + * @param context the alignment context + * @return returns 1, but this value isn't used in the reduce step + */ public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { rodDbSNP dbsnp = rodDbSNP.getFirstRealSNP(tracker.getTrackData("dbsnp", null)); + // Only use data from non-dbsnp sites // Assume every mismatch at a non-dbsnp site is indicitive of poor quality if( dbsnp == null ) { List reads = context.getReads(); List offsets = context.getOffsets(); - SAMRecord read; // preallocate for use in for loop below - int offset; // preallocate for use in for loop below + SAMRecord read; + int offset; + + // For each read at this locus for( int iii = 0; iii < reads.size(); iii++ ) { read = reads.get(iii); offset = offsets.get(iii); - // Only use data from reads with mapping quality above given quality value and base quality greater than zero + // Only use data from reads with mapping quality above specified quality value and base quality greater than zero byte[] quals = read.getBaseQualities(); if( read.getMappingQuality() >= MIN_MAPPING_QUALITY && quals[offset] > 0) - { - if( offset > 0 && offset < (read.getReadLength() - 1) ) { // skip first and last bases because they don't have a dinuc count + { + // Skip first and last bases because they don't have a dinuc count + if( offset > 0 && offset < (read.getReadLength() - 1) ) { updateDataFromRead(read, offset, ref); } } @@ -124,43 +187,69 @@ public class CovariateCounterWalker extends LocusWalker { return 1; } + /** + * Major workhorse routine for this walker. + * Loop through the list of requested covariates and pick out the value from the read, offset, and reference + * Using the list of covariate values as a key, pick out the RecalDatum and increment, + * adding one to the number of observations and potentially one to the number of mismatches + * @param read the read + * @param offset the offset in the read for this locus + * @param ref the reference context + */ private void updateDataFromRead(SAMRecord read, int offset, ReferenceContext ref) { - List> key = new ArrayList>(); - Comparable keyElement; // preallocate for use in for loop below + List key = new ArrayList(); + Comparable keyElement; boolean badKey = false; + + // Loop through the list of requested covariates and pick out the value from the read, offset, and reference for( Covariate covariate : requestedCovariates ) { keyElement = covariate.getValue( read, offset, ref.getBases() ); if( keyElement != null ) { key.add( keyElement ); } else { - badKey = true; + badKey = true; // covariate returned bad value, for example dinuc returns null because base = 'N' } } + // Using the list of covariate values as a key, pick out the RecalDatum RecalDatum datum = null; if( !badKey ) { datum = dataManager.data.get( key ); if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it - datum = new RecalDatum(); + datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method dataManager.data.put( key, datum ); } } + // Need the bases to determine whether or not we have a mismatch byte[] bases = read.getReadBases(); char base = (char)bases[offset]; char refBase = ref.getBase(); + // Get the complement base strand if we are a negative direction read if ( read.getReadNegativeStrandFlag() ) { refBase = BaseUtils.simpleComplement( refBase ); base = BaseUtils.simpleComplement( base ); } if( datum != null ) { + // Add one to the number of observations and potentially one to the number of mismatches datum.increment( base, refBase ); } } - + + + //--------------------------------------------------------------------------------------------------------------- + // + // reduce + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * Initialize the reudce step by creating a PrintStream from the filename specified as an argument to the walker. + * @return returns a PrintStream created from the -rf filename + */ public PrintStream reduceInit() { try { return new PrintStream( RECAL_FILE ); @@ -169,10 +258,20 @@ public class CovariateCounterWalker extends LocusWalker { } } + /** + * The Reduce method doesn't do anything for this walker. + * @param value result of the map. + * @param recalTableStream the PrintStream + * @return returns the PrintStream used to output the CSV data + */ public PrintStream reduce( Integer value, PrintStream recalTableStream ) { return recalTableStream; // nothing to do here } + /** + * Write out the full data hashmap to disk in CSV format + * @param recalTableStream the PrintStream to write out to + */ public void onTraversalDone( PrintStream recalTableStream ) { out.print( "Writing raw recalibration data..." ); for( Covariate cov : requestedCovariates ) { @@ -184,16 +283,19 @@ public class CovariateCounterWalker extends LocusWalker { recalTableStream.close(); } + /** + * For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format + * @param recalTableStream the PrintStream to write out to + */ private void outputToCSV( PrintStream recalTableStream ) { - for( Map.Entry>, RecalDatum> entry : dataManager.data.entrySet() ) { - //for( int iii = 0; iii < entry.getKey().size(); iii++ ) { - //index = Integer.parseInt( (entry.getKey().get( iii )).toString()); - //recalTableStream.print( requestedCovariates.get( iii ).getValue( index ) + "," ); - // recalTableStream.print( entry.getKey().get(iii) ) - //} - for( Comparable comp : entry.getKey() ) { + // For each entry in the data hashmap + for( Map.Entry, RecalDatum> entry : dataManager.data.entrySet() ) { + // For each Covariate in the key + for( Comparable comp : entry.getKey() ) { + // Output the Covariate's value recalTableStream.print( comp + "," ); } + // Output the RecalDatum entry recalTableStream.println( entry.getValue().outputToCSV() ); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java index f6acbcdd3..e5d2d20a8 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CycleCovariate.java @@ -2,16 +2,42 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; import net.sf.samtools.SAMRecord; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + /** * Created by IntelliJ IDEA. * User: rpoplin * Date: Oct 30, 2009 */ + public class CycleCovariate implements Covariate { public String platform; - public CycleCovariate() { // empty constructor is required by CovariateCounterWalker + public CycleCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker platform = null; } @@ -19,8 +45,8 @@ public class CycleCovariate implements Covariate { platform = _platform; } - public Comparable getValue(SAMRecord read, int offset, char[] refBases) { - //BUGBUG: assumes Solexia platform + public Comparable getValue(SAMRecord read, int offset, char[] refBases) { + //BUGBUG: assumes Solexa platform Integer cycle = offset; if( read.getReadNegativeStrandFlag() ) { cycle = read.getReadLength() - (offset + 1); @@ -28,10 +54,14 @@ public class CycleCovariate implements Covariate { return cycle; } - public Comparable getValue(String str) { + public Comparable getValue(String str) { return Integer.parseInt( str ); } + public int estimatedNumberOfBins() { + return 100; + } + public String toString() { return "Cycle"; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java index 991a2cf35..2613a26a1 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java @@ -5,16 +5,42 @@ import org.broadinstitute.sting.utils.BaseUtils; import java.util.*; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + /** * Created by IntelliJ IDEA. * User: rpoplin * Date: Nov 3, 2009 */ + public class DinucCovariate implements Covariate { public static ArrayList BASES; - public DinucCovariate() { // empty constructor is required by CovariateCounterWalker + public DinucCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker BASES = new ArrayList(); BASES.add("A"); BASES.add("G"); @@ -26,7 +52,7 @@ public class DinucCovariate implements Covariate { BASES.add("t"); } - public Comparable getValue(SAMRecord read, int offset, char[] refBases) { + public Comparable getValue(SAMRecord read, int offset, char[] refBases) { byte[] bases = read.getReadBases(); char base = (char)bases[offset]; char prevBase = (char)bases[offset - 1]; @@ -36,16 +62,20 @@ public class DinucCovariate implements Covariate { } // Check if bad base, probably an 'N' if( !BASES.contains( String.format( "%c", prevBase ) ) || !BASES.contains( String.format( "%c", base) ) ) { - return null; // CovariateCounterWalker and TableRecalibrationWalker will recognize that null means skip this particular location in the read + return null; // CovariateCounterWalker will recognize that null means skip this particular location in the read } else { return String.format("%c%c", prevBase, base); } } - public Comparable getValue(String str) { + public Comparable getValue(String str) { return str; } + public int estimatedNumberOfBins() { + return 16; + } + public String toString() { return "Dinucleotide"; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java index f258822a5..7b79fcb79 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MappingQualityCovariate.java @@ -2,23 +2,53 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; import net.sf.samtools.SAMRecord; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + /** * Created by IntelliJ IDEA. * User: rpoplin * Date: Nov 4, 2009 */ + public class MappingQualityCovariate implements Covariate { - public MappingQualityCovariate() { // empty constructor is required by CovariateCounterWalker + public MappingQualityCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker } - public Comparable getValue(SAMRecord read, int offset, char[] refBases) { - return (Integer)(read.getMappingQuality()); + public Comparable getValue(SAMRecord read, int offset, char[] refBases) { + return (Integer)(read.getMappingQuality()); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code } - public Comparable getValue(String str) { + public Comparable getValue(String str) { return Integer.parseInt( str ); } + + public int estimatedNumberOfBins() { + return 100; + } public String toString() { return "Mapping Quality Score"; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java index f18454bce..963f2490d 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/MinimumNQSCovariate.java @@ -3,17 +3,43 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.QualityUtils; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + /** * Created by IntelliJ IDEA. * User: rpoplin * Date: Nov 4, 2009 */ + public class MinimumNQSCovariate implements Covariate { public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; protected boolean USE_ORIGINAL_QUALS; - public MinimumNQSCovariate() { // empty constructor is required by CovariateCounterWalker + public MinimumNQSCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker USE_ORIGINAL_QUALS = false; } @@ -21,7 +47,7 @@ public class MinimumNQSCovariate implements Covariate { USE_ORIGINAL_QUALS = originalQuals; } - public Comparable getValue(SAMRecord read, int offset, char[] refBases) { + public Comparable getValue(SAMRecord read, int offset, char[] refBases) { byte[] quals = read.getBaseQualities(); if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG); @@ -41,10 +67,14 @@ public class MinimumNQSCovariate implements Covariate { return minQual; } - public Comparable getValue(String str) { + public Comparable getValue(String str) { return Integer.parseInt( str ); } + public int estimatedNumberOfBins() { + return 40; + } + public String toString() { return "Minimum Neighborhood Quality Score"; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java index 6792ad349..8dd85cbde 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java @@ -2,18 +2,93 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; import java.util.*; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + /** * Created by IntelliJ IDEA. * User: rpoplin * Date: Oct 30, 2009 */ -public class NHashMap extends HashMap>, T> { + +public class NHashMap extends HashMap, T> { private static final long serialVersionUID = 1L; //BUGBUG: what should I do here? + private ArrayList> keyLists; - + public NHashMap() { + super(); + keyLists = new ArrayList>(); + } - public static > List makeList(T... args) { + public NHashMap( int initialCapacity, float loadingFactor ) { + super( initialCapacity, loadingFactor ); + keyLists = new ArrayList>(); + } + + /* + + // This method is here only to help facilitate direct comparison of old and refactored recalibrator. + // The old recalibrator prints out it's mappings in a sorted order but the refactored recalibrator doesn't need to. + // Comment out this overriding method to improve performance + public T put(List key, T value) { + + ArrayList thisList; + for( int iii = 0; iii < key.size(); iii++ ) { + thisList = keyLists.get( iii ); + if( thisList == null ) { + thisList = new ArrayList(); + } + if( !thisList.contains( key.get( iii ) ) ) { + thisList.add( key.get(iii ) ); + } + } + return super.put( key, value ); + } + + // This method is here only to help facilitate direct comparison of old and refactored recalibrator. + // The old recalibrator prints out it's mappings in a sorted order but the refactored recalibrator doesn't need to. + // Comment out this overriding method to improve performance + public Set, T>> entrySet() { + + for( ArrayList list : keyLists ) { + Collections.sort(list); + } + + for( ArrayList list : keyLists ) { + for( Comparable comp : list ) { + // BUGBUG: unfinished + } + } + + return null; + } + + */ + + public static List makeList(T... args) { List list = new ArrayList(); for (T arg : args) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java index 2e1097e49..b2b7692d1 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/QualityScoreCovariate.java @@ -3,17 +3,43 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.QualityUtils; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + /** * Created by IntelliJ IDEA. * User: rpoplin * Date: Nov 3, 2009 */ + public class QualityScoreCovariate implements Covariate { public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; protected boolean USE_ORIGINAL_QUALS; - public QualityScoreCovariate() { // empty constructor is required by CovariateCounterWalker + public QualityScoreCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker USE_ORIGINAL_QUALS = false; } @@ -21,7 +47,7 @@ public class QualityScoreCovariate implements Covariate { USE_ORIGINAL_QUALS = originalQuals; } - public Comparable getValue(SAMRecord read, int offset, char[] refBases) { + public Comparable getValue(SAMRecord read, int offset, char[] refBases) { byte[] quals = read.getBaseQualities(); if ( USE_ORIGINAL_QUALS && read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { Object obj = read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG); @@ -32,12 +58,16 @@ public class QualityScoreCovariate implements Covariate { } } - return ((Integer)((int)quals[offset])); + return ((Integer)((int)quals[offset])); // cast to Object Integer is required so that return values from the two getValue methods hash to the same code } - public Comparable getValue(String str) { + public Comparable getValue(String str) { return Integer.parseInt( str ); } + + public int estimatedNumberOfBins() { + return 40; + } public String toString() { return "Reported Quality Score"; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java index 261eb8aa8..beaf697c5 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/ReadGroupCovariate.java @@ -2,24 +2,54 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; import net.sf.samtools.SAMRecord; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + /** * Created by IntelliJ IDEA. * User: rpoplin * Date: Oct 30, 2009 */ + public class ReadGroupCovariate implements Covariate{ - public ReadGroupCovariate() { // empty constructor is required by CovariateCounterWalker + public ReadGroupCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker } - public Comparable getValue(SAMRecord read, int offset, char[] refBases) { + public Comparable getValue(SAMRecord read, int offset, char[] refBases) { return read.getReadGroup().getReadGroupId(); } - public Comparable getValue(String str) { + public Comparable getValue(String str) { return str; } + public int estimatedNumberOfBins() { + return 300; + } + public String toString() { return "Read Group"; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java index 7e257a148..63af6315a 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDataManager.java @@ -5,6 +5,31 @@ import org.broadinstitute.sting.utils.QualityUtils; import java.util.*; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + /** * Created by IntelliJ IDEA. * User: rpoplin @@ -19,8 +44,9 @@ public class RecalDataManager { public boolean collapsedTablesCreated; public NHashMap dataSumExpectedErrors; - RecalDataManager() { - data = new NHashMap(); + RecalDataManager( int estimatedCapacity ) { + data = new NHashMap( estimatedCapacity, 0.95f ); // second arg is the 'loading factor', + // a number to monkey around with when optimizing performace of the HashMap collapsedTablesCreated = false; } @@ -37,17 +63,17 @@ public class RecalDataManager { // preallocate for use in for loops below RecalDatum thisDatum; RecalDatum collapsedDatum; - List> key; - ArrayList> newKey; + List key; + ArrayList newKey; Double sumExpectedErrors; // for every data point in the map - for( Map.Entry>,RecalDatum> entry : data.entrySet() ) { + for( Map.Entry,RecalDatum> entry : data.entrySet() ) { thisDatum = entry.getValue(); key = entry.getKey(); // create dataCollapsedReadGroup, the table where everything except read group has been collapsed - newKey = new ArrayList>(); + newKey = new ArrayList(); newKey.add( key.get(0) ); // make a new key with just the read group collapsedDatum = dataCollapsedReadGroup.get( newKey ); if( collapsedDatum == null ) { @@ -57,7 +83,7 @@ public class RecalDataManager { collapsedDatum.increment( thisDatum ); } - newKey = new ArrayList>(); + newKey = new ArrayList(); newKey.add( key.get(0) ); // make a new key with just the read group sumExpectedErrors = dataSumExpectedErrors.get( newKey ); if( sumExpectedErrors == null ) { @@ -69,7 +95,7 @@ public class RecalDataManager { dataSumExpectedErrors.put( newKey, sumExpectedErrors ); } - newKey = new ArrayList>(); + newKey = new ArrayList(); // create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed newKey.add( key.get(0) ); // make a new key with the read group ... newKey.add( key.get(1) ); // and quality score @@ -83,7 +109,7 @@ public class RecalDataManager { // create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted - newKey = new ArrayList>(); + newKey = new ArrayList(); newKey.add( key.get(0) ); // make a new key with the read group ... newKey.add( key.get(1) ); // and quality score ... newKey.add( key.get(iii + 2) ); // and the given covariate diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java index 098ed5a2e..d67eede7a 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/RecalDatum.java @@ -5,6 +5,31 @@ import org.broadinstitute.sting.utils.QualityUtils; import java.util.*; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + /** * Created by IntelliJ IDEA. * User: rpoplin diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java index 0876e9f57..60d884ff9 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java @@ -13,6 +13,31 @@ import java.util.regex.Pattern; import java.io.File; import java.io.FileNotFoundException; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + /** * Created by IntelliJ IDEA. * User: rpoplin @@ -56,14 +81,14 @@ public class TableRecalibrationWalker extends ReadWalker(); // Read in the data from the csv file and populate the map - out.print( "Reading in the data from input file..." ); + logger.info( "Reading in the data from input file..." ); try { for ( String line : new xReadLines(new File( RECAL_FILE )) ) { @@ -80,6 +105,8 @@ public class TableRecalibrationWalker extends ReadWalker> key = new ArrayList>(); + ArrayList key = new ArrayList(); Covariate cov; // preallocated for use in for loop below int iii; for( iii = 0; iii < requestedCovariates.size(); iii++ ) { @@ -149,53 +178,44 @@ public class TableRecalibrationWalker extends ReadWalker keyElement; // preallocate for use in for loops below for( int iii = 1; iii < read.getReadLength() - 1; iii++ ) { // skip first and last base because there is no dinuc - ArrayList> key = new ArrayList>(); - boolean badKey = false; + List key = new ArrayList(); for( Covariate covariate : requestedCovariates ) { - keyElement = covariate.getValue( read, iii, refBases ); - if ( keyElement != null ) { - key.add( keyElement ); - } else { - badKey = true; - } + key.add( covariate.getValue( read, iii, refBases ) ); } - if( !badKey ) { - if( MODE == RecalibrationMode.COMBINATORIAL ) { - RecalDatum datum = dataManager.data.get( key ); - if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing - recalQuals[iii] = datum.empiricalQualByte( SMOOTHING ); - } - } else if( MODE == RecalibrationMode.SEQUENTIAL ) { - recalQuals[iii] = performSequentialQualityCalculation( key ); - } else { - throw new StingException( "Specified RecalibrationMode is not supported: " + MODE ); + if( MODE == RecalibrationMode.COMBINATORIAL ) { + RecalDatum datum = dataManager.data.get( key ); + if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing + recalQuals[iii] = datum.empiricalQualByte( SMOOTHING ); } + } else if( MODE == RecalibrationMode.SEQUENTIAL ) { + recalQuals[iii] = performSequentialQualityCalculation( key ); + } else { + throw new StingException( "Specified RecalibrationMode is not supported: " + MODE ); + } - // Do some error checking on the new quality score - if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) { - throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] ); - } + // Do some error checking on the new quality score + if ( recalQuals[iii] <= 0 || recalQuals[iii] > QualityUtils.MAX_REASONABLE_Q_SCORE ) { + throw new StingException( "Assigning bad quality score " + key + " => " + recalQuals[iii] ); } } preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low read.setBaseQualities(recalQuals); // overwrite old qualities with new recalibrated qualities - if ( read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if there is room in the read + if ( read.getAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // save the old qualities if the tag isn't already taken in the read read.setAttribute(ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals)); } return read; } - private byte performSequentialQualityCalculation( ArrayList> key ) { + private byte performSequentialQualityCalculation( List key ) { byte qualFromRead = Byte.parseByte(key.get(1).toString()); - ArrayList> newKey; + ArrayList newKey; - newKey = new ArrayList>(); + newKey = new ArrayList(); newKey.add( key.get(0) ); // read group RecalDatum globalDeltaQDatum = dataManager.getCollapsedTable(0).get( newKey ); double globalDeltaQ = 0.0; @@ -206,7 +226,7 @@ public class TableRecalibrationWalker extends ReadWalker>(); + newKey = new ArrayList(); newKey.add( key.get(0) ); // read group newKey.add( key.get(1) ); // quality score RecalDatum deltaQReportedDatum = dataManager.getCollapsedTable(1).get( newKey ); @@ -219,7 +239,7 @@ public class TableRecalibrationWalker extends ReadWalker>(); + newKey = new ArrayList(); newKey.add( key.get(0) ); // read group newKey.add( key.get(1) ); // quality score newKey.add( key.get(iii) ); // given covariate @@ -236,6 +256,8 @@ public class TableRecalibrationWalker extends ReadWalker %d + %.2f + %.2f + %.2f = %d", qualFromRead, globalDeltaQ, deltaQReported, deltaQCovariates, newQualityByte ) ); } + + //System.out.println("returning: " + newQualityByte); return newQualityByte; }