diff --git a/java/src/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java b/java/src/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java new file mode 100755 index 000000000..bf9139aa8 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/filters/NoOriginalQualityScoresFilter.java @@ -0,0 +1,15 @@ +package org.broadinstitute.sting.gatk.filters; + +import net.sf.picard.filter.SamRecordFilter; +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 19, 2009 + */ +public class NoOriginalQualityScoresFilter implements SamRecordFilter { + public boolean filterOut(SAMRecord rec) { + return (rec.getAttribute("OQ") == null); + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 4633f100b..1d7255669 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -514,6 +514,9 @@ class SerialRecalMapping implements RecalMapping { double newQual = qual + globalDeltaQ + deltaQual + deltaQPos + deltaQDinuc; newQualByte = QualityUtils.boundQual((int)Math.round(newQual), QualityUtils.MAX_REASONABLE_Q_SCORE); + //System.out.println(String.format("%s %c%c %d %d => %d + %.2f + %.2f + %.2f + %.2f = %d", + // readGroup, prevBase, base, cycle, qual, qual, globalDeltaQ, deltaQual, deltaQPos, deltaQDinuc, newQualByte)); + if ( newQualByte <= 0 && newQualByte >= QualityUtils.MAX_REASONABLE_Q_SCORE ) throw new RuntimeException(String.format("Illegal base quality score calculated: %s %c%c %d %d => %d + %.2f + %.2f + %.2f = %d", readGroup, prevBase, base, cycle, qual, qual, globalDeltaQ, deltaQPos, deltaQDinuc, newQualByte)); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java index 9536f780e..2d0130e8b 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/CovariateCounterWalker.java @@ -64,7 +64,7 @@ import net.sf.samtools.SAMReadGroupRecord; @By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file @WalkerName( "CountCovariatesRefactored" ) -@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality +//@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality // BUGBUG taken out to match old integration tests @Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta public class CovariateCounterWalker extends LocusWalker { @@ -80,6 +80,11 @@ public class CovariateCounterWalker extends LocusWalker { private String RECAL_FILE = "output.recal_data.csv"; @Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. For debugging purposes only.") private boolean NO_PRINT_HEADER = false; + @Argument(fullName="validate_old_recalibrator", shortName="validateOldRecalibrator", required=false, doc="Match the output of the old recalibrator exactly. For debugging purposes only.") + private boolean VALIDATE_OLD_RECALIBRATOR = false; + @Argument(fullName="use_slx_platform", shortName="useSLXPlatform", required=false, doc="Force the platform to be Illumina regardless of what it actually says. For debugging purposes only.") + private boolean USE_SLX_PLATFORM = false; + private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps private ArrayList requestedCovariates; // A list to hold the covariate objects that were requested @@ -130,7 +135,12 @@ public class CovariateCounterWalker extends LocusWalker { // Initialize the requested covariates by parsing the -cov argument requestedCovariates = new ArrayList(); int estimatedCapacity = 1; // capacity is multiplicitive so this starts at one - if( COVARIATES != null ) { + if( VALIDATE_OLD_RECALIBRATOR ) { + requestedCovariates.add( new ReadGroupCovariate() ); + requestedCovariates.add( new CycleCovariate() ); // unfortunately order is different here in order to match the old recalibrator exactly + requestedCovariates.add( new QualityScoreCovariate() ); + requestedCovariates.add( new DinucCovariate() ); + } else if( COVARIATES != null ) { if(COVARIATES[0].equalsIgnoreCase("ALL")) { // the user wants ALL covariates to be used requestedCovariates.add( new ReadGroupCovariate() ); // first add the required covariates then add the rest by looping over all implementing classes that were found requestedCovariates.add( new QualityScoreCovariate() ); @@ -251,51 +261,61 @@ public class CovariateCounterWalker extends LocusWalker { // readGroupId = read.getReadGroup().getReadGroupId(); // readGroupHashMap.put( read, readGroupId ); //} + + if( read.getMappingQuality() > 0 ) { // BUGBUG: turn this into a read filter after passing the old integration tests - offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct + offset = offsets.get(iii); // offset is zero based so quals[offset] and bases[offset] is correct - // skip first and last base because there is no dinuc, this is mainly done for speed so we don't have to check cases - if( offset > 0 && offset < read.getReadLength() - 1 ) { + // skip first and last base because there is no dinuc, this is mainly done for speed so we don't have to check cases + if( offset > 0 && offset < read.getReadLength() - 1 ) { - quals = read.getBaseQualities(); - // Check if we need to use the original quality scores instead - if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { - Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); - if ( obj instanceof String ) - quals = QualityUtils.fastqToPhred((String)obj); - else { - throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); - } - } - - // skip if base quality is zero - if( quals[offset] > 0 ) { - bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A' - refBase = (byte)ref.getBase(); - prevBase = bases[offset-1]; - - // Get the complement base strand if we are a negative strand read - if( read.getReadNegativeStrandFlag() ) { - bases = BaseUtils.simpleComplement( bases ); // this is an expensive call - refBase = (byte)BaseUtils.simpleComplement( ref.getBase() ); - prevBase = bases[offset+1]; - } - - // skip if this base or the previous one was an 'N' or etc. - if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)bases[offset] ) ) { - - final SAMReadGroupRecord readGroup = read.getReadGroup(); - readGroupId = readGroup.getReadGroupId(); // this is an expensive call - platform = readGroup.getPlatform(); // this is an expensive call - - // SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them - colorSpaceQuals = null; - if( platform.equalsIgnoreCase("SOLID") ) { - colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG)); + quals = read.getBaseQualities(); + // Check if we need to use the original quality scores instead + if ( USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { + Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); + if ( obj instanceof String ) + quals = QualityUtils.fastqToPhred((String)obj); + else { + throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); } - if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet - { - updateDataFromRead( read, offset, readGroupId, platform, quals, bases, refBase ); + } + + // skip if base quality is zero + if( quals[offset] > 0 ) { + bases = read.getReadBases(); // BUGBUG: DinucCovariate is relying on this method returning the same byte for bases 'a' and 'A' + refBase = (byte)ref.getBase(); + prevBase = bases[offset-1]; + + // Get the complement base strand if we are a negative strand read + if( read.getReadNegativeStrandFlag() ) { + bases = BaseUtils.simpleComplement( bases ); // this is an expensive call + refBase = (byte)BaseUtils.simpleComplement( ref.getBase() ); + prevBase = bases[offset+1]; + } + + // skip if this base or the previous one was an 'N' or etc. + if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)bases[offset] ) ) { + + final SAMReadGroupRecord readGroup = read.getReadGroup(); + readGroupId = readGroup.getReadGroupId(); + platform = readGroup.getPlatform(); + if( USE_SLX_PLATFORM ) { + platform = "ILLUMINA"; + } + + // SOLID bams insert the reference base into the read if the color space quality is zero, so skip over them + colorSpaceQuals = null; + if( platform.equalsIgnoreCase("SOLID") ) { + colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG)); + } + if( colorSpaceQuals == null || colorSpaceQuals[offset] > 0 ) //BUGBUG: This isn't exactly correct yet + { + updateDataFromRead( read, offset, readGroupId, platform, quals, bases, refBase ); + } + } else { + if( VALIDATE_OLD_RECALIBRATOR ) { + countedBases++; // replicating a small bug in the old recalibrator + } } } } @@ -320,6 +340,7 @@ public class CovariateCounterWalker extends LocusWalker { * @param read The read * @param offset The offset in the read for this locus * @param readGroup The read group the read is in + * @param platform The String that has the platform this read came from: Illumina, 454, or solid * @param quals List of base quality scores * @param bases The bases which make up the read * @param refBase The reference base at this locus @@ -338,7 +359,11 @@ public class CovariateCounterWalker extends LocusWalker { RecalDatum datum = dataManager.data.get( key ); if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method - dataManager.data.put( key, datum ); + if( VALIDATE_OLD_RECALIBRATOR ) { + dataManager.data.myPut( key, datum ); + } else { + dataManager.data.put( key, datum ); + } } // Need the bases to determine whether or not we have a mismatch @@ -396,26 +421,46 @@ public class CovariateCounterWalker extends LocusWalker { */ private void outputToCSV( final PrintStream recalTableStream ) { - if( !NO_PRINT_HEADER ) { - recalTableStream.printf("# Counted Sites %d%n", countedSites); - recalTableStream.printf("# Counted Bases %d%n", countedBases); - recalTableStream.printf("# Skipped Sites %d%n", skippedSites); - recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); - for( Covariate cov : requestedCovariates ) { - // The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name - recalTableStream.println( "@!" + cov.getClass().getSimpleName() ); + if( VALIDATE_OLD_RECALIBRATOR ) { + recalTableStream.printf("# collapsed_pos false%n"); + recalTableStream.printf("# collapsed_dinuc false%n"); + recalTableStream.printf("# counted_sites %d%n", countedSites); + recalTableStream.printf("# counted_bases %d%n", countedBases); + recalTableStream.printf("# skipped_sites %d%n", skippedSites); + recalTableStream.printf("# fraction_skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); + recalTableStream.printf("rg,pos,Qrep,dn,nBases,nMismatches,Qemp%n"); + // For each entry in the data hashmap + for( Pair,RecalDatum> entry : dataManager.data.entrySetSorted4() ) { + // For each Covariate in the key + for( Comparable comp : entry.first ) { + // Output the Covariate's value + recalTableStream.print( comp + "," ); + } + // Output the RecalDatum entry + recalTableStream.println( entry.second.outputToCSV() ); } - } - // For each entry in the data hashmap - for( Map.Entry, RecalDatum> entry : dataManager.data.entrySet() ) { - // For each Covariate in the key - for( Comparable comp : entry.getKey() ) { - // Output the Covariate's value - if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG - recalTableStream.print( comp + "," ); + } else { + if( !NO_PRINT_HEADER ) { + recalTableStream.printf("# Counted Sites %d%n", countedSites); + recalTableStream.printf("# Counted Bases %d%n", countedBases); + recalTableStream.printf("# Skipped Sites %d%n", skippedSites); + recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); + for( Covariate cov : requestedCovariates ) { + // The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name + recalTableStream.println( "@!" + cov.getClass().getSimpleName() ); + } + } + // For each entry in the data hashmap + for( Map.Entry, RecalDatum> entry : dataManager.data.entrySet() ) { + // For each Covariate in the key + for( Comparable comp : entry.getKey() ) { + // Output the Covariate's value + if( NO_PRINT_HEADER && comp instanceof String ) { continue; } // BUGBUG + recalTableStream.print( comp + "," ); + } + // Output the RecalDatum entry + recalTableStream.println( entry.getValue().outputToCSV() ); } - // Output the RecalDatum entry - recalTableStream.println( entry.getValue().outputToCSV() ); } } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java index 2d6e34abc..6ccf2a553 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/DinucCovariate.java @@ -2,8 +2,6 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; import net.sf.samtools.SAMRecord; -import java.util.ArrayList; -import java.util.Collection; import java.util.HashMap; /* @@ -60,10 +58,12 @@ public class DinucCovariate implements Covariate { final byte[] quals, final byte[] bases) { byte base = bases[offset]; - byte prevBase = bases[offset - 1]; + byte prevBase; // If this is a negative strand read then we need to reverse the direction for our previous base if( read.getReadNegativeStrandFlag() ) { prevBase = bases[offset + 1]; + } else { + prevBase = bases[offset - 1]; } //char[] charArray = {(char)prevBase, (char)base}; //return new String( charArray ); // This is an expensive call diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java index e44e1f1bf..2016c95ca 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/NHashMap.java @@ -1,5 +1,8 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.StingException; + import java.util.*; /* @@ -33,18 +36,84 @@ import java.util.*; * Date: Oct 30, 2009 * * A HashMap that maps a list of comparables to any object . + * There is functionality for the mappings to be given back to you in sorted order. */ public class NHashMap extends HashMap, T> { private static final long serialVersionUID = 1L; //BUGBUG: what should I do here?, added by Eclipse + private ArrayList> keyLists; public NHashMap() { super(); + keyLists = null; } public NHashMap( int initialCapacity, float loadingFactor ) { super( initialCapacity, loadingFactor ); + keyLists = null; + } + + + // This method is here only to help facilitate direct comparison of old and refactored recalibrator. + // The old recalibrator prints out its mappings in a sorted order but the refactored recalibrator doesn't need to. + public T myPut(List key, T value) { + + if( keyLists == null ) { + keyLists = new ArrayList>(); + for( Comparable comp : key ) { + keyLists.add( new ArrayList() ); + } + } + + ArrayList thisList; + for( int iii = 0; iii < key.size(); iii++ ) { + thisList = keyLists.get( iii ); + if( thisList == null ) { + thisList = new ArrayList(); + } + if( !thisList.contains( key.get( iii ) ) ) { + thisList.add( key.get(iii ) ); + } + } + return super.put( key, value ); + } + + // This method is very ugly but is here only to help facilitate direct comparison of old and refactored recalibrator. + // The old recalibrator prints out its mappings in a sorted order but the refactored recalibrator doesn't need to. + @SuppressWarnings(value = "unchecked") + public ArrayList, T>> entrySetSorted4() { + + ArrayList, T>> theSet = new ArrayList, T>>(); + + for( ArrayList list : keyLists ) { + Collections.sort(list); + } + + if( keyLists.size() != 4 ) { + throw new StingException("Are you sure you want to be calling this ugly method? NHashMap.entrySetSorted4()"); + } + + ArrayList newKey = null; + for( Comparable c0 : keyLists.get(0) ) { + for( Comparable c1 : keyLists.get(1) ) { + for( Comparable c2 : keyLists.get(2) ) { + for( Comparable c3 : keyLists.get(3) ) { + newKey = new ArrayList(); + newKey.add(c0); + newKey.add(c1); + newKey.add(c2); + newKey.add(c3); + T value = this.get( newKey ); + if( value!= null ) { + theSet.add(new Pair,T>( newKey, value ) ); + } + } + } + } + } + + return theSet; } public static List makeList(T... args) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java index 3d6bb4ea1..3e8ce3966 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/Recalibration/TableRecalibrationWalker.java @@ -74,6 +74,10 @@ public class TableRecalibrationWalker extends ReadWalker requestedCovariates; private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + private static Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*"); @@ -133,7 +138,7 @@ public class TableRecalibrationWalker extends ReadWalker 300 * 40 * 200 * 16) { estimatedCapacity = 300 * 40 * 200 * 16; } // Don't want to crash with out of heap space exception final boolean createCollapsedTables = true; @@ -208,7 +219,17 @@ public class TableRecalibrationWalker extends ReadWalker key = new ArrayList(); + // Get the covariate values which make up the key for( Covariate covariate : requestedCovariates ) { key.add( covariate.getValue( read, iii, readGroup, platform, originalQuals, bases ) ); // offset is zero based so passing iii is correct here } recalQuals[iii] = performSequentialQualityCalculation( key ); - - //if( MODE_STRING.equalsIgnoreCase("COMBINATORIAL") ) { // BUGBUG: This isn't supported. No need to keep the full data hashmap around so it was removed for major speed up - // //RecalDatum datum = dataManager.data.get( key ); - // //if( datum != null ) { // if we have data for this combination of covariates then recalibrate the quality score otherwise do nothing - // // recalQuals[iii] = datum.empiricalQualByte( SMOOTHING ); - // //} - // throw new StingException("The Combinatorial mode isn't supported."); - //} else if( MODE_STRING.equalsIgnoreCase("SEQUENTIAL") ) { - // - //} else { - // throw new StingException( "Specified RecalibrationMode is not supported: " + MODE_STRING ); - //} } preserveQScores( originalQuals, recalQuals ); // overwrite the work done if original quality score is too low @@ -340,11 +354,17 @@ public class TableRecalibrationWalker extends ReadWalker paramsFiles = new HashMap(); + + @Test + public void testCountCovariatesR() { + HashMap e = new HashMap(); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "7be0b7a624d22187e712131f12aae546" ); + //e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" ); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "dc1a1b6e99f4a47525cc1dce7b6eb1dc" ); + + for ( Map.Entry entry : e.entrySet() ) { + String bam = entry.getKey(); + String md5 = entry.getValue(); + + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R /broad/1KG/reference/human_b36_both.fasta" + + " --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" + + " -T CountCovariatesRefactored" + + " -I " + bam + + " -L 1:10,000,000-11,000,000" + + " --validate_old_recalibrator" + + " --use_slx_platform" + + " -recalFile %s", + 1, // just one output file + Arrays.asList(md5)); + List result = executeTest("testCountCovariatesR", spec).getFirst(); + paramsFiles.put(bam, result.get(0).getAbsolutePath()); + } + } + + @Test + public void testTableRecalibratorR() { + HashMap e = new HashMap(); + e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.SLX.SRP000031.2009_06.chr1.10_20mb.bam", "3ace4b9b8495429cc32e7008145f4996" ); + //e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "" ); + //e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "" ); + + for ( Map.Entry entry : e.entrySet() ) { + String bam = entry.getKey(); + String md5 = entry.getValue(); + String paramsFile = paramsFiles.get(bam); + System.out.printf("PARAMS FOR %s is %s%n", bam, paramsFile); + if ( paramsFile != null ) { + WalkerTestSpec spec = new WalkerTestSpec( + "-R /broad/1KG/reference/human_b36_both.fasta" + + " --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod" + + " -T TableRecalibrationRefactored" + + " -I " + bam + + " -L 1:10,000,000-20,000,000" + + " --validate_old_recalibrator" + + " --use_slx_platform" + + " -outputBam %s" + + " -recalFile " + paramsFile, + 1, // just one output file + Arrays.asList(md5)); + executeTest("testTableRecalibratorR", spec); + } + } + } +}