diff --git a/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java b/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java index 88e0d7494..5036e3e81 100755 --- a/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java +++ b/java/src/org/broadinstitute/sting/analyzecovariates/AnalysisDataManager.java @@ -56,7 +56,7 @@ public class AnalysisDataManager { // Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) { - if( iii == 0 || !(qscore < IGNORE_QSCORES_LESS_THAN) ) { // use all data for the plot versus reported quality, but not for the other plots versus cycle etc. + if( iii == 0 || !(qscore < IGNORE_QSCORES_LESS_THAN) ) { // use all data for the plot versus reported quality, but not for the other plots versus cycle and etc. newKey = new ArrayList(); newKey.add( key.get(0) ); // Make a new key with the read group ... newKey.add( key.get(iii + 1) ); // and the given covariate diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index f5c5bf426..6869e6c93 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -87,9 +87,7 @@ public class CovariateCounterWalker extends LocusWalker { ///////////////////////////// // Debugging-only Arguments ///////////////////////////// - @Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. FOR DEBUGGING PURPOSES ONLY.") - private boolean NO_PRINT_HEADER = false; - @Argument(fullName="sorted_output", shortName="sorted", required=false, doc="The outputted table recalibration file will be in sorted order at the cost of added overhead. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") + @Argument(fullName="sorted_output", shortName="sorted", required=false, doc="The output table recalibration csv file will be in sorted order at the cost of added overhead. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") private boolean SORTED_OUTPUT = false; ///////////////////////////// @@ -100,8 +98,10 @@ public class CovariateCounterWalker extends LocusWalker { private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file + private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base + private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. BUGBUG: I don't understand what is going on in this case private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site - private static final String versionString = "v2.0.12"; // Major version, minor version, and build number + private static final String versionString = "v2.1.0"; // Major version, minor version, and build number private Pair dbSNP_counts = new Pair(0L, 0L); // mismatch/base counts for dbSNP loci private Pair novel_counts = new Pair(0L, 0L); // mismatch/base counts for non-dbSNP loci private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878) @@ -124,6 +124,9 @@ public class CovariateCounterWalker extends LocusWalker { if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; } if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; } DBSNP_VALIDATION_CHECK_FREQUENCY *= PROCESS_EVERY_NTH_LOCUS; + if( !RAC.checkSolidRecalMode() ) { + throw new StingException( "Unrecognized --solid_recal_mode argument. Implemented options: DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, COUNT_AS_MISMATCH, or REMOVE_REF_BIAS"); + } // Get a list of all available covariates final List> classes = PackageUtils.getClassesImplementingInterface( Covariate.class ); @@ -155,13 +158,7 @@ public class CovariateCounterWalker extends LocusWalker { // BUGBUG: This is a mess because there are a lot of cases (validate, all, none, and supplied covList). Clean up needed. requestedCovariates = new ArrayList(); int estimatedCapacity = 1; // Capacity is multiplicitive so this starts at one - if( RAC.VALIDATE_OLD_RECALIBRATOR ) { - requestedCovariates.add( new ReadGroupCovariate() ); - requestedCovariates.add( new CycleCovariate() ); // Unfortunately order is different here in order to match the old recalibrator exactly - requestedCovariates.add( new QualityScoreCovariate() ); - requestedCovariates.add( new DinucCovariate() ); - estimatedCapacity = 60 * 100 * 40 * 16; - } else if( COVARIATES != null ) { + if( COVARIATES != null ) { if(COVARIATES[0].equalsIgnoreCase( "ALL" )) { // The user wants ALL covariates to be used requestedCovariates.add( new ReadGroupCovariate() ); // First add the required covariates then add the rest by looping over all implementing classes that were found requestedCovariates.add( new QualityScoreCovariate() ); @@ -275,44 +272,46 @@ public class CovariateCounterWalker extends LocusWalker { read = p.getRead(); offset = p.getOffset(); - RecalDataManager.parseSAMRecord(read, RAC); + RecalDataManager.parseSAMRecord( read, RAC ); + RecalDataManager.parseColorSpace( read ); // Skip first and last base because there is no dinuc // BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests. - if( offset > 0 ) { - if( offset < read.getReadLength() - 1 ) { - // Skip if base quality is zero - if( read.getBaseQualities()[offset] > 0 ) { + if( offset > 0 && offset < read.getReadLength() - 1) { + // Skip if base quality is zero + if( read.getBaseQualities()[offset] > 0 ) { - bases = read.getReadBases(); - refBase = (byte)ref.getBase(); - prevBase = bases[offset - 1]; + bases = read.getReadBases(); + refBase = (byte)ref.getBase(); + prevBase = bases[offset - 1]; - // DinucCovariate is responsible for getting the complement bases if needed - if( read.getReadNegativeStrandFlag() ) { - prevBase = bases[offset + 1]; - } + // DinucCovariate is responsible for getting the complement bases if needed + if( read.getReadNegativeStrandFlag() ) { + prevBase = bases[offset + 1]; + } - // Skip if this base or the previous one was an 'N' or etc. - // BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read. - if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(bases[offset]) ) ) { + // Skip if this base or the previous one was an 'N' or etc. + // BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read. + if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(bases[offset]) ) ) { - // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base - // so decrease the quality of this base by forcing it to be a mismatch - if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) { - if( RecalDataManager.isInconsistentColorSpace( p.getBase(), read ) ) { - refBase = (byte)'X'; // Because we are already filter out all bases that don't pass BaseUtils.isRegularBase this will always be marked as a mismatch - } - } + // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it + if( !read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) { // This base finally passed all the checks for a good base, so add it to the big data hashmap updateDataFromRead( read, offset, refBase ); - } else { - if( RAC.VALIDATE_OLD_RECALIBRATOR ) { - countedBases++; // Replicating a small bug in the old recalibrator + + } else { // calculate SOLID reference insertion rate + if( ref.getBase() == (char)bases[offset] ) { + solidInsertedReferenceBases++; + } else { + otherColorSpaceInconsistency++; + } + if( RAC.SOLID_RECAL_MODE.equalsIgnoreCase("COUNT_AS_MISMATCH") ) { + refBase = (byte)'X'; // This will always mismatch since we are already filtering out reads that don't pass BaseUtils.isRegularBase + updateDataFromRead( read, offset, refBase ); } } - } + } } } } @@ -352,6 +351,7 @@ public class CovariateCounterWalker extends LocusWalker { } counts.second++; } + } } @@ -397,7 +397,7 @@ public class CovariateCounterWalker extends LocusWalker { RecalDatum datum = dataManager.data.get( key ); if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method - if( RAC.VALIDATE_OLD_RECALIBRATOR || SORTED_OUTPUT ) { + if( SORTED_OUTPUT ) { dataManager.data.sortedPut( key, datum ); } else { dataManager.data.put( key, datum ); @@ -461,59 +461,45 @@ public class CovariateCounterWalker extends LocusWalker { * @param recalTableStream The PrintStream to write out to */ private void outputToCSV( final PrintStream recalTableStream ) { + + recalTableStream.printf("# Counted Sites %d%n", countedSites); + recalTableStream.printf("# Counted Bases %d%n", countedBases); + recalTableStream.printf("# Skipped Sites %d%n", skippedSites); + if( PROCESS_EVERY_NTH_LOCUS == 1 ) { + recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); + } else { + recalTableStream.printf("# Percent Skipped %.4f%n", 100.0 * (double)skippedSites / ((double)countedSites+skippedSites)); + } + if( solidInsertedReferenceBases != 0 ) { + recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) countedBases / solidInsertedReferenceBases); + recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) countedBases / otherColorSpaceInconsistency); + } - //BUGBUG: This method is a mess. It will be cleaned up when I get rid of the validation and no_header debug options. - if( RAC.VALIDATE_OLD_RECALIBRATOR ) { - // Output the old header as well as output the data in sorted order - recalTableStream.printf("# collapsed_pos false%n"); - recalTableStream.printf("# collapsed_dinuc false%n"); - recalTableStream.printf("# counted_sites %d%n", countedSites); - recalTableStream.printf("# counted_bases %d%n", countedBases); - recalTableStream.printf("# skipped_sites %d%n", skippedSites); - recalTableStream.printf("# fraction_skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); - recalTableStream.printf("rg,pos,Qrep,dn,nBases,nMismatches,Qemp%n"); - for( Pair,RecalDatum> entry : dataManager.data.entrySetSorted4() ) { + // Output header saying which covariates were used and in what order + for( Covariate cov : requestedCovariates ) { + recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," ); + } + recalTableStream.println("nObservations,nMismatches,Qempirical"); + + + if( SORTED_OUTPUT && requestedCovariates.size() == 4 ) + { + for( Pair,RecalDatum> entry : dataManager.data.entrySetSorted4() ) { //BUGBUG: entrySetSorted4 isn't correct here for( Comparable comp : entry.first ) { recalTableStream.print( comp + "," ); } recalTableStream.println( entry.second.outputToCSV() ); } } else { - if( !NO_PRINT_HEADER ) { - recalTableStream.printf("# Counted Sites %d%n", countedSites); - recalTableStream.printf("# Counted Bases %d%n", countedBases); - recalTableStream.printf("# Skipped Sites %d%n", skippedSites); - if( PROCESS_EVERY_NTH_LOCUS == 1 ) { - recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites); - } else { - recalTableStream.printf("# Percent Skipped %.4f%n", 100.0 * (double)skippedSites / ((double)countedSites+skippedSites)); - } - for( Covariate cov : requestedCovariates ) { - // The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name - recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," ); - } - recalTableStream.println("nObservations,nMismatches,Qempirical"); - } - - if( SORTED_OUTPUT && requestedCovariates.size() == 4 ) - { - for( Pair,RecalDatum> entry : dataManager.data.entrySetSorted4() ) { //BUGBUG: entrySetSorted4 isn't correct here - for( Comparable comp : entry.first ) { - recalTableStream.print( comp + "," ); - } - recalTableStream.println( entry.second.outputToCSV() ); - } - } else { - // For each entry in the data hashmap - for( Map.Entry, RecalDatum> entry : dataManager.data.entrySet() ) { - // For each Covariate in the key - for( Comparable comp : entry.getKey() ) { - // Output the Covariate's value - recalTableStream.print( comp + "," ); - } - // Output the RecalDatum entry - recalTableStream.println( entry.getValue().outputToCSV() ); + // For each entry in the data hashmap + for( Map.Entry, RecalDatum> entry : dataManager.data.entrySet() ) { + // For each Covariate in the key + for( Comparable comp : entry.getKey() ) { + // Output the Covariate's value + recalTableStream.print( comp + "," ); } + // Output the RecalDatum entry + recalTableStream.println( entry.getValue().outputToCSV() ); } } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java new file mode 100755 index 000000000..c2f063917 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/HomopolymerCovariate.java @@ -0,0 +1,82 @@ +package org.broadinstitute.sting.gatk.walkers.recalibration; + +import net.sf.samtools.SAMRecord; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Dec 4, 2009 + * + * The Homopolymer Run Covariate. This is the number of bases in the previous N that match the current base. + * For example, if N = 10: + * ATTGCCCCGTAAAAAAAAAA + * 00100123121123456789 + */ + +public class HomopolymerCovariate implements Covariate { + + int numBack = 10; + + // Initialize any member variables using the command-line arguments passed to the walkers + public void initialize( final RecalibrationArgumentCollection RAC ) { + numBack = RAC.HOMOPOLYMER_NBACK; + } + + // Used to pick out the covariate's value from attributes of the read + public final Comparable getValue( final SAMRecord read, final int offset ) { + + int numAgree = 0; // The number of bases that agree with you in the previous numBack bases of the read + int startPos = 0; + int stopPos = 0; + byte[] bases = read.getReadBases(); + byte thisBase = bases[offset]; + if( !read.getReadNegativeStrandFlag() ) { // Forward direction + startPos = Math.max(offset - numBack, 0); + stopPos = offset; + } else { // Negative direction + startPos = offset + 1; + stopPos = Math.min(offset + numBack + 1, read.getReadLength()); + } + + for( int iii = startPos; iii < stopPos; iii++ ) { + if( bases[iii] == thisBase ) { numAgree++; } + } + + return numAgree; + } + + // Used to get the covariate's value from input csv file in TableRecalibrationWalker + public final Comparable getValue( final String str ) { + return Integer.parseInt( str ); + } + + // Used to estimate the amount space required for the full data HashMap + public final int estimatedNumberOfBins() { + return numBack + 1; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index f5bedae0e..86f652efb 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; @@ -57,7 +58,9 @@ public class RecalDataManager { public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams + public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which if this base is inconsistent with its color private static boolean warnUserNullReadGroup = false; + private static boolean warnUserNoColorSpace = false; RecalDataManager() { data = new NHashMap(); @@ -205,12 +208,12 @@ public class RecalDataManager { * @param read The read to adjust * @param RAC The list of shared command line arguments */ - public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC) { + public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) { // Check if we need to use the original quality scores instead - if ( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { + if( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) { Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG); - if ( obj instanceof String ) + if( obj instanceof String ) read.setBaseQualities( QualityUtils.fastqToPhred((String)obj) ); else { throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName())); @@ -245,13 +248,154 @@ public class RecalDataManager { } } + public static void parseColorSpace( final SAMRecord read ) { + // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base + if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) { + if( read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null ) { // Haven't calculated the inconsistency array yet for this read + if( read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG) != null ) { + char[] colorSpace = ((String)read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG)).toCharArray(); + // Loop over the read and calculate first the infered bases from the color and then check if it is consistent with the read + byte[] readBases = read.getReadBases(); + if( read.getReadNegativeStrandFlag() ) { + readBases = BaseUtils.simpleReverseComplement( read.getReadBases() ); + } + int[] inconsistency = new int[readBases.length]; + int iii; + byte prevBase = (byte) colorSpace[0]; // The sentinel + for( iii = 0; iii < readBases.length; iii++ ) { + byte thisBase = (byte)getNextBaseFromColor( (char)prevBase, colorSpace[iii + 1] ); + inconsistency[iii] = ( thisBase == readBases[iii] ? 0 : 1 ); + prevBase = ( thisBase == readBases[iii] ? thisBase : readBases[iii] ); + //System.out.print((char)thisBase); + } + //System.out.println(); + read.setAttribute( RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency ); + //for( iii = 0; iii < readBases.length; iii++ ) { + // System.out.print((char)readBases[iii]); + //} + //System.out.println(); + //for( iii = 0; iii < readBases.length; iii++ ) { + // System.out.print(inconsistency[iii]); + //} + //System.out.println("\n"); + + } else if ( !warnUserNoColorSpace ) { // Warn the user if we can't find the color space tag + Utils.warnUser("Unable to find color space information in SOLID read. First observed at read with name = " + read.getReadName()); + Utils.warnUser("This calculation is critically dependent on being able to know when reference bases were inserted into the SOLID read. Are you sure you want to proceed?"); + warnUserNoColorSpace = true; + } + } + } + } + + public static byte[] calcColorSpace( final SAMRecord read, byte[] originalQualScores, String SOLID_RECAL_MODE, Random coinFlip ) { + + int[] inconsistency = null; + if( read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG) != null ) { + char[] colorSpace = ((String)read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG)).toCharArray(); + // Loop over the read and calculate first the infered bases from the color and then check if it is consistent with the read + byte[] readBases = read.getReadBases(); + byte[] colorImpliedBases = readBases.clone(); + if( read.getReadNegativeStrandFlag() ) { + readBases = BaseUtils.simpleReverseComplement( read.getReadBases() ); + } + inconsistency = new int[readBases.length]; + int iii; + byte prevBase = (byte) colorSpace[0]; // The sentinel + for( iii = 0; iii < readBases.length; iii++ ) { + byte thisBase = (byte)getNextBaseFromColor( (char)prevBase, colorSpace[iii + 1] ); + colorImpliedBases[iii] = thisBase; + inconsistency[iii] = ( thisBase == readBases[iii] ? 0 : 1 ); + prevBase = ( thisBase == readBases[iii] ? thisBase : readBases[iii] ); + //System.out.print((char)thisBase); + } + + + //System.out.println(); + //for( iii = 0; iii < readBases.length; iii++ ) { + // System.out.print((char)readBases[iii]); + //} + //System.out.println(); + //for( iii = 0; iii < readBases.length; iii++ ) { + // System.out.print(inconsistency[iii]); + //} + //System.out.println("\n"); + + if( SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ) { + for( iii = 0; iii < originalQualScores.length; iii++ ) { + if( inconsistency[iii] == 1 ) { + originalQualScores[iii] = (byte)0; + } + } + } else if( SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO_BASE_N") ) { + byte[] tmpBases = read.getReadBases(); + for( iii = 0; iii < originalQualScores.length; iii++ ) { + if( inconsistency[iii] == 1 ) { + originalQualScores[iii] = (byte)0; + tmpBases[iii] = (byte)'N'; + } + } + read.setReadBases( tmpBases ); + + } else if( SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ) { + if( read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG) != null ) { // BUGBUG: Warn user if using this mode but there are no color space quality scores in the bam + byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG)); + byte[] tmpBases = read.getReadBases(); + if( read.getReadNegativeStrandFlag() ) { + colorImpliedBases = BaseUtils.simpleComplement(colorImpliedBases.clone()); + } + for( iii = 0; iii < inconsistency.length - 1; iii++ ) { + if( inconsistency[iii] == 1 ) { + if( colorSpaceQuals[iii] > colorSpaceQuals[iii+1] ) { // reference was inserted here, but there is more evidence for the color implied base + tmpBases[iii] = colorImpliedBases[iii]; + } else if( colorSpaceQuals[iii] == colorSpaceQuals[iii+1] ) { // equal evidence for the color implied base and the reference base, so flip a coin + int rand = coinFlip.nextInt( 2 ); + if( rand == 0 ) { // the color implied base won the coin flip + tmpBases[iii] = colorImpliedBases[iii]; + } + } + } + } + + read.setReadBases( tmpBases ); + } + } + + } else if ( !warnUserNoColorSpace ) { // Warn the user if we can't find the color space tag + Utils.warnUser("Unable to find color space information in SOLID read. First observed at read with name = " + read.getReadName()); + Utils.warnUser("This calculation is critically dependent on being able to know when reference bases were inserted into the SOLID read. Are you sure you want to proceed?"); + warnUserNoColorSpace = true; + } + return originalQualScores; + } + + private static char getNextBaseFromColor( final char prevBase, final char color ) { + switch(color) { + case '0': // same base + return prevBase; + case '1': // transversion + return BaseUtils.transversion( prevBase ); + case '2': // transition + return BaseUtils.transition( prevBase ); + case '3': // simple complement + return BaseUtils.simpleComplement( prevBase ); + default: + throw new StingException( "Unrecognized color space in SOLID bam, color = " + color ); + } + } + /** * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality - * @param base The nucleotide we are checking * @param read The read which contains the color space to check against + * @param offset The offset in the read at which to check * @return Returns true if the base is inconsistent with the color space */ - public static boolean isInconsistentColorSpace( final byte base, final SAMRecord read ) { - return false; + public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) { + if( read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) != null ) { + int[] colorSpace = ((int[])read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG)); + return colorSpace[offset] != 0; + } else { + return false; + } } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java index 764e1f6e9..1266441f0 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java @@ -45,8 +45,6 @@ public class RecalibrationArgumentCollection { public String RECAL_FILE = "output.recal_data.csv"; @Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false) public boolean USE_ORIGINAL_QUALS = false; - @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false) - public int WINDOW_SIZE = 5; @Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.") public String DEFAULT_READ_GROUP = ReadGroupCovariate.defaultReadGroup; @Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") @@ -55,14 +53,15 @@ public class RecalibrationArgumentCollection { public String FORCE_READ_GROUP = null; @Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; + @Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? EXPERIMENTAL OPTION. USE AT OWN RISK. Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, COUNT_AS_MISMATCH, or REMOVE_REF_BIAS") + public String SOLID_RECAL_MODE = "DO_NOTHING"; + @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false) + public int WINDOW_SIZE = 5; + @Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false) + public int HOMOPOLYMER_NBACK = 10; - ////////////////////////////////// - // Shared Debugging-only Arguments - ////////////////////////////////// - @Argument(fullName="validate_old_recalibrator", shortName="validate", required=false, doc="!!!Depricated!!! Match the output of the old recalibrator exactly. FOR DEBUGGING PURPOSES ONLY.") - public boolean VALIDATE_OLD_RECALIBRATOR = false; - // BUGBUG: This validate option no longer does exactly what it says because now using read filter to filter out zero mapping quality reads. The old recalibrator counted those reads in the counted_sites variable but the new one doesn't. - + public boolean checkSolidRecalMode() { + return ( SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") || + SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO_BASE_N") || SOLID_RECAL_MODE.equalsIgnoreCase("COUNT_AS_MISMATCH") || SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ); + } } - - diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index bd4c8f50f..94ddce851 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.*; import java.util.ArrayList; import java.util.List; +import java.util.Random; import java.util.regex.Pattern; import java.io.File; import java.io.FileNotFoundException; @@ -93,8 +94,9 @@ public class TableRecalibrationWalker extends ReadWalker> classes = PackageUtils.getClassesImplementingInterface(Covariate.class); @@ -178,12 +184,6 @@ public class TableRecalibrationWalker extends ReadWalker @@ -418,7 +407,7 @@ public class TableRecalibrationWalker extends ReadWalker G or C <-> T) on the base, or the specified base if it can't be done (i.e. an ambiguous base). + * + * @param base the base [AaCcGgTt] + * @return the transition of the base, or the input base if it's not one of the understood ones + */ + static public char transition(char base) { + switch (base) { + case 'A': + case 'a': return 'G'; + case 'C': + case 'c': return 'T'; + case 'G': + case 'g': return 'A'; + case 'T': + case 't': return 'C'; + default: return base; + } + } + + /** + * Perform a transversion (A <-> C or G <-> T) on the base, or the specified base if it can't be done (i.e. an ambiguous base). + * + * @param base the base [AaCcGgTt] + * @return the transversion of the base, or the input base if it's not one of the understood ones + */ + static public char transversion(char base) { + switch (base) { + case 'A': + case 'a': return 'C'; + case 'C': + case 'c': return 'A'; + case 'G': + case 'g': return 'T'; + case 'T': + case 't': return 'G'; + default: return base; + } + } + + /** + * Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base). + * + * @param base the base [AaCcGgTt] * @return the complementary base, or the input base if it's not one of the understood ones */ static public char simpleComplement(char base) { @@ -252,7 +292,7 @@ public class BaseUtils { /** * Reverse complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form) * - * @param bases the byte array of bases + * @param bases the byte array of bases * @return the reverse complement of the base byte array */ static public byte[] simpleReverseComplement(byte[] bases) { diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java index c04c0f533..81ee5dd8f 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java @@ -37,6 +37,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { " -cov CycleCovariate" + " -cov DinucCovariate" + " --sorted_output" + + " --solid_recal_mode DO_NOTHING" + " -recalFile %s", 1, // just one output file Arrays.asList(md5)); @@ -67,6 +68,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { ? " -L 1:10,800,000-10,810,000" : " -L 1:10,100,000-10,300,000" ) + " -outputBam %s" + " --no_pg_tag" + + " --solid_recal_mode DO_NOTHING" + " -recalFile " + paramsFile, 1, // just one output file Arrays.asList(md5)); @@ -95,6 +97,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { " -cov CycleCovariate" + " -cov DinucCovariate" + " --sorted_output" + + " --solid_recal_mode DO_NOTHING" + " -recalFile %s", 1, // just one output file Arrays.asList(md5)); @@ -123,6 +126,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { " -cov DinucCovariate" + " --default_platform illumina" + " --sorted_output" + + " --solid_recal_mode DO_NOTHING" + " -recalFile %s", 1, // just one output file Arrays.asList(md5)); @@ -149,6 +153,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { " -L 1:10,100,000-10,300,000" + " -outputBam %s" + " --no_pg_tag" + + " --solid_recal_mode DO_NOTHING" + " --default_platform illumina" + " -recalFile " + paramsFile, 1, // just one output file