Added --solid_recal_mode argument to experiment with different ways of dealing with solid reference bias. Currently the default option is DO_NOTHING which means use the same behavior as the old recalibrator. Eventually the new methods in RecalDataManager will be moved over to a SolidUtils class. Added transition and transversion methods to BaseUtils that work like simpleComplement, used with the color space in my solid methods. Also, initial check-in of HomopolymerCovariate.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2276 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2009-12-07 14:26:27 +00:00
parent 2632cb6b58
commit 1d5b9883db
8 changed files with 379 additions and 134 deletions

View File

@ -56,7 +56,7 @@ public class AnalysisDataManager {
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) {
if( iii == 0 || !(qscore < IGNORE_QSCORES_LESS_THAN) ) { // use all data for the plot versus reported quality, but not for the other plots versus cycle etc.
if( iii == 0 || !(qscore < IGNORE_QSCORES_LESS_THAN) ) { // use all data for the plot versus reported quality, but not for the other plots versus cycle and etc.
newKey = new ArrayList<Comparable>();
newKey.add( key.get(0) ); // Make a new key with the read group ...
newKey.add( key.get(iii + 1) ); // and the given covariate

View File

@ -87,9 +87,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
/////////////////////////////
// Debugging-only Arguments
/////////////////////////////
@Argument(fullName="no_print_header", shortName="noHeader", required=false, doc="Don't print the usual header on the table recalibration file. FOR DEBUGGING PURPOSES ONLY.")
private boolean NO_PRINT_HEADER = false;
@Argument(fullName="sorted_output", shortName="sorted", required=false, doc="The outputted table recalibration file will be in sorted order at the cost of added overhead. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
@Argument(fullName="sorted_output", shortName="sorted", required=false, doc="The output table recalibration csv file will be in sorted order at the cost of added overhead. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
private boolean SORTED_OUTPUT = false;
/////////////////////////////
@ -100,8 +98,10 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. BUGBUG: I don't understand what is going on in this case
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
private static final String versionString = "v2.0.12"; // Major version, minor version, and build number
private static final String versionString = "v2.1.0"; // Major version, minor version, and build number
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
@ -124,6 +124,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
DBSNP_VALIDATION_CHECK_FREQUENCY *= PROCESS_EVERY_NTH_LOCUS;
if( !RAC.checkSolidRecalMode() ) {
throw new StingException( "Unrecognized --solid_recal_mode argument. Implemented options: DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, COUNT_AS_MISMATCH, or REMOVE_REF_BIAS");
}
// Get a list of all available covariates
final List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface( Covariate.class );
@ -155,13 +158,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
// BUGBUG: This is a mess because there are a lot of cases (validate, all, none, and supplied covList). Clean up needed.
requestedCovariates = new ArrayList<Covariate>();
int estimatedCapacity = 1; // Capacity is multiplicitive so this starts at one
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
requestedCovariates.add( new ReadGroupCovariate() );
requestedCovariates.add( new CycleCovariate() ); // Unfortunately order is different here in order to match the old recalibrator exactly
requestedCovariates.add( new QualityScoreCovariate() );
requestedCovariates.add( new DinucCovariate() );
estimatedCapacity = 60 * 100 * 40 * 16;
} else if( COVARIATES != null ) {
if( COVARIATES != null ) {
if(COVARIATES[0].equalsIgnoreCase( "ALL" )) { // The user wants ALL covariates to be used
requestedCovariates.add( new ReadGroupCovariate() ); // First add the required covariates then add the rest by looping over all implementing classes that were found
requestedCovariates.add( new QualityScoreCovariate() );
@ -275,44 +272,46 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
read = p.getRead();
offset = p.getOffset();
RecalDataManager.parseSAMRecord(read, RAC);
RecalDataManager.parseSAMRecord( read, RAC );
RecalDataManager.parseColorSpace( read );
// Skip first and last base because there is no dinuc
// BUGBUG: Technically we only have to skip the first base on forward reads and the last base on negative strand reads. Change after passing old integration tests.
if( offset > 0 ) {
if( offset < read.getReadLength() - 1 ) {
// Skip if base quality is zero
if( read.getBaseQualities()[offset] > 0 ) {
if( offset > 0 && offset < read.getReadLength() - 1) {
// Skip if base quality is zero
if( read.getBaseQualities()[offset] > 0 ) {
bases = read.getReadBases();
refBase = (byte)ref.getBase();
prevBase = bases[offset - 1];
bases = read.getReadBases();
refBase = (byte)ref.getBase();
prevBase = bases[offset - 1];
// DinucCovariate is responsible for getting the complement bases if needed
if( read.getReadNegativeStrandFlag() ) {
prevBase = bases[offset + 1];
}
// DinucCovariate is responsible for getting the complement bases if needed
if( read.getReadNegativeStrandFlag() ) {
prevBase = bases[offset + 1];
}
// Skip if this base or the previous one was an 'N' or etc.
// BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read.
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
// Skip if this base or the previous one was an 'N' or etc.
// BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read.
if( BaseUtils.isRegularBase( (char)prevBase ) && BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base
// so decrease the quality of this base by forcing it to be a mismatch
if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) {
if( RecalDataManager.isInconsistentColorSpace( p.getBase(), read ) ) {
refBase = (byte)'X'; // Because we are already filter out all bases that don't pass BaseUtils.isRegularBase this will always be marked as a mismatch
}
}
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
if( !read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) {
// This base finally passed all the checks for a good base, so add it to the big data hashmap
updateDataFromRead( read, offset, refBase );
} else {
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
countedBases++; // Replicating a small bug in the old recalibrator
} else { // calculate SOLID reference insertion rate
if( ref.getBase() == (char)bases[offset] ) {
solidInsertedReferenceBases++;
} else {
otherColorSpaceInconsistency++;
}
if( RAC.SOLID_RECAL_MODE.equalsIgnoreCase("COUNT_AS_MISMATCH") ) {
refBase = (byte)'X'; // This will always mismatch since we are already filtering out reads that don't pass BaseUtils.isRegularBase
updateDataFromRead( read, offset, refBase );
}
}
}
}
}
}
}
@ -352,6 +351,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
}
counts.second++;
}
}
}
@ -397,7 +397,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
RecalDatum datum = dataManager.data.get( key );
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
if( RAC.VALIDATE_OLD_RECALIBRATOR || SORTED_OUTPUT ) {
if( SORTED_OUTPUT ) {
dataManager.data.sortedPut( key, datum );
} else {
dataManager.data.put( key, datum );
@ -461,59 +461,45 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
* @param recalTableStream The PrintStream to write out to
*/
private void outputToCSV( final PrintStream recalTableStream ) {
recalTableStream.printf("# Counted Sites %d%n", countedSites);
recalTableStream.printf("# Counted Bases %d%n", countedBases);
recalTableStream.printf("# Skipped Sites %d%n", skippedSites);
if( PROCESS_EVERY_NTH_LOCUS == 1 ) {
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
} else {
recalTableStream.printf("# Percent Skipped %.4f%n", 100.0 * (double)skippedSites / ((double)countedSites+skippedSites));
}
if( solidInsertedReferenceBases != 0 ) {
recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) countedBases / solidInsertedReferenceBases);
recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) countedBases / otherColorSpaceInconsistency);
}
//BUGBUG: This method is a mess. It will be cleaned up when I get rid of the validation and no_header debug options.
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
// Output the old header as well as output the data in sorted order
recalTableStream.printf("# collapsed_pos false%n");
recalTableStream.printf("# collapsed_dinuc false%n");
recalTableStream.printf("# counted_sites %d%n", countedSites);
recalTableStream.printf("# counted_bases %d%n", countedBases);
recalTableStream.printf("# skipped_sites %d%n", skippedSites);
recalTableStream.printf("# fraction_skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
recalTableStream.printf("rg,pos,Qrep,dn,nBases,nMismatches,Qemp%n");
for( Pair<List<? extends Comparable>,RecalDatum> entry : dataManager.data.entrySetSorted4() ) {
// Output header saying which covariates were used and in what order
for( Covariate cov : requestedCovariates ) {
recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," );
}
recalTableStream.println("nObservations,nMismatches,Qempirical");
if( SORTED_OUTPUT && requestedCovariates.size() == 4 )
{
for( Pair<List<? extends Comparable>,RecalDatum> entry : dataManager.data.entrySetSorted4() ) { //BUGBUG: entrySetSorted4 isn't correct here
for( Comparable comp : entry.first ) {
recalTableStream.print( comp + "," );
}
recalTableStream.println( entry.second.outputToCSV() );
}
} else {
if( !NO_PRINT_HEADER ) {
recalTableStream.printf("# Counted Sites %d%n", countedSites);
recalTableStream.printf("# Counted Bases %d%n", countedBases);
recalTableStream.printf("# Skipped Sites %d%n", skippedSites);
if( PROCESS_EVERY_NTH_LOCUS == 1 ) {
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
} else {
recalTableStream.printf("# Percent Skipped %.4f%n", 100.0 * (double)skippedSites / ((double)countedSites+skippedSites));
}
for( Covariate cov : requestedCovariates ) {
// The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name
recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," );
}
recalTableStream.println("nObservations,nMismatches,Qempirical");
}
if( SORTED_OUTPUT && requestedCovariates.size() == 4 )
{
for( Pair<List<? extends Comparable>,RecalDatum> entry : dataManager.data.entrySetSorted4() ) { //BUGBUG: entrySetSorted4 isn't correct here
for( Comparable comp : entry.first ) {
recalTableStream.print( comp + "," );
}
recalTableStream.println( entry.second.outputToCSV() );
}
} else {
// For each entry in the data hashmap
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
// For each Covariate in the key
for( Comparable comp : entry.getKey() ) {
// Output the Covariate's value
recalTableStream.print( comp + "," );
}
// Output the RecalDatum entry
recalTableStream.println( entry.getValue().outputToCSV() );
// For each entry in the data hashmap
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
// For each Covariate in the key
for( Comparable comp : entry.getKey() ) {
// Output the Covariate's value
recalTableStream.print( comp + "," );
}
// Output the RecalDatum entry
recalTableStream.println( entry.getValue().outputToCSV() );
}
}
}

View File

@ -0,0 +1,82 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Dec 4, 2009
*
* The Homopolymer Run Covariate. This is the number of bases in the previous N that match the current base.
* For example, if N = 10:
* ATTGCCCCGTAAAAAAAAAA
* 00100123121123456789
*/
public class HomopolymerCovariate implements Covariate {
int numBack = 10;
// Initialize any member variables using the command-line arguments passed to the walkers
public void initialize( final RecalibrationArgumentCollection RAC ) {
numBack = RAC.HOMOPOLYMER_NBACK;
}
// Used to pick out the covariate's value from attributes of the read
public final Comparable getValue( final SAMRecord read, final int offset ) {
int numAgree = 0; // The number of bases that agree with you in the previous numBack bases of the read
int startPos = 0;
int stopPos = 0;
byte[] bases = read.getReadBases();
byte thisBase = bases[offset];
if( !read.getReadNegativeStrandFlag() ) { // Forward direction
startPos = Math.max(offset - numBack, 0);
stopPos = offset;
} else { // Negative direction
startPos = offset + 1;
stopPos = Math.min(offset + numBack + 1, read.getReadLength());
}
for( int iii = startPos; iii < stopPos; iii++ ) {
if( bases[iii] == thisBase ) { numAgree++; }
}
return numAgree;
}
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
public final Comparable getValue( final String str ) {
return Integer.parseInt( str );
}
// Used to estimate the amount space required for the full data HashMap
public final int estimatedNumberOfBins() {
return numBack + 1;
}
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.*;
@ -57,7 +58,9 @@ public class RecalDataManager {
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which if this base is inconsistent with its color
private static boolean warnUserNullReadGroup = false;
private static boolean warnUserNoColorSpace = false;
RecalDataManager() {
data = new NHashMap<RecalDatum>();
@ -205,12 +208,12 @@ public class RecalDataManager {
* @param read The read to adjust
* @param RAC The list of shared command line arguments
*/
public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC) {
public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) {
// Check if we need to use the original quality scores instead
if ( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
if( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
if ( obj instanceof String )
if( obj instanceof String )
read.setBaseQualities( QualityUtils.fastqToPhred((String)obj) );
else {
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
@ -245,13 +248,154 @@ public class RecalDataManager {
}
}
public static void parseColorSpace( final SAMRecord read ) {
// If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base
if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) {
if( read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null ) { // Haven't calculated the inconsistency array yet for this read
if( read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG) != null ) {
char[] colorSpace = ((String)read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG)).toCharArray();
// Loop over the read and calculate first the infered bases from the color and then check if it is consistent with the read
byte[] readBases = read.getReadBases();
if( read.getReadNegativeStrandFlag() ) {
readBases = BaseUtils.simpleReverseComplement( read.getReadBases() );
}
int[] inconsistency = new int[readBases.length];
int iii;
byte prevBase = (byte) colorSpace[0]; // The sentinel
for( iii = 0; iii < readBases.length; iii++ ) {
byte thisBase = (byte)getNextBaseFromColor( (char)prevBase, colorSpace[iii + 1] );
inconsistency[iii] = ( thisBase == readBases[iii] ? 0 : 1 );
prevBase = ( thisBase == readBases[iii] ? thisBase : readBases[iii] );
//System.out.print((char)thisBase);
}
//System.out.println();
read.setAttribute( RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency );
//for( iii = 0; iii < readBases.length; iii++ ) {
// System.out.print((char)readBases[iii]);
//}
//System.out.println();
//for( iii = 0; iii < readBases.length; iii++ ) {
// System.out.print(inconsistency[iii]);
//}
//System.out.println("\n");
} else if ( !warnUserNoColorSpace ) { // Warn the user if we can't find the color space tag
Utils.warnUser("Unable to find color space information in SOLID read. First observed at read with name = " + read.getReadName());
Utils.warnUser("This calculation is critically dependent on being able to know when reference bases were inserted into the SOLID read. Are you sure you want to proceed?");
warnUserNoColorSpace = true;
}
}
}
}
public static byte[] calcColorSpace( final SAMRecord read, byte[] originalQualScores, String SOLID_RECAL_MODE, Random coinFlip ) {
int[] inconsistency = null;
if( read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG) != null ) {
char[] colorSpace = ((String)read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG)).toCharArray();
// Loop over the read and calculate first the infered bases from the color and then check if it is consistent with the read
byte[] readBases = read.getReadBases();
byte[] colorImpliedBases = readBases.clone();
if( read.getReadNegativeStrandFlag() ) {
readBases = BaseUtils.simpleReverseComplement( read.getReadBases() );
}
inconsistency = new int[readBases.length];
int iii;
byte prevBase = (byte) colorSpace[0]; // The sentinel
for( iii = 0; iii < readBases.length; iii++ ) {
byte thisBase = (byte)getNextBaseFromColor( (char)prevBase, colorSpace[iii + 1] );
colorImpliedBases[iii] = thisBase;
inconsistency[iii] = ( thisBase == readBases[iii] ? 0 : 1 );
prevBase = ( thisBase == readBases[iii] ? thisBase : readBases[iii] );
//System.out.print((char)thisBase);
}
//System.out.println();
//for( iii = 0; iii < readBases.length; iii++ ) {
// System.out.print((char)readBases[iii]);
//}
//System.out.println();
//for( iii = 0; iii < readBases.length; iii++ ) {
// System.out.print(inconsistency[iii]);
//}
//System.out.println("\n");
if( SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ) {
for( iii = 0; iii < originalQualScores.length; iii++ ) {
if( inconsistency[iii] == 1 ) {
originalQualScores[iii] = (byte)0;
}
}
} else if( SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO_BASE_N") ) {
byte[] tmpBases = read.getReadBases();
for( iii = 0; iii < originalQualScores.length; iii++ ) {
if( inconsistency[iii] == 1 ) {
originalQualScores[iii] = (byte)0;
tmpBases[iii] = (byte)'N';
}
}
read.setReadBases( tmpBases );
} else if( SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ) {
if( read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG) != null ) { // BUGBUG: Warn user if using this mode but there are no color space quality scores in the bam
byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
byte[] tmpBases = read.getReadBases();
if( read.getReadNegativeStrandFlag() ) {
colorImpliedBases = BaseUtils.simpleComplement(colorImpliedBases.clone());
}
for( iii = 0; iii < inconsistency.length - 1; iii++ ) {
if( inconsistency[iii] == 1 ) {
if( colorSpaceQuals[iii] > colorSpaceQuals[iii+1] ) { // reference was inserted here, but there is more evidence for the color implied base
tmpBases[iii] = colorImpliedBases[iii];
} else if( colorSpaceQuals[iii] == colorSpaceQuals[iii+1] ) { // equal evidence for the color implied base and the reference base, so flip a coin
int rand = coinFlip.nextInt( 2 );
if( rand == 0 ) { // the color implied base won the coin flip
tmpBases[iii] = colorImpliedBases[iii];
}
}
}
}
read.setReadBases( tmpBases );
}
}
} else if ( !warnUserNoColorSpace ) { // Warn the user if we can't find the color space tag
Utils.warnUser("Unable to find color space information in SOLID read. First observed at read with name = " + read.getReadName());
Utils.warnUser("This calculation is critically dependent on being able to know when reference bases were inserted into the SOLID read. Are you sure you want to proceed?");
warnUserNoColorSpace = true;
}
return originalQualScores;
}
private static char getNextBaseFromColor( final char prevBase, final char color ) {
switch(color) {
case '0': // same base
return prevBase;
case '1': // transversion
return BaseUtils.transversion( prevBase );
case '2': // transition
return BaseUtils.transition( prevBase );
case '3': // simple complement
return BaseUtils.simpleComplement( prevBase );
default:
throw new StingException( "Unrecognized color space in SOLID bam, color = " + color );
}
}
/**
* Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality
* @param base The nucleotide we are checking
* @param read The read which contains the color space to check against
* @param offset The offset in the read at which to check
* @return Returns true if the base is inconsistent with the color space
*/
public static boolean isInconsistentColorSpace( final byte base, final SAMRecord read ) {
return false;
public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) {
if( read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) != null ) {
int[] colorSpace = ((int[])read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG));
return colorSpace[offset] != 0;
} else {
return false;
}
}
}

View File

@ -45,8 +45,6 @@ public class RecalibrationArgumentCollection {
public String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName = "use_original_quals", shortName="OQ", doc="If provided, we will use the quals from the original qualities OQ attribute field instead of the quals in the regular QUALS field", required=false)
public boolean USE_ORIGINAL_QUALS = false;
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
public int WINDOW_SIZE = 5;
@Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.")
public String DEFAULT_READ_GROUP = ReadGroupCovariate.defaultReadGroup;
@Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
@ -55,14 +53,15 @@ public class RecalibrationArgumentCollection {
public String FORCE_READ_GROUP = null;
@Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
@Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? EXPERIMENTAL OPTION. USE AT OWN RISK. Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, COUNT_AS_MISMATCH, or REMOVE_REF_BIAS")
public String SOLID_RECAL_MODE = "DO_NOTHING";
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
public int WINDOW_SIZE = 5;
@Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false)
public int HOMOPOLYMER_NBACK = 10;
//////////////////////////////////
// Shared Debugging-only Arguments
//////////////////////////////////
@Argument(fullName="validate_old_recalibrator", shortName="validate", required=false, doc="!!!Depricated!!! Match the output of the old recalibrator exactly. FOR DEBUGGING PURPOSES ONLY.")
public boolean VALIDATE_OLD_RECALIBRATOR = false;
// BUGBUG: This validate option no longer does exactly what it says because now using read filter to filter out zero mapping quality reads. The old recalibrator counted those reads in the counted_sites variable but the new one doesn't.
public boolean checkSolidRecalMode() {
return ( SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ||
SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO_BASE_N") || SOLID_RECAL_MODE.equalsIgnoreCase("COUNT_AS_MISMATCH") || SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") );
}
}

View File

@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.regex.Pattern;
import java.io.File;
import java.io.FileNotFoundException;
@ -93,8 +94,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
private static final String versionString = "v2.0.10"; // Major version, minor version, and build number
private static final String versionString = "v2.1.0"; // Major version, minor version, and build number
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
private Random coinFlip; // Random number generator is used to remove reference bias in solid bams
//---------------------------------------------------------------------------------------------------------------
//
@ -112,6 +114,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
logger.info( "TableRecalibrationWalker version: " + versionString );
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
coinFlip = new Random();
if( !RAC.checkSolidRecalMode() ) {
throw new StingException( "Unrecognized --solid_recal_mode argument. Implemented options: DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, COUNT_AS_MISMATCH, or REMOVE_REF_BIAS");
}
// Get a list of all available covariates
List<Class<? extends Covariate>> classes = PackageUtils.getClassesImplementingInterface(Covariate.class);
@ -178,12 +184,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
} else { // Found a line of data
if( !foundAllCovariates ) {
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
requestedCovariates.add( new ReadGroupCovariate() );
requestedCovariates.add( new QualityScoreCovariate() );
requestedCovariates.add( new CycleCovariate() );
requestedCovariates.add( new DinucCovariate() );
}
foundAllCovariates = true;
// At this point all the covariates should have been found and initialized
@ -265,17 +265,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
int iii;
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
cov = requestedCovariates.get( iii );
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
if( iii == 1 ) { // Order is different in the old recalibrator unfortunately
key.add( cov.getValue( vals[2] ) );
} else if ( iii == 2 ) {
key.add( cov.getValue( vals[1] ) );
} else {
key.add( cov.getValue( vals[iii] ) );
}
} else {
key.add( cov.getValue( vals[iii] ) );
}
key.add( cov.getValue( vals[iii] ) );
}
// Create a new datum using the number of observations, number of mismatches, and reported quality score
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ) );
@ -301,10 +291,15 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// WARNING: refBases is always null because this walker doesn't have @Requires({DataSource.REFERENCE_BASES})
// This is done in order to speed up the code
RecalDataManager.parseSAMRecord(read, RAC);
RecalDataManager.parseSAMRecord( read, RAC );
byte[] originalQuals = read.getBaseQualities();
byte[] recalQuals = originalQuals.clone();
String platform = read.getReadGroup().getPlatform();
if( platform.equalsIgnoreCase("SOLID") && !RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") ) {
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, coinFlip );
}
int startPos = 1;
int stopPos = read.getReadLength();
@ -328,7 +323,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
read.setBaseQualities(recalQuals); // Overwrite old qualities with new recalibrated qualities
read.setBaseQualities( recalQuals ); // Overwrite old qualities with new recalibrated qualities
if ( read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // Save the old qualities if the tag isn't already taken in the read
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, QualityUtils.phredToFastq(originalQuals));
}
@ -377,17 +372,11 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
// The shift in quality due to each covariate by itself in turn
double deltaQCovariates = 0.0;
Double deltaQCovariateEmpirical;
double deltaQPos = 0.0;
double deltaQDinuc = 0.0;
for( int iii = 2; iii < key.size(); iii++ ) {
collapsedTableKey.add( key.get(iii) ); // The given covariate
deltaQCovariateEmpirical = dataManager.getCollapsedDoubleTable(iii).get( collapsedTableKey );
if( deltaQCovariateEmpirical != null ) {
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
if( RAC.VALIDATE_OLD_RECALIBRATOR ) {
if(iii==2) { deltaQPos = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); } // BUGBUG: Only here to validate against the old recalibrator
if(iii==3) { deltaQDinuc = deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); }
}
}
collapsedTableKey.remove( 2 ); // This new covariate is always added in at position 2 in the collapsedTableKey list
// The collapsedTableKey should be: < ReadGroup, Reported Quality Score, This Covariate >
@ -418,7 +407,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
*/
private void preserveQScores( final byte[] originalQuals, byte[] recalQuals ) {
for( int iii = 0; iii < recalQuals.length; iii++ ) {
if ( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) {
if( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) {
recalQuals[iii] = originalQuals[iii];
}
}
@ -431,7 +420,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
//---------------------------------------------------------------------------------------------------------------
/**
* Start the reduce with a new handle to the output bam file
* Start the reduce with a handle to the output bam file
* @return A FileWriter pointing to a new bam file
*/
public SAMFileWriter reduceInit() {

View File

@ -229,10 +229,50 @@ public class BaseUtils {
return base2;
}
/**
* Return the complement of a base, or the specified base if it can't be complemented (i.e. an ambiguous base).
*
* @param base the base [AaCcGgTt]
/**
* Perform a transition (A <-> G or C <-> T) on the base, or the specified base if it can't be done (i.e. an ambiguous base).
*
* @param base the base [AaCcGgTt]
* @return the transition of the base, or the input base if it's not one of the understood ones
*/
static public char transition(char base) {
switch (base) {
case 'A':
case 'a': return 'G';
case 'C':
case 'c': return 'T';
case 'G':
case 'g': return 'A';
case 'T':
case 't': return 'C';
default: return base;
}
}
/**
* Perform a transversion (A <-> C or G <-> T) on the base, or the specified base if it can't be done (i.e. an ambiguous base).
*
* @param base the base [AaCcGgTt]
* @return the transversion of the base, or the input base if it's not one of the understood ones
*/
static public char transversion(char base) {
switch (base) {
case 'A':
case 'a': return 'C';
case 'C':
case 'c': return 'A';
case 'G':
case 'g': return 'T';
case 'T':
case 't': return 'G';
default: return base;
}
}
/**
* Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base).
*
* @param base the base [AaCcGgTt]
* @return the complementary base, or the input base if it's not one of the understood ones
*/
static public char simpleComplement(char base) {
@ -252,7 +292,7 @@ public class BaseUtils {
/**
* Reverse complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form)
*
* @param bases the byte array of bases
* @param bases the byte array of bases
* @return the reverse complement of the base byte array
*/
static public byte[] simpleReverseComplement(byte[] bases) {

View File

@ -37,6 +37,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
" -cov CycleCovariate" +
" -cov DinucCovariate" +
" --sorted_output" +
" --solid_recal_mode DO_NOTHING" +
" -recalFile %s",
1, // just one output file
Arrays.asList(md5));
@ -67,6 +68,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
? " -L 1:10,800,000-10,810,000" : " -L 1:10,100,000-10,300,000" ) +
" -outputBam %s" +
" --no_pg_tag" +
" --solid_recal_mode DO_NOTHING" +
" -recalFile " + paramsFile,
1, // just one output file
Arrays.asList(md5));
@ -95,6 +97,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
" -cov CycleCovariate" +
" -cov DinucCovariate" +
" --sorted_output" +
" --solid_recal_mode DO_NOTHING" +
" -recalFile %s",
1, // just one output file
Arrays.asList(md5));
@ -123,6 +126,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
" -cov DinucCovariate" +
" --default_platform illumina" +
" --sorted_output" +
" --solid_recal_mode DO_NOTHING" +
" -recalFile %s",
1, // just one output file
Arrays.asList(md5));
@ -149,6 +153,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
" -L 1:10,100,000-10,300,000" +
" -outputBam %s" +
" --no_pg_tag" +
" --solid_recal_mode DO_NOTHING" +
" --default_platform illumina" +
" -recalFile " + paramsFile,
1, // just one output file