Updating the two solid_recal_mode options to also change the previous base since solid aligner prefers single color mismatch alignments over true SNP alignments. COUNT_AS_MISMATCH mode has been removed completely. The default mode is now SET_Q_ZERO.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2394 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
07f1859290
commit
6fbf77be95
|
|
@ -103,7 +103,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
|
||||
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. BUGBUG: I don't understand what is going on in this case
|
||||
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
|
||||
private static final String versionString = "v2.1.1"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.1.2"; // Major version, minor version, and build number
|
||||
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
|
||||
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
|
||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||
|
|
@ -127,7 +127,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
|
||||
DBSNP_VALIDATION_CHECK_FREQUENCY *= PROCESS_EVERY_NTH_LOCUS;
|
||||
if( !RAC.checkSolidRecalMode() ) {
|
||||
throw new StingException( "Unrecognized --solid_recal_mode argument. Implemented options: DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, COUNT_AS_MISMATCH, or REMOVE_REF_BIAS");
|
||||
throw new StingException( "Unrecognized --solid_recal_mode argument. Implemented options: DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS");
|
||||
}
|
||||
|
||||
// Get a list of all available covariates
|
||||
|
|
@ -300,10 +300,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
} else {
|
||||
otherColorSpaceInconsistency++;
|
||||
}
|
||||
if( RAC.SOLID_RECAL_MODE.equalsIgnoreCase("COUNT_AS_MISMATCH") ) {
|
||||
refBase = (byte)'X'; // This will always mismatch since we are already filtering out reads that don't pass BaseUtils.isRegularBase
|
||||
updateDataFromRead( read, offset, refBase );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -313,7 +309,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
} else { // We skipped over the dbSNP site, and we are only processing every Nth locus
|
||||
skippedSites++;
|
||||
if( isSNP ) {
|
||||
updateMismatchCounts(dbSNP_counts, context, ref.getBase());// For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||
updateMismatchCounts(dbSNP_counts, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ import net.sf.samtools.SAMReadGroupRecord;
|
|||
* Date: Nov 6, 2009
|
||||
*
|
||||
* This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions.
|
||||
* It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias.
|
||||
*/
|
||||
|
||||
public class RecalDataManager {
|
||||
|
|
@ -58,7 +59,7 @@ public class RecalDataManager {
|
|||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
|
||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
||||
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
|
||||
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which if this base is inconsistent with its color
|
||||
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
|
||||
private static boolean warnUserNullReadGroup = false;
|
||||
private static boolean warnUserNoColorSpace = false;
|
||||
|
||||
|
|
@ -88,21 +89,27 @@ public class RecalDataManager {
|
|||
* Add the given mapping to all of the collapsed hash tables
|
||||
* @param key The list of comparables that is the key for this mapping
|
||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||
* @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table
|
||||
*/
|
||||
public final void addToAllTables( final List<? extends Comparable> key, final RecalDatum fullDatum ) {
|
||||
public final void addToAllTables( final List<? extends Comparable> key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN ) {
|
||||
|
||||
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
|
||||
//data.put(key, thisDatum); // add the mapping to the main table
|
||||
|
||||
int qualityScore = Integer.parseInt( key.get(1).toString() );
|
||||
ArrayList<Comparable> newKey;
|
||||
RecalDatum collapsedDatum;
|
||||
|
||||
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
||||
//BUGBUG: Bases with Reported Quality less than -pQ argument shouldn't be included in this collapsed table
|
||||
ArrayList<Comparable> newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // Make a new key with just the read group
|
||||
RecalDatum collapsedDatum = dataCollapsedReadGroup.get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported
|
||||
if( qualityScore >= PRESERVE_QSCORES_LESS_THAN ) {
|
||||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // Make a new key with just the read group
|
||||
collapsedDatum = dataCollapsedReadGroup.get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedReadGroup.put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported
|
||||
}
|
||||
}
|
||||
|
||||
newKey = new ArrayList<Comparable>();
|
||||
|
|
@ -248,6 +255,10 @@ public class RecalDataManager {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are inconsistent with the color space
|
||||
* @param read The SAMRecord to parse
|
||||
*/
|
||||
public static void parseColorSpace( final SAMRecord read ) {
|
||||
// If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base
|
||||
if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) {
|
||||
|
|
@ -265,19 +276,9 @@ public class RecalDataManager {
|
|||
for( iii = 0; iii < readBases.length; iii++ ) {
|
||||
byte thisBase = (byte)getNextBaseFromColor( (char)prevBase, colorSpace[iii + 1] );
|
||||
inconsistency[iii] = ( thisBase == readBases[iii] ? 0 : 1 );
|
||||
prevBase = ( thisBase == readBases[iii] ? thisBase : readBases[iii] );
|
||||
//System.out.print((char)thisBase);
|
||||
prevBase = readBases[iii];
|
||||
}
|
||||
//System.out.println();
|
||||
read.setAttribute( RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency );
|
||||
//for( iii = 0; iii < readBases.length; iii++ ) {
|
||||
// System.out.print((char)readBases[iii]);
|
||||
//}
|
||||
//System.out.println();
|
||||
//for( iii = 0; iii < readBases.length; iii++ ) {
|
||||
// System.out.print(inconsistency[iii]);
|
||||
//}
|
||||
//System.out.println("\n");
|
||||
|
||||
} else if ( !warnUserNoColorSpace ) { // Warn the user if we can't find the color space tag
|
||||
Utils.warnUser("Unable to find color space information in SOLID read. First observed at read with name = " + read.getReadName());
|
||||
|
|
@ -288,77 +289,48 @@ public class RecalDataManager {
|
|||
}
|
||||
}
|
||||
|
||||
public static byte[] calcColorSpace( final SAMRecord read, byte[] originalQualScores, String SOLID_RECAL_MODE, Random coinFlip ) {
|
||||
/**
|
||||
* Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases
|
||||
* This method doesn't add the inconsistent tag to the read like parseColorSpace does
|
||||
* @param read The SAMRecord to parse
|
||||
* @param originalQualScores The array of original quality scores to modify during the correction
|
||||
* @param SOLID_RECAL_MODE Which mode of solid recalibration to apply
|
||||
* @param coinFlip A random number generator
|
||||
* @param refBases The reference for this read
|
||||
* @return A new array of quality scores that have been ref bias corrected
|
||||
*/
|
||||
public static byte[] calcColorSpace( SAMRecord read, byte[] originalQualScores, final String SOLID_RECAL_MODE, final Random coinFlip, final char[] refBases ) {
|
||||
|
||||
int[] inconsistency = null;
|
||||
if( read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG) != null ) {
|
||||
char[] colorSpace = ((String)read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG)).toCharArray();
|
||||
// Loop over the read and calculate first the infered bases from the color and then check if it is consistent with the read
|
||||
byte[] readBases = read.getReadBases();
|
||||
byte[] colorImpliedBases = readBases.clone();
|
||||
char[] refBasesDirRead = refBases;
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
readBases = BaseUtils.simpleReverseComplement( read.getReadBases() );
|
||||
refBasesDirRead = BaseUtils.simpleReverseComplement( refBases );
|
||||
}
|
||||
inconsistency = new int[readBases.length];
|
||||
int iii;
|
||||
int[] inconsistency = new int[readBases.length];
|
||||
byte prevBase = (byte) colorSpace[0]; // The sentinel
|
||||
for( iii = 0; iii < readBases.length; iii++ ) {
|
||||
for( int iii = 0; iii < readBases.length; iii++ ) {
|
||||
byte thisBase = (byte)getNextBaseFromColor( (char)prevBase, colorSpace[iii + 1] );
|
||||
colorImpliedBases[iii] = thisBase;
|
||||
inconsistency[iii] = ( thisBase == readBases[iii] ? 0 : 1 );
|
||||
prevBase = ( thisBase == readBases[iii] ? thisBase : readBases[iii] );
|
||||
//System.out.print((char)thisBase);
|
||||
prevBase = readBases[iii];
|
||||
}
|
||||
|
||||
boolean isMappedToReference = (refBases != null && refBases.length == inconsistency.length);
|
||||
|
||||
//System.out.println();
|
||||
//for( iii = 0; iii < readBases.length; iii++ ) {
|
||||
// System.out.print((char)readBases[iii]);
|
||||
//}
|
||||
//System.out.println();
|
||||
//for( iii = 0; iii < readBases.length; iii++ ) {
|
||||
// System.out.print(inconsistency[iii]);
|
||||
//}
|
||||
//System.out.println("\n");
|
||||
|
||||
if( SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ) {
|
||||
for( iii = 0; iii < originalQualScores.length; iii++ ) {
|
||||
if( inconsistency[iii] == 1 ) {
|
||||
originalQualScores[iii] = (byte)0;
|
||||
}
|
||||
}
|
||||
// Now that we have the inconsistency array apply the desired correction to the inconsistent bases
|
||||
if( SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ) { // Set inconsistent bases and the one before it to Q0
|
||||
boolean setBaseN = false;
|
||||
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, isMappedToReference, setBaseN);
|
||||
} else if( SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO_BASE_N") ) {
|
||||
byte[] tmpBases = read.getReadBases();
|
||||
for( iii = 0; iii < originalQualScores.length; iii++ ) {
|
||||
if( inconsistency[iii] == 1 ) {
|
||||
originalQualScores[iii] = (byte)0;
|
||||
tmpBases[iii] = (byte)'N';
|
||||
}
|
||||
}
|
||||
read.setReadBases( tmpBases );
|
||||
|
||||
} else if( SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ) {
|
||||
if( read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG) != null ) { // BUGBUG: Warn user if using this mode but there are no color space quality scores in the bam
|
||||
byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
|
||||
byte[] tmpBases = read.getReadBases();
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
colorImpliedBases = BaseUtils.simpleComplement(colorImpliedBases.clone());
|
||||
}
|
||||
for( iii = 0; iii < inconsistency.length - 1; iii++ ) {
|
||||
if( inconsistency[iii] == 1 ) {
|
||||
if( colorSpaceQuals[iii] > colorSpaceQuals[iii+1] ) { // reference was inserted here, but there is more evidence for the color implied base
|
||||
tmpBases[iii] = colorImpliedBases[iii];
|
||||
} else if( colorSpaceQuals[iii] == colorSpaceQuals[iii+1] ) { // equal evidence for the color implied base and the reference base, so flip a coin
|
||||
int rand = coinFlip.nextInt( 2 );
|
||||
if( rand == 0 ) { // the color implied base won the coin flip
|
||||
tmpBases[iii] = colorImpliedBases[iii];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
read.setReadBases( tmpBases );
|
||||
}
|
||||
boolean setBaseN = true;
|
||||
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, isMappedToReference, setBaseN);
|
||||
} else if( SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases
|
||||
solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBases, isMappedToReference, coinFlip);
|
||||
}
|
||||
|
||||
} else if ( !warnUserNoColorSpace ) { // Warn the user if we can't find the color space tag
|
||||
|
|
@ -369,6 +341,107 @@ public class RecalDataManager {
|
|||
return originalQualScores;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero
|
||||
* @param read The SAMRecord to recalibrate
|
||||
* @param readBases The bases in the read RC'd if necessary
|
||||
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
|
||||
* @param originalQualScores The array of original quality scores to set to zero if needed
|
||||
* @param refBases The reference which has been RC'd if necessary
|
||||
* @param isMappedToRef Is this read mapped fully to a reference
|
||||
* @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar
|
||||
* @return The byte array of original quality scores some of which might have been set to zero
|
||||
*/
|
||||
private static byte[] solidRecalSetToQZero( SAMRecord read, byte[] readBases, int[] inconsistency, byte[] originalQualScores,
|
||||
final char[] refBases, final boolean isMappedToRef, final boolean setBaseN ) {
|
||||
|
||||
for( int iii = 1; iii < originalQualScores.length - 1; iii++ ) { // BUGBUG: This relies on the fact that in SOLID bams the first and last base are set to Q0 and aren't recalibrated anyway
|
||||
if( inconsistency[iii] == 1 ) {
|
||||
if( !isMappedToRef || (char)readBases[iii] == refBases[iii] ) {
|
||||
originalQualScores[iii] = (byte)0;
|
||||
if( setBaseN ) { readBases[iii] = (byte)'N'; }
|
||||
}
|
||||
// Set the prev base to Q0 as well
|
||||
if( !isMappedToRef || (char)readBases[iii-1] == refBases[iii-1] ) {
|
||||
originalQualScores[iii-1] = (byte)0;
|
||||
if( setBaseN ) { readBases[iii-1] = (byte)'N'; }
|
||||
}
|
||||
if( !isMappedToRef || (char)readBases[iii+1] == refBases[iii+1] ) {
|
||||
originalQualScores[iii+1] = (byte)0;
|
||||
if( setBaseN ) { readBases[iii+1] = (byte)'N'; }
|
||||
}
|
||||
}
|
||||
}
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read
|
||||
}
|
||||
read.setReadBases( readBases );
|
||||
return originalQualScores;
|
||||
}
|
||||
|
||||
/**
|
||||
* Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference
|
||||
* @param read The SAMRecord to recalibrate
|
||||
* @param readBases The bases in the read RC'd if necessary
|
||||
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
|
||||
* @param colorImpliedBases The bases implied by the color space, RC'd if necessary
|
||||
* @param refBases The reference which has been RC'd if necessary
|
||||
* @param isMappedToRef Is this read mapped fully to a reference
|
||||
* @param coinFlip A random number generator
|
||||
*/
|
||||
private static void solidRecalRemoveRefBias( SAMRecord read, byte[] readBases, int[] inconsistency, byte[] colorImpliedBases,
|
||||
final char[] refBases, final boolean isMappedToRef, final Random coinFlip ) {
|
||||
if( read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
|
||||
|
||||
for( int iii = 1; iii < inconsistency.length - 2; iii++ ) {
|
||||
if( inconsistency[iii] == 1 ) {
|
||||
for( int jjj = iii - 1; jjj <= iii + 1; jjj++ ) { // Correct this base and the one before it along the direction of the read
|
||||
if( !isMappedToRef || (char)readBases[jjj] == refBases[jjj] ) {
|
||||
if( colorSpaceQuals[jjj] == colorSpaceQuals[jjj+1] ) { // Equal evidence for the color implied base and the reference base, so flip a coin
|
||||
int rand = coinFlip.nextInt( 2 );
|
||||
if( rand == 0 ) { // The color implied base won the coin flip
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
}
|
||||
} else {
|
||||
int maxQuality = Math.max((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
|
||||
int minQuality = Math.min((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
|
||||
int diffInQuality = maxQuality - minQuality;
|
||||
int numLow = minQuality;
|
||||
if(numLow == 0) { numLow = 1; }
|
||||
int numHigh = Math.round(numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely
|
||||
int rand = coinFlip.nextInt( numLow + numHigh );
|
||||
if( rand >= numLow ) {
|
||||
if( maxQuality == (int)colorSpaceQuals[jjj] ) {
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
}
|
||||
} else {
|
||||
if( minQuality == (int)colorSpaceQuals[jjj] ) {
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read
|
||||
}
|
||||
read.setReadBases( readBases );
|
||||
} else { // No color space quality tag in file
|
||||
throw new StingException("REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the base and the color calculate the next base in the sequence
|
||||
* @param prevBase The base
|
||||
* @param color The color
|
||||
* @return The next base in the sequence
|
||||
*/
|
||||
private static char getNextBaseFromColor( final char prevBase, final char color ) {
|
||||
switch(color) {
|
||||
case '0': // same base
|
||||
|
|
|
|||
|
|
@ -53,8 +53,8 @@ public class RecalibrationArgumentCollection {
|
|||
public String FORCE_READ_GROUP = null;
|
||||
@Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String FORCE_PLATFORM = null;
|
||||
@Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? EXPERIMENTAL OPTION. USE AT OWN RISK. Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, COUNT_AS_MISMATCH, or REMOVE_REF_BIAS")
|
||||
public String SOLID_RECAL_MODE = "DO_NOTHING";
|
||||
@Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
|
||||
public String SOLID_RECAL_MODE = "SET_Q_ZERO";
|
||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
|
||||
public int WINDOW_SIZE = 5;
|
||||
@Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false)
|
||||
|
|
@ -62,6 +62,6 @@ public class RecalibrationArgumentCollection {
|
|||
|
||||
public boolean checkSolidRecalMode() {
|
||||
return ( SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ||
|
||||
SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO_BASE_N") || SOLID_RECAL_MODE.equalsIgnoreCase("COUNT_AS_MISMATCH") || SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") );
|
||||
SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO_BASE_N") || SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -58,8 +58,7 @@ import java.lang.reflect.Field;
|
|||
*/
|
||||
|
||||
@WalkerName("TableRecalibration")
|
||||
@Requires({ DataSource.READS, DataSource.REFERENCE }) // This walker requires -I input.bam, it also requires -R reference.fasta, but by not saying @requires REFERENCE_BASES I'm telling the
|
||||
// GATK to not actually spend time giving me the refBase array since I don't use it
|
||||
@Requires({ DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES }) // This walker requires -I input.bam, it also requires -R reference.fasta
|
||||
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
|
||||
/////////////////////////////
|
||||
|
|
@ -94,9 +93,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
private static final String versionString = "v2.1.1"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.1.2"; // Major version, minor version, and build number
|
||||
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
|
||||
private Random coinFlip; // Random number generator is used to remove reference bias in solid bams
|
||||
private static final long RANDOM_SEED = 1032861495;
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -114,9 +114,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
logger.info( "TableRecalibrationWalker version: " + versionString );
|
||||
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
|
||||
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
|
||||
coinFlip = new Random();
|
||||
coinFlip = new Random(RANDOM_SEED);
|
||||
if( !RAC.checkSolidRecalMode() ) {
|
||||
throw new StingException( "Unrecognized --solid_recal_mode argument. Implemented options: DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, COUNT_AS_MISMATCH, or REMOVE_REF_BIAS");
|
||||
throw new StingException( "Unrecognized --solid_recal_mode argument. Implemented options: DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS");
|
||||
}
|
||||
|
||||
// Get a list of all available covariates
|
||||
|
|
@ -282,7 +282,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ) );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum );
|
||||
dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN );
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -300,17 +300,14 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
*/
|
||||
public SAMRecord map( char[] refBases, SAMRecord read ) {
|
||||
|
||||
// WARNING: refBases is always null because this walker doesn't have @Requires({DataSource.REFERENCE_BASES})
|
||||
// This is done in order to speed up the code
|
||||
|
||||
RecalDataManager.parseSAMRecord( read, RAC );
|
||||
|
||||
|
||||
byte[] originalQuals = read.getBaseQualities();
|
||||
byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
String platform = read.getReadGroup().getPlatform();
|
||||
if( platform.equalsIgnoreCase("SOLID") && !RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") ) {
|
||||
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, coinFlip );
|
||||
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, coinFlip, refBases );
|
||||
}
|
||||
|
||||
int startPos = 1;
|
||||
|
|
|
|||
|
|
@ -321,6 +321,38 @@ public class BaseUtils {
|
|||
return rcbases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverse complement a char array of bases
|
||||
*
|
||||
* @param bases the char array of bases
|
||||
* @return the reverse complement of the char byte array
|
||||
*/
|
||||
static public char[] simpleReverseComplement(char[] bases) {
|
||||
char[] rcbases = new char[bases.length];
|
||||
|
||||
for (int i = 0; i < bases.length; i++) {
|
||||
rcbases[i] = simpleComplement(bases[bases.length - 1 - i]);
|
||||
}
|
||||
|
||||
return rcbases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Complement a char array of bases
|
||||
*
|
||||
* @param bases the char array of bases
|
||||
* @return the complement of the base char array
|
||||
*/
|
||||
static public char[] simpleComplement(char[] bases) {
|
||||
char[] rcbases = new char[bases.length];
|
||||
|
||||
for (int i = 0; i < bases.length; i++) {
|
||||
rcbases[i] = simpleComplement(bases[i]);
|
||||
}
|
||||
|
||||
return rcbases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverse complement a String of bases. Preserves ambiguous bases.
|
||||
*
|
||||
|
|
@ -358,6 +390,22 @@ public class BaseUtils {
|
|||
return rcbases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverse an int array of bases
|
||||
*
|
||||
* @param bases the int array of bases
|
||||
* @return the reverse of the base int array
|
||||
*/
|
||||
static public int[] reverse(int[] bases) {
|
||||
int[] rcbases = new int[bases.length];
|
||||
|
||||
for (int i = 0; i < bases.length; i++) {
|
||||
rcbases[i] = bases[bases.length - i - 1];
|
||||
}
|
||||
|
||||
return rcbases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverse (NOT reverse-complement!!) a string
|
||||
*
|
||||
|
|
|
|||
|
|
@ -17,9 +17,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
public void testCountCovariates1() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "16f87c9644b27c3c3fe7a963eed45d2d" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "add8196af03d5a674aa3de702995cbbf");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5a4873064e602e1f945f3881c6b4a3e5");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b039f405580ad844cd1dfd4d8ba25913" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "c730bff43ffb89dcb0774926286cac6f" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "dbbb9c49f2e0b1f117b88eb386fb6d0f" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -37,7 +37,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -cov CycleCovariate" +
|
||||
" -cov DinucCovariate" +
|
||||
" --sorted_output" +
|
||||
" --solid_recal_mode DO_NOTHING" +
|
||||
" --solid_recal_mode SET_Q_ZERO" +
|
||||
" -recalFile %s",
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
|
|
@ -50,9 +50,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
public void testTableRecalibrator1() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "d7f3e0db5ed9fefc917144a0f937d50d" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "fb82046d5acfb07625d5459526d5f6c8");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "1f1770363f381d06e8db9fead03ad7bb");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "3fe6ee95cfa6155f8cc8e3e0a6330ad2" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "e4eae955f7d408b2af34c9224616f564" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "732267e9cd02a72b8c85238f47487509" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -68,7 +68,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
? " -L 1:10,800,000-10,810,000" : " -L 1:10,100,000-10,300,000" ) +
|
||||
" -outputBam %s" +
|
||||
" --no_pg_tag" +
|
||||
" --solid_recal_mode DO_NOTHING" +
|
||||
" --solid_recal_mode SET_Q_ZERO" +
|
||||
" -recalFile " + paramsFile,
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
|
|
@ -80,7 +80,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testCountCovariatesVCF() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "d0bdd1934f003406e93f935b63ab6edd");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "bc0e62dc366f2f6ca70847a832b4aad4");
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -97,7 +97,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -cov CycleCovariate" +
|
||||
" -cov DinucCovariate" +
|
||||
" --sorted_output" +
|
||||
" --solid_recal_mode DO_NOTHING" +
|
||||
" --solid_recal_mode SET_Q_ZERO" +
|
||||
" -recalFile %s",
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
|
|
@ -108,7 +108,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testCountCovariatesNoReadGroups() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "fadfa35415a694781ed2b4d1b1d9535b" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "9d83a1653e076e028fecf4c2575f56e9" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -126,7 +126,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -cov DinucCovariate" +
|
||||
" --default_platform illumina" +
|
||||
" --sorted_output" +
|
||||
" --solid_recal_mode DO_NOTHING" +
|
||||
" --solid_recal_mode SET_Q_ZERO" +
|
||||
" -recalFile %s",
|
||||
1, // just one output file
|
||||
Arrays.asList(md5));
|
||||
|
|
@ -138,7 +138,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testTableRecalibratorNoReadGroups() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "4879036295b567514f637d21030bf9c8" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "8853add9cfd0a8171ca917b1c6452e0a" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -153,7 +153,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -L 1:10,100,000-10,300,000" +
|
||||
" -outputBam %s" +
|
||||
" --no_pg_tag" +
|
||||
" --solid_recal_mode DO_NOTHING" +
|
||||
" --solid_recal_mode SET_Q_ZERO" +
|
||||
" --default_platform illumina" +
|
||||
" -recalFile " + paramsFile,
|
||||
1, // just one output file
|
||||
|
|
|
|||
Loading…
Reference in New Issue