Added method to AlignmentUtils which takes a read's cigar and the refBases char array given to a ReadWalker and returns the aligned reference char array. Bug fix in solid_recal_modes to use this aligned reference array. Recalibrator version number is no longer separate for each of the two walkers.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2589 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-01-15 15:36:59 +00:00
parent 2a116bb5d6
commit 70df30fc1b
5 changed files with 51 additions and 21 deletions

View File

@ -102,7 +102,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted.
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
private static final String versionString = "v2.2.5"; // Major version, minor version, and build number
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
@ -121,7 +120,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
*/
public void initialize() {
logger.info( "CovariateCounterWalker version: " + versionString );
logger.info( "Recalibrator version: " + RecalDataManager.versionString );
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
DBSNP_VALIDATION_CHECK_FREQUENCY *= PROCESS_EVERY_NTH_LOCUS;

View File

@ -57,6 +57,8 @@ public class RecalDataManager {
private static boolean warnUserNullReadGroup = false;
private static boolean warnUserNoColorSpace = false;
public static final String versionString = "v2.2.10"; // Major version, minor version, and build number
RecalDataManager() {
data = new NestedHashMap();
dataCollapsedReadGroup = null;
@ -261,7 +263,6 @@ public class RecalDataManager {
throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
}
// Loop over the read and calculate first the infered bases from the color and then check if it is consistent with the read
byte[] readBases = read.getReadBases();
if( read.getReadNegativeStrandFlag() ) {
@ -310,10 +311,10 @@ public class RecalDataManager {
// Loop over the read and calculate first the infered bases from the color and then check if it is consistent with the read
byte[] readBases = read.getReadBases();
final byte[] colorImpliedBases = readBases.clone();
char[] refBasesDirRead = refBases;
char[] refBasesDirRead = AlignmentUtils.alignmentToCharArray( read.getCigar(), read.getReadString().toCharArray(), refBases ); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases
if( read.getReadNegativeStrandFlag() ) {
readBases = BaseUtils.simpleReverseComplement( read.getReadBases() );
refBasesDirRead = BaseUtils.simpleReverseComplement( refBases );
refBasesDirRead = BaseUtils.simpleReverseComplement( refBasesDirRead.clone() );
}
final int[] inconsistency = new int[readBases.length];
byte prevBase = (byte) colorSpace[0]; // The sentinel
@ -324,17 +325,15 @@ public class RecalDataManager {
prevBase = readBases[iii];
}
final boolean isMappedToReference = (refBases != null && refBases.length == inconsistency.length);
// Now that we have the inconsistency array apply the desired correction to the inconsistent bases
if( SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ) { // Set inconsistent bases and the one before it to Q0
final boolean setBaseN = false;
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, isMappedToReference, setBaseN);
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN);
} else if( SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO_BASE_N") ) {
final boolean setBaseN = true;
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, isMappedToReference, setBaseN);
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN);
} else if( SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases
solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead, isMappedToReference, coinFlip);
solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead, coinFlip);
}
} else if ( !warnUserNoColorSpace ) { // Warn the user if we can't find the color space tag
@ -353,23 +352,22 @@ public class RecalDataManager {
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
* @param originalQualScores The array of original quality scores to set to zero if needed
* @param refBases The reference which has been RC'd if necessary
* @param isMappedToRef Is this read mapped fully to a reference
* @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar
* @return The byte array of original quality scores some of which might have been set to zero
*/
private static byte[] solidRecalSetToQZero( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores,
final char[] refBases, final boolean isMappedToRef, final boolean setBaseN ) {
final char[] refBases, final boolean setBaseN ) {
final boolean negStrand = read.getReadNegativeStrandFlag();
for( int iii = 1; iii < originalQualScores.length - 1; iii++ ) {
if( inconsistency[iii] == 1 ) {
if( !isMappedToRef || (char)readBases[iii] == refBases[iii] ) {
if( (char)readBases[iii] == refBases[iii] ) {
if( negStrand ) { originalQualScores[originalQualScores.length-(iii+1)] = (byte)0; }
else { originalQualScores[iii] = (byte)0; }
if( setBaseN ) { readBases[iii] = (byte)'N'; }
}
// Set the prev base to Q0 as well
if( !isMappedToRef || (char)readBases[iii-1] == refBases[iii-1] ) {
if( (char)readBases[iii-1] == refBases[iii-1] ) {
if( negStrand ) { originalQualScores[originalQualScores.length-iii] = (byte)0; }
else { originalQualScores[iii-1] = (byte)0; }
if( setBaseN ) { readBases[iii-1] = (byte)'N'; }
@ -390,11 +388,10 @@ public class RecalDataManager {
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
* @param colorImpliedBases The bases implied by the color space, RC'd if necessary
* @param refBases The reference which has been RC'd if necessary
* @param isMappedToRef Is this read mapped fully to a reference
* @param coinFlip A random number generator
*/
private static void solidRecalRemoveRefBias( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases,
final char[] refBases, final boolean isMappedToRef, final Random coinFlip) {
final char[] refBases, final Random coinFlip) {
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
if( attr != null ) {
@ -409,7 +406,7 @@ public class RecalDataManager {
if( inconsistency[iii] == 1 ) {
for( int jjj = iii - 1; jjj <= iii; jjj++ ) { // Correct this base and the one before it along the direction of the read
if( jjj == iii || inconsistency[jjj] == 0 ) { // Don't want to correct the previous base a second time if it was already corrected in the previous step
if( !isMappedToRef || (char)readBases[jjj] == refBases[jjj] ) {
if( (char)readBases[jjj] == refBases[jjj] ) {
if( colorSpaceQuals[jjj] == colorSpaceQuals[jjj+1] ) { // Equal evidence for the color implied base and the reference base, so flip a coin
final int rand = coinFlip.nextInt( 2 );
if( rand == 0 ) { // The color implied base won the coin flip

View File

@ -61,10 +61,10 @@ public class RecalibrationArgumentCollection {
public int HOMOPOLYMER_NBACK = 7;
@Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false)
public boolean EXCEPTION_IF_NO_TILE = false;
public final boolean checkSolidRecalMode() {
return ( SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ||
SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO_BASE_N") || SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") );
}
}

View File

@ -95,7 +95,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
private static final String versionString = "v2.2.9"; // Major version, minor version, and build number
private Random coinFlip; // Random number generator is used to remove reference bias in solid bams
private static final long RANDOM_SEED = 1032861495;
@ -112,7 +111,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
*/
public void initialize() {
logger.info( "TableRecalibrationWalker version: " + versionString );
logger.info( "Recalibrator version: " + RecalDataManager.versionString );
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
coinFlip = new Random(RANDOM_SEED);
@ -225,7 +224,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
final SAMFileHeader header = getToolkit().getSAMFileHeader().clone();
if( !NO_PG_TAG ) {
final SAMProgramRecord programRecord = new SAMProgramRecord( "GATK TableRecalibration" );
programRecord.setProgramVersion( versionString );
programRecord.setProgramVersion( RecalDataManager.versionString );
String commandLineString = "Covariates used = [";
for( Covariate cov : requestedCovariates ) {
commandLineString += cov.getClass().getSimpleName() + ", ";

View File

@ -353,6 +353,41 @@ public class AlignmentUtils {
return refLine.toString();
}
public static char[] alignmentToCharArray( final Cigar cigar, final char[] read, final char[] ref ) {
final char[] alignment = new char[read.length];
int refPos = 0;
int alignPos = 0;
for ( int iii = 0 ; iii < cigar.numCigarElements() ; iii++ ) {
final CigarElement ce = cigar.getCigarElement(iii);
switch( ce.getOperator() ) {
case I:
case S:
for ( int jjj = 0 ; jjj < ce.getLength(); jjj++ ) {
alignment[alignPos++] = '+';
}
break;
case D:
case N:
refPos++;
break;
case M:
for ( int jjj = 0 ; jjj < ce.getLength(); jjj++ ) {
alignment[alignPos] = ref[refPos];
alignPos++;
refPos++;
}
break;
default:
throw new StingException( "Unsupported cigar operator: " + ce.getOperator() );
}
}
return alignment;
}
/**
* Due to (unfortunate) multiple ways to indicate that read is unmapped allowed by SAM format
* specification, one may need this convenience shortcut. Checks both 'read unmapped' flag and