Added a new option in the recalibrator to be used by people who have SOLiD data in which only a few of the reads have no-calls in the color space. These reads will be skipped over and left in the bam file untouched.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2857 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-02-19 15:25:23 +00:00
parent b1a4e6d840
commit 7f19ff1fa1
5 changed files with 67 additions and 18 deletions

View File

@ -263,28 +263,32 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
offset = p.getOffset();
RecalDataManager.parseSAMRecord( read, RAC );
RecalDataManager.parseColorSpace( read );
// Skip if base quality is zero
if( read.getBaseQualities()[offset] > 0 ) {
// Skip over reads with no calls in the color space if the user requested it
if( !RAC.IGNORE_NOCALL_COLORSPACE || !RecalDataManager.checkNoCallColorSpace( read ) ) {
RecalDataManager.parseColorSpace( read );
bases = read.getReadBases();
refBase = (byte)ref.getBase();
// Skip if base quality is zero
if( read.getBaseQualities()[offset] > 0 ) {
// Skip if this base is an 'N' or etc.
if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
bases = read.getReadBases();
refBase = (byte)ref.getBase();
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
if( !read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) {
// Skip if this base is an 'N' or etc.
if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
// This base finally passed all the checks for a good base, so add it to the big data hashmap
updateDataFromRead( read, offset, refBase );
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
if( !read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) {
} else { // calculate SOLID reference insertion rate
if( ref.getBase() == (char)bases[offset] ) {
solidInsertedReferenceBases++;
} else {
otherColorSpaceInconsistency++;
// This base finally passed all the checks for a good base, so add it to the big data hashmap
updateDataFromRead( read, offset, refBase );
} else { // calculate SOLID reference insertion rate
if( ref.getBase() == (char)bases[offset] ) {
solidInsertedReferenceBases++;
} else {
otherColorSpaceInconsistency++;
}
}
}
}

View File

@ -170,7 +170,8 @@ public class RecalDataManager {
final Object val = data.get(comp);
if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps
if( data.keySet().size() == 1) {
data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done in a previous sequential step
data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ...
// in a previous step of the sequential calculation model
}
} else { // Another layer in the nested hash map
checkForSingletons( (Map) val );
@ -358,6 +359,32 @@ public class RecalDataManager {
return originalQualScores;
}
public static boolean checkNoCallColorSpace( final SAMRecord read ) {
if( read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") ) {
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
if( attr != null ) {
char[] colorSpace;
if( attr instanceof String ) {
colorSpace = ((String)attr).substring(1).toCharArray(); // trim off the Sentinel
} else {
throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
}
for( char color : colorSpace ) {
if( color != '0' && color != '1' && color != '2' && color != '3' ) {
return true; // There is a bad color in this SOLiD read and the user wants to skip over it
}
}
} else {
throw new StingException("Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
}
}
return false; // There aren't any color no calls in this SOLiD read
}
/**
* Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero
* @param read The SAMRecord to recalibrate
@ -516,7 +543,7 @@ public class RecalDataManager {
// }
//}
} else {
} else { // No inconsistency array, so nothing is inconsistent
return false;
}
}

View File

@ -61,6 +61,9 @@ public class RecalibrationArgumentCollection {
public int HOMOPOLYMER_NBACK = 7;
@Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false)
public boolean EXCEPTION_IF_NO_TILE = false;
@Argument(fullName = "ignore_nocall_colorspace", shortName="ignore_nocall_colorspace", doc="If provided, the recalibrator will skip over reads with no calls in the color space instead of halting with an exception", required=false)
public boolean IGNORE_NOCALL_COLORSPACE = false;
public final boolean checkSolidRecalMode() {
return ( SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ||

View File

@ -98,6 +98,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
private static final long RANDOM_SEED = 1032861495;
private final Random coinFlip = new Random( RANDOM_SEED ); // Random number generator is used to remove reference bias in solid bams
private long numReadsWithMalformedColorSpace = 0;
//---------------------------------------------------------------------------------------------------------------
//
@ -300,6 +301,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
final String platform = read.getReadGroup().getPlatform();
if( platform.toUpperCase().contains("SOLID") && !RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") ) {
if( RAC.IGNORE_NOCALL_COLORSPACE ) {
final boolean badColor = RecalDataManager.checkNoCallColorSpace( read );
if( badColor ) {
numReadsWithMalformedColorSpace++;
return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them
}
}
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, coinFlip, refBases );
}
@ -452,5 +460,11 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* @param output The SAMFileWriter that outputs the bam file
*/
public void onTraversalDone(SAMFileWriter output) {
if( numReadsWithMalformedColorSpace != 0 ) {
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
"These reads remain in the output bam file but haven't been corrected for reference bias. Use at your own risk.");
}
}
}

View File

@ -182,6 +182,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
" -I " + bam +
" -L 1:10,000,000-10,200,000" +
" -standard" +
" --ignore_nocall_colorspace" +
" --solid_recal_mode SET_Q_ZERO" +
" -recalFile %s",
1, // just one output file