Added a new option in the recalibrator to be used by people who have SOLiD data in which only a few of the reads have no-calls in the color space. These reads will be skipped over and left in the bam file untouched.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2857 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b1a4e6d840
commit
7f19ff1fa1
|
|
@ -263,28 +263,32 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
offset = p.getOffset();
|
||||
|
||||
RecalDataManager.parseSAMRecord( read, RAC );
|
||||
RecalDataManager.parseColorSpace( read );
|
||||
|
||||
// Skip if base quality is zero
|
||||
if( read.getBaseQualities()[offset] > 0 ) {
|
||||
// Skip over reads with no calls in the color space if the user requested it
|
||||
if( !RAC.IGNORE_NOCALL_COLORSPACE || !RecalDataManager.checkNoCallColorSpace( read ) ) {
|
||||
RecalDataManager.parseColorSpace( read );
|
||||
|
||||
bases = read.getReadBases();
|
||||
refBase = (byte)ref.getBase();
|
||||
// Skip if base quality is zero
|
||||
if( read.getBaseQualities()[offset] > 0 ) {
|
||||
|
||||
// Skip if this base is an 'N' or etc.
|
||||
if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
|
||||
bases = read.getReadBases();
|
||||
refBase = (byte)ref.getBase();
|
||||
|
||||
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
||||
if( !read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) {
|
||||
// Skip if this base is an 'N' or etc.
|
||||
if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
|
||||
|
||||
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||
updateDataFromRead( read, offset, refBase );
|
||||
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
||||
if( !read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || !RecalDataManager.isInconsistentColorSpace( read, offset ) ) {
|
||||
|
||||
} else { // calculate SOLID reference insertion rate
|
||||
if( ref.getBase() == (char)bases[offset] ) {
|
||||
solidInsertedReferenceBases++;
|
||||
} else {
|
||||
otherColorSpaceInconsistency++;
|
||||
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||
updateDataFromRead( read, offset, refBase );
|
||||
|
||||
} else { // calculate SOLID reference insertion rate
|
||||
if( ref.getBase() == (char)bases[offset] ) {
|
||||
solidInsertedReferenceBases++;
|
||||
} else {
|
||||
otherColorSpaceInconsistency++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -170,7 +170,8 @@ public class RecalDataManager {
|
|||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps
|
||||
if( data.keySet().size() == 1) {
|
||||
data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done in a previous sequential step
|
||||
data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ...
|
||||
// in a previous step of the sequential calculation model
|
||||
}
|
||||
} else { // Another layer in the nested hash map
|
||||
checkForSingletons( (Map) val );
|
||||
|
|
@ -358,6 +359,32 @@ public class RecalDataManager {
|
|||
return originalQualScores;
|
||||
}
|
||||
|
||||
public static boolean checkNoCallColorSpace( final SAMRecord read ) {
|
||||
if( read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") ) {
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
char[] colorSpace;
|
||||
if( attr instanceof String ) {
|
||||
colorSpace = ((String)attr).substring(1).toCharArray(); // trim off the Sentinel
|
||||
} else {
|
||||
throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
for( char color : colorSpace ) {
|
||||
if( color != '0' && color != '1' && color != '2' && color != '3' ) {
|
||||
return true; // There is a bad color in this SOLiD read and the user wants to skip over it
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
throw new StingException("Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
|
||||
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
|
||||
}
|
||||
}
|
||||
|
||||
return false; // There aren't any color no calls in this SOLiD read
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero
|
||||
* @param read The SAMRecord to recalibrate
|
||||
|
|
@ -516,7 +543,7 @@ public class RecalDataManager {
|
|||
// }
|
||||
//}
|
||||
|
||||
} else {
|
||||
} else { // No inconsistency array, so nothing is inconsistent
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -61,6 +61,9 @@ public class RecalibrationArgumentCollection {
|
|||
public int HOMOPOLYMER_NBACK = 7;
|
||||
@Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false)
|
||||
public boolean EXCEPTION_IF_NO_TILE = false;
|
||||
@Argument(fullName = "ignore_nocall_colorspace", shortName="ignore_nocall_colorspace", doc="If provided, the recalibrator will skip over reads with no calls in the color space instead of halting with an exception", required=false)
|
||||
public boolean IGNORE_NOCALL_COLORSPACE = false;
|
||||
|
||||
|
||||
public final boolean checkSolidRecalMode() {
|
||||
return ( SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") || SOLID_RECAL_MODE.equalsIgnoreCase("SET_Q_ZERO") ||
|
||||
|
|
|
|||
|
|
@ -98,6 +98,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
private static final long RANDOM_SEED = 1032861495;
|
||||
private final Random coinFlip = new Random( RANDOM_SEED ); // Random number generator is used to remove reference bias in solid bams
|
||||
private long numReadsWithMalformedColorSpace = 0;
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -300,6 +301,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
final String platform = read.getReadGroup().getPlatform();
|
||||
if( platform.toUpperCase().contains("SOLID") && !RAC.SOLID_RECAL_MODE.equalsIgnoreCase("DO_NOTHING") ) {
|
||||
if( RAC.IGNORE_NOCALL_COLORSPACE ) {
|
||||
final boolean badColor = RecalDataManager.checkNoCallColorSpace( read );
|
||||
if( badColor ) {
|
||||
numReadsWithMalformedColorSpace++;
|
||||
return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them
|
||||
}
|
||||
}
|
||||
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, coinFlip, refBases );
|
||||
}
|
||||
|
||||
|
|
@ -452,5 +460,11 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
* @param output The SAMFileWriter that outputs the bam file
|
||||
*/
|
||||
public void onTraversalDone(SAMFileWriter output) {
|
||||
if( numReadsWithMalformedColorSpace != 0 ) {
|
||||
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
|
||||
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
||||
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
||||
"These reads remain in the output bam file but haven't been corrected for reference bias. Use at your own risk.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -182,6 +182,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -I " + bam +
|
||||
" -L 1:10,000,000-10,200,000" +
|
||||
" -standard" +
|
||||
" --ignore_nocall_colorspace" +
|
||||
" --solid_recal_mode SET_Q_ZERO" +
|
||||
" -recalFile %s",
|
||||
1, // just one output file
|
||||
|
|
|
|||
Loading…
Reference in New Issue