ReadClipper.clipRead's claim that it doesn't modify the original read was false. Ultimately, GATKSAMRecord.clone (as documented) creates a soft copy of the read - so modifying e.g. the bases of the cloned read means that you modify the bases of the original read too. Because of this, when the BQSRv2 Context covariate was writing Ns over the low quality tails of the reads they got propagated out to the output BAM file (very bad). I've updated the ReadClipper docs and cleaned up the code (no reason to use a clone of the read anymore given that we are already modifying the original). For now, the simplest thing is to have the Context covariate store the original bases, overwrite low quality Ns, compute covariates, and rewrite the original bases; we can update later if needed.

This commit is contained in:
Eric Banks 2012-07-13 18:50:27 -04:00
parent d7bf74fb7e
commit 75543a3f22
2 changed files with 28 additions and 27 deletions

View File

@ -82,6 +82,8 @@ public class ContextCovariate implements StandardCovariate {
@Override
public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
// store the original bases and then write Ns over low quality ones
final byte[] originalBases = read.getReadBases();
final GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); // Write N's over the low quality tail of the reads to avoid adding them into the context
final boolean negativeStrand = clippedRead.getReadNegativeStrandFlag();
@ -99,6 +101,9 @@ public class ContextCovariate implements StandardCovariate {
final int indelKey = indelKeys.get(i);
values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, (negativeStrand ? readLength - i - 1 : i));
}
// put the original bases back in
read.setReadBases(originalBases);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration

View File

@ -110,37 +110,32 @@ public class ReadClipper {
}
/**
* Creates a new read that's been clipped according to ops and the chosen algorithm.
* The original read is unmodified.
* Clips a read according to ops and the chosen algorithm.
*
* @param algorithm What mode of clipping do you want to apply for the stacked operations.
* @return a new read with the clipping applied.
* @return the read with the clipping applied.
*/
public GATKSAMRecord clipRead(ClippingRepresentation algorithm) {
if (ops == null)
return getRead();
else {
try {
GATKSAMRecord clippedRead = (GATKSAMRecord) read.clone();
for (ClippingOp op : getOps()) {
//check if the clipped read can still be clipped in the range requested
if (op.start < clippedRead.getReadLength()) {
ClippingOp fixedOperation = op;
if (op.stop >= clippedRead.getReadLength())
fixedOperation = new ClippingOp(op.start, clippedRead.getReadLength() - 1);
clippedRead = fixedOperation.apply(algorithm, clippedRead);
}
}
wasClipped = true;
ops.clear();
if ( clippedRead.isEmpty() )
return GATKSAMRecord.emptyRead(clippedRead);
return clippedRead;
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e); // this should never happen
GATKSAMRecord clippedRead = read;
for (ClippingOp op : getOps()) {
final int readLength = clippedRead.getReadLength();
//check if the clipped read can still be clipped in the range requested
if (op.start < readLength) {
ClippingOp fixedOperation = op;
if (op.stop >= readLength)
fixedOperation = new ClippingOp(op.start, readLength - 1);
clippedRead = fixedOperation.apply(algorithm, clippedRead);
}
}
wasClipped = true;
ops.clear();
if ( clippedRead.isEmpty() )
return GATKSAMRecord.emptyRead(clippedRead);
return clippedRead;
}
@ -241,20 +236,21 @@ public class ReadClipper {
if (read.isEmpty())
return read;
byte [] quals = read.getBaseQualities();
final byte [] quals = read.getBaseQualities();
final int readLength = read.getReadLength();
int leftClipIndex = 0;
int rightClipIndex = read.getReadLength() - 1;
int rightClipIndex = readLength - 1;
// check how far we can clip both sides
while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--;
while (leftClipIndex < read.getReadLength() && quals[leftClipIndex] <= lowQual) leftClipIndex++;
while (leftClipIndex < readLength && quals[leftClipIndex] <= lowQual) leftClipIndex++;
// if the entire read should be clipped, then return an empty read.
if (leftClipIndex > rightClipIndex)
return GATKSAMRecord.emptyRead(read);
if (rightClipIndex < read.getReadLength() - 1) {
this.addOp(new ClippingOp(rightClipIndex + 1, read.getReadLength() - 1));
if (rightClipIndex < readLength - 1) {
this.addOp(new ClippingOp(rightClipIndex + 1, readLength - 1));
}
if (leftClipIndex > 0 ) {
this.addOp(new ClippingOp(0, leftClipIndex - 1));