Insidious bug: clipped sequences (S cigar elements) where a) processed incorrectly; b) sometimes caused IntervalCleaner to crash, if such sequence occured at the boundary of the interval. The following inconsistency occurs: LocusWindow traversal instantiates interval reference stretch up to rightmost read.getAlignmentEnd(), but this does not include clipped bases; then IntervalCleaner takes all read bases (as a string) and does not check if some of them were clipped. Inside the interval this would cause counting mismatches on clipped bases, at the boundary of the interval the clipped bases would stick outside the passed reference stretch and index-out-of-bound exception would be thrown. THIS IS A PARTIAL, TEMPORARY FIX of the problem: mismatchQualitySum() is fixed, in that it does not count mismatches on clipped bases anymore; however, we do not attempt yet to realign only meaningful, unclipped part of the read; instead all reads that have clipped bases are assigned to the original reference and we do not attempt to realign them at all (we'd need to be careful to preserve the cigar if we wanted to do this)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@933 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-06-08 05:20:29 +00:00
parent 3a8219a469
commit 9f35a5aa32
2 changed files with 122 additions and 92 deletions

View File

@ -120,6 +120,7 @@ public class TraverseByLocusWindows extends TraversalEngine {
TraversalStatistics.nRecords++; TraversalStatistics.nRecords++;
SAMRecord read = readIter.next(); SAMRecord read = readIter.next();
// apparently, unmapped reads can occur anywhere in the file! // apparently, unmapped reads can occur anywhere in the file!
if ( read.getReadUnmappedFlag() ) { if ( read.getReadUnmappedFlag() ) {
walker.nonIntervalReadAction(read); walker.nonIntervalReadAction(read);
@ -225,6 +226,8 @@ public class TraverseByLocusWindows extends TraversalEngine {
rightmostIndex = interval.getStop(); rightmostIndex = interval.getStop();
while (readIter.hasNext() && !done) { while (readIter.hasNext() && !done) {
TraversalStatistics.nRecords++; TraversalStatistics.nRecords++;
SAMRecord read = readIter.next(); SAMRecord read = readIter.next();
reads.add(read); reads.add(read);
if ( read.getAlignmentStart() < leftmostIndex ) if ( read.getAlignmentStart() < leftmostIndex )

View File

@ -167,12 +167,25 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
case D: case D:
refIndex += ce.getLength(); refIndex += ce.getLength();
break; break;
case S: // soft clip
refIndex+=ce.getLength(); // (?? - do we have to??);
readIndex+=ce.getLength();
break;
default: throw new StingException("Cigar element "+ce.getOperator() +" currently can not be processed");
} }
} }
return sum; return sum;
} }
private boolean readIsClipped(SAMRecord read) {
final Cigar c = read.getCigar();
final int n = c.numCigarElements();
if ( c.getCigarElement(n-1).getOperator() == CigarOperator.S ||
c.getCigarElement(0).getOperator() == CigarOperator.S) return true;
return false;
}
private void clean(List<SAMRecord> reads, String reference, GenomeLoc interval) { private void clean(List<SAMRecord> reads, String reference, GenomeLoc interval) {
long leftmostIndex = interval.getStart(); long leftmostIndex = interval.getStart();
@ -183,6 +196,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
// decide which reads potentially need to be cleaned // decide which reads potentially need to be cleaned
for ( SAMRecord read : reads ) { for ( SAMRecord read : reads ) {
// first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence
int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read); int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read);
if ( numBlocks == 2 ) if ( numBlocks == 2 )
@ -191,6 +206,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
AlignedRead aRead = new AlignedRead(read); AlignedRead aRead = new AlignedRead(read);
int mismatchScore = mismatchQualitySum(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex); int mismatchScore = mismatchQualitySum(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex);
// we currently can not deal with clipped reads correctly
if ( readIsClipped(read) ) { refReads.add(read); continue; }
// if this doesn't match perfectly to the reference, let's try to clean it // if this doesn't match perfectly to the reference, let's try to clean it
if ( mismatchScore > 0 ) { if ( mismatchScore > 0 ) {
altReads.add(aRead); altReads.add(aRead);
@ -214,7 +232,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
// for each alternative consensus to test, align it to the reference and create an alternative consensus // for each alternative consensus to test, align it to the reference and create an alternative consensus
for ( int index = 0; index < altAlignmentsToTest.size(); index++ ) { for ( int index = 0; index < altAlignmentsToTest.size(); index++ ) {
if ( altAlignmentsToTest.get(index) ) { if ( ! altAlignmentsToTest.get(index) ) continue;
// do a pairwise alignment against the reference // do a pairwise alignment against the reference
AlignedRead aRead = altReads.get(index); AlignedRead aRead = altReads.get(index);
@ -267,7 +285,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
continue; continue;
sb.append(reference.substring(refIdx)); sb.append(reference.substring(refIdx));
String altConsensus = sb.toString(); String altConsensus = sb.toString(); // alternative consensus sequence we just built from the cuurent read
// for each imperfect match to the reference, score it against this alternative // for each imperfect match to the reference, score it against this alternative
Consensus consensus = new Consensus(altConsensus, c, indexOnRef); Consensus consensus = new Consensus(altConsensus, c, indexOnRef);
@ -287,17 +305,18 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
logger.debug(aRead.getReadString() + " vs. " + toTest.getReadString() + " => " + myScore + " - " + altAlignment.first); logger.debug(aRead.getReadString() + " vs. " + toTest.getReadString() + " => " + myScore + " - " + altAlignment.first);
consensus.mismatchSum += myScore; consensus.mismatchSum += myScore;
if ( myScore == 0 ) if ( myScore == 0 )
// we already know that this is its consensus, so don't bother testing it later // we already know that this is its consensus, so don't bother testing it later
altAlignmentsToTest.set(j, false); altAlignmentsToTest.set(j, false);
} }
logger.debug(aRead.getReadString() + " " + consensus.mismatchSum); logger.debug(aRead.getReadString() + " " + consensus.mismatchSum);
if ( bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) { if ( bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) {
bestConsensus = consensus; bestConsensus = consensus;
logger.debug(aRead.getReadString() + " " + consensus.mismatchSum); logger.debug(aRead.getReadString() + " " + consensus.mismatchSum);
} }
} }
}
// if the best alternate consensus has a smaller sum of quality score mismatches (more than // if the best alternate consensus has a smaller sum of quality score mismatches (more than
// the LOD threshold), and it didn't just move around the mismatching columns, then clean! // the LOD threshold), and it didn't just move around the mismatching columns, then clean!
@ -369,6 +388,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
aRead.getRead().setAttribute("NM", AlignmentUtils.numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex)); aRead.getRead().setAttribute("NM", AlignmentUtils.numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex));
} }
} }
// END IF ( improvemenr >= LOD_THRESHOLD )
} else if ( statsOutput != null ) { } else if ( statsOutput != null ) {
try { try {
statsOutput.write(interval.toString()); statsOutput.write(interval.toString());
@ -497,7 +519,12 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
int refIdx = read.getOriginalAlignmentStart() - (int)leftmostIndex; int refIdx = read.getOriginalAlignmentStart() - (int)leftmostIndex;
String readStr = read.getReadString(); String readStr = read.getReadString();
String qualStr = read.getBaseQualityString(); String qualStr = read.getBaseQualityString();
for (int j=0; j < readStr.length(); j++, refIdx++ ) { for (int j=0; j < readStr.length(); j++, refIdx++ ) {
// if ( refIdx < 0 || refIdx >= reference.length() ) {
// System.out.println( "Read: "+read.getRead().getReadName() + "; length = " + readStr.length() );
// System.out.println( "Ref left: "+ leftmostIndex +"; ref length=" + reference.length() + "; read alignment start: "+read.getOriginalAlignmentStart() );
// }
totalBases[refIdx] += (int)qualStr.charAt(j) - 33; totalBases[refIdx] += (int)qualStr.charAt(j) - 33;
if ( Character.toUpperCase(readStr.charAt(j)) != Character.toUpperCase(reference.charAt(refIdx)) ) if ( Character.toUpperCase(readStr.charAt(j)) != Character.toUpperCase(reference.charAt(refIdx)) )
originalMismatchBases[refIdx] += (int)qualStr.charAt(j) - 33; originalMismatchBases[refIdx] += (int)qualStr.charAt(j) - 33;