Insidious bug: clipped sequences (S cigar elements) where a) processed incorrectly; b) sometimes caused IntervalCleaner to crash, if such sequence occured at the boundary of the interval. The following inconsistency occurs: LocusWindow traversal instantiates interval reference stretch up to rightmost read.getAlignmentEnd(), but this does not include clipped bases; then IntervalCleaner takes all read bases (as a string) and does not check if some of them were clipped. Inside the interval this would cause counting mismatches on clipped bases, at the boundary of the interval the clipped bases would stick outside the passed reference stretch and index-out-of-bound exception would be thrown. THIS IS A PARTIAL, TEMPORARY FIX of the problem: mismatchQualitySum() is fixed, in that it does not count mismatches on clipped bases anymore; however, we do not attempt yet to realign only meaningful, unclipped part of the read; instead all reads that have clipped bases are assigned to the original reference and we do not attempt to realign them at all (we'd need to be careful to preserve the cigar if we wanted to do this)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@933 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-06-08 05:20:29 +00:00
parent 3a8219a469
commit 9f35a5aa32
2 changed files with 122 additions and 92 deletions

View File

@ -120,6 +120,7 @@ public class TraverseByLocusWindows extends TraversalEngine {
TraversalStatistics.nRecords++; TraversalStatistics.nRecords++;
SAMRecord read = readIter.next(); SAMRecord read = readIter.next();
// apparently, unmapped reads can occur anywhere in the file! // apparently, unmapped reads can occur anywhere in the file!
if ( read.getReadUnmappedFlag() ) { if ( read.getReadUnmappedFlag() ) {
walker.nonIntervalReadAction(read); walker.nonIntervalReadAction(read);
@ -225,6 +226,8 @@ public class TraverseByLocusWindows extends TraversalEngine {
rightmostIndex = interval.getStop(); rightmostIndex = interval.getStop();
while (readIter.hasNext() && !done) { while (readIter.hasNext() && !done) {
TraversalStatistics.nRecords++; TraversalStatistics.nRecords++;
SAMRecord read = readIter.next(); SAMRecord read = readIter.next();
reads.add(read); reads.add(read);
if ( read.getAlignmentStart() < leftmostIndex ) if ( read.getAlignmentStart() < leftmostIndex )
@ -303,4 +306,4 @@ public class TraverseByLocusWindows extends TraversalEngine {
//printProgress("intervals", interval.getLocation()); //printProgress("intervals", interval.getLocation());
return sum; return sum;
} }
} }

View File

@ -153,26 +153,39 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
for (int i = 0 ; i < c.numCigarElements() ; i++) { for (int i = 0 ; i < c.numCigarElements() ; i++) {
CigarElement ce = c.getCigarElement(i); CigarElement ce = c.getCigarElement(i);
switch ( ce.getOperator() ) { switch ( ce.getOperator() ) {
case M: case M:
for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIndex++ ) { for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIndex++ ) {
if ( refIndex >= refSeq.length() ) if ( refIndex >= refSeq.length() )
sum += MAX_QUAL; sum += MAX_QUAL;
else if ( Character.toUpperCase(readSeq.charAt(readIndex)) != Character.toUpperCase(refSeq.charAt(refIndex)) ) else if ( Character.toUpperCase(readSeq.charAt(readIndex)) != Character.toUpperCase(refSeq.charAt(refIndex)) )
sum += (int)quals.charAt(readIndex) - 33; sum += (int)quals.charAt(readIndex) - 33;
} }
break; break;
case I: case I:
readIndex += ce.getLength(); readIndex += ce.getLength();
break; break;
case D: case D:
refIndex += ce.getLength(); refIndex += ce.getLength();
break; break;
case S: // soft clip
refIndex+=ce.getLength(); // (?? - do we have to??);
readIndex+=ce.getLength();
break;
default: throw new StingException("Cigar element "+ce.getOperator() +" currently can not be processed");
} }
} }
return sum; return sum;
} }
private boolean readIsClipped(SAMRecord read) {
final Cigar c = read.getCigar();
final int n = c.numCigarElements();
if ( c.getCigarElement(n-1).getOperator() == CigarOperator.S ||
c.getCigarElement(0).getOperator() == CigarOperator.S) return true;
return false;
}
private void clean(List<SAMRecord> reads, String reference, GenomeLoc interval) { private void clean(List<SAMRecord> reads, String reference, GenomeLoc interval) {
long leftmostIndex = interval.getStart(); long leftmostIndex = interval.getStart();
@ -183,6 +196,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
// decide which reads potentially need to be cleaned // decide which reads potentially need to be cleaned
for ( SAMRecord read : reads ) { for ( SAMRecord read : reads ) {
// first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence
int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read); int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read);
if ( numBlocks == 2 ) if ( numBlocks == 2 )
@ -191,6 +206,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
AlignedRead aRead = new AlignedRead(read); AlignedRead aRead = new AlignedRead(read);
int mismatchScore = mismatchQualitySum(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex); int mismatchScore = mismatchQualitySum(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex);
// we currently can not deal with clipped reads correctly
if ( readIsClipped(read) ) { refReads.add(read); continue; }
// if this doesn't match perfectly to the reference, let's try to clean it // if this doesn't match perfectly to the reference, let's try to clean it
if ( mismatchScore > 0 ) { if ( mismatchScore > 0 ) {
altReads.add(aRead); altReads.add(aRead);
@ -214,88 +232,89 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
// for each alternative consensus to test, align it to the reference and create an alternative consensus // for each alternative consensus to test, align it to the reference and create an alternative consensus
for ( int index = 0; index < altAlignmentsToTest.size(); index++ ) { for ( int index = 0; index < altAlignmentsToTest.size(); index++ ) {
if ( altAlignmentsToTest.get(index) ) { if ( ! altAlignmentsToTest.get(index) ) continue;
// do a pairwise alignment against the reference // do a pairwise alignment against the reference
AlignedRead aRead = altReads.get(index); AlignedRead aRead = altReads.get(index);
int indexOnRef; int indexOnRef;
Cigar c; Cigar c;
if ( aRead.isRealignable() ) { if ( aRead.isRealignable() ) {
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString()); SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString());
indexOnRef = swConsensus.getAlignmentStart2wrt1(); indexOnRef = swConsensus.getAlignmentStart2wrt1();
c = swConsensus.getCigar(); c = swConsensus.getCigar();
} else { } else {
indexOnRef = aRead.getAlignmentStart() - (int)leftmostIndex; indexOnRef = aRead.getAlignmentStart() - (int)leftmostIndex;
c = aRead.getCigar(); c = aRead.getCigar();
} }
if ( indexOnRef < 0 ) if ( indexOnRef < 0 )
continue; continue;
// create the new consensus // create the new consensus
StringBuffer sb = new StringBuffer(); StringBuffer sb = new StringBuffer();
sb.append(reference.substring(0, indexOnRef)); sb.append(reference.substring(0, indexOnRef));
logger.debug("CIGAR = " + cigarToString(c)); logger.debug("CIGAR = " + cigarToString(c));
int indelCount = 0; int indelCount = 0;
int altIdx = 0; int altIdx = 0;
int refIdx = indexOnRef; int refIdx = indexOnRef;
boolean ok_flag = true; boolean ok_flag = true;
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) { for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
CigarElement ce = c.getCigarElement(i); CigarElement ce = c.getCigarElement(i);
switch( ce.getOperator() ) { switch( ce.getOperator() ) {
case D: case D:
indelCount++; indelCount++;
refIdx += ce.getLength(); refIdx += ce.getLength();
break; break;
case M: case M:
if ( reference.length() < refIdx+ce.getLength() ) if ( reference.length() < refIdx+ce.getLength() )
ok_flag = false; ok_flag = false;
else
sb.append(reference.substring(refIdx, refIdx+ce.getLength()));
refIdx += ce.getLength();
altIdx += ce.getLength();
break;
case I:
sb.append(aRead.getReadString().substring(altIdx, altIdx+ce.getLength()));
altIdx += ce.getLength();
indelCount++;
break;
}
}
// make sure that there is at most only a single indel and it aligns appropriately!
if ( !ok_flag || indelCount > 1 || reference.length() < refIdx )
continue;
sb.append(reference.substring(refIdx));
String altConsensus = sb.toString();
// for each imperfect match to the reference, score it against this alternative
Consensus consensus = new Consensus(altConsensus, c, indexOnRef);
for ( int j = 0; j < altReads.size(); j++ ) {
AlignedRead toTest = altReads.get(j);
if ( !toTest.isRealignable() )
continue;
Pair<Integer, Integer> altAlignment = findBestOffset(altConsensus, toTest);
// the mismatch score is the min of its alignment vs. the reference and vs. the alternate
int myScore = altAlignment.second;
if ( myScore >= toTest.getMismatchScoreToReference() )
myScore = toTest.getMismatchScoreToReference();
// keep track of reads that align better to the alternate consensus
else else
consensus.readIndexes.add(new Pair<Integer, Integer>(j, altAlignment.first)); sb.append(reference.substring(refIdx, refIdx+ce.getLength()));
refIdx += ce.getLength();
altIdx += ce.getLength();
break;
case I:
sb.append(aRead.getReadString().substring(altIdx, altIdx+ce.getLength()));
altIdx += ce.getLength();
indelCount++;
break;
}
}
// make sure that there is at most only a single indel and it aligns appropriately!
if ( !ok_flag || indelCount > 1 || reference.length() < refIdx )
continue;
logger.debug(aRead.getReadString() + " vs. " + toTest.getReadString() + " => " + myScore + " - " + altAlignment.first); sb.append(reference.substring(refIdx));
consensus.mismatchSum += myScore; String altConsensus = sb.toString(); // alternative consensus sequence we just built from the cuurent read
if ( myScore == 0 )
// we already know that this is its consensus, so don't bother testing it later // for each imperfect match to the reference, score it against this alternative
altAlignmentsToTest.set(j, false); Consensus consensus = new Consensus(altConsensus, c, indexOnRef);
} for ( int j = 0; j < altReads.size(); j++ ) {
AlignedRead toTest = altReads.get(j);
if ( !toTest.isRealignable() )
continue;
Pair<Integer, Integer> altAlignment = findBestOffset(altConsensus, toTest);
// the mismatch score is the min of its alignment vs. the reference and vs. the alternate
int myScore = altAlignment.second;
if ( myScore >= toTest.getMismatchScoreToReference() )
myScore = toTest.getMismatchScoreToReference();
// keep track of reads that align better to the alternate consensus
else
consensus.readIndexes.add(new Pair<Integer, Integer>(j, altAlignment.first));
logger.debug(aRead.getReadString() + " vs. " + toTest.getReadString() + " => " + myScore + " - " + altAlignment.first);
consensus.mismatchSum += myScore;
if ( myScore == 0 )
// we already know that this is its consensus, so don't bother testing it later
altAlignmentsToTest.set(j, false);
}
logger.debug(aRead.getReadString() + " " + consensus.mismatchSum);
if ( bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) {
bestConsensus = consensus;
logger.debug(aRead.getReadString() + " " + consensus.mismatchSum); logger.debug(aRead.getReadString() + " " + consensus.mismatchSum);
if ( bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) {
bestConsensus = consensus;
logger.debug(aRead.getReadString() + " " + consensus.mismatchSum);
}
} }
} }
@ -369,7 +388,10 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
aRead.getRead().setAttribute("NM", AlignmentUtils.numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex)); aRead.getRead().setAttribute("NM", AlignmentUtils.numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex));
} }
} }
} else if ( statsOutput != null ) {
// END IF ( improvemenr >= LOD_THRESHOLD )
} else if ( statsOutput != null ) {
try { try {
statsOutput.write(interval.toString()); statsOutput.write(interval.toString());
statsOutput.write("\tFAIL\t"); // if improvement < LOD_THRESHOLD statsOutput.write("\tFAIL\t"); // if improvement < LOD_THRESHOLD
@ -497,7 +519,12 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
int refIdx = read.getOriginalAlignmentStart() - (int)leftmostIndex; int refIdx = read.getOriginalAlignmentStart() - (int)leftmostIndex;
String readStr = read.getReadString(); String readStr = read.getReadString();
String qualStr = read.getBaseQualityString(); String qualStr = read.getBaseQualityString();
for (int j=0; j < readStr.length(); j++, refIdx++ ) { for (int j=0; j < readStr.length(); j++, refIdx++ ) {
// if ( refIdx < 0 || refIdx >= reference.length() ) {
// System.out.println( "Read: "+read.getRead().getReadName() + "; length = " + readStr.length() );
// System.out.println( "Ref left: "+ leftmostIndex +"; ref length=" + reference.length() + "; read alignment start: "+read.getOriginalAlignmentStart() );
// }
totalBases[refIdx] += (int)qualStr.charAt(j) - 33; totalBases[refIdx] += (int)qualStr.charAt(j) - 33;
if ( Character.toUpperCase(readStr.charAt(j)) != Character.toUpperCase(reference.charAt(refIdx)) ) if ( Character.toUpperCase(readStr.charAt(j)) != Character.toUpperCase(reference.charAt(refIdx)) )
originalMismatchBases[refIdx] += (int)qualStr.charAt(j) - 33; originalMismatchBases[refIdx] += (int)qualStr.charAt(j) - 33;