Insidious bug: clipped sequences (S cigar elements) where a) processed incorrectly; b) sometimes caused IntervalCleaner to crash, if such sequence occured at the boundary of the interval. The following inconsistency occurs: LocusWindow traversal instantiates interval reference stretch up to rightmost read.getAlignmentEnd(), but this does not include clipped bases; then IntervalCleaner takes all read bases (as a string) and does not check if some of them were clipped. Inside the interval this would cause counting mismatches on clipped bases, at the boundary of the interval the clipped bases would stick outside the passed reference stretch and index-out-of-bound exception would be thrown. THIS IS A PARTIAL, TEMPORARY FIX of the problem: mismatchQualitySum() is fixed, in that it does not count mismatches on clipped bases anymore; however, we do not attempt yet to realign only meaningful, unclipped part of the read; instead all reads that have clipped bases are assigned to the original reference and we do not attempt to realign them at all (we'd need to be careful to preserve the cigar if we wanted to do this)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@933 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-06-08 05:20:29 +00:00
parent 3a8219a469
commit 9f35a5aa32
2 changed files with 122 additions and 92 deletions

View File

@ -120,6 +120,7 @@ public class TraverseByLocusWindows extends TraversalEngine {
TraversalStatistics.nRecords++;
SAMRecord read = readIter.next();
// apparently, unmapped reads can occur anywhere in the file!
if ( read.getReadUnmappedFlag() ) {
walker.nonIntervalReadAction(read);
@ -225,6 +226,8 @@ public class TraverseByLocusWindows extends TraversalEngine {
rightmostIndex = interval.getStop();
while (readIter.hasNext() && !done) {
TraversalStatistics.nRecords++;
SAMRecord read = readIter.next();
reads.add(read);
if ( read.getAlignmentStart() < leftmostIndex )
@ -303,4 +306,4 @@ public class TraverseByLocusWindows extends TraversalEngine {
//printProgress("intervals", interval.getLocation());
return sum;
}
}
}

View File

@ -153,26 +153,39 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
for (int i = 0 ; i < c.numCigarElements() ; i++) {
CigarElement ce = c.getCigarElement(i);
switch ( ce.getOperator() ) {
case M:
for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIndex++ ) {
if ( refIndex >= refSeq.length() )
sum += MAX_QUAL;
else if ( Character.toUpperCase(readSeq.charAt(readIndex)) != Character.toUpperCase(refSeq.charAt(refIndex)) )
sum += (int)quals.charAt(readIndex) - 33;
}
break;
case I:
readIndex += ce.getLength();
break;
case D:
refIndex += ce.getLength();
break;
case M:
for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIndex++ ) {
if ( refIndex >= refSeq.length() )
sum += MAX_QUAL;
else if ( Character.toUpperCase(readSeq.charAt(readIndex)) != Character.toUpperCase(refSeq.charAt(refIndex)) )
sum += (int)quals.charAt(readIndex) - 33;
}
break;
case I:
readIndex += ce.getLength();
break;
case D:
refIndex += ce.getLength();
break;
case S: // soft clip
refIndex+=ce.getLength(); // (?? - do we have to??);
readIndex+=ce.getLength();
break;
default: throw new StingException("Cigar element "+ce.getOperator() +" currently can not be processed");
}
}
return sum;
}
private boolean readIsClipped(SAMRecord read) {
final Cigar c = read.getCigar();
final int n = c.numCigarElements();
if ( c.getCigarElement(n-1).getOperator() == CigarOperator.S ||
c.getCigarElement(0).getOperator() == CigarOperator.S) return true;
return false;
}
private void clean(List<SAMRecord> reads, String reference, GenomeLoc interval) {
long leftmostIndex = interval.getStart();
@ -183,6 +196,8 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
// decide which reads potentially need to be cleaned
for ( SAMRecord read : reads ) {
// first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence
int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read);
if ( numBlocks == 2 )
@ -191,6 +206,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
AlignedRead aRead = new AlignedRead(read);
int mismatchScore = mismatchQualitySum(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex);
// we currently can not deal with clipped reads correctly
if ( readIsClipped(read) ) { refReads.add(read); continue; }
// if this doesn't match perfectly to the reference, let's try to clean it
if ( mismatchScore > 0 ) {
altReads.add(aRead);
@ -214,88 +232,89 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
// for each alternative consensus to test, align it to the reference and create an alternative consensus
for ( int index = 0; index < altAlignmentsToTest.size(); index++ ) {
if ( altAlignmentsToTest.get(index) ) {
if ( ! altAlignmentsToTest.get(index) ) continue;
// do a pairwise alignment against the reference
AlignedRead aRead = altReads.get(index);
int indexOnRef;
Cigar c;
if ( aRead.isRealignable() ) {
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString());
indexOnRef = swConsensus.getAlignmentStart2wrt1();
c = swConsensus.getCigar();
} else {
indexOnRef = aRead.getAlignmentStart() - (int)leftmostIndex;
c = aRead.getCigar();
}
if ( indexOnRef < 0 )
continue;
// do a pairwise alignment against the reference
AlignedRead aRead = altReads.get(index);
int indexOnRef;
Cigar c;
if ( aRead.isRealignable() ) {
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString());
indexOnRef = swConsensus.getAlignmentStart2wrt1();
c = swConsensus.getCigar();
} else {
indexOnRef = aRead.getAlignmentStart() - (int)leftmostIndex;
c = aRead.getCigar();
}
if ( indexOnRef < 0 )
continue;
// create the new consensus
StringBuffer sb = new StringBuffer();
sb.append(reference.substring(0, indexOnRef));
logger.debug("CIGAR = " + cigarToString(c));
// create the new consensus
StringBuffer sb = new StringBuffer();
sb.append(reference.substring(0, indexOnRef));
logger.debug("CIGAR = " + cigarToString(c));
int indelCount = 0;
int altIdx = 0;
int refIdx = indexOnRef;
boolean ok_flag = true;
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
CigarElement ce = c.getCigarElement(i);
switch( ce.getOperator() ) {
case D:
indelCount++;
refIdx += ce.getLength();
break;
case M:
if ( reference.length() < refIdx+ce.getLength() )
ok_flag = false;
else
sb.append(reference.substring(refIdx, refIdx+ce.getLength()));
refIdx += ce.getLength();
altIdx += ce.getLength();
break;
case I:
sb.append(aRead.getReadString().substring(altIdx, altIdx+ce.getLength()));
altIdx += ce.getLength();
indelCount++;
break;
}
}
// make sure that there is at most only a single indel and it aligns appropriately!
if ( !ok_flag || indelCount > 1 || reference.length() < refIdx )
continue;
sb.append(reference.substring(refIdx));
String altConsensus = sb.toString();
// for each imperfect match to the reference, score it against this alternative
Consensus consensus = new Consensus(altConsensus, c, indexOnRef);
for ( int j = 0; j < altReads.size(); j++ ) {
AlignedRead toTest = altReads.get(j);
if ( !toTest.isRealignable() )
continue;
Pair<Integer, Integer> altAlignment = findBestOffset(altConsensus, toTest);
// the mismatch score is the min of its alignment vs. the reference and vs. the alternate
int myScore = altAlignment.second;
if ( myScore >= toTest.getMismatchScoreToReference() )
myScore = toTest.getMismatchScoreToReference();
// keep track of reads that align better to the alternate consensus
int indelCount = 0;
int altIdx = 0;
int refIdx = indexOnRef;
boolean ok_flag = true;
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
CigarElement ce = c.getCigarElement(i);
switch( ce.getOperator() ) {
case D:
indelCount++;
refIdx += ce.getLength();
break;
case M:
if ( reference.length() < refIdx+ce.getLength() )
ok_flag = false;
else
consensus.readIndexes.add(new Pair<Integer, Integer>(j, altAlignment.first));
sb.append(reference.substring(refIdx, refIdx+ce.getLength()));
refIdx += ce.getLength();
altIdx += ce.getLength();
break;
case I:
sb.append(aRead.getReadString().substring(altIdx, altIdx+ce.getLength()));
altIdx += ce.getLength();
indelCount++;
break;
}
}
// make sure that there is at most only a single indel and it aligns appropriately!
if ( !ok_flag || indelCount > 1 || reference.length() < refIdx )
continue;
logger.debug(aRead.getReadString() + " vs. " + toTest.getReadString() + " => " + myScore + " - " + altAlignment.first);
consensus.mismatchSum += myScore;
if ( myScore == 0 )
// we already know that this is its consensus, so don't bother testing it later
altAlignmentsToTest.set(j, false);
}
sb.append(reference.substring(refIdx));
String altConsensus = sb.toString(); // alternative consensus sequence we just built from the cuurent read
// for each imperfect match to the reference, score it against this alternative
Consensus consensus = new Consensus(altConsensus, c, indexOnRef);
for ( int j = 0; j < altReads.size(); j++ ) {
AlignedRead toTest = altReads.get(j);
if ( !toTest.isRealignable() )
continue;
Pair<Integer, Integer> altAlignment = findBestOffset(altConsensus, toTest);
// the mismatch score is the min of its alignment vs. the reference and vs. the alternate
int myScore = altAlignment.second;
if ( myScore >= toTest.getMismatchScoreToReference() )
myScore = toTest.getMismatchScoreToReference();
// keep track of reads that align better to the alternate consensus
else
consensus.readIndexes.add(new Pair<Integer, Integer>(j, altAlignment.first));
logger.debug(aRead.getReadString() + " vs. " + toTest.getReadString() + " => " + myScore + " - " + altAlignment.first);
consensus.mismatchSum += myScore;
if ( myScore == 0 )
// we already know that this is its consensus, so don't bother testing it later
altAlignmentsToTest.set(j, false);
}
logger.debug(aRead.getReadString() + " " + consensus.mismatchSum);
if ( bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) {
bestConsensus = consensus;
logger.debug(aRead.getReadString() + " " + consensus.mismatchSum);
if ( bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) {
bestConsensus = consensus;
logger.debug(aRead.getReadString() + " " + consensus.mismatchSum);
}
}
}
@ -369,7 +388,10 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
aRead.getRead().setAttribute("NM", AlignmentUtils.numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex));
}
}
} else if ( statsOutput != null ) {
// END IF ( improvemenr >= LOD_THRESHOLD )
} else if ( statsOutput != null ) {
try {
statsOutput.write(interval.toString());
statsOutput.write("\tFAIL\t"); // if improvement < LOD_THRESHOLD
@ -497,7 +519,12 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
int refIdx = read.getOriginalAlignmentStart() - (int)leftmostIndex;
String readStr = read.getReadString();
String qualStr = read.getBaseQualityString();
for (int j=0; j < readStr.length(); j++, refIdx++ ) {
// if ( refIdx < 0 || refIdx >= reference.length() ) {
// System.out.println( "Read: "+read.getRead().getReadName() + "; length = " + readStr.length() );
// System.out.println( "Ref left: "+ leftmostIndex +"; ref length=" + reference.length() + "; read alignment start: "+read.getOriginalAlignmentStart() );
// }
totalBases[refIdx] += (int)qualStr.charAt(j) - 33;
if ( Character.toUpperCase(readStr.charAt(j)) != Character.toUpperCase(reference.charAt(refIdx)) )
originalMismatchBases[refIdx] += (int)qualStr.charAt(j) - 33;