Insidious bug: clipped sequences (S cigar elements) where a) processed incorrectly; b) sometimes caused IntervalCleaner to crash, if such sequence occured at the boundary of the interval. The following inconsistency occurs: LocusWindow traversal instantiates interval reference stretch up to rightmost read.getAlignmentEnd(), but this does not include clipped bases; then IntervalCleaner takes all read bases (as a string) and does not check if some of them were clipped. Inside the interval this would cause counting mismatches on clipped bases, at the boundary of the interval the clipped bases would stick outside the passed reference stretch and index-out-of-bound exception would be thrown. THIS IS A PARTIAL, TEMPORARY FIX of the problem: mismatchQualitySum() is fixed, in that it does not count mismatches on clipped bases anymore; however, we do not attempt yet to realign only meaningful, unclipped part of the read; instead all reads that have clipped bases are assigned to the original reference and we do not attempt to realign them at all (we'd need to be careful to preserve the cigar if we wanted to do this)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@933 348d0f76-0448-11de-a6fe-93d51630548a
2009-06-08 05:20:29 +00:00 · 2009-06-08 05:20:29 +00:00 · 9f35a5aa32
parent 3a8219a469
commit 9f35a5aa32
2 changed files with 122 additions and 92 deletions
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLocusWindows.java
@ -120,6 +120,7 @@ public class TraverseByLocusWindows extends TraversalEngine {
            TraversalStatistics.nRecords++;
            SAMRecord read = readIter.next();

+
            // apparently, unmapped reads can occur anywhere in the file!
            if ( read.getReadUnmappedFlag() ) {
                walker.nonIntervalReadAction(read);
@ -225,6 +226,8 @@ public class TraverseByLocusWindows extends TraversalEngine {
                rightmostIndex = interval.getStop();
        while (readIter.hasNext() && !done) {
            TraversalStatistics.nRecords++;
+
+
            SAMRecord read = readIter.next();
            reads.add(read);
            if ( read.getAlignmentStart() < leftmostIndex )
@ -303,4 +306,4 @@ public class TraverseByLocusWindows extends TraversalEngine {
        //printProgress("intervals", interval.getLocation());
        return sum;
    }
-}
+}
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java
@ -153,26 +153,39 @@ public class  IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
        for (int i = 0 ; i < c.numCigarElements() ; i++) {
            CigarElement ce = c.getCigarElement(i);
            switch ( ce.getOperator() ) {
-                case M:
-                    for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIndex++ ) {
-                        if ( refIndex >= refSeq.length() )
-                            sum += MAX_QUAL;
-                        else if ( Character.toUpperCase(readSeq.charAt(readIndex)) != Character.toUpperCase(refSeq.charAt(refIndex)) )
-                            sum += (int)quals.charAt(readIndex) - 33;
-                    }
-                    break;
-                case I:
-                    readIndex += ce.getLength();
-                    break;
-                case D:
-                    refIndex += ce.getLength();
-                    break;
+            case M:
+                for (int j = 0 ; j < ce.getLength() ; j++, refIndex++, readIndex++ ) {
+                    if ( refIndex >= refSeq.length() )
+                        sum += MAX_QUAL;
+                    else if ( Character.toUpperCase(readSeq.charAt(readIndex)) != Character.toUpperCase(refSeq.charAt(refIndex)) )
+                        sum += (int)quals.charAt(readIndex) - 33;
+                }
+                break;
+            case I:
+                readIndex += ce.getLength();
+                break;
+            case D:
+                refIndex += ce.getLength();
+                break;
+            case S: // soft clip
+                refIndex+=ce.getLength(); // (?? - do we have to??); 
+                readIndex+=ce.getLength(); 
+                break;
+            default: throw new StingException("Cigar element "+ce.getOperator() +" currently can not be processed");
            }

        }
        return sum;
    }

+    private boolean readIsClipped(SAMRecord read) {
+        final Cigar c = read.getCigar();
+        final int n = c.numCigarElements();
+        if ( c.getCigarElement(n-1).getOperator() == CigarOperator.S ||
+             c.getCigarElement(0).getOperator() == CigarOperator.S) return true;
+        return false;
+    }
+
    private void clean(List<SAMRecord> reads, String reference, GenomeLoc interval) {

        long leftmostIndex = interval.getStart();
@ -183,6 +196,8 @@ public class  IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>

        // decide which reads potentially need to be cleaned
        for ( SAMRecord read : reads ) {
+
+
            // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence
            int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read);
            if ( numBlocks == 2 )
@ -191,6 +206,9 @@ public class  IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
            AlignedRead aRead = new AlignedRead(read);
            int mismatchScore = mismatchQualitySum(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex);

+            // we currently can not deal with clipped reads correctly
+            if ( readIsClipped(read) ) { refReads.add(read); continue; }
+
            // if this doesn't match perfectly to the reference, let's try to clean it
            if ( mismatchScore > 0 ) {
                altReads.add(aRead);
@ -214,88 +232,89 @@ public class  IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>

        // for each alternative consensus to test, align it to the reference and create an alternative consensus
        for ( int index = 0; index < altAlignmentsToTest.size(); index++ ) {
-            if ( altAlignmentsToTest.get(index) ) {
+            if ( ! altAlignmentsToTest.get(index) ) continue;

-                // do a pairwise alignment against the reference
-                AlignedRead aRead = altReads.get(index);
-                int indexOnRef;
-                Cigar c;
-                if ( aRead.isRealignable() ) {
-                    SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString());
-                    indexOnRef = swConsensus.getAlignmentStart2wrt1();
-                    c = swConsensus.getCigar();
-                } else {
-                    indexOnRef = aRead.getAlignmentStart() - (int)leftmostIndex;
-                    c = aRead.getCigar();
-                }
-                if ( indexOnRef < 0 )
-                     continue;
+            // do a pairwise alignment against the reference
+            AlignedRead aRead = altReads.get(index);
+            int indexOnRef;
+            Cigar c;
+            if ( aRead.isRealignable() ) {
+                SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadString());
+                indexOnRef = swConsensus.getAlignmentStart2wrt1();
+                c = swConsensus.getCigar();
+            } else {
+                indexOnRef = aRead.getAlignmentStart() - (int)leftmostIndex;
+                c = aRead.getCigar();
+            }
+            if ( indexOnRef < 0 )
+                continue;

-                // create the new consensus
-                StringBuffer sb = new StringBuffer();
-                sb.append(reference.substring(0, indexOnRef));
-                logger.debug("CIGAR = " + cigarToString(c));
+            // create the new consensus
+            StringBuffer sb = new StringBuffer();
+            sb.append(reference.substring(0, indexOnRef));
+            logger.debug("CIGAR = " + cigarToString(c));

-                int indelCount = 0;
-                int altIdx = 0;
-                int refIdx = indexOnRef;
-                boolean ok_flag = true;
-                for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
-                    CigarElement ce = c.getCigarElement(i);
-                    switch( ce.getOperator() ) {
-                        case D:
-                            indelCount++;
-                            refIdx += ce.getLength();
-                            break;
-                        case M:
-                            if ( reference.length() < refIdx+ce.getLength() )
-                                ok_flag = false;
-                            else
-                                sb.append(reference.substring(refIdx, refIdx+ce.getLength()));
-                            refIdx += ce.getLength();
-                            altIdx += ce.getLength();
-                            break;
-                        case I:
-                            sb.append(aRead.getReadString().substring(altIdx, altIdx+ce.getLength()));
-                            altIdx += ce.getLength();
-                            indelCount++;
-                            break;
-                    }
-                }
-                // make sure that there is at most only a single indel and it aligns appropriately!
-                if ( !ok_flag || indelCount > 1 || reference.length() < refIdx )
-                    continue;
-
-                sb.append(reference.substring(refIdx));
-                String altConsensus =  sb.toString();
-
-                // for each imperfect match to the reference, score it against this alternative
-                Consensus consensus = new Consensus(altConsensus, c, indexOnRef);
-                for ( int j = 0; j < altReads.size(); j++ ) {
-                    AlignedRead toTest = altReads.get(j);
-                    if ( !toTest.isRealignable() )
-                        continue;
-                    Pair<Integer, Integer> altAlignment = findBestOffset(altConsensus, toTest);
-
-                    // the mismatch score is the min of its alignment vs. the reference and vs. the alternate
-                    int myScore = altAlignment.second;
-                    if ( myScore >= toTest.getMismatchScoreToReference() )
-                        myScore = toTest.getMismatchScoreToReference();
-                    // keep track of reads that align better to the alternate consensus
+            int indelCount = 0;
+            int altIdx = 0;
+            int refIdx = indexOnRef;
+            boolean ok_flag = true;
+            for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
+                CigarElement ce = c.getCigarElement(i);
+                switch( ce.getOperator() ) {
+                case D:
+                    indelCount++;
+                    refIdx += ce.getLength();
+                    break;
+                case M:
+                    if ( reference.length() < refIdx+ce.getLength() )
+                        ok_flag = false;
                    else
-                        consensus.readIndexes.add(new Pair<Integer, Integer>(j, altAlignment.first));
+                        sb.append(reference.substring(refIdx, refIdx+ce.getLength()));
+                    refIdx += ce.getLength();
+                    altIdx += ce.getLength();
+                    break;
+                case I:
+                    sb.append(aRead.getReadString().substring(altIdx, altIdx+ce.getLength()));
+                    altIdx += ce.getLength();
+                    indelCount++;
+                    break;
+                }
+            }
+            // make sure that there is at most only a single indel and it aligns appropriately!
+            if ( !ok_flag || indelCount > 1 || reference.length() < refIdx )
+                continue;

-                    logger.debug(aRead.getReadString() +  " vs. " + toTest.getReadString() + " => " + myScore + " - " + altAlignment.first);
-                    consensus.mismatchSum += myScore;
-                    if ( myScore == 0 )
-                        // we already know that this is its consensus, so don't bother testing it later
-                        altAlignmentsToTest.set(j, false);
-                }
+            sb.append(reference.substring(refIdx));
+            String altConsensus =  sb.toString(); // alternative consensus sequence we just built from the cuurent read
+
+            // for each imperfect match to the reference, score it against this alternative
+            Consensus consensus = new Consensus(altConsensus, c, indexOnRef);
+            for ( int j = 0; j < altReads.size(); j++ ) {
+                AlignedRead toTest = altReads.get(j);
+                if ( !toTest.isRealignable() )
+                    continue;
+                Pair<Integer, Integer> altAlignment = findBestOffset(altConsensus, toTest);
+
+                // the mismatch score is the min of its alignment vs. the reference and vs. the alternate
+                int myScore = altAlignment.second;
+                if ( myScore >= toTest.getMismatchScoreToReference() )
+                    myScore = toTest.getMismatchScoreToReference();
+                // keep track of reads that align better to the alternate consensus
+                else
+                    consensus.readIndexes.add(new Pair<Integer, Integer>(j, altAlignment.first));
+
+                logger.debug(aRead.getReadString() +  " vs. " + toTest.getReadString() + " => " + myScore + " - " + altAlignment.first);
+                consensus.mismatchSum += myScore;
+
+                if ( myScore == 0 )
+                    // we already know that this is its consensus, so don't bother testing it later
+                    altAlignmentsToTest.set(j, false);
+            }
+
+            logger.debug(aRead.getReadString() +  " " + consensus.mismatchSum);
+            if ( bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) {
+                bestConsensus = consensus;
                logger.debug(aRead.getReadString() +  " " + consensus.mismatchSum);
-                if ( bestConsensus == null || bestConsensus.mismatchSum > consensus.mismatchSum) {
-                    bestConsensus = consensus;
-                    logger.debug(aRead.getReadString() +  " " + consensus.mismatchSum);
-                }
            }
        }

@ -369,7 +388,10 @@ public class  IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
                    aRead.getRead().setAttribute("NM", AlignmentUtils.numMismatches(aRead.getRead(), reference, aRead.getRead().getAlignmentStart()-(int)leftmostIndex));
                }
            }
-        } else if ( statsOutput != null ) {
+
+            // END IF ( improvemenr >= LOD_THRESHOLD )
+
+        } else if ( statsOutput != null ) { 
            try {
                statsOutput.write(interval.toString());
                statsOutput.write("\tFAIL\t"); // if improvement < LOD_THRESHOLD
@ -497,7 +519,12 @@ public class  IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
            int refIdx = read.getOriginalAlignmentStart() - (int)leftmostIndex;
            String readStr = read.getReadString();
            String qualStr = read.getBaseQualityString();
+
            for (int j=0; j < readStr.length(); j++, refIdx++ ) {
+                //                if ( refIdx < 0 || refIdx >= reference.length() ) {
+                //                    System.out.println( "Read: "+read.getRead().getReadName() + "; length = " + readStr.length() );
+                //                    System.out.println( "Ref left: "+ leftmostIndex +"; ref length=" + reference.length() + "; read alignment start: "+read.getOriginalAlignmentStart() ); 
+                //                }
                totalBases[refIdx] += (int)qualStr.charAt(j) - 33;
                if ( Character.toUpperCase(readStr.charAt(j)) != Character.toUpperCase(reference.charAt(refIdx)) )
                    originalMismatchBases[refIdx] += (int)qualStr.charAt(j) - 33;