Pulling out the left-alignment code for indels so that other walkers can use it.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3251 348d0f76-0448-11de-a6fe-93d51630548a
2010-04-23 16:12:34 +00:00 · 2010-04-23 16:12:34 +00:00 · 42bcca1010
parent 9e28e4eb42
commit 42bcca1010
2 changed files with 163 additions and 163 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
@ -470,7 +470,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
            // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence
            int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read);
            if ( numBlocks == 2 ) {
-                Cigar newCigar = indelRealignment(read.getCigar(), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0);
+                Cigar newCigar = AlignmentUtils.leftAlignIndel(read.getCigar(), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0);
                aRead.setCigar(newCigar);
            }

@ -600,7 +600,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
        final double improvement = (bestConsensus == null ? -1 : ((double)(totalRawMismatchSum - bestConsensus.mismatchSum))/10.0);
        if ( improvement >= LOD_THRESHOLD && bestConsensus.mismatchSum <= totalAlignerMismatchSum ) {

-            bestConsensus.cigar = indelRealignment(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference);
+            bestConsensus.cigar = AlignmentUtils.leftAlignIndel(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference);

           // start cleaning the appropriate reads
            for ( Pair<Integer, Integer> indexPair : bestConsensus.readIndexes ) {
@ -1005,167 +1005,6 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
        return reduces;
    }

-    /** Takes the alignment of the read sequence <code>readSeq</code> to the reference sequence <code>refSeq</code>
-     * starting at 0-based position <code>refIndex</code> on the <code>refSeq</code> and specified by its <code>cigar</code>.
-     * The last argument <code>readIndex</code> specifies 0-based position on the read where the alignment described by the
-     * <code>cigar</code> starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0.
-     * Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case
-     * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are
-     * always the positions where the cigar starts on the ref and on the read, respectively.
-     *
-     * If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar
-     * specifies that (any) one AT  is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT
-     * as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence
-     * is not repeated), the original cigar is returned.
-     * @param cigar structure of the original alignment
-     * @param refSeq reference sequence the read is aligned to
-     * @param readSeq read sequence
-     * @param refIndex 0-based alignment start position on ref
-     * @param readIndex 0-based alignment start position on read
-     * @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any)
-     */
-    private Cigar indelRealignment(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) {
-        if ( cigar.numCigarElements() < 2 ) return cigar; // no indels, nothing to do
-
-        final CigarElement ce1 = cigar.getCigarElement(0);
-        final CigarElement ce2 = cigar.getCigarElement(1);
-
-        // we currently can not handle clipped reads; alternatively, if the alignment starts from insertion, there
-        // is no place on the read to move that insertion further left; so we are done:
-        if ( ce1.getOperator() != CigarOperator.M ) return cigar;
-
-        int difference = 0; // we can move indel 'difference' bases left
-        final int indel_length = ce2.getLength();
-
-        int period = 0; // period of the inserted/deleted sequence
-        int indelIndexOnRef = refIndex+ce1.getLength() ; // position of the indel on the REF (first deleted base or first base after insertion)
-        int indelIndexOnRead = readIndex+ce1.getLength(); // position of the indel on the READ (first insterted base, of first base after deletion)
-
-        byte[] indelString = new byte[ce2.getLength()];  // inserted or deleted sequence
-
-        if ( ce2.getOperator() == CigarOperator.D )
-            System.arraycopy(refSeq, indelIndexOnRef, indelString, 0, ce2.getLength());
-        else if ( ce2.getOperator() == CigarOperator.I )
-            System.arraycopy(readSeq, indelIndexOnRead, indelString, 0, ce2.getLength());
-        else
-            // we can get here if there is soft clipping done at the beginning of the read
-            // for now, we'll just punt the issue and not try to realign these
-            return cigar;
-
-        // now we have to check all WHOLE periods of the indel sequence:
-        //  for instance, if
-        //   REF:   AGCTATATATAGCC
-        //   READ:   GCTAT***TAGCC
-        // the deleted sequence ATA does have period of 2, but deletion obviously can not be
-        // shifted left by 2 bases (length 3 does not contain whole number of periods of 2);
-        // however if 4 bases are deleted:
-        //   REF:   AGCTATATATAGCC
-        //   READ:   GCTA****TAGCC
-        // the length 4 is a multiple of the period of 2, and indeed deletion site can be moved left by 2 bases!
-        //  Also, we will always have to check the length of the indel sequence itself (trivial period). If the smallest
-        // period is 1 (which means that indel sequence is a homo-nucleotide sequence), we obviously do not have to check
-        // any other periods.
-
-        // NOTE: we treat both insertions and deletions in the same way below: we always check if the indel sequence
-        // repeats itsels on the REF (never on the read!), even for insertions: if we see TA inserted and REF has, e.g., CATATA prior to the insertion
-        // position, we will move insertion left, to the position right after CA. This way, while moving the indel across the repeat
-        // on the ref, we can theoretically move it across a non-repeat on the read if the latter has a mismtach.
-
-        while ( period < indel_length ) { // we will always get at least trivial period = indelStringLength
-
-                period = BaseUtils.sequencePeriod(indelString, period+1);
-
-                if ( indel_length % period != 0 ) continue; // if indel sequence length is not a multiple of the period, it's not gonna work
-
-                int newIndex = indelIndexOnRef;
-
-                while ( newIndex >= period ) { // let's see if there is a repeat, i.e. if we could also say that same bases at lower position are deleted
-
-                    // lets check if bases [newIndex-period,newIndex) immediately preceding the indel on the ref
-                    // are the same as the currently checked period of the inserted sequence:
-
-                    boolean match = true;
-
-                    for ( int testRefPos = newIndex - period, indelPos = 0 ; testRefPos < newIndex; testRefPos++, indelPos++) {
-                        byte indelChr = indelString[indelPos];
-                        if ( refSeq[testRefPos] != indelChr || !BaseUtils.isRegularBase((char)indelChr) ) {
-                            match = false;
-                            break;
-                        }
-                    }
-                    if ( match ) {
-                        newIndex -= period; // yes, they are the same, we can move indel farther left by at least period bases, go check if we can do more...
-                    }
-                    else {
-                        break; // oops, no match, can not push indel farther left
-                    }
-                }
-
-                final int newDifference = indelIndexOnRef - newIndex;
-                if ( newDifference > difference ) difference = newDifference; // deletion should be moved 'difference' bases left
-
-                if ( period == 1 ) break; // we do not have to check all periods of homonucleotide sequences, we already
-                                          // got maximum possible shift after checking period=1 above.
-        }
-
-        //        if ( ce2.getLength() >= 2 )
-        //            System.out.println("-----------------------------------\n  FROM:\n"+AlignmentUtils.alignmentToString(cigar,readSeq,refSeq,refIndex, (readIsConsensusSequence?refIndex:0)));
-
-
-        if ( difference > 0 ) {
-
-            // The following if() statement: this should've never happened, unless the alignment is really screwed up.
-            // A real life example:
-            //
-            //   ref:    TTTTTTTTTTTTTTTTTT******TTTTTACTTATAGAAGAAAT...
-            //  read:       GTCTTTTTTTTTTTTTTTTTTTTTTTACTTATAGAAGAAAT...
-            //
-            //  i.e. the alignment claims 6 T's to be inserted. The alignment is clearly malformed/non-conforming since we could
-            // have just 3 T's inserted (so that the beginning of the read maps right onto the beginning of the
-            // reference fragment shown): that would leave us with same 2 mismatches at the beginning of the read
-            // (G and C) but lower gap penalty. Note that this has nothing to do with the alignment being "right" or "wrong"
-            // with respect to where on the DNA the read actually came from. It is the assumptions of *how* the alignments are
-            // built and represented that are broken here. While it is unclear how the alignment shown above could be generated
-            // in the first place, we are not in the business of fixing incorrect alignments in this method; all we are
-            // trying to do is to left-adjust correct ones. So if something like that happens, we refuse to change the cigar
-            // and bail out.
-            if ( ce1.getLength()-difference < 0 ) return cigar;
-
-            Cigar newCigar = new Cigar();
-            // do not add leading M cigar element if its length is zero (i.e. if we managed to left-shift the
-            // insertion all the way to the read start):
-            if ( ce1.getLength() - difference > 0 )
-                newCigar.add(new CigarElement(ce1.getLength()-difference, CigarOperator.M));
-            newCigar.add(ce2);  // add the indel, now it's left shifted since we decreased the number of preceding matching bases
-
-            if ( cigar.numCigarElements() > 2 ) {
-                // if we got something following the indel element:
-
-                if ( cigar.getCigarElement(2).getOperator() == CigarOperator.M  ) {
-                    // if indel was followed by matching bases (that's the most common situation),
-                    // increase the length of the matching section after the indel by the amount of left shift
-                    // (matching bases that were on the left are now *after* the indel; we have also checked at the beginning
-                    // that the first cigar element was also M):
-                    newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength()+difference, CigarOperator.M));
-                } else {
-                    // if the element after the indel was not M, we have to add just the matching bases that were on the left
-                    // and now appear after the indel after we performed the shift. Then add the original element that followed the indel.
-                    newCigar.add(new CigarElement(difference, CigarOperator.M));
-                    newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength(),cigar.getCigarElement(2).getOperator()));
-                }
-                // now add remaining (unchanged) cigar elements, if any:
-                for ( int i = 3 ; i < cigar.numCigarElements() ; i++ )  {
-                    newCigar.add(new CigarElement(cigar.getCigarElement(i).getLength(),cigar.getCigarElement(i).getOperator()));
-                }
-            }
-
-            //logger.debug("Realigning indel: " + AlignmentUtils.cigarToString(cigar) + " to " + AlignmentUtils.cigarToString(newCigar));
-            cigar = newCigar;
-
-        }
-        return cigar;
-    }
-
    private class AlignedRead {
        private final SAMRecord read;
        private Cigar newCigar = null;
--- a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
+++ b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java
@ -379,4 +379,165 @@ public class AlignmentUtils {

        return BaseUtils.reverse(read.getBaseQualities());
    }
+
+    /** Takes the alignment of the read sequence <code>readSeq</code> to the reference sequence <code>refSeq</code>
+     * starting at 0-based position <code>refIndex</code> on the <code>refSeq</code> and specified by its <code>cigar</code>.
+     * The last argument <code>readIndex</code> specifies 0-based position on the read where the alignment described by the
+     * <code>cigar</code> starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0.
+     * Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case
+     * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are
+     * always the positions where the cigar starts on the ref and on the read, respectively.
+     *
+     * If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar
+     * specifies that (any) one AT  is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT
+     * as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence
+     * is not repeated), the original cigar is returned.
+     * @param cigar structure of the original alignment
+     * @param refSeq reference sequence the read is aligned to
+     * @param readSeq read sequence
+     * @param refIndex 0-based alignment start position on ref
+     * @param readIndex 0-based alignment start position on read
+     * @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any)
+     */
+    public static Cigar leftAlignIndel(Cigar cigar, final byte[] refSeq, final byte[] readSeq, final int refIndex, final int readIndex) {
+        if ( cigar.numCigarElements() < 2 ) return cigar; // no indels, nothing to do
+
+        final CigarElement ce1 = cigar.getCigarElement(0);
+        final CigarElement ce2 = cigar.getCigarElement(1);
+
+        // we currently can not handle clipped reads; alternatively, if the alignment starts from insertion, there
+        // is no place on the read to move that insertion further left; so we are done:
+        if ( ce1.getOperator() != CigarOperator.M ) return cigar;
+
+        int difference = 0; // we can move indel 'difference' bases left
+        final int indel_length = ce2.getLength();
+
+        int period = 0; // period of the inserted/deleted sequence
+        int indelIndexOnRef = refIndex+ce1.getLength() ; // position of the indel on the REF (first deleted base or first base after insertion)
+        int indelIndexOnRead = readIndex+ce1.getLength(); // position of the indel on the READ (first insterted base, of first base after deletion)
+
+        byte[] indelString = new byte[ce2.getLength()];  // inserted or deleted sequence
+
+        if ( ce2.getOperator() == CigarOperator.D )
+            System.arraycopy(refSeq, indelIndexOnRef, indelString, 0, ce2.getLength());
+        else if ( ce2.getOperator() == CigarOperator.I )
+            System.arraycopy(readSeq, indelIndexOnRead, indelString, 0, ce2.getLength());
+        else
+            // we can get here if there is soft clipping done at the beginning of the read
+            // for now, we'll just punt the issue and not try to realign these
+            return cigar;
+
+        // now we have to check all WHOLE periods of the indel sequence:
+        //  for instance, if
+        //   REF:   AGCTATATATAGCC
+        //   READ:   GCTAT***TAGCC
+        // the deleted sequence ATA does have period of 2, but deletion obviously can not be
+        // shifted left by 2 bases (length 3 does not contain whole number of periods of 2);
+        // however if 4 bases are deleted:
+        //   REF:   AGCTATATATAGCC
+        //   READ:   GCTA****TAGCC
+        // the length 4 is a multiple of the period of 2, and indeed deletion site can be moved left by 2 bases!
+        //  Also, we will always have to check the length of the indel sequence itself (trivial period). If the smallest
+        // period is 1 (which means that indel sequence is a homo-nucleotide sequence), we obviously do not have to check
+        // any other periods.
+
+        // NOTE: we treat both insertions and deletions in the same way below: we always check if the indel sequence
+        // repeats itsels on the REF (never on the read!), even for insertions: if we see TA inserted and REF has, e.g., CATATA prior to the insertion
+        // position, we will move insertion left, to the position right after CA. This way, while moving the indel across the repeat
+        // on the ref, we can theoretically move it across a non-repeat on the read if the latter has a mismtach.
+
+        while ( period < indel_length ) { // we will always get at least trivial period = indelStringLength
+
+                period = BaseUtils.sequencePeriod(indelString, period+1);
+
+                if ( indel_length % period != 0 ) continue; // if indel sequence length is not a multiple of the period, it's not gonna work
+
+                int newIndex = indelIndexOnRef;
+
+                while ( newIndex >= period ) { // let's see if there is a repeat, i.e. if we could also say that same bases at lower position are deleted
+
+                    // lets check if bases [newIndex-period,newIndex) immediately preceding the indel on the ref
+                    // are the same as the currently checked period of the inserted sequence:
+
+                    boolean match = true;
+
+                    for ( int testRefPos = newIndex - period, indelPos = 0 ; testRefPos < newIndex; testRefPos++, indelPos++) {
+                        byte indelChr = indelString[indelPos];
+                        if ( refSeq[testRefPos] != indelChr || !BaseUtils.isRegularBase((char)indelChr) ) {
+                            match = false;
+                            break;
+                        }
+                    }
+                    if ( match ) {
+                        newIndex -= period; // yes, they are the same, we can move indel farther left by at least period bases, go check if we can do more...
+                    }
+                    else {
+                        break; // oops, no match, can not push indel farther left
+                    }
+                }
+
+                final int newDifference = indelIndexOnRef - newIndex;
+                if ( newDifference > difference ) difference = newDifference; // deletion should be moved 'difference' bases left
+
+                if ( period == 1 ) break; // we do not have to check all periods of homonucleotide sequences, we already
+                                          // got maximum possible shift after checking period=1 above.
+        }
+
+        //        if ( ce2.getLength() >= 2 )
+        //            System.out.println("-----------------------------------\n  FROM:\n"+AlignmentUtils.alignmentToString(cigar,readSeq,refSeq,refIndex, (readIsConsensusSequence?refIndex:0)));
+
+
+        if ( difference > 0 ) {
+
+            // The following if() statement: this should've never happened, unless the alignment is really screwed up.
+            // A real life example:
+            //
+            //   ref:    TTTTTTTTTTTTTTTTTT******TTTTTACTTATAGAAGAAAT...
+            //  read:       GTCTTTTTTTTTTTTTTTTTTTTTTTACTTATAGAAGAAAT...
+            //
+            //  i.e. the alignment claims 6 T's to be inserted. The alignment is clearly malformed/non-conforming since we could
+            // have just 3 T's inserted (so that the beginning of the read maps right onto the beginning of the
+            // reference fragment shown): that would leave us with same 2 mismatches at the beginning of the read
+            // (G and C) but lower gap penalty. Note that this has nothing to do with the alignment being "right" or "wrong"
+            // with respect to where on the DNA the read actually came from. It is the assumptions of *how* the alignments are
+            // built and represented that are broken here. While it is unclear how the alignment shown above could be generated
+            // in the first place, we are not in the business of fixing incorrect alignments in this method; all we are
+            // trying to do is to left-adjust correct ones. So if something like that happens, we refuse to change the cigar
+            // and bail out.
+            if ( ce1.getLength()-difference < 0 ) return cigar;
+
+            Cigar newCigar = new Cigar();
+            // do not add leading M cigar element if its length is zero (i.e. if we managed to left-shift the
+            // insertion all the way to the read start):
+            if ( ce1.getLength() - difference > 0 )
+                newCigar.add(new CigarElement(ce1.getLength()-difference, CigarOperator.M));
+            newCigar.add(ce2);  // add the indel, now it's left shifted since we decreased the number of preceding matching bases
+
+            if ( cigar.numCigarElements() > 2 ) {
+                // if we got something following the indel element:
+
+                if ( cigar.getCigarElement(2).getOperator() == CigarOperator.M  ) {
+                    // if indel was followed by matching bases (that's the most common situation),
+                    // increase the length of the matching section after the indel by the amount of left shift
+                    // (matching bases that were on the left are now *after* the indel; we have also checked at the beginning
+                    // that the first cigar element was also M):
+                    newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength()+difference, CigarOperator.M));
+                } else {
+                    // if the element after the indel was not M, we have to add just the matching bases that were on the left
+                    // and now appear after the indel after we performed the shift. Then add the original element that followed the indel.
+                    newCigar.add(new CigarElement(difference, CigarOperator.M));
+                    newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength(),cigar.getCigarElement(2).getOperator()));
+                }
+                // now add remaining (unchanged) cigar elements, if any:
+                for ( int i = 3 ; i < cigar.numCigarElements() ; i++ )  {
+                    newCigar.add(new CigarElement(cigar.getCigarElement(i).getLength(),cigar.getCigarElement(i).getOperator()));
+                }
+            }
+
+            //logger.debug("Realigning indel: " + AlignmentUtils.cigarToString(cigar) + " to " + AlignmentUtils.cigarToString(newCigar));
+            cigar = newCigar;
+
+        }
+        return cigar;
+    }
 }