From b48508a2263aacc9d1dd6b192e6442ef5c9519eb Mon Sep 17 00:00:00 2001 From: asivache Date: Sun, 7 Jun 2009 17:42:19 +0000 Subject: [PATCH] indelRealignment() signature changed. The only difference about consensus sequences is that they are passed along with alignment cigars that start inside the sequence, while for 'conventional' reads cigar always starts at position 0 on the read. Logically, indelRealignment() should not know what 'consensus' is. Instead, now it receives an additional int parameter, start of the cigar on the 'read' sequence git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@929 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/indels/IntervalCleanerWalker.java | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java index 16d64d93b..9a50f5194 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IntervalCleanerWalker.java @@ -185,7 +185,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker for ( SAMRecord read : reads ) { // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence if ( AlignmentUtils.getNumAlignmentBlocks(read) == 2 ) - read.setCigar(indelRealignment(read.getCigar(), reference, read.getReadString(), read.getAlignmentStart()-(int)leftmostIndex, false)); + read.setCigar(indelRealignment(read.getCigar(), reference, read.getReadString(), read.getAlignmentStart()-(int)leftmostIndex, 0)); AlignedRead aRead = new AlignedRead(read); int mismatchScore = mismatchQualitySum(aRead, reference, read.getAlignmentStart()-(int)leftmostIndex); @@ -287,7 +287,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker double improvement = (bestConsensus == null ? -1 : ((double)(totalMismatchSum - bestConsensus.mismatchSum))/10.0); if ( improvement >= LOD_THRESHOLD ) { - bestConsensus.cigar = indelRealignment(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, true); + bestConsensus.cigar = indelRealignment(bestConsensus.cigar, reference, bestConsensus.str, bestConsensus.positionOnReference, bestConsensus.positionOnReference); // clean the appropriate reads for ( Pair indexPair : bestConsensus.readIndexes ) { @@ -527,7 +527,13 @@ public class IntervalCleanerWalker extends LocusWindowWalker /** Takes the alignment of the read sequence readSeq to the reference sequence refSeq * starting at 0-based position refIndex on the refSeq and specified by its cigar. - * If the alignment has an indel, then attempts to move this indel left across a stretch of repetitive bases. For instance, if the original cigar + * The last argument readIndex specifies 0-based position on the read where the alignment described by the + * cigar starts. Usually cigars specify alignments of the whole read to the ref, so that readIndex is normally 0. + * Use non-zero readIndex only when the alignment cigar represents alignment of a part of the read. The refIndex in this case + * should be the position where the alignment of that part of the read starts at. In other words, both refIndex and readIndex are + * always the positions where the cigar starts on the ref and on the read, respectively. + * + * If the alignment has an indel, then this method attempts moving this indel left across a stretch of repetitive bases. For instance, if the original cigar * specifies that (any) one AT is deleted from a repeat sequence TATATATA, the output cigar will always mark the leftmost AT * as deleted. If there is no indel in the original cigar, or the indel position is determined unambiguously (i.e. inserted/deleted sequence * is not repeated), the original cigar is returned. @@ -537,7 +543,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker * @param refIndex 0-based alignment start position * @return a cigar, in which indel is guaranteed to be placed at the leftmost possible position across a repeat (if any) */ - private Cigar indelRealignment(Cigar cigar, String refSeq, String readSeq, int refIndex, boolean readIsConsensusSequence) { + private Cigar indelRealignment(Cigar cigar, String refSeq, String readSeq, int refIndex, int readIndex) { if ( cigar.numCigarElements() < 2 ) return cigar; // no indels, nothing to do CigarElement ce1 = cigar.getCigarElement(0); @@ -548,13 +554,13 @@ public class IntervalCleanerWalker extends LocusWindowWalker String indelString = null; // inserted or deleted sequence int period = 0; // period of the inserted/deleted sequence - int indexOnRef = refIndex+ce1.getLength() ; // position of the indel on the REF (first deleted base or first base after insertion) - + int indelIndexOnRef = refIndex+ce1.getLength() ; // position of the indel on the REF (first deleted base or first base after insertion) + int indelIndexOnRead = readIndex+ce1.getLength(); // position of the indel on the READ (first insterted base, of first base after deletion) + if ( ce2.getOperator() == CigarOperator.D ) - indelString = refSeq.substring(indexOnRef, indexOnRef+ce2.getLength()).toUpperCase(); // deleted bases + indelString = refSeq.substring(indelIndexOnRef, indelIndexOnRef+ce2.getLength()).toUpperCase(); // deleted bases else if ( ce2.getOperator() == CigarOperator.I ) { - int start = ce1.getLength() + (readIsConsensusSequence ? refIndex : 0); - indelString = readSeq.substring(start, start+ce2.getLength()).toUpperCase(); // get the inserted bases + indelString = readSeq.substring(indelIndexOnRead, indelIndexOnRead+ce2.getLength()).toUpperCase(); // get the inserted bases } // now we have to check all WHOLE periods of the indel sequence: @@ -573,16 +579,17 @@ public class IntervalCleanerWalker extends LocusWindowWalker // NOTE: we treat both insertions and deletions in the same way below: we always check if the indel sequence // repeats itsels on the REF (never on the read!), even for insertions: if we see TA inserted and REF has, e.g., CATATA prior to the insertion - // position, we will move insertion left, to the position right after CA. + // position, we will move insertion left, to the position right after CA. This way, while moving the indel across the repeat + // on the ref, we can theoretically move it across a non-repeat on the read if the latter has a mismtach. while ( period < indel_length ) { // we will always get at least trivial period = indelStringLength period = BaseUtils.sequencePeriod(indelString, period+1); - if ( indel_length % period != 0 ) continue; // period is not a multiple of indel sequence length + if ( indel_length % period != 0 ) continue; // if indel sequence length is not a multiple of the period, it's not gonna work - int newIndex = indexOnRef; + int newIndex = indelIndexOnRef; while ( newIndex >= period ) { // let's see if there is a repeat, i.e. if we could also say that same bases at lower position are deleted @@ -602,7 +609,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker else break; // oops, no match, can not push indel farther left } - final int newDifference = indexOnRef - newIndex; + final int newDifference = indelIndexOnRef - newIndex; if ( newDifference > difference ) difference = newDifference; // deletion should be moved 'difference' bases left if ( period == 1 ) break; // we do not have to check all periods of homonucleotide sequences, we already @@ -620,7 +627,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength()+difference, CigarOperator.M)); // System.out.println(" FROM:\n"+AlignmentUtils.alignmentToString(cigar,readSeq,refSeq,refIndex)); // if ( ce2.getLength() >=2 ) - // System.out.println(" REALIGNED TO:\n"+AlignmentUtils.alignmentToString(newCigar,readSeq,refSeq,refIndex,(readIsConsensusSequence?refIndex:0))+"\n"); + // System.out.println(" REALIGNED TO:\n"+AlignmentUtils.alignmentToString(newCigar,readSeq,refSeq,refIndex,readIndex)+"\n"); logger.debug("Realigning indel: " + cigarToString(cigar) + " to " + cigarToString(newCigar)); cigar = newCigar;