mostly synchronizing with the main branch. Based on anecdotal evidence (too few examples in the data), realignment (shifting indel left across a repeat) works correctly on non-homonucleotide repeats

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@928 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-06-07 16:39:16 +00:00
parent c6634e3121
commit 9eb38c0222
1 changed files with 56 additions and 48 deletions

View File

@ -1,4 +1,3 @@
package org.broadinstitute.sting.playground.gatk.walkers.indels; package org.broadinstitute.sting.playground.gatk.walkers.indels;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
@ -10,7 +9,6 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.playground.indels.*; import org.broadinstitute.sting.playground.indels.*;
import net.sf.samtools.*; import net.sf.samtools.*;
import java.util.*; import java.util.*;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
@ -558,6 +556,7 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
int start = ce1.getLength() + (readIsConsensusSequence ? refIndex : 0); int start = ce1.getLength() + (readIsConsensusSequence ? refIndex : 0);
indelString = readSeq.substring(start, start+ce2.getLength()).toUpperCase(); // get the inserted bases indelString = readSeq.substring(start, start+ce2.getLength()).toUpperCase(); // get the inserted bases
} }
// now we have to check all WHOLE periods of the indel sequence: // now we have to check all WHOLE periods of the indel sequence:
// for instance, if // for instance, if
// REF: AGCTATATATAGCC // REF: AGCTATATATAGCC
@ -568,9 +567,9 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
// REF: AGCTATATATAGCC // REF: AGCTATATATAGCC
// READ: GCTA****TAGCC // READ: GCTA****TAGCC
// the length 4 is a multiple of the period of 2, and indeed deletion site can be moved left by 2 bases! // the length 4 is a multiple of the period of 2, and indeed deletion site can be moved left by 2 bases!
// We will always have to check the length of the indel sequence itself (trivial period), unless the smallest // Also, we will always have to check the length of the indel sequence itself (trivial period). If the smallest
// period is 1 (which means that indel sequence is a homo-nucleotide sequence so we can just step left // period is 1 (which means that indel sequence is a homo-nucleotide sequence), we obviously do not have to check
// one base at a time as long as we get a match) // any other periods.
// NOTE: we treat both insertions and deletions in the same way below: we always check if the indel sequence // NOTE: we treat both insertions and deletions in the same way below: we always check if the indel sequence
// repeats itsels on the REF (never on the read!), even for insertions: if we see TA inserted and REF has, e.g., CATATA prior to the insertion // repeats itsels on the REF (never on the read!), even for insertions: if we see TA inserted and REF has, e.g., CATATA prior to the insertion
@ -610,13 +609,22 @@ public class IntervalCleanerWalker extends LocusWindowWalker<Integer, Integer>
// got maximum possible shift after checking period=1 above. // got maximum possible shift after checking period=1 above.
} }
// if ( ce2.getLength() >= 2 )
// System.out.println("-----------------------------------\n FROM:\n"+AlignmentUtils.alignmentToString(cigar,readSeq,refSeq,refIndex, (readIsConsensusSequence?refIndex:0)));
if ( difference > 0 ) { if ( difference > 0 ) {
Cigar newCigar = new Cigar(); Cigar newCigar = new Cigar();
newCigar.add(new CigarElement(ce1.getLength()-difference, CigarOperator.M)); newCigar.add(new CigarElement(ce1.getLength()-difference, CigarOperator.M));
newCigar.add(ce2); newCigar.add(ce2);
newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength()+difference, CigarOperator.M)); newCigar.add(new CigarElement(cigar.getCigarElement(2).getLength()+difference, CigarOperator.M));
// System.out.println(" FROM:\n"+AlignmentUtils.alignmentToString(cigar,readSeq,refSeq,refIndex));
// if ( ce2.getLength() >=2 )
// System.out.println(" REALIGNED TO:\n"+AlignmentUtils.alignmentToString(newCigar,readSeq,refSeq,refIndex,(readIsConsensusSequence?refIndex:0))+"\n");
logger.debug("Realigning indel: " + cigarToString(cigar) + " to " + cigarToString(newCigar)); logger.debug("Realigning indel: " + cigarToString(cigar) + " to " + cigarToString(newCigar));
cigar = newCigar; cigar = newCigar;
} }
return cigar; return cigar;
} }