From d72d332239e49941535e9ae47afa108fa62a7067 Mon Sep 17 00:00:00 2001 From: asivache Date: Wed, 16 Dec 2009 23:29:07 +0000 Subject: [PATCH] 1) changed to search specifically for D and I cigar elements (and to process properly/ignore H,S,P elements) and print out only intervals that encompass actual indels. There's still one interval per read (at most) generated, which is the smallest intervals that covers ALL indels (D or I elements) present in the read; 2) if an interval (thus the original read itself and indels in it) sticks beyond the end of the chromosome, the read is ignored and this interval is NOT printed into the output; instead, a warning is printed to STDOUT (should we send it to logger.warn() instead? git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2385 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/indels/IndelIntervalWalker.java | 47 +++++++++++++++++-- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelIntervalWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelIntervalWalker.java index 79bfd6d73..08cf4b420 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelIntervalWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelIntervalWalker.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.*; import net.sf.samtools.SAMRecord; import net.sf.samtools.AlignmentBlock; +import net.sf.samtools.CigarElement; import java.util.List; @@ -27,18 +28,55 @@ public class IndelIntervalWalker extends ReadWalker 1 && // indel + read.getCigarLength() > 1 && // indel (allow454 || !Utils.is454Read(read)) ); } public Interval map(char[] ref, SAMRecord read) { - List blocks = read.getAlignmentBlocks(); - long indelLeftEdge = read.getAlignmentStart() + blocks.get(0).getLength() - 1; - long indelRightEdge = read.getAlignmentEnd() - blocks.get(blocks.size()-1).getLength() + 1; +// List blocks = read.getAlignmentBlocks(); +// long indelLeftEdge = read.getAlignmentStart() + blocks.get(0).getLength() - 1; +// long indelRightEdge = read.getAlignmentEnd() - blocks.get(blocks.size()-1).getLength() + 1; + long indelLeftEdge = -1; + long indelRightEdge = -1; + int lengthOnRef = 0; // length of the event on the reference (this preset value is right for insertion) + int pos = read.getAlignmentStart(); + for ( final CigarElement ce : read.getCigar().getCigarElements() ) { + switch( ce.getOperator() ) { + case S : + case H : break; // ignore clipping completely: alignment start is set to the first NON-clipped base + case P : // skip padding and matching bases, advance position on the ref: + case M : pos += ce.getLength(); break; + case D : + lengthOnRef = ce.getLength(); // deletion occupies non-zero number of bases on the ref + case I : + // for insertion we do not have to set lengthOnRef as it was already initialized to 0 + if ( indelLeftEdge == -1 ) { // it's the first indel in the current read + indelLeftEdge = pos - 1; // set to the last (matching) base before the indel + } + pos += lengthOnRef; + indelRightEdge = pos; // whether it is the first indel in the read or not, we set right edge immediately after it + break; + default : + throw new StingException("Unrecognized cigar element '"+ce.getOperator()+"' in read "+read.getReadName()); + } + } + + // at this point indelLeftEdge == -1 or [indelLeftEdge,indelRightEdge] is the interval encompassing ALL indels in the current read: + if ( indelLeftEdge == -1 ) return null; + + // we've seen some bam files coming from other centers that have reads (and indels!!) hanging over the contig ends; + // we do not want to deal with this stuff (what is it, anyway??) + if ( indelRightEdge > GenomeLocParser.getContigInfo(read.getReferenceName()).getSequenceLength() ) { + System.out.println("WARNING: read " + read.getReadName()+" contains indel(s) spanning past the contig end (read ignored)."); + return null; + } + + if ( indelLeftEdge == 49563377 ) System.out.println("read: " +read.getReadName()); GenomeLoc indelLoc = GenomeLocParser.createGenomeLoc(read.getReferenceIndex(), indelLeftEdge, indelRightEdge); GenomeLoc refLoc = GenomeLocParser.createGenomeLoc(read); + return new Interval(refLoc, indelLoc); } @@ -53,6 +91,7 @@ public class IndelIntervalWalker extends ReadWalker " + value); // if the intervals don't overlap, print out the leftmost one and start a new one