diff --git a/java/src/org/broadinstitute/sting/playground/tools/RemapAlignments.java b/java/src/org/broadinstitute/sting/playground/tools/RemapAlignments.java index 676396751..e4cbc7bff 100644 --- a/java/src/org/broadinstitute/sting/playground/tools/RemapAlignments.java +++ b/java/src/org/broadinstitute/sting/playground/tools/RemapAlignments.java @@ -214,12 +214,17 @@ public class RemapAlignments extends CommandLineProgram { int minNM = 1000000; int cnt = 0; // count of best alignments for ( SAMRecord r : reads ) { - int nm = (Integer)r.getAttribute("NM"); + Object attr = r.getAttribute("NM"); + if ( attr == null ) { + return; // can not recompute qualities! + } + int nm = (Integer)attr; if ( nm < minNM ) { minNM = nm; cnt = 1; } else if ( nm == minNM ) cnt++; } + // now reset counts and quals: for ( SAMRecord r : reads ) { diff --git a/java/src/org/broadinstitute/sting/playground/utils/GenomicMap.java b/java/src/org/broadinstitute/sting/playground/utils/GenomicMap.java index 46ef74be5..058d68730 100644 --- a/java/src/org/broadinstitute/sting/playground/utils/GenomicMap.java +++ b/java/src/org/broadinstitute/sting/playground/utils/GenomicMap.java @@ -207,22 +207,26 @@ public class GenomicMap implements Iterable segments = getContigMapping(r.getReferenceName()); if ( segments == null ) throw new StingException("Can not remap a record: unknown custom contig name "+r.getReferenceName()); - + + // scroll the list of intervals until we find the interval that the alignment start falls into: Pair, Integer> p = seekForward(segments,customStart); Iterator iter = p.first; - GenomeLoc gl = iter.next(); // initialization: get first interval + GenomeLoc gl = iter.next(); // initialization: get interval that contains customStart + // p.second is 1-based position of alignment start relative to interval gl; + // hence refPos is 1-based position of the alignment start on the master ref (since gl.getStart() is 1-based too) int refPos = (int)(p.second+gl.getStart()-1); String oldRefName = r.getReferenceName(); int oldStart = r.getAlignmentStart(); int oldEnd = r.getAlignmentEnd(); - + r.setAlignmentStart(refPos); r.setHeader(h); // have to substitute here, or setReferenceIndex will not work correctly below @@ -233,11 +237,11 @@ public class GenomicMap implements Iterable= currStop ) { - // deletion wrt the custom reference touches the end of current segment or extends past it - int currDeletionLength = (int)(currStop - refPos + 1); // that many bases are deleted from the current segment - deletionLength += currDeletionLength; - len -= currDeletionLength; // we still have 'len' deleted bases to record + case M: +/////////// + if ( delayedGap > 0 ) { + // we get here if previous M or D element ended exactly at the interval boundary; we need + // to add the stretch of N's only if that element turned out to be not the last one, so we do it now + newCigar.add(new CigarElement(delayedGap, CigarOperator.N)); + delayedGap = 0; + } + while ( refPos + len - 1 > currStop ) { // current D or M cigar element extends beyond the end of current segment - if ( k != N-1 || len > 0 ) { - // we did not finish with current deletion and/or there are more cigar - // elements to come, so we need to add pseudo-deletion to jump to the next segment: - if ( ! iter.hasNext() ) { - String message = "Record "+r.getReadName()+" extends beyond its custom contig."+ - "\nRead aligns to: "+oldRefName+":"+oldStart+"-"+oldEnd+"; cigar="+ - r.getCigarString()+"; contig length="+contigLength(segments); - if ( discardCrossContig ) { - // System.out.println("WARNING: ALIGNMENT DISCARDED: "+message); - return null; - } else throw new StingException(message); - } + // we have that many bases in the current cigar element till the end of the current segment: + int currLength = (int)(currStop-refPos+1); - gl = iter.next(); // advance to next segment + newCigar.add(new CigarElement( currLength,ce.getOperator()) ); // record deletion/match till the end of the current segment + len -= currLength; // we still have 'len' bases remaining in the current cigar element - refPos = (int)gl.getStart(); // we jump to the start of next segment on the master ref - - if ( gl.getContigIndex() != r.getReferenceIndex() ) - throw new StingException("Contig "+oldRefName+" has segments on different master contigs: currently unsupported"); - - if ( refPos < currStop + 1 ) - throw new StingException("Contig "+oldRefName+" has segments that are out of order or strictly adjacent: currently unsupported"); - - // add "deletion" w/respect to the master ref over the region between adjacent segments: - deletionLength += (int)(refPos-currStop-1); + // NOTE: since we entered the loop, we were guaranteed that len > currLength, so now len > 0 - currStop = gl.getStop(); - } - } - - deletionLength += len; // remaining deleted bases on the current segment; we are guaranteed that len < segment length - refPos += len; // advance ref position to next base after deletion - newCigar.add( new CigarElement(deletionLength, CigarOperator.D)); - break; - - case M: // we have a span of 'len' matching bases w/respect to custom contig. - // These bases can extend over the boundaries of the segments contig is built of: - - while ( refPos + len - 1 > currStop ) { // matching bases extend beyond current segment + // check if we have next segment to extend remaining matching bases to; if we don't, something's awfully wrong: + if ( ! iter.hasNext() ) { + String message = "Record "+r.getReadName()+" extends beyond its custom contig."+ + "\nRead aligns to: "+oldRefName+":"+oldStart+"-"+oldEnd+"; cigar="+ + r.getCigarString()+"; contig length="+contigLength(segments); + if ( discardCrossContig ) { + // System.out.println("WARNING: ALIGNMENT DISCARDED: "+message); + return null; + } else throw new StingException(message); + } - // we have that many matching bases till the end of the current segment: - int currMatchLength = (int)(currStop-refPos+1); - if ( currMatchLength > 0 ) { - // will be non-positive if previous M or D cigar element ended exactly at the end of current segment; - // in that case we will not add an empty M element but proceed to adding pseudo-"deletion" - newCigar.add(new CigarElement(currMatchLength,CigarOperator.M)); // record match till the end of the current segment - len -= currMatchLength; // we still have 'len' matching bases to record - } + gl = iter.next(); // advance to next segment - // check if we have next segment to extend remaining matching bases to; if we don't, something's awfully wrong: - if ( ! iter.hasNext() ) { - String message = "Record "+r.getReadName()+" extends beyond its custom contig."+ - "\nRead aligns to: "+oldRefName+":"+oldStart+"-"+oldEnd+"; cigar="+ - r.getCigarString()+"; contig length="+contigLength(segments); - if ( discardCrossContig ) { - // System.out.println("WARNING: ALIGNMENT DISCARDED: "+message); - return null; - } else throw new StingException(message); - } - - gl = iter.next(); // advance to next segment - - refPos = (int)gl.getStart(); // we jump to the start of next segment on the master ref - - if ( gl.getContigIndex() != r.getReferenceIndex() ) - throw new StingException("Contig "+oldRefName+ - " has segments on different master contigs: currently unsupported"); - - if ( refPos < currStop + 1 ) - throw new StingException("Contig "+oldRefName+ - " has segments that are out of order or strictly adjacent: currently unsupported"); - if ( refPos > currStop + 1 ) { - // add "deletion" w/respect to the master ref over the region between adjacent segments - // (and do not add anything if segments are strictly adjacent, i.e. refPos == currStop+1): - newCigar.add(new CigarElement((int)(refPos-currStop-1),CigarOperator.D)); - } - currStop = gl.getStop(); - // now we can continue with recording remaining matching bases over the current segment - } - // we get here when remaining matching bases fit completely inside the current segment: - newCigar.add(new CigarElement(len,CigarOperator.M)); - refPos+=len; - - // NOTE: if matching bases touched the end of current segment, but did not extend beyond it, - // we did not record the pseudo-"deletion" after the current segment yet, but refPos in that - // case is set to next base after the segment end; we will detect this later. - break; + refPos = (int)gl.getStart(); // we jump to the start of next segment on the master ref + + if ( gl.getContigIndex() != r.getReferenceIndex() ) + throw new StingException("Contig "+oldRefName+ + " has segments on different master contigs: currently unsupported"); + + if ( refPos < currStop + 1 ) + throw new StingException("Contig "+oldRefName+ + " has segments that are out of order or strictly adjacent: currently unsupported"); + if ( len > 0 && refPos > currStop + 1 ) { + // add "panning" N's w/respect to the master ref over the region between adjacent segments + // (and do not add anything if segments are strictly adjacent, i.e. refPos == currStop+1): + newCigar.add(new CigarElement((int)(refPos-currStop-1),CigarOperator.N)); + } else { + // we jumped onto the next interval, but the current cigar element ended exactly + // at the end of the previous interval. We will need to end later, only if more M/D elements follow: + delayedGap = (int)(refPos-currStop-1); + } + currStop = gl.getStop(); + // now we can continue with recording remaining matching bases over the current segment + } + // we get here when remaining matching bases fit completely inside the current segment: + if ( len > 0 ) newCigar.add(new CigarElement(len,ce.getOperator())); + refPos+=len; + + break; +//////////// } } @@ -361,7 +324,7 @@ public class GenomicMap implements Iterable nameIterator() { return map.keySet().iterator(); } public Set nameSet() { return map.keySet(); } - /** Returns an iterator into the specified collection of segments that points to the segment that contains + /** Returns an iterator into the specified collection of segments that points right before the segment that contains * specified position, and the offset of the position inside that segment. This helper method assumes that * there is a "custom" contig built of intervals on the "master" reference; the first argument specifies * the mapping (i.e. an ordered collection of master reference intervals the custom contig is built of), and the second argument