Now GenomicMap maps (and RemapAlignment outputs) regions between intervals on the master reference as 'N' cigar elements, not 'D'. 'D' is now used only for bona fide deletions.

Also: do not die if alignment record does not have NM tags (but mapping quality will not be recomputed after remapping/reducing for the lack of required data)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1411 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-08-11 21:10:17 +00:00
parent 2c3f56cb8d
commit e4acd14675
2 changed files with 71 additions and 103 deletions

View File

@ -214,12 +214,17 @@ public class RemapAlignments extends CommandLineProgram {
int minNM = 1000000;
int cnt = 0; // count of best alignments
for ( SAMRecord r : reads ) {
int nm = (Integer)r.getAttribute("NM");
Object attr = r.getAttribute("NM");
if ( attr == null ) {
return; // can not recompute qualities!
}
int nm = (Integer)attr;
if ( nm < minNM ) {
minNM = nm;
cnt = 1;
} else if ( nm == minNM ) cnt++;
}
// now reset counts and quals:
for ( SAMRecord r : reads ) {

View File

@ -207,22 +207,26 @@ public class GenomicMap implements Iterable<Map.Entry<String, Collection<GenomeL
if ( r.getReadUnmappedFlag() ) return r; // nothing to do if read is unmapped
int customStart = r.getAlignmentStart();
// get mapping from read's contig onto a "global" contig (as a list of intervals on the latter):
Collection<GenomeLoc> segments = getContigMapping(r.getReferenceName());
if ( segments == null ) throw new StingException("Can not remap a record: unknown custom contig name "+r.getReferenceName());
// scroll the list of intervals until we find the interval that the alignment start falls into:
Pair<? extends Iterator<GenomeLoc>, Integer> p = seekForward(segments,customStart);
Iterator<GenomeLoc> iter = p.first;
GenomeLoc gl = iter.next(); // initialization: get first interval
GenomeLoc gl = iter.next(); // initialization: get interval that contains customStart
// p.second is 1-based position of alignment start relative to interval gl;
// hence refPos is 1-based position of the alignment start on the master ref (since gl.getStart() is 1-based too)
int refPos = (int)(p.second+gl.getStart()-1);
String oldRefName = r.getReferenceName();
int oldStart = r.getAlignmentStart();
int oldEnd = r.getAlignmentEnd();
r.setAlignmentStart(refPos);
r.setHeader(h); // have to substitute here, or setReferenceIndex will not work correctly below
@ -233,11 +237,11 @@ public class GenomicMap implements Iterable<Map.Entry<String, Collection<GenomeL
Cigar newCigar = new Cigar();
int N = oldCigar.numCigarElements() ;
long currStop = gl.getStop();// end of the current segment of the custom contig on the master reference
long currStop = gl.getStop();// end of the current segment of the custom contig on the master reference, 1-based inclusive
int delayedGap = 0 ; // length of the 'N' gap between the segments (intervals) on the master ref, to be added only if followed by another cigar element
for ( int k = 0; k < N ; k++ ) {
CigarElement ce = oldCigar.getCigarElement(k);
int len = ce.getLength();
int len = ce.getLength(); // length of the cigar element
switch( ce.getOperator() ) {
case S: // soft clip
case H: // or hard clip - these are not included in getAlignmentStart, so pass them through
@ -245,108 +249,67 @@ public class GenomicMap implements Iterable<Map.Entry<String, Collection<GenomeL
throw new StingException("Don't know what to do with S or N cigar element that is not at the either end of the cigar. Cigar: "+
r.getCigarString());
case I: // insertions are passed through as well
newCigar.add(ce);
newCigar.add(new CigarElement(len,ce.getOperator()));
break;
case D:
// deletion would be easy except for one quirk: deletion can span over the boundary of adjacent segments on the custom
// contig; since that boundary itself will be encoded as a "deletion", we need to merge such overlapping real and
// pseudo-deletions into a single deletion element:
int deletionLength = 0;
while ( refPos + len - 1 >= currStop ) {
// deletion wrt the custom reference touches the end of current segment or extends past it
int currDeletionLength = (int)(currStop - refPos + 1); // that many bases are deleted from the current segment
deletionLength += currDeletionLength;
len -= currDeletionLength; // we still have 'len' deleted bases to record
case M:
///////////
if ( delayedGap > 0 ) {
// we get here if previous M or D element ended exactly at the interval boundary; we need
// to add the stretch of N's only if that element turned out to be not the last one, so we do it now
newCigar.add(new CigarElement(delayedGap, CigarOperator.N));
delayedGap = 0;
}
while ( refPos + len - 1 > currStop ) { // current D or M cigar element extends beyond the end of current segment
if ( k != N-1 || len > 0 ) {
// we did not finish with current deletion and/or there are more cigar
// elements to come, so we need to add pseudo-deletion to jump to the next segment:
if ( ! iter.hasNext() ) {
String message = "Record "+r.getReadName()+" extends beyond its custom contig."+
"\nRead aligns to: "+oldRefName+":"+oldStart+"-"+oldEnd+"; cigar="+
r.getCigarString()+"; contig length="+contigLength(segments);
if ( discardCrossContig ) {
// System.out.println("WARNING: ALIGNMENT DISCARDED: "+message);
return null;
} else throw new StingException(message);
}
// we have that many bases in the current cigar element till the end of the current segment:
int currLength = (int)(currStop-refPos+1);
gl = iter.next(); // advance to next segment
newCigar.add(new CigarElement( currLength,ce.getOperator()) ); // record deletion/match till the end of the current segment
len -= currLength; // we still have 'len' bases remaining in the current cigar element
refPos = (int)gl.getStart(); // we jump to the start of next segment on the master ref
if ( gl.getContigIndex() != r.getReferenceIndex() )
throw new StingException("Contig "+oldRefName+" has segments on different master contigs: currently unsupported");
if ( refPos < currStop + 1 )
throw new StingException("Contig "+oldRefName+" has segments that are out of order or strictly adjacent: currently unsupported");
// add "deletion" w/respect to the master ref over the region between adjacent segments:
deletionLength += (int)(refPos-currStop-1);
// NOTE: since we entered the loop, we were guaranteed that len > currLength, so now len > 0
currStop = gl.getStop();
}
}
deletionLength += len; // remaining deleted bases on the current segment; we are guaranteed that len < segment length
refPos += len; // advance ref position to next base after deletion
newCigar.add( new CigarElement(deletionLength, CigarOperator.D));
break;
case M: // we have a span of 'len' matching bases w/respect to custom contig.
// These bases can extend over the boundaries of the segments contig is built of:
while ( refPos + len - 1 > currStop ) { // matching bases extend beyond current segment
// check if we have next segment to extend remaining matching bases to; if we don't, something's awfully wrong:
if ( ! iter.hasNext() ) {
String message = "Record "+r.getReadName()+" extends beyond its custom contig."+
"\nRead aligns to: "+oldRefName+":"+oldStart+"-"+oldEnd+"; cigar="+
r.getCigarString()+"; contig length="+contigLength(segments);
if ( discardCrossContig ) {
// System.out.println("WARNING: ALIGNMENT DISCARDED: "+message);
return null;
} else throw new StingException(message);
}
// we have that many matching bases till the end of the current segment:
int currMatchLength = (int)(currStop-refPos+1);
if ( currMatchLength > 0 ) {
// will be non-positive if previous M or D cigar element ended exactly at the end of current segment;
// in that case we will not add an empty M element but proceed to adding pseudo-"deletion"
newCigar.add(new CigarElement(currMatchLength,CigarOperator.M)); // record match till the end of the current segment
len -= currMatchLength; // we still have 'len' matching bases to record
}
gl = iter.next(); // advance to next segment
// check if we have next segment to extend remaining matching bases to; if we don't, something's awfully wrong:
if ( ! iter.hasNext() ) {
String message = "Record "+r.getReadName()+" extends beyond its custom contig."+
"\nRead aligns to: "+oldRefName+":"+oldStart+"-"+oldEnd+"; cigar="+
r.getCigarString()+"; contig length="+contigLength(segments);
if ( discardCrossContig ) {
// System.out.println("WARNING: ALIGNMENT DISCARDED: "+message);
return null;
} else throw new StingException(message);
}
gl = iter.next(); // advance to next segment
refPos = (int)gl.getStart(); // we jump to the start of next segment on the master ref
if ( gl.getContigIndex() != r.getReferenceIndex() )
throw new StingException("Contig "+oldRefName+
" has segments on different master contigs: currently unsupported");
if ( refPos < currStop + 1 )
throw new StingException("Contig "+oldRefName+
" has segments that are out of order or strictly adjacent: currently unsupported");
if ( refPos > currStop + 1 ) {
// add "deletion" w/respect to the master ref over the region between adjacent segments
// (and do not add anything if segments are strictly adjacent, i.e. refPos == currStop+1):
newCigar.add(new CigarElement((int)(refPos-currStop-1),CigarOperator.D));
}
currStop = gl.getStop();
// now we can continue with recording remaining matching bases over the current segment
}
// we get here when remaining matching bases fit completely inside the current segment:
newCigar.add(new CigarElement(len,CigarOperator.M));
refPos+=len;
// NOTE: if matching bases touched the end of current segment, but did not extend beyond it,
// we did not record the pseudo-"deletion" after the current segment yet, but refPos in that
// case is set to next base after the segment end; we will detect this later.
break;
refPos = (int)gl.getStart(); // we jump to the start of next segment on the master ref
if ( gl.getContigIndex() != r.getReferenceIndex() )
throw new StingException("Contig "+oldRefName+
" has segments on different master contigs: currently unsupported");
if ( refPos < currStop + 1 )
throw new StingException("Contig "+oldRefName+
" has segments that are out of order or strictly adjacent: currently unsupported");
if ( len > 0 && refPos > currStop + 1 ) {
// add "panning" N's w/respect to the master ref over the region between adjacent segments
// (and do not add anything if segments are strictly adjacent, i.e. refPos == currStop+1):
newCigar.add(new CigarElement((int)(refPos-currStop-1),CigarOperator.N));
} else {
// we jumped onto the next interval, but the current cigar element ended exactly
// at the end of the previous interval. We will need to end later, only if more M/D elements follow:
delayedGap = (int)(refPos-currStop-1);
}
currStop = gl.getStop();
// now we can continue with recording remaining matching bases over the current segment
}
// we get here when remaining matching bases fit completely inside the current segment:
if ( len > 0 ) newCigar.add(new CigarElement(len,ce.getOperator()));
refPos+=len;
break;
////////////
}
}
@ -361,7 +324,7 @@ public class GenomicMap implements Iterable<Map.Entry<String, Collection<GenomeL
public Iterator<String> nameIterator() { return map.keySet().iterator(); }
public Set<String> nameSet() { return map.keySet(); }
/** Returns an iterator into the specified collection of segments that points to the segment that contains
/** Returns an iterator into the specified collection of segments that points right before the segment that contains
* specified position, and the offset of the position inside that segment. This helper method assumes that
* there is a "custom" contig built of intervals on the "master" reference; the first argument specifies
* the mapping (i.e. an ordered collection of master reference intervals the custom contig is built of), and the second argument