Use reads that span multiple intervals

* RR will now compress reads that span across multiple intervals correctly and output them in the correct order.
* Fixed bug in getReadCoordinateForReferenceCoordinate where if the requested reference coordinate fell inside a deletion in the read the read would be clipped up to one element past the deletion.
This commit is contained in:
Mauricio Carneiro 2011-09-27 16:49:29 -04:00
parent 84bd355690
commit 3b6e43b7c4
4 changed files with 150 additions and 23 deletions

View File

@ -249,7 +249,7 @@ public class ClippingOp {
@Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1", "!read.getReadUnmappedFlag()"})
private SAMRecord hardClip (SAMRecord read, int start, int stop) {
if (start == 0 && stop == read.getReadLength() -1)
if (start == 0 && stop == read.getReadLength() - 1)
return new SAMRecord(read.getHeader());
// If the read is unmapped there is no Cigar string and neither should we create a new cigar string

View File

@ -4,6 +4,7 @@ import com.google.java.contract.Requires;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -68,25 +69,15 @@ public class ReadClipper {
}
private SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart);
int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop);
int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL);
int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL);
if (start < 0 || stop > read.getReadLength() - 1)
throw new ReviewedStingException("Trying to clip before the start or after the end of a read");
if ( start > stop ) {
// stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, ReadUtils.getRefCoordSoftUnclippedEnd(read));
if ( start > stop )
throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!");
}
//This tries to fix the bug where the deletion is counted a read base and as a result, the hardCLipper runs into
//an endless loop when hard clipping the cigar string because the read coordinates are not covered by the read
// stop -= numDeletions(read);
// if ( start > stop )
// start -= numDeletions(read);
//System.out.println("Clipping start/stop: " + start + "/" + stop);
this.addOp(new ClippingOp(start, stop));
SAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES);
this.ops = null;

View File

@ -0,0 +1,46 @@
package org.broadinstitute.sting.utils.sam;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMRecord;
import java.util.Comparator;
public class AlignmentStartWithNoTiesComparator implements Comparator<SAMRecord> {
@Requires("c1 >= 0 && c2 >= 0")
@Ensures("result == 0 || result == 1 || result == -1")
private int compareContigs(int c1, int c2) {
if (c1 == c2)
return 0;
else if (c1 > c2)
return 1;
return -1;
}
@Requires("r1 != null && r2 != null")
@Ensures("result == 0 || result == 1 || result == -1")
public int compare(SAMRecord r1, SAMRecord r2) {
int result;
if (r1 == r2)
result = 0;
else if (r1.getReadUnmappedFlag())
result = 1;
else if (r2.getReadUnmappedFlag())
result = -1;
else {
final int cmpContig = compareContigs(r1.getReferenceIndex(), r2.getReferenceIndex());
if (cmpContig != 0)
result = cmpContig;
else {
if (r1.getAlignmentStart() < r2.getAlignmentStart()) result = -1;
else result = 1;
}
}
return result;
}
}

View File

@ -780,11 +780,56 @@ public class ReadUtils {
}
public enum ClippingTail {
LEFT_TAIL,
RIGHT_TAIL
}
/**
* Pre-processes the results of getReadCoordinateForReferenceCoordinate(SAMRecord, int) in case it falls in
* a deletion following the typical clipping needs. If clipping the left tail (beginning of the read) returns
* the base prior to the deletion. If clipping the right tail (end of the read) returns the base after the
* deletion.
*
* @param read
* @param refCoord
* @param tail
* @return the read coordinate corresponding to the requested reference coordinate for clipping.
*/
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"})
@Ensures({"result >= 0", "result < read.getReadLength()"})
public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) {
public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord, ClippingTail tail) {
Pair<Integer, Boolean> result = getReadCoordinateForReferenceCoordinate(read, refCoord);
int readCoord = result.getFirst();
if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL)
readCoord++;
return readCoord;
}
/**
* Returns the read coordinate corresponding to the requested reference coordinate.
*
* WARNING: if the requested reference coordinate happens to fall inside a deletion in the read, this function
* will return the last read base before the deletion. This function returns a
* Pair(int readCoord, boolean fallsInsideDeletion) so you can choose which readCoordinate to use when faced with
* a deletion.
*
* SUGGESTION: Use getReadCoordinateForReferenceCoordinate(SAMRecord, int, ClippingTail) instead to get a
* pre-processed result according to normal clipping needs. Or you can use this function and tailor the
* behavior to your needs.
*
* @param read
* @param refCoord
* @return the read coordinate corresponding to the requested reference coordinate. (see warning!)
*/
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"})
@Ensures({"result >= 0", "result < read.getReadLength()"})
public static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) {
int readBases = 0;
int refBases = 0;
boolean fallsInsideDeletion = false;
if (refCoord < read.getAlignmentStart()) {
readBases = getReadCoordinateForReferenceCoordinateBeforeAlignmentStart(read, refCoord);
@ -806,26 +851,56 @@ public class ReadUtils {
int shift = 0;
if (cigarElement.getOperator().consumesReferenceBases()) {
if (refBases + cigarElement.getLength() < goal) {
if (refBases + cigarElement.getLength() < goal)
shift = cigarElement.getLength();
}
else {
else
shift = goal - refBases;
}
refBases += shift;
}
goalReached = refBases == goal;
if (cigarElement.getOperator().consumesReadBases()) {
readBases += goalReached ? shift : cigarElement.getLength();
}
if (!goalReached && cigarElement.getOperator().consumesReadBases())
readBases += cigarElement.getLength();
if (goalReached) {
// Is this base's reference position within this cigar element? Or did we use it all?
boolean endsWithinCigar = shift < cigarElement.getLength();
// If it isn't, we need to check the next one. There should *ALWAYS* be a next one
// since we checked if the goal coordinate is within the read length, so this is just a sanity check.
if (!endsWithinCigar && !cigarElementIterator.hasNext())
throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio");
CigarElement nextCigarElement;
// if we end inside the current cigar element, we just have to check if it is a deletion
if (endsWithinCigar)
fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION;
// if we end outside the current cigar element, we need to check if the next element is a deletion.
else {
nextCigarElement = cigarElementIterator.next();
fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION;
}
// If we reached our goal outside a deletion, add the shift
if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases())
readBases += shift;
// If we reached our goal inside a deletion, but the deletion is the next cigar element then we need
// to add the shift of the current cigar element but go back to it's last element to return the last
// base before the deletion (see warning in function contracts)
else if (fallsInsideDeletion && !endsWithinCigar)
readBases += shift - 1;
}
}
if (!goalReached)
throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?");
}
return readBases;
return new Pair<Integer, Boolean>(readBases, fallsInsideDeletion);
}
public static SAMRecord unclipSoftClippedBases(SAMRecord rec) {
@ -871,4 +946,19 @@ public class ReadUtils {
return rec;
}
/**
* Compares two SAMRecords only the basis on alignment start. Note that
* comparisons are performed ONLY on the basis of alignment start; any
* two SAM records with the same alignment start will be considered equal.
*
* Unmapped alignments will all be considered equal.
*/
@Requires({"read1 != null", "read2 != null"})
@Ensures("result == 0 || result == 1 || result == -1")
public static int compareSAMRecords(SAMRecord read1, SAMRecord read2) {
AlignmentStartComparator comp = new AlignmentStartComparator();
return comp.compare(read1, read2);
}
}