Use reads that span multiple intervals

* RR will now compress reads that span across multiple intervals correctly and output them in the correct order.
* Fixed bug in getReadCoordinateForReferenceCoordinate where if the requested reference coordinate fell inside a deletion in the read the read would be clipped up to one element past the deletion.
This commit is contained in:
Mauricio Carneiro 2011-09-27 16:49:29 -04:00
parent 84bd355690
commit 3b6e43b7c4
4 changed files with 150 additions and 23 deletions

View File

@ -4,6 +4,7 @@ import com.google.java.contract.Requires;
import net.sf.samtools.CigarElement; import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator; import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -68,25 +69,15 @@ public class ReadClipper {
} }
private SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { private SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart); int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL);
int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop); int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL);
if (start < 0 || stop > read.getReadLength() - 1) if (start < 0 || stop > read.getReadLength() - 1)
throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); throw new ReviewedStingException("Trying to clip before the start or after the end of a read");
if ( start > stop ) { if ( start > stop )
// stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, ReadUtils.getRefCoordSoftUnclippedEnd(read));
throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!"); throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!");
}
//This tries to fix the bug where the deletion is counted a read base and as a result, the hardCLipper runs into
//an endless loop when hard clipping the cigar string because the read coordinates are not covered by the read
// stop -= numDeletions(read);
// if ( start > stop )
// start -= numDeletions(read);
//System.out.println("Clipping start/stop: " + start + "/" + stop);
this.addOp(new ClippingOp(start, stop)); this.addOp(new ClippingOp(start, stop));
SAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES); SAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES);
this.ops = null; this.ops = null;

View File

@ -0,0 +1,46 @@
package org.broadinstitute.sting.utils.sam;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMRecord;
import java.util.Comparator;
public class AlignmentStartWithNoTiesComparator implements Comparator<SAMRecord> {
@Requires("c1 >= 0 && c2 >= 0")
@Ensures("result == 0 || result == 1 || result == -1")
private int compareContigs(int c1, int c2) {
if (c1 == c2)
return 0;
else if (c1 > c2)
return 1;
return -1;
}
@Requires("r1 != null && r2 != null")
@Ensures("result == 0 || result == 1 || result == -1")
public int compare(SAMRecord r1, SAMRecord r2) {
int result;
if (r1 == r2)
result = 0;
else if (r1.getReadUnmappedFlag())
result = 1;
else if (r2.getReadUnmappedFlag())
result = -1;
else {
final int cmpContig = compareContigs(r1.getReferenceIndex(), r2.getReferenceIndex());
if (cmpContig != 0)
result = cmpContig;
else {
if (r1.getAlignmentStart() < r2.getAlignmentStart()) result = -1;
else result = 1;
}
}
return result;
}
}

View File

@ -780,11 +780,56 @@ public class ReadUtils {
} }
public enum ClippingTail {
LEFT_TAIL,
RIGHT_TAIL
}
/**
* Pre-processes the results of getReadCoordinateForReferenceCoordinate(SAMRecord, int) in case it falls in
* a deletion following the typical clipping needs. If clipping the left tail (beginning of the read) returns
* the base prior to the deletion. If clipping the right tail (end of the read) returns the base after the
* deletion.
*
* @param read
* @param refCoord
* @param tail
* @return the read coordinate corresponding to the requested reference coordinate for clipping.
*/
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"}) @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"})
@Ensures({"result >= 0", "result < read.getReadLength()"}) @Ensures({"result >= 0", "result < read.getReadLength()"})
public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) { public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord, ClippingTail tail) {
Pair<Integer, Boolean> result = getReadCoordinateForReferenceCoordinate(read, refCoord);
int readCoord = result.getFirst();
if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL)
readCoord++;
return readCoord;
}
/**
* Returns the read coordinate corresponding to the requested reference coordinate.
*
* WARNING: if the requested reference coordinate happens to fall inside a deletion in the read, this function
* will return the last read base before the deletion. This function returns a
* Pair(int readCoord, boolean fallsInsideDeletion) so you can choose which readCoordinate to use when faced with
* a deletion.
*
* SUGGESTION: Use getReadCoordinateForReferenceCoordinate(SAMRecord, int, ClippingTail) instead to get a
* pre-processed result according to normal clipping needs. Or you can use this function and tailor the
* behavior to your needs.
*
* @param read
* @param refCoord
* @return the read coordinate corresponding to the requested reference coordinate. (see warning!)
*/
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"})
@Ensures({"result >= 0", "result < read.getReadLength()"})
public static Pair<Integer, Boolean> getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) {
int readBases = 0; int readBases = 0;
int refBases = 0; int refBases = 0;
boolean fallsInsideDeletion = false;
if (refCoord < read.getAlignmentStart()) { if (refCoord < read.getAlignmentStart()) {
readBases = getReadCoordinateForReferenceCoordinateBeforeAlignmentStart(read, refCoord); readBases = getReadCoordinateForReferenceCoordinateBeforeAlignmentStart(read, refCoord);
@ -806,18 +851,48 @@ public class ReadUtils {
int shift = 0; int shift = 0;
if (cigarElement.getOperator().consumesReferenceBases()) { if (cigarElement.getOperator().consumesReferenceBases()) {
if (refBases + cigarElement.getLength() < goal) { if (refBases + cigarElement.getLength() < goal)
shift = cigarElement.getLength(); shift = cigarElement.getLength();
} else
else {
shift = goal - refBases; shift = goal - refBases;
}
refBases += shift; refBases += shift;
} }
goalReached = refBases == goal; goalReached = refBases == goal;
if (cigarElement.getOperator().consumesReadBases()) { if (!goalReached && cigarElement.getOperator().consumesReadBases())
readBases += goalReached ? shift : cigarElement.getLength(); readBases += cigarElement.getLength();
if (goalReached) {
// Is this base's reference position within this cigar element? Or did we use it all?
boolean endsWithinCigar = shift < cigarElement.getLength();
// If it isn't, we need to check the next one. There should *ALWAYS* be a next one
// since we checked if the goal coordinate is within the read length, so this is just a sanity check.
if (!endsWithinCigar && !cigarElementIterator.hasNext())
throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio");
CigarElement nextCigarElement;
// if we end inside the current cigar element, we just have to check if it is a deletion
if (endsWithinCigar)
fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION;
// if we end outside the current cigar element, we need to check if the next element is a deletion.
else {
nextCigarElement = cigarElementIterator.next();
fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION;
}
// If we reached our goal outside a deletion, add the shift
if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases())
readBases += shift;
// If we reached our goal inside a deletion, but the deletion is the next cigar element then we need
// to add the shift of the current cigar element but go back to it's last element to return the last
// base before the deletion (see warning in function contracts)
else if (fallsInsideDeletion && !endsWithinCigar)
readBases += shift - 1;
} }
} }
@ -825,7 +900,7 @@ public class ReadUtils {
throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?");
} }
return readBases; return new Pair<Integer, Boolean>(readBases, fallsInsideDeletion);
} }
public static SAMRecord unclipSoftClippedBases(SAMRecord rec) { public static SAMRecord unclipSoftClippedBases(SAMRecord rec) {
@ -871,4 +946,19 @@ public class ReadUtils {
return rec; return rec;
} }
/**
* Compares two SAMRecords only the basis on alignment start. Note that
* comparisons are performed ONLY on the basis of alignment start; any
* two SAM records with the same alignment start will be considered equal.
*
* Unmapped alignments will all be considered equal.
*/
@Requires({"read1 != null", "read2 != null"})
@Ensures("result == 0 || result == 1 || result == -1")
public static int compareSAMRecords(SAMRecord read1, SAMRecord read2) {
AlignmentStartComparator comp = new AlignmentStartComparator();
return comp.compare(read1, read2);
}
} }