Fixes to ReadClipper and added Reference Coordinate clipping.

* Added reference coordinate based hard clipping functions. This allows you to set a hard cut on where you need the read to be trimmed despite indels.
* soft clipping was messing up cigar string if there was already a hard clip at the beginning of the read. Fixed.
* hard clipping now works with previously hard clipped reads.
This commit is contained in:
Mauricio Carneiro 2011-08-14 14:34:29 -04:00
parent 291d8c7596
commit 8a51732049
3 changed files with 83 additions and 25 deletions

View File

@ -4,6 +4,7 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.ReadUtils;
@ -17,22 +18,14 @@ import java.util.Vector;
* according to the wishes of the supplid ClippingAlgorithm enum.
*/
public class ClippingOp {
public final ClippingType type;
public final int start, stop; // inclusive
public final Object extraInfo;
public ClippingOp(int start, int stop) {
this(null, start, stop, null);
}
public ClippingOp(ClippingType type, int start, int stop, Object extraInfo) {
// todo -- remove type and extra info
this.type = type;
this.start = start;
this.stop = stop;
this.extraInfo = extraInfo;
}
public int getLength() {
return stop - start + 1;
}
@ -119,15 +112,6 @@ public class ClippingOp {
return clippedRead;
}
/**
* What is the type of a ClippingOp?
*/
public enum ClippingType {
LOW_Q_SCORES,
WITHIN_CLIP_RANGE,
MATCHES_CLIP_SEQ
}
/**
* Given a cigar string, get the number of bases hard or soft clipped at the start
*/
@ -196,7 +180,7 @@ public class ClippingOp {
Vector<CigarElement> newElements = new Vector<CigarElement>();
for (CigarElement curElem : __cigar.getCigarElements()) {
if (!curElem.getOperator().consumesReadBases()) {
if (curLength > __startClipEnd && curLength < __endClipBegin) {
if (curElem.getOperator() == CigarOperator.HARD_CLIP || curLength > __startClipEnd && curLength < __endClipBegin) {
newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator()));
}
continue;

View File

@ -1,6 +1,10 @@
package org.broadinstitute.sting.utils.clipreads;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.jets3t.service.multi.ThreadedStorageService;
import java.util.ArrayList;
import java.util.List;
@ -43,6 +47,23 @@ public class ReadClipper {
return read;
}
public SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
int start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart);
int stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop);
this.addOp(new ClippingOp(start, stop));
return clipRead(ClippingRepresentation.HARDCLIP_BASES);
}
public SAMRecord hardClipByReadCoordinates(int start, int stop) {
this.addOp(new ClippingOp(start, stop));
return clipRead(ClippingRepresentation.HARDCLIP_BASES);
}
public SAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) {
this.read = hardClipByReferenceCoordinates(read.getUnclippedStart(), left);
this.ops = null; // reset the operations
return hardClipByReferenceCoordinates(right, read.getUnclippedEnd());
}
/**
* Return a new read corresponding to this.read that's been clipped according to ops, if any are present.

View File

@ -432,16 +432,34 @@ public class ReadUtils {
keepEnd = rec.getReadLength() - l - 1;
newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP));
break;
case H:
// TODO -- must be handled specially
throw new ReviewedStingException("BUG: tell mark he forgot to implement this");
default:
newCigarElements.add(ce);
break;
}
}
return hardClipBases(rec, keepStart, keepEnd, newCigarElements);
// Merges tandem cigar elements like 5H10H or 2S5S to 15H or 7S
// this will happen if you soft clip a read that has been hard clipped before
// like: 5H20S => 5H20H
List<CigarElement> mergedCigarElements = new LinkedList<CigarElement>();
Iterator<CigarElement> cigarElementIterator = newCigarElements.iterator();
CigarOperator currentOperator = null;
int currentOperatorLength = 0;
while (cigarElementIterator.hasNext()) {
CigarElement cigarElement = cigarElementIterator.next();
if (currentOperator != cigarElement.getOperator()) {
if (currentOperator != null)
mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator));
currentOperator = cigarElement.getOperator();
currentOperatorLength = cigarElement.getLength();
}
else
currentOperatorLength += cigarElement.getLength();
}
mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator));
return hardClipBases(rec, keepStart, keepEnd, mergedCigarElements);
}
/**
@ -460,8 +478,7 @@ public class ReadUtils {
"keepEnd < rec.getReadLength()",
"rec.getReadUnmappedFlag() || newCigarElements != null"})
@Ensures("result != null")
public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd,
List<CigarElement> newCigarElements) {
public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd, List<CigarElement> newCigarElements) {
int newLength = keepEnd - keepStart + 1;
if ( newLength != rec.getReadLength() ) {
try {
@ -633,7 +650,43 @@ public class ReadUtils {
return ReadAndIntervalOverlap.RIGHT_OVERLAP;
}
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"})
@Ensures({"result >= 0", "result < read.getReadLength()"})
public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) {
int readBases = 0;
int refBases = 0;
int goal = refCoord - read.getUnclippedStart(); // read coords are 0-based!
boolean goalReached = false;
Iterator<CigarElement> cigarElementIterator = read.getCigar().getCigarElements().iterator();
while (!goalReached && cigarElementIterator.hasNext()) {
CigarElement cigarElement = cigarElementIterator.next();
int shift = 0;
if (refBases == 0 && readBases == 0 && cigarElement.getOperator() == CigarOperator.HARD_CLIP) {
goal -= cigarElement.getLength();
}
if (cigarElement.getOperator().consumesReferenceBases()) {
if (refBases + cigarElement.getLength() < goal) {
shift = cigarElement.getLength();
}
else {
shift = goal - refBases;
}
refBases += shift;
}
goalReached = refBases == goal;
if (cigarElement.getOperator().consumesReadBases()) {
readBases += goalReached ? shift : cigarElement.getLength();
}
}
if (!goalReached)
throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?");
return readBases;
}
}