Fixes to ReadClipper and added Reference Coordinate clipping.
* Added reference coordinate based hard clipping functions. This allows you to set a hard cut on where you need the read to be trimmed despite indels. * soft clipping was messing up cigar string if there was already a hard clip at the beginning of the read. Fixed. * hard clipping now works with previously hard clipped reads.
This commit is contained in:
parent
291d8c7596
commit
8a51732049
|
|
@ -4,6 +4,7 @@ import net.sf.samtools.Cigar;
|
|||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
|
|
@ -17,22 +18,14 @@ import java.util.Vector;
|
|||
* according to the wishes of the supplid ClippingAlgorithm enum.
|
||||
*/
|
||||
public class ClippingOp {
|
||||
public final ClippingType type;
|
||||
public final int start, stop; // inclusive
|
||||
public final Object extraInfo;
|
||||
|
||||
public ClippingOp(int start, int stop) {
|
||||
this(null, start, stop, null);
|
||||
}
|
||||
|
||||
public ClippingOp(ClippingType type, int start, int stop, Object extraInfo) {
|
||||
// todo -- remove type and extra info
|
||||
this.type = type;
|
||||
this.start = start;
|
||||
this.stop = stop;
|
||||
this.extraInfo = extraInfo;
|
||||
}
|
||||
|
||||
|
||||
public int getLength() {
|
||||
return stop - start + 1;
|
||||
}
|
||||
|
|
@ -119,15 +112,6 @@ public class ClippingOp {
|
|||
return clippedRead;
|
||||
}
|
||||
|
||||
/**
|
||||
* What is the type of a ClippingOp?
|
||||
*/
|
||||
public enum ClippingType {
|
||||
LOW_Q_SCORES,
|
||||
WITHIN_CLIP_RANGE,
|
||||
MATCHES_CLIP_SEQ
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a cigar string, get the number of bases hard or soft clipped at the start
|
||||
*/
|
||||
|
|
@ -196,7 +180,7 @@ public class ClippingOp {
|
|||
Vector<CigarElement> newElements = new Vector<CigarElement>();
|
||||
for (CigarElement curElem : __cigar.getCigarElements()) {
|
||||
if (!curElem.getOperator().consumesReadBases()) {
|
||||
if (curLength > __startClipEnd && curLength < __endClipBegin) {
|
||||
if (curElem.getOperator() == CigarOperator.HARD_CLIP || curLength > __startClipEnd && curLength < __endClipBegin) {
|
||||
newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator()));
|
||||
}
|
||||
continue;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,10 @@
|
|||
package org.broadinstitute.sting.utils.clipreads;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.jets3t.service.multi.ThreadedStorageService;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
|
@ -43,6 +47,23 @@ public class ReadClipper {
|
|||
return read;
|
||||
}
|
||||
|
||||
public SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
|
||||
int start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart);
|
||||
int stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop);
|
||||
this.addOp(new ClippingOp(start, stop));
|
||||
return clipRead(ClippingRepresentation.HARDCLIP_BASES);
|
||||
}
|
||||
|
||||
public SAMRecord hardClipByReadCoordinates(int start, int stop) {
|
||||
this.addOp(new ClippingOp(start, stop));
|
||||
return clipRead(ClippingRepresentation.HARDCLIP_BASES);
|
||||
}
|
||||
|
||||
public SAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) {
|
||||
this.read = hardClipByReferenceCoordinates(read.getUnclippedStart(), left);
|
||||
this.ops = null; // reset the operations
|
||||
return hardClipByReferenceCoordinates(right, read.getUnclippedEnd());
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a new read corresponding to this.read that's been clipped according to ops, if any are present.
|
||||
|
|
|
|||
|
|
@ -432,16 +432,34 @@ public class ReadUtils {
|
|||
keepEnd = rec.getReadLength() - l - 1;
|
||||
newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP));
|
||||
break;
|
||||
case H:
|
||||
// TODO -- must be handled specially
|
||||
throw new ReviewedStingException("BUG: tell mark he forgot to implement this");
|
||||
|
||||
default:
|
||||
newCigarElements.add(ce);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return hardClipBases(rec, keepStart, keepEnd, newCigarElements);
|
||||
// Merges tandem cigar elements like 5H10H or 2S5S to 15H or 7S
|
||||
// this will happen if you soft clip a read that has been hard clipped before
|
||||
// like: 5H20S => 5H20H
|
||||
List<CigarElement> mergedCigarElements = new LinkedList<CigarElement>();
|
||||
Iterator<CigarElement> cigarElementIterator = newCigarElements.iterator();
|
||||
CigarOperator currentOperator = null;
|
||||
int currentOperatorLength = 0;
|
||||
while (cigarElementIterator.hasNext()) {
|
||||
CigarElement cigarElement = cigarElementIterator.next();
|
||||
if (currentOperator != cigarElement.getOperator()) {
|
||||
if (currentOperator != null)
|
||||
mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator));
|
||||
currentOperator = cigarElement.getOperator();
|
||||
currentOperatorLength = cigarElement.getLength();
|
||||
}
|
||||
else
|
||||
currentOperatorLength += cigarElement.getLength();
|
||||
}
|
||||
mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator));
|
||||
|
||||
return hardClipBases(rec, keepStart, keepEnd, mergedCigarElements);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -460,8 +478,7 @@ public class ReadUtils {
|
|||
"keepEnd < rec.getReadLength()",
|
||||
"rec.getReadUnmappedFlag() || newCigarElements != null"})
|
||||
@Ensures("result != null")
|
||||
public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd,
|
||||
List<CigarElement> newCigarElements) {
|
||||
public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd, List<CigarElement> newCigarElements) {
|
||||
int newLength = keepEnd - keepStart + 1;
|
||||
if ( newLength != rec.getReadLength() ) {
|
||||
try {
|
||||
|
|
@ -633,7 +650,43 @@ public class ReadUtils {
|
|||
return ReadAndIntervalOverlap.RIGHT_OVERLAP;
|
||||
}
|
||||
|
||||
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"})
|
||||
@Ensures({"result >= 0", "result < read.getReadLength()"})
|
||||
public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) {
|
||||
int readBases = 0;
|
||||
int refBases = 0;
|
||||
int goal = refCoord - read.getUnclippedStart(); // read coords are 0-based!
|
||||
boolean goalReached = false;
|
||||
|
||||
Iterator<CigarElement> cigarElementIterator = read.getCigar().getCigarElements().iterator();
|
||||
while (!goalReached && cigarElementIterator.hasNext()) {
|
||||
CigarElement cigarElement = cigarElementIterator.next();
|
||||
int shift = 0;
|
||||
if (refBases == 0 && readBases == 0 && cigarElement.getOperator() == CigarOperator.HARD_CLIP) {
|
||||
goal -= cigarElement.getLength();
|
||||
}
|
||||
|
||||
if (cigarElement.getOperator().consumesReferenceBases()) {
|
||||
if (refBases + cigarElement.getLength() < goal) {
|
||||
shift = cigarElement.getLength();
|
||||
}
|
||||
else {
|
||||
shift = goal - refBases;
|
||||
}
|
||||
refBases += shift;
|
||||
}
|
||||
goalReached = refBases == goal;
|
||||
|
||||
if (cigarElement.getOperator().consumesReadBases()) {
|
||||
readBases += goalReached ? shift : cigarElement.getLength();
|
||||
}
|
||||
}
|
||||
|
||||
if (!goalReached)
|
||||
throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?");
|
||||
|
||||
return readBases;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue