Fixes to ReadClipper and added Reference Coordinate clipping.
* Added reference coordinate based hard clipping functions. This allows you to set a hard cut on where you need the read to be trimmed despite indels. * soft clipping was messing up cigar string if there was already a hard clip at the beginning of the read. Fixed. * hard clipping now works with previously hard clipped reads.
This commit is contained in:
parent
291d8c7596
commit
8a51732049
|
|
@ -4,6 +4,7 @@ import net.sf.samtools.Cigar;
|
||||||
import net.sf.samtools.CigarElement;
|
import net.sf.samtools.CigarElement;
|
||||||
import net.sf.samtools.CigarOperator;
|
import net.sf.samtools.CigarOperator;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
|
||||||
|
|
@ -17,22 +18,14 @@ import java.util.Vector;
|
||||||
* according to the wishes of the supplid ClippingAlgorithm enum.
|
* according to the wishes of the supplid ClippingAlgorithm enum.
|
||||||
*/
|
*/
|
||||||
public class ClippingOp {
|
public class ClippingOp {
|
||||||
public final ClippingType type;
|
|
||||||
public final int start, stop; // inclusive
|
public final int start, stop; // inclusive
|
||||||
public final Object extraInfo;
|
|
||||||
|
|
||||||
public ClippingOp(int start, int stop) {
|
public ClippingOp(int start, int stop) {
|
||||||
this(null, start, stop, null);
|
|
||||||
}
|
|
||||||
|
|
||||||
public ClippingOp(ClippingType type, int start, int stop, Object extraInfo) {
|
|
||||||
// todo -- remove type and extra info
|
|
||||||
this.type = type;
|
|
||||||
this.start = start;
|
this.start = start;
|
||||||
this.stop = stop;
|
this.stop = stop;
|
||||||
this.extraInfo = extraInfo;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public int getLength() {
|
public int getLength() {
|
||||||
return stop - start + 1;
|
return stop - start + 1;
|
||||||
}
|
}
|
||||||
|
|
@ -119,15 +112,6 @@ public class ClippingOp {
|
||||||
return clippedRead;
|
return clippedRead;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* What is the type of a ClippingOp?
|
|
||||||
*/
|
|
||||||
public enum ClippingType {
|
|
||||||
LOW_Q_SCORES,
|
|
||||||
WITHIN_CLIP_RANGE,
|
|
||||||
MATCHES_CLIP_SEQ
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a cigar string, get the number of bases hard or soft clipped at the start
|
* Given a cigar string, get the number of bases hard or soft clipped at the start
|
||||||
*/
|
*/
|
||||||
|
|
@ -196,7 +180,7 @@ public class ClippingOp {
|
||||||
Vector<CigarElement> newElements = new Vector<CigarElement>();
|
Vector<CigarElement> newElements = new Vector<CigarElement>();
|
||||||
for (CigarElement curElem : __cigar.getCigarElements()) {
|
for (CigarElement curElem : __cigar.getCigarElements()) {
|
||||||
if (!curElem.getOperator().consumesReadBases()) {
|
if (!curElem.getOperator().consumesReadBases()) {
|
||||||
if (curLength > __startClipEnd && curLength < __endClipBegin) {
|
if (curElem.getOperator() == CigarOperator.HARD_CLIP || curLength > __startClipEnd && curLength < __endClipBegin) {
|
||||||
newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator()));
|
newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator()));
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,10 @@
|
||||||
package org.broadinstitute.sting.utils.clipreads;
|
package org.broadinstitute.sting.utils.clipreads;
|
||||||
|
|
||||||
|
import net.sf.samtools.Cigar;
|
||||||
|
import net.sf.samtools.CigarElement;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
import org.jets3t.service.multi.ThreadedStorageService;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
@ -43,6 +47,23 @@ public class ReadClipper {
|
||||||
return read;
|
return read;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
|
||||||
|
int start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart);
|
||||||
|
int stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop);
|
||||||
|
this.addOp(new ClippingOp(start, stop));
|
||||||
|
return clipRead(ClippingRepresentation.HARDCLIP_BASES);
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMRecord hardClipByReadCoordinates(int start, int stop) {
|
||||||
|
this.addOp(new ClippingOp(start, stop));
|
||||||
|
return clipRead(ClippingRepresentation.HARDCLIP_BASES);
|
||||||
|
}
|
||||||
|
|
||||||
|
public SAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) {
|
||||||
|
this.read = hardClipByReferenceCoordinates(read.getUnclippedStart(), left);
|
||||||
|
this.ops = null; // reset the operations
|
||||||
|
return hardClipByReferenceCoordinates(right, read.getUnclippedEnd());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return a new read corresponding to this.read that's been clipped according to ops, if any are present.
|
* Return a new read corresponding to this.read that's been clipped according to ops, if any are present.
|
||||||
|
|
|
||||||
|
|
@ -432,16 +432,34 @@ public class ReadUtils {
|
||||||
keepEnd = rec.getReadLength() - l - 1;
|
keepEnd = rec.getReadLength() - l - 1;
|
||||||
newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP));
|
newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP));
|
||||||
break;
|
break;
|
||||||
case H:
|
|
||||||
// TODO -- must be handled specially
|
|
||||||
throw new ReviewedStingException("BUG: tell mark he forgot to implement this");
|
|
||||||
default:
|
default:
|
||||||
newCigarElements.add(ce);
|
newCigarElements.add(ce);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return hardClipBases(rec, keepStart, keepEnd, newCigarElements);
|
// Merges tandem cigar elements like 5H10H or 2S5S to 15H or 7S
|
||||||
|
// this will happen if you soft clip a read that has been hard clipped before
|
||||||
|
// like: 5H20S => 5H20H
|
||||||
|
List<CigarElement> mergedCigarElements = new LinkedList<CigarElement>();
|
||||||
|
Iterator<CigarElement> cigarElementIterator = newCigarElements.iterator();
|
||||||
|
CigarOperator currentOperator = null;
|
||||||
|
int currentOperatorLength = 0;
|
||||||
|
while (cigarElementIterator.hasNext()) {
|
||||||
|
CigarElement cigarElement = cigarElementIterator.next();
|
||||||
|
if (currentOperator != cigarElement.getOperator()) {
|
||||||
|
if (currentOperator != null)
|
||||||
|
mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator));
|
||||||
|
currentOperator = cigarElement.getOperator();
|
||||||
|
currentOperatorLength = cigarElement.getLength();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
currentOperatorLength += cigarElement.getLength();
|
||||||
|
}
|
||||||
|
mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator));
|
||||||
|
|
||||||
|
return hardClipBases(rec, keepStart, keepEnd, mergedCigarElements);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -460,8 +478,7 @@ public class ReadUtils {
|
||||||
"keepEnd < rec.getReadLength()",
|
"keepEnd < rec.getReadLength()",
|
||||||
"rec.getReadUnmappedFlag() || newCigarElements != null"})
|
"rec.getReadUnmappedFlag() || newCigarElements != null"})
|
||||||
@Ensures("result != null")
|
@Ensures("result != null")
|
||||||
public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd,
|
public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd, List<CigarElement> newCigarElements) {
|
||||||
List<CigarElement> newCigarElements) {
|
|
||||||
int newLength = keepEnd - keepStart + 1;
|
int newLength = keepEnd - keepStart + 1;
|
||||||
if ( newLength != rec.getReadLength() ) {
|
if ( newLength != rec.getReadLength() ) {
|
||||||
try {
|
try {
|
||||||
|
|
@ -633,7 +650,43 @@ public class ReadUtils {
|
||||||
return ReadAndIntervalOverlap.RIGHT_OVERLAP;
|
return ReadAndIntervalOverlap.RIGHT_OVERLAP;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"})
|
||||||
|
@Ensures({"result >= 0", "result < read.getReadLength()"})
|
||||||
|
public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) {
|
||||||
|
int readBases = 0;
|
||||||
|
int refBases = 0;
|
||||||
|
int goal = refCoord - read.getUnclippedStart(); // read coords are 0-based!
|
||||||
|
boolean goalReached = false;
|
||||||
|
|
||||||
|
Iterator<CigarElement> cigarElementIterator = read.getCigar().getCigarElements().iterator();
|
||||||
|
while (!goalReached && cigarElementIterator.hasNext()) {
|
||||||
|
CigarElement cigarElement = cigarElementIterator.next();
|
||||||
|
int shift = 0;
|
||||||
|
if (refBases == 0 && readBases == 0 && cigarElement.getOperator() == CigarOperator.HARD_CLIP) {
|
||||||
|
goal -= cigarElement.getLength();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cigarElement.getOperator().consumesReferenceBases()) {
|
||||||
|
if (refBases + cigarElement.getLength() < goal) {
|
||||||
|
shift = cigarElement.getLength();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
shift = goal - refBases;
|
||||||
|
}
|
||||||
|
refBases += shift;
|
||||||
|
}
|
||||||
|
goalReached = refBases == goal;
|
||||||
|
|
||||||
|
if (cigarElement.getOperator().consumesReadBases()) {
|
||||||
|
readBases += goalReached ? shift : cigarElement.getLength();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!goalReached)
|
||||||
|
throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?");
|
||||||
|
|
||||||
|
return readBases;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue