Better contracts and docs for the ReadClipper
* Described the ReadClipper contract in the top of the class * Added contracts where applicable * Added descriptive information to all tools in the read clipper * Organized public members and static methods together with the same javadoc
This commit is contained in:
parent
506c0e9c97
commit
35c41409a1
|
|
@ -11,17 +11,53 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A simple collection of the clipping operations to apply to a read along with its read
|
||||
* A comprehensive clipping tool.
|
||||
*
|
||||
* General Contract:
|
||||
* - All clipping operations return a new read with the clipped bases requested, it never modifies the original read.
|
||||
* - If a read is fully clipped, return an empty GATKSAMRecord, never null.
|
||||
* - When hard clipping, add cigar operator H for every *reference base* removed (i.e. Matches, SoftClips and Deletions, but *not* insertions). See Hard Clipping notes for details.
|
||||
*
|
||||
*
|
||||
* There are several types of clipping to use:
|
||||
*
|
||||
* Write N's:
|
||||
* Change the bases to N's in the desired region. This can be applied anywhere in the read.
|
||||
*
|
||||
* Write Q0's:
|
||||
* Change the quality of the bases in the desired region to Q0. This can be applied anywhere in the read.
|
||||
*
|
||||
* Write both N's and Q0's:
|
||||
* Same as the two independent operations, put together.
|
||||
*
|
||||
* Soft Clipping:
|
||||
* Do not change the read, just mark the reads as soft clipped in the Cigar String
|
||||
* and adjust the alignment start and end of the read.
|
||||
*
|
||||
* Hard Clipping:
|
||||
* Creates a new read without the hard clipped bases (and base qualities). The cigar string
|
||||
* will be updated with the cigar operator H for every reference base removed (i.e. Matches,
|
||||
* Soft clipped bases and deletions, but *not* insertions). This contract with the cigar
|
||||
* is necessary to allow read.getUnclippedStart() / End() to recover the original alignment
|
||||
* of the read (before clipping).
|
||||
*
|
||||
*/
|
||||
public class ReadClipper {
|
||||
GATKSAMRecord read;
|
||||
final GATKSAMRecord read;
|
||||
boolean wasClipped;
|
||||
List<ClippingOp> ops = null;
|
||||
|
||||
/**
|
||||
* We didn't do any clipping work on this read, just leave everything as a default
|
||||
* Initializes a ReadClipper object.
|
||||
*
|
||||
* @param read
|
||||
* You can set up your clipping operations using the addOp method. When you're ready to
|
||||
* generate a new read with all the clipping operations, use clipRead().
|
||||
*
|
||||
* Note: Use this if you want to set up multiple operations on the read using the ClippingOp
|
||||
* class. If you just want to apply one of the typical modes of clipping, use the static
|
||||
* clipping functions available in this class instead.
|
||||
*
|
||||
* @param read the read to clip
|
||||
*/
|
||||
public ReadClipper(final GATKSAMRecord read) {
|
||||
this.read = read;
|
||||
|
|
@ -29,32 +65,55 @@ public class ReadClipper {
|
|||
}
|
||||
|
||||
/**
|
||||
* Add another clipping operation to apply to this read
|
||||
* Add clipping operation to the read.
|
||||
*
|
||||
* @param op
|
||||
* You can add as many operations as necessary to this read before clipping. Beware that the
|
||||
* order in which you add these operations matter. For example, if you hard clip the beginning
|
||||
* of a read first then try to hard clip the end, the indices will have changed. Make sure you
|
||||
* know what you're doing, otherwise just use the static functions below that take care of the
|
||||
* ordering for you.
|
||||
*
|
||||
* Note: You only choose the clipping mode when you use clipRead()
|
||||
*
|
||||
* @param op a ClippingOp object describing the area you want to clip.
|
||||
*/
|
||||
public void addOp(ClippingOp op) {
|
||||
if (ops == null) ops = new ArrayList<ClippingOp>();
|
||||
ops.add(op);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check the list of operations set up for this read.
|
||||
*
|
||||
* @return a list of the operations set up for this read.
|
||||
*/
|
||||
public List<ClippingOp> getOps() {
|
||||
return ops;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether or not this read has been clipped.
|
||||
* @return true if this read has produced a clipped read, false otherwise.
|
||||
*/
|
||||
public boolean wasClipped() {
|
||||
return wasClipped;
|
||||
}
|
||||
|
||||
/**
|
||||
* The original read.
|
||||
*
|
||||
* @return returns the read to be clipped (original)
|
||||
*/
|
||||
public GATKSAMRecord getRead() {
|
||||
return read;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a new read corresponding to this.read that's been clipped according to ops, if any are present.
|
||||
* Creates a new read that's been clipped according to ops and the chosen algorithm.
|
||||
* The original read is unmodified.
|
||||
*
|
||||
* @param algorithm
|
||||
* @return
|
||||
* @param algorithm What mode of clipping do you want to apply for the stacked operations.
|
||||
* @return a new read with the clipping applied.
|
||||
*/
|
||||
public GATKSAMRecord clipRead(ClippingRepresentation algorithm) {
|
||||
if (ops == null)
|
||||
|
|
@ -84,41 +143,47 @@ public class ReadClipper {
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// QUICK USE UTILITY FUNCTION
|
||||
|
||||
/**
|
||||
* Hard clips the left tail of a read up to (and including) refStop using reference
|
||||
* coordinates.
|
||||
*
|
||||
* @param refStop the last base to be hard clipped in the left tail of the read.
|
||||
* @return a new read, without the left tail.
|
||||
*/
|
||||
@Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip
|
||||
public GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(int refStop) {
|
||||
return hardClipByReferenceCoordinates(-1, refStop);
|
||||
}
|
||||
public static GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(GATKSAMRecord read, int refStop) {
|
||||
return (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, refStop);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Hard clips the right tail of a read starting at (and including) refStart using reference
|
||||
* coordinates.
|
||||
*
|
||||
* @param refStart refStop the first base to be hard clipped in the right tail of the read.
|
||||
* @return a new read, without the right tail.
|
||||
*/
|
||||
@Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip
|
||||
public GATKSAMRecord hardClipByReferenceCoordinatesRightTail(int refStart) {
|
||||
return hardClipByReferenceCoordinates(refStart, -1);
|
||||
}
|
||||
|
||||
@Requires("!read.getReadUnmappedFlag()")
|
||||
protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
|
||||
int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL);
|
||||
int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL);
|
||||
|
||||
if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1))
|
||||
return new GATKSAMRecord(read.getHeader());
|
||||
|
||||
if (start < 0 || stop > read.getReadLength() - 1)
|
||||
throw new ReviewedStingException("Trying to clip before the start or after the end of a read");
|
||||
|
||||
if ( start > stop )
|
||||
throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!");
|
||||
|
||||
if ( start > 0 && stop < read.getReadLength() - 1)
|
||||
throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString()));
|
||||
|
||||
this.addOp(new ClippingOp(start, stop));
|
||||
GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES);
|
||||
this.ops = null;
|
||||
return clippedRead;
|
||||
public static GATKSAMRecord hardClipByReferenceCoordinatesRightTail(GATKSAMRecord read, int refStart) {
|
||||
return (new ReadClipper(read)).hardClipByReferenceCoordinates(refStart, -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Hard clips a read using read coordinates.
|
||||
*
|
||||
* @param start the first base to clip (inclusive)
|
||||
* @param stop the last base to clip (inclusive)
|
||||
* @return a new read, without the clipped bases
|
||||
*/
|
||||
@Requires({"start >= 0 && stop <= read.getReadLength() - 1", // start and stop have to be within the read
|
||||
"start == 0 || stop == read.getReadLength() - 1"}) // cannot clip the middle of the read
|
||||
public GATKSAMRecord hardClipByReadCoordinates(int start, int stop) {
|
||||
if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1))
|
||||
return new GATKSAMRecord(read.getHeader());
|
||||
|
|
@ -126,8 +191,23 @@ public class ReadClipper {
|
|||
this.addOp(new ClippingOp(start, stop));
|
||||
return clipRead(ClippingRepresentation.HARDCLIP_BASES);
|
||||
}
|
||||
public static GATKSAMRecord hardClipByReadCoordinates(GATKSAMRecord read, int start, int stop) {
|
||||
return (new ReadClipper(read)).hardClipByReadCoordinates(start, stop);
|
||||
}
|
||||
|
||||
@Requires("left <= right")
|
||||
|
||||
/**
|
||||
* Hard clips both tails of a read.
|
||||
* Left tail goes from the beginning to the 'left' coordinate (inclusive)
|
||||
* Right tail goes from the 'right' coordinate (inclusive) until the end of the read
|
||||
*
|
||||
* @param left the coordinate of the last base to be clipped in the left tail (inclusive)
|
||||
* @param right the coordinate of the first base to be clipped in the right tail (inclusive)
|
||||
* @return a new read, without the clipped bases
|
||||
*/
|
||||
@Requires({"left <= right", // tails cannot overlap
|
||||
"left >= read.getAlignmentStart()", // coordinate has to be within the mapped read
|
||||
"right <= read.getAlignmentEnd()"}) // coordinate has to be within the mapped read
|
||||
public GATKSAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) {
|
||||
if (read.isEmpty() || left == right)
|
||||
return new GATKSAMRecord(read.getHeader());
|
||||
|
|
@ -141,7 +221,20 @@ public class ReadClipper {
|
|||
ReadClipper clipper = new ReadClipper(leftTailRead);
|
||||
return clipper.hardClipByReferenceCoordinatesLeftTail(left);
|
||||
}
|
||||
public static GATKSAMRecord hardClipBothEndsByReferenceCoordinates(GATKSAMRecord read, int left, int right) {
|
||||
return (new ReadClipper(read)).hardClipBothEndsByReferenceCoordinates(left, right);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Hard clips any contiguous tail (left, right or both) with base quality lower than lowQual.
|
||||
*
|
||||
* This function will look for low quality tails and hard clip them away. A low quality tail
|
||||
* ends when a base has base quality greater than lowQual.
|
||||
*
|
||||
* @param lowQual every base quality lower than or equal to this in the tail of the read will be hard clipped
|
||||
* @return a new read without low quality tails
|
||||
*/
|
||||
public GATKSAMRecord hardClipLowQualEnds(byte lowQual) {
|
||||
if (read.isEmpty())
|
||||
return read;
|
||||
|
|
@ -154,7 +247,7 @@ public class ReadClipper {
|
|||
while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--;
|
||||
while (leftClipIndex < read.getReadLength() && quals[leftClipIndex] <= lowQual) leftClipIndex++;
|
||||
|
||||
// if the entire read should be clipped, then return an empty read. (--todo: maybe null is better? testing this for now)
|
||||
// if the entire read should be clipped, then return an empty read.
|
||||
if (leftClipIndex > rightClipIndex)
|
||||
return (new GATKSAMRecord(read.getHeader()));
|
||||
|
||||
|
|
@ -166,7 +259,16 @@ public class ReadClipper {
|
|||
}
|
||||
return this.clipRead(ClippingRepresentation.HARDCLIP_BASES);
|
||||
}
|
||||
public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) {
|
||||
return (new ReadClipper(read)).hardClipLowQualEnds(lowQual);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Will hard clip every soft clipped bases in the read.
|
||||
*
|
||||
* @return a new read without the soft clipped bases
|
||||
*/
|
||||
public GATKSAMRecord hardClipSoftClippedBases () {
|
||||
if (read.isEmpty())
|
||||
return read;
|
||||
|
|
@ -200,7 +302,18 @@ public class ReadClipper {
|
|||
|
||||
return clipRead(ClippingRepresentation.HARDCLIP_BASES);
|
||||
}
|
||||
public static GATKSAMRecord hardClipSoftClippedBases (GATKSAMRecord read) {
|
||||
return (new ReadClipper(read)).hardClipSoftClippedBases();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if a read contains adaptor sequences. If it does, hard clips them out.
|
||||
*
|
||||
* Note: To see how a read is checked for adaptor sequence see ReadUtils.getAdaptorBoundary()
|
||||
*
|
||||
* @return a new read without adaptor sequence
|
||||
*/
|
||||
public GATKSAMRecord hardClipAdaptorSequence () {
|
||||
final Integer adaptorBoundary = ReadUtils.getAdaptorBoundary(read);
|
||||
|
||||
|
|
@ -209,7 +322,16 @@ public class ReadClipper {
|
|||
|
||||
return read.getReadNegativeStrandFlag() ? hardClipByReferenceCoordinatesLeftTail(adaptorBoundary) : hardClipByReferenceCoordinatesRightTail(adaptorBoundary);
|
||||
}
|
||||
public static GATKSAMRecord hardClipAdaptorSequence (GATKSAMRecord read) {
|
||||
return (new ReadClipper(read)).hardClipAdaptorSequence();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Hard clips any leading insertions in the read. Only looks at the beginning of the read, not the end.
|
||||
*
|
||||
* @return a new read without leading insertions
|
||||
*/
|
||||
public GATKSAMRecord hardClipLeadingInsertions() {
|
||||
if (read.isEmpty())
|
||||
return read;
|
||||
|
|
@ -225,49 +347,54 @@ public class ReadClipper {
|
|||
}
|
||||
return clipRead(ClippingRepresentation.HARDCLIP_BASES);
|
||||
}
|
||||
|
||||
public GATKSAMRecord revertSoftClippedBases() {
|
||||
this.addOp(new ClippingOp(0, 0)); // UNSOFTCLIP_BASES doesn't need coordinates
|
||||
return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// STATIC VERSIONS OF THE QUICK CLIPPING FUNCTIONS
|
||||
|
||||
public static GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(GATKSAMRecord read, int refStop) {
|
||||
return (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, refStop);
|
||||
}
|
||||
|
||||
public static GATKSAMRecord hardClipByReferenceCoordinatesRightTail(GATKSAMRecord read, int refStart) {
|
||||
return (new ReadClipper(read)).hardClipByReferenceCoordinates(refStart, -1);
|
||||
}
|
||||
|
||||
public static GATKSAMRecord hardClipByReadCoordinates(GATKSAMRecord read, int start, int stop) {
|
||||
return (new ReadClipper(read)).hardClipByReadCoordinates(start, stop);
|
||||
}
|
||||
|
||||
public static GATKSAMRecord hardClipBothEndsByReferenceCoordinates(GATKSAMRecord read, int left, int right) {
|
||||
return (new ReadClipper(read)).hardClipBothEndsByReferenceCoordinates(left, right);
|
||||
}
|
||||
|
||||
public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) {
|
||||
return (new ReadClipper(read)).hardClipLowQualEnds(lowQual);
|
||||
}
|
||||
|
||||
public static GATKSAMRecord hardClipSoftClippedBases (GATKSAMRecord read) {
|
||||
return (new ReadClipper(read)).hardClipSoftClippedBases();
|
||||
}
|
||||
|
||||
public static GATKSAMRecord hardClipAdaptorSequence (GATKSAMRecord read) {
|
||||
return (new ReadClipper(read)).hardClipAdaptorSequence();
|
||||
}
|
||||
|
||||
public static GATKSAMRecord hardClipLeadingInsertions(GATKSAMRecord read) {
|
||||
return (new ReadClipper(read)).hardClipLeadingInsertions();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Turns soft clipped bases into matches
|
||||
*
|
||||
* @return a new read with every soft clip turned into a match
|
||||
*/
|
||||
public GATKSAMRecord revertSoftClippedBases() {
|
||||
this.addOp(new ClippingOp(0, 0)); // UNSOFTCLIP_BASES doesn't need coordinates
|
||||
return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES);
|
||||
}
|
||||
public static GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) {
|
||||
return (new ReadClipper(read)).revertSoftClippedBases();
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic functionality to hard clip a read, used internally by hardClipByReferenceCoordinatesLeftTail
|
||||
* and hardClipByReferenceCoordinatesRightTail. Should not be used directly.
|
||||
*
|
||||
* @param refStart first base to clip (inclusive)
|
||||
* @param refStop last base to clip (inclusive)
|
||||
* @return a new read, without the clipped bases
|
||||
*/
|
||||
@Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip
|
||||
protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
|
||||
int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL);
|
||||
int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL);
|
||||
|
||||
if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1))
|
||||
return new GATKSAMRecord(read.getHeader());
|
||||
|
||||
if (start < 0 || stop > read.getReadLength() - 1)
|
||||
throw new ReviewedStingException("Trying to clip before the start or after the end of a read");
|
||||
|
||||
if ( start > stop )
|
||||
throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!");
|
||||
|
||||
if ( start > 0 && stop < read.getReadLength() - 1)
|
||||
throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString()));
|
||||
|
||||
this.addOp(new ClippingOp(start, stop));
|
||||
GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES);
|
||||
this.ops = null;
|
||||
return clippedRead;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue