Better contracts and docs for the ReadClipper

* Described the ReadClipper contract in the top of the class
  * Added contracts where applicable
  * Added descriptive information to all tools in the read clipper
  * Organized public members and static methods together with the same javadoc
This commit is contained in:
Mauricio Carneiro 2011-12-23 19:35:00 -05:00
parent 506c0e9c97
commit 35c41409a1
1 changed files with 202 additions and 75 deletions

View File

@ -11,17 +11,53 @@ import java.util.ArrayList;
import java.util.List;
/**
* A simple collection of the clipping operations to apply to a read along with its read
* A comprehensive clipping tool.
*
* General Contract:
* - All clipping operations return a new read with the clipped bases requested, it never modifies the original read.
* - If a read is fully clipped, return an empty GATKSAMRecord, never null.
* - When hard clipping, add cigar operator H for every *reference base* removed (i.e. Matches, SoftClips and Deletions, but *not* insertions). See Hard Clipping notes for details.
*
*
* There are several types of clipping to use:
*
* Write N's:
* Change the bases to N's in the desired region. This can be applied anywhere in the read.
*
* Write Q0's:
* Change the quality of the bases in the desired region to Q0. This can be applied anywhere in the read.
*
* Write both N's and Q0's:
* Same as the two independent operations, put together.
*
* Soft Clipping:
* Do not change the read, just mark the reads as soft clipped in the Cigar String
* and adjust the alignment start and end of the read.
*
* Hard Clipping:
* Creates a new read without the hard clipped bases (and base qualities). The cigar string
* will be updated with the cigar operator H for every reference base removed (i.e. Matches,
* Soft clipped bases and deletions, but *not* insertions). This contract with the cigar
* is necessary to allow read.getUnclippedStart() / End() to recover the original alignment
* of the read (before clipping).
*
*/
public class ReadClipper {
GATKSAMRecord read;
final GATKSAMRecord read;
boolean wasClipped;
List<ClippingOp> ops = null;
/**
* We didn't do any clipping work on this read, just leave everything as a default
* Initializes a ReadClipper object.
*
* @param read
* You can set up your clipping operations using the addOp method. When you're ready to
* generate a new read with all the clipping operations, use clipRead().
*
* Note: Use this if you want to set up multiple operations on the read using the ClippingOp
* class. If you just want to apply one of the typical modes of clipping, use the static
* clipping functions available in this class instead.
*
* @param read the read to clip
*/
public ReadClipper(final GATKSAMRecord read) {
this.read = read;
@ -29,32 +65,55 @@ public class ReadClipper {
}
/**
* Add another clipping operation to apply to this read
* Add clipping operation to the read.
*
* @param op
* You can add as many operations as necessary to this read before clipping. Beware that the
* order in which you add these operations matter. For example, if you hard clip the beginning
* of a read first then try to hard clip the end, the indices will have changed. Make sure you
* know what you're doing, otherwise just use the static functions below that take care of the
* ordering for you.
*
* Note: You only choose the clipping mode when you use clipRead()
*
* @param op a ClippingOp object describing the area you want to clip.
*/
public void addOp(ClippingOp op) {
if (ops == null) ops = new ArrayList<ClippingOp>();
ops.add(op);
}
/**
* Check the list of operations set up for this read.
*
* @return a list of the operations set up for this read.
*/
public List<ClippingOp> getOps() {
return ops;
}
/**
* Check whether or not this read has been clipped.
* @return true if this read has produced a clipped read, false otherwise.
*/
public boolean wasClipped() {
return wasClipped;
}
/**
* The original read.
*
* @return returns the read to be clipped (original)
*/
public GATKSAMRecord getRead() {
return read;
}
/**
* Return a new read corresponding to this.read that's been clipped according to ops, if any are present.
* Creates a new read that's been clipped according to ops and the chosen algorithm.
* The original read is unmodified.
*
* @param algorithm
* @return
* @param algorithm What mode of clipping do you want to apply for the stacked operations.
* @return a new read with the clipping applied.
*/
public GATKSAMRecord clipRead(ClippingRepresentation algorithm) {
if (ops == null)
@ -84,41 +143,47 @@ public class ReadClipper {
}
// QUICK USE UTILITY FUNCTION
/**
* Hard clips the left tail of a read up to (and including) refStop using reference
* coordinates.
*
* @param refStop the last base to be hard clipped in the left tail of the read.
* @return a new read, without the left tail.
*/
@Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip
public GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(int refStop) {
return hardClipByReferenceCoordinates(-1, refStop);
}
public static GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(GATKSAMRecord read, int refStop) {
return (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, refStop);
}
/**
* Hard clips the right tail of a read starting at (and including) refStart using reference
* coordinates.
*
* @param refStart refStop the first base to be hard clipped in the right tail of the read.
* @return a new read, without the right tail.
*/
@Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip
public GATKSAMRecord hardClipByReferenceCoordinatesRightTail(int refStart) {
return hardClipByReferenceCoordinates(refStart, -1);
}
@Requires("!read.getReadUnmappedFlag()")
protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL);
int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL);
if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1))
return new GATKSAMRecord(read.getHeader());
if (start < 0 || stop > read.getReadLength() - 1)
throw new ReviewedStingException("Trying to clip before the start or after the end of a read");
if ( start > stop )
throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!");
if ( start > 0 && stop < read.getReadLength() - 1)
throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString()));
this.addOp(new ClippingOp(start, stop));
GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES);
this.ops = null;
return clippedRead;
public static GATKSAMRecord hardClipByReferenceCoordinatesRightTail(GATKSAMRecord read, int refStart) {
return (new ReadClipper(read)).hardClipByReferenceCoordinates(refStart, -1);
}
/**
* Hard clips a read using read coordinates.
*
* @param start the first base to clip (inclusive)
* @param stop the last base to clip (inclusive)
* @return a new read, without the clipped bases
*/
@Requires({"start >= 0 && stop <= read.getReadLength() - 1", // start and stop have to be within the read
"start == 0 || stop == read.getReadLength() - 1"}) // cannot clip the middle of the read
public GATKSAMRecord hardClipByReadCoordinates(int start, int stop) {
if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1))
return new GATKSAMRecord(read.getHeader());
@ -126,8 +191,23 @@ public class ReadClipper {
this.addOp(new ClippingOp(start, stop));
return clipRead(ClippingRepresentation.HARDCLIP_BASES);
}
public static GATKSAMRecord hardClipByReadCoordinates(GATKSAMRecord read, int start, int stop) {
return (new ReadClipper(read)).hardClipByReadCoordinates(start, stop);
}
@Requires("left <= right")
/**
* Hard clips both tails of a read.
* Left tail goes from the beginning to the 'left' coordinate (inclusive)
* Right tail goes from the 'right' coordinate (inclusive) until the end of the read
*
* @param left the coordinate of the last base to be clipped in the left tail (inclusive)
* @param right the coordinate of the first base to be clipped in the right tail (inclusive)
* @return a new read, without the clipped bases
*/
@Requires({"left <= right", // tails cannot overlap
"left >= read.getAlignmentStart()", // coordinate has to be within the mapped read
"right <= read.getAlignmentEnd()"}) // coordinate has to be within the mapped read
public GATKSAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) {
if (read.isEmpty() || left == right)
return new GATKSAMRecord(read.getHeader());
@ -141,7 +221,20 @@ public class ReadClipper {
ReadClipper clipper = new ReadClipper(leftTailRead);
return clipper.hardClipByReferenceCoordinatesLeftTail(left);
}
public static GATKSAMRecord hardClipBothEndsByReferenceCoordinates(GATKSAMRecord read, int left, int right) {
return (new ReadClipper(read)).hardClipBothEndsByReferenceCoordinates(left, right);
}
/**
* Hard clips any contiguous tail (left, right or both) with base quality lower than lowQual.
*
* This function will look for low quality tails and hard clip them away. A low quality tail
* ends when a base has base quality greater than lowQual.
*
* @param lowQual every base quality lower than or equal to this in the tail of the read will be hard clipped
* @return a new read without low quality tails
*/
public GATKSAMRecord hardClipLowQualEnds(byte lowQual) {
if (read.isEmpty())
return read;
@ -154,7 +247,7 @@ public class ReadClipper {
while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--;
while (leftClipIndex < read.getReadLength() && quals[leftClipIndex] <= lowQual) leftClipIndex++;
// if the entire read should be clipped, then return an empty read. (--todo: maybe null is better? testing this for now)
// if the entire read should be clipped, then return an empty read.
if (leftClipIndex > rightClipIndex)
return (new GATKSAMRecord(read.getHeader()));
@ -166,7 +259,16 @@ public class ReadClipper {
}
return this.clipRead(ClippingRepresentation.HARDCLIP_BASES);
}
public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) {
return (new ReadClipper(read)).hardClipLowQualEnds(lowQual);
}
/**
* Will hard clip every soft clipped bases in the read.
*
* @return a new read without the soft clipped bases
*/
public GATKSAMRecord hardClipSoftClippedBases () {
if (read.isEmpty())
return read;
@ -200,7 +302,18 @@ public class ReadClipper {
return clipRead(ClippingRepresentation.HARDCLIP_BASES);
}
public static GATKSAMRecord hardClipSoftClippedBases (GATKSAMRecord read) {
return (new ReadClipper(read)).hardClipSoftClippedBases();
}
/**
* Checks if a read contains adaptor sequences. If it does, hard clips them out.
*
* Note: To see how a read is checked for adaptor sequence see ReadUtils.getAdaptorBoundary()
*
* @return a new read without adaptor sequence
*/
public GATKSAMRecord hardClipAdaptorSequence () {
final Integer adaptorBoundary = ReadUtils.getAdaptorBoundary(read);
@ -209,7 +322,16 @@ public class ReadClipper {
return read.getReadNegativeStrandFlag() ? hardClipByReferenceCoordinatesLeftTail(adaptorBoundary) : hardClipByReferenceCoordinatesRightTail(adaptorBoundary);
}
public static GATKSAMRecord hardClipAdaptorSequence (GATKSAMRecord read) {
return (new ReadClipper(read)).hardClipAdaptorSequence();
}
/**
* Hard clips any leading insertions in the read. Only looks at the beginning of the read, not the end.
*
* @return a new read without leading insertions
*/
public GATKSAMRecord hardClipLeadingInsertions() {
if (read.isEmpty())
return read;
@ -225,49 +347,54 @@ public class ReadClipper {
}
return clipRead(ClippingRepresentation.HARDCLIP_BASES);
}
public GATKSAMRecord revertSoftClippedBases() {
this.addOp(new ClippingOp(0, 0)); // UNSOFTCLIP_BASES doesn't need coordinates
return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES);
}
// STATIC VERSIONS OF THE QUICK CLIPPING FUNCTIONS
public static GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(GATKSAMRecord read, int refStop) {
return (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, refStop);
}
public static GATKSAMRecord hardClipByReferenceCoordinatesRightTail(GATKSAMRecord read, int refStart) {
return (new ReadClipper(read)).hardClipByReferenceCoordinates(refStart, -1);
}
public static GATKSAMRecord hardClipByReadCoordinates(GATKSAMRecord read, int start, int stop) {
return (new ReadClipper(read)).hardClipByReadCoordinates(start, stop);
}
public static GATKSAMRecord hardClipBothEndsByReferenceCoordinates(GATKSAMRecord read, int left, int right) {
return (new ReadClipper(read)).hardClipBothEndsByReferenceCoordinates(left, right);
}
public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) {
return (new ReadClipper(read)).hardClipLowQualEnds(lowQual);
}
public static GATKSAMRecord hardClipSoftClippedBases (GATKSAMRecord read) {
return (new ReadClipper(read)).hardClipSoftClippedBases();
}
public static GATKSAMRecord hardClipAdaptorSequence (GATKSAMRecord read) {
return (new ReadClipper(read)).hardClipAdaptorSequence();
}
public static GATKSAMRecord hardClipLeadingInsertions(GATKSAMRecord read) {
return (new ReadClipper(read)).hardClipLeadingInsertions();
}
/**
* Turns soft clipped bases into matches
*
* @return a new read with every soft clip turned into a match
*/
public GATKSAMRecord revertSoftClippedBases() {
this.addOp(new ClippingOp(0, 0)); // UNSOFTCLIP_BASES doesn't need coordinates
return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES);
}
public static GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) {
return (new ReadClipper(read)).revertSoftClippedBases();
}
/**
* Generic functionality to hard clip a read, used internally by hardClipByReferenceCoordinatesLeftTail
* and hardClipByReferenceCoordinatesRightTail. Should not be used directly.
*
* @param refStart first base to clip (inclusive)
* @param refStop last base to clip (inclusive)
* @return a new read, without the clipped bases
*/
@Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip
protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL);
int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL);
if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1))
return new GATKSAMRecord(read.getHeader());
if (start < 0 || stop > read.getReadLength() - 1)
throw new ReviewedStingException("Trying to clip before the start or after the end of a read");
if ( start > stop )
throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!");
if ( start > 0 && stop < read.getReadLength() - 1)
throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString()));
this.addOp(new ClippingOp(start, stop));
GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES);
this.ops = null;
return clippedRead;
}
}