diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index d82472bae..5deee122f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -11,17 +11,53 @@ import java.util.ArrayList; import java.util.List; /** - * A simple collection of the clipping operations to apply to a read along with its read + * A comprehensive clipping tool. + * + * General Contract: + * - All clipping operations return a new read with the clipped bases requested, it never modifies the original read. + * - If a read is fully clipped, return an empty GATKSAMRecord, never null. + * - When hard clipping, add cigar operator H for every *reference base* removed (i.e. Matches, SoftClips and Deletions, but *not* insertions). See Hard Clipping notes for details. + * + * + * There are several types of clipping to use: + * + * Write N's: + * Change the bases to N's in the desired region. This can be applied anywhere in the read. + * + * Write Q0's: + * Change the quality of the bases in the desired region to Q0. This can be applied anywhere in the read. + * + * Write both N's and Q0's: + * Same as the two independent operations, put together. + * + * Soft Clipping: + * Do not change the read, just mark the reads as soft clipped in the Cigar String + * and adjust the alignment start and end of the read. + * + * Hard Clipping: + * Creates a new read without the hard clipped bases (and base qualities). The cigar string + * will be updated with the cigar operator H for every reference base removed (i.e. Matches, + * Soft clipped bases and deletions, but *not* insertions). This contract with the cigar + * is necessary to allow read.getUnclippedStart() / End() to recover the original alignment + * of the read (before clipping). + * */ public class ReadClipper { - GATKSAMRecord read; + final GATKSAMRecord read; boolean wasClipped; List ops = null; /** - * We didn't do any clipping work on this read, just leave everything as a default + * Initializes a ReadClipper object. * - * @param read + * You can set up your clipping operations using the addOp method. When you're ready to + * generate a new read with all the clipping operations, use clipRead(). + * + * Note: Use this if you want to set up multiple operations on the read using the ClippingOp + * class. If you just want to apply one of the typical modes of clipping, use the static + * clipping functions available in this class instead. + * + * @param read the read to clip */ public ReadClipper(final GATKSAMRecord read) { this.read = read; @@ -29,32 +65,55 @@ public class ReadClipper { } /** - * Add another clipping operation to apply to this read + * Add clipping operation to the read. * - * @param op + * You can add as many operations as necessary to this read before clipping. Beware that the + * order in which you add these operations matter. For example, if you hard clip the beginning + * of a read first then try to hard clip the end, the indices will have changed. Make sure you + * know what you're doing, otherwise just use the static functions below that take care of the + * ordering for you. + * + * Note: You only choose the clipping mode when you use clipRead() + * + * @param op a ClippingOp object describing the area you want to clip. */ public void addOp(ClippingOp op) { if (ops == null) ops = new ArrayList(); ops.add(op); } + /** + * Check the list of operations set up for this read. + * + * @return a list of the operations set up for this read. + */ public List getOps() { return ops; } + /** + * Check whether or not this read has been clipped. + * @return true if this read has produced a clipped read, false otherwise. + */ public boolean wasClipped() { return wasClipped; } + /** + * The original read. + * + * @return returns the read to be clipped (original) + */ public GATKSAMRecord getRead() { return read; } /** - * Return a new read corresponding to this.read that's been clipped according to ops, if any are present. + * Creates a new read that's been clipped according to ops and the chosen algorithm. + * The original read is unmodified. * - * @param algorithm - * @return + * @param algorithm What mode of clipping do you want to apply for the stacked operations. + * @return a new read with the clipping applied. */ public GATKSAMRecord clipRead(ClippingRepresentation algorithm) { if (ops == null) @@ -84,41 +143,47 @@ public class ReadClipper { } - - - // QUICK USE UTILITY FUNCTION - + /** + * Hard clips the left tail of a read up to (and including) refStop using reference + * coordinates. + * + * @param refStop the last base to be hard clipped in the left tail of the read. + * @return a new read, without the left tail. + */ + @Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip public GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(int refStop) { return hardClipByReferenceCoordinates(-1, refStop); } + public static GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(GATKSAMRecord read, int refStop) { + return (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, refStop); + } + + + /** + * Hard clips the right tail of a read starting at (and including) refStart using reference + * coordinates. + * + * @param refStart refStop the first base to be hard clipped in the right tail of the read. + * @return a new read, without the right tail. + */ + @Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip public GATKSAMRecord hardClipByReferenceCoordinatesRightTail(int refStart) { return hardClipByReferenceCoordinates(refStart, -1); } - - @Requires("!read.getReadUnmappedFlag()") - protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { - int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); - int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); - - if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1)) - return new GATKSAMRecord(read.getHeader()); - - if (start < 0 || stop > read.getReadLength() - 1) - throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); - - if ( start > stop ) - throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!"); - - if ( start > 0 && stop < read.getReadLength() - 1) - throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString())); - - this.addOp(new ClippingOp(start, stop)); - GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES); - this.ops = null; - return clippedRead; + public static GATKSAMRecord hardClipByReferenceCoordinatesRightTail(GATKSAMRecord read, int refStart) { + return (new ReadClipper(read)).hardClipByReferenceCoordinates(refStart, -1); } + /** + * Hard clips a read using read coordinates. + * + * @param start the first base to clip (inclusive) + * @param stop the last base to clip (inclusive) + * @return a new read, without the clipped bases + */ + @Requires({"start >= 0 && stop <= read.getReadLength() - 1", // start and stop have to be within the read + "start == 0 || stop == read.getReadLength() - 1"}) // cannot clip the middle of the read public GATKSAMRecord hardClipByReadCoordinates(int start, int stop) { if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1)) return new GATKSAMRecord(read.getHeader()); @@ -126,8 +191,23 @@ public class ReadClipper { this.addOp(new ClippingOp(start, stop)); return clipRead(ClippingRepresentation.HARDCLIP_BASES); } + public static GATKSAMRecord hardClipByReadCoordinates(GATKSAMRecord read, int start, int stop) { + return (new ReadClipper(read)).hardClipByReadCoordinates(start, stop); + } - @Requires("left <= right") + + /** + * Hard clips both tails of a read. + * Left tail goes from the beginning to the 'left' coordinate (inclusive) + * Right tail goes from the 'right' coordinate (inclusive) until the end of the read + * + * @param left the coordinate of the last base to be clipped in the left tail (inclusive) + * @param right the coordinate of the first base to be clipped in the right tail (inclusive) + * @return a new read, without the clipped bases + */ + @Requires({"left <= right", // tails cannot overlap + "left >= read.getAlignmentStart()", // coordinate has to be within the mapped read + "right <= read.getAlignmentEnd()"}) // coordinate has to be within the mapped read public GATKSAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) { if (read.isEmpty() || left == right) return new GATKSAMRecord(read.getHeader()); @@ -141,7 +221,20 @@ public class ReadClipper { ReadClipper clipper = new ReadClipper(leftTailRead); return clipper.hardClipByReferenceCoordinatesLeftTail(left); } + public static GATKSAMRecord hardClipBothEndsByReferenceCoordinates(GATKSAMRecord read, int left, int right) { + return (new ReadClipper(read)).hardClipBothEndsByReferenceCoordinates(left, right); + } + + /** + * Hard clips any contiguous tail (left, right or both) with base quality lower than lowQual. + * + * This function will look for low quality tails and hard clip them away. A low quality tail + * ends when a base has base quality greater than lowQual. + * + * @param lowQual every base quality lower than or equal to this in the tail of the read will be hard clipped + * @return a new read without low quality tails + */ public GATKSAMRecord hardClipLowQualEnds(byte lowQual) { if (read.isEmpty()) return read; @@ -154,7 +247,7 @@ public class ReadClipper { while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--; while (leftClipIndex < read.getReadLength() && quals[leftClipIndex] <= lowQual) leftClipIndex++; - // if the entire read should be clipped, then return an empty read. (--todo: maybe null is better? testing this for now) + // if the entire read should be clipped, then return an empty read. if (leftClipIndex > rightClipIndex) return (new GATKSAMRecord(read.getHeader())); @@ -166,7 +259,16 @@ public class ReadClipper { } return this.clipRead(ClippingRepresentation.HARDCLIP_BASES); } + public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) { + return (new ReadClipper(read)).hardClipLowQualEnds(lowQual); + } + + /** + * Will hard clip every soft clipped bases in the read. + * + * @return a new read without the soft clipped bases + */ public GATKSAMRecord hardClipSoftClippedBases () { if (read.isEmpty()) return read; @@ -200,7 +302,18 @@ public class ReadClipper { return clipRead(ClippingRepresentation.HARDCLIP_BASES); } + public static GATKSAMRecord hardClipSoftClippedBases (GATKSAMRecord read) { + return (new ReadClipper(read)).hardClipSoftClippedBases(); + } + + /** + * Checks if a read contains adaptor sequences. If it does, hard clips them out. + * + * Note: To see how a read is checked for adaptor sequence see ReadUtils.getAdaptorBoundary() + * + * @return a new read without adaptor sequence + */ public GATKSAMRecord hardClipAdaptorSequence () { final Integer adaptorBoundary = ReadUtils.getAdaptorBoundary(read); @@ -209,7 +322,16 @@ public class ReadClipper { return read.getReadNegativeStrandFlag() ? hardClipByReferenceCoordinatesLeftTail(adaptorBoundary) : hardClipByReferenceCoordinatesRightTail(adaptorBoundary); } + public static GATKSAMRecord hardClipAdaptorSequence (GATKSAMRecord read) { + return (new ReadClipper(read)).hardClipAdaptorSequence(); + } + + /** + * Hard clips any leading insertions in the read. Only looks at the beginning of the read, not the end. + * + * @return a new read without leading insertions + */ public GATKSAMRecord hardClipLeadingInsertions() { if (read.isEmpty()) return read; @@ -225,49 +347,54 @@ public class ReadClipper { } return clipRead(ClippingRepresentation.HARDCLIP_BASES); } - - public GATKSAMRecord revertSoftClippedBases() { - this.addOp(new ClippingOp(0, 0)); // UNSOFTCLIP_BASES doesn't need coordinates - return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES); - } - - - - // STATIC VERSIONS OF THE QUICK CLIPPING FUNCTIONS - - public static GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(GATKSAMRecord read, int refStop) { - return (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, refStop); - } - - public static GATKSAMRecord hardClipByReferenceCoordinatesRightTail(GATKSAMRecord read, int refStart) { - return (new ReadClipper(read)).hardClipByReferenceCoordinates(refStart, -1); - } - - public static GATKSAMRecord hardClipByReadCoordinates(GATKSAMRecord read, int start, int stop) { - return (new ReadClipper(read)).hardClipByReadCoordinates(start, stop); - } - - public static GATKSAMRecord hardClipBothEndsByReferenceCoordinates(GATKSAMRecord read, int left, int right) { - return (new ReadClipper(read)).hardClipBothEndsByReferenceCoordinates(left, right); - } - - public static GATKSAMRecord hardClipLowQualEnds(GATKSAMRecord read, byte lowQual) { - return (new ReadClipper(read)).hardClipLowQualEnds(lowQual); - } - - public static GATKSAMRecord hardClipSoftClippedBases (GATKSAMRecord read) { - return (new ReadClipper(read)).hardClipSoftClippedBases(); - } - - public static GATKSAMRecord hardClipAdaptorSequence (GATKSAMRecord read) { - return (new ReadClipper(read)).hardClipAdaptorSequence(); - } - public static GATKSAMRecord hardClipLeadingInsertions(GATKSAMRecord read) { return (new ReadClipper(read)).hardClipLeadingInsertions(); } + + /** + * Turns soft clipped bases into matches + * + * @return a new read with every soft clip turned into a match + */ + public GATKSAMRecord revertSoftClippedBases() { + this.addOp(new ClippingOp(0, 0)); // UNSOFTCLIP_BASES doesn't need coordinates + return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES); + } public static GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { return (new ReadClipper(read)).revertSoftClippedBases(); } + + /** + * Generic functionality to hard clip a read, used internally by hardClipByReferenceCoordinatesLeftTail + * and hardClipByReferenceCoordinatesRightTail. Should not be used directly. + * + * @param refStart first base to clip (inclusive) + * @param refStop last base to clip (inclusive) + * @return a new read, without the clipped bases + */ + @Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip + protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { + int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); + int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); + + if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1)) + return new GATKSAMRecord(read.getHeader()); + + if (start < 0 || stop > read.getReadLength() - 1) + throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); + + if ( start > stop ) + throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!"); + + if ( start > 0 && stop < read.getReadLength() - 1) + throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString())); + + this.addOp(new ClippingOp(start, stop)); + GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES); + this.ops = null; + return clippedRead; + } + + }