Add support for reads starting with insertion

* Modified cleanCigarShift to allow insertions in the beginning and end of the read
      * Allowed cigars starting/ending in insertions in the systematic ReadClipper tests
      * Updated all ReadClipper unit tests
      * ReduceReads does not hard clip leading insertions by default anymore
      * SlidingWindow adjusts start location if read starts with insertion
      * SlidingWindow creates an empty element with insertions to the right
      * Fixed all potential divide by zero with totalCount() (from BaseCounts)
      * Updated all Integration tests
      * Added new integration test for multiple interval reducing
This commit is contained in:
Mauricio Carneiro 2011-12-28 12:15:53 -05:00
parent 3ecb9a0bf7
commit 94791a2a75
7 changed files with 301 additions and 243 deletions

View File

@ -70,27 +70,27 @@ public class ClippingOp {
break; break;
case SOFTCLIP_BASES: case SOFTCLIP_BASES:
if ( read.getReadUnmappedFlag() ) { if (read.getReadUnmappedFlag()) {
// we can't process unmapped reads // we can't process unmapped reads
throw new UserException("Read Clipper cannot soft clip unmapped reads"); throw new UserException("Read Clipper cannot soft clip unmapped reads");
} }
//System.out.printf("%d %d %d%n", stop, start, read.getReadLength()); //System.out.printf("%d %d %d%n", stop, start, read.getReadLength());
int myStop = stop; int myStop = stop;
if ( (stop + 1 - start) == read.getReadLength() ) { if ((stop + 1 - start) == read.getReadLength()) {
// BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone
//Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", read.getReadName())); //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", read.getReadName()));
//break; //break;
myStop--; // just decrement stop myStop--; // just decrement stop
} }
if ( start > 0 && myStop != read.getReadLength() - 1 ) if (start > 0 && myStop != read.getReadLength() - 1)
throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", read.getReadName(), start, myStop)); throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", read.getReadName(), start, myStop));
Cigar oldCigar = read.getCigar(); Cigar oldCigar = read.getCigar();
int scLeft = 0, scRight = read.getReadLength(); int scLeft = 0, scRight = read.getReadLength();
if ( start == 0 ) if (start == 0)
scLeft = myStop + 1; scLeft = myStop + 1;
else else
scRight = start; scRight = start;
@ -134,8 +134,7 @@ public class ClippingOp {
unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH));
matchesCount = 0; matchesCount = 0;
unclippedCigar.add(element); unclippedCigar.add(element);
} } else
else
unclippedCigar.add(element); unclippedCigar.add(element);
} }
if (matchesCount > 0) if (matchesCount > 0)
@ -284,10 +283,9 @@ public class ClippingOp {
} }
@Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1"}) @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1"})
private GATKSAMRecord hardClip (GATKSAMRecord read, int start, int stop) { private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) {
if (start == 0 && stop == read.getReadLength() - 1) if (start == 0 && stop == read.getReadLength() - 1)
return GATKSAMRecord.emptyRead(read); return GATKSAMRecord.emptyRead(read);
// return new GATKSAMRecord(read.getHeader());
// If the read is unmapped there is no Cigar string and neither should we create a new cigar string // If the read is unmapped there is no Cigar string and neither should we create a new cigar string
@ -296,8 +294,8 @@ public class ClippingOp {
// the cigar may force a shift left or right (or both) in case we are left with insertions // the cigar may force a shift left or right (or both) in case we are left with insertions
// starting or ending the read after applying the hard clip on start/stop. // starting or ending the read after applying the hard clip on start/stop.
int newLength = read.getReadLength() - (stop - start + 1) - cigarShift.shiftFromStart - cigarShift.shiftFromEnd; int newLength = read.getReadLength() - (stop - start + 1) - cigarShift.shiftFromStart - cigarShift.shiftFromEnd;
byte [] newBases = new byte[newLength]; byte[] newBases = new byte[newLength];
byte [] newQuals = new byte[newLength]; byte[] newQuals = new byte[newLength];
int copyStart = (start == 0) ? stop + 1 + cigarShift.shiftFromStart : cigarShift.shiftFromStart; int copyStart = (start == 0) ? stop + 1 + cigarShift.shiftFromStart : cigarShift.shiftFromStart;
System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength); System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength);
@ -321,11 +319,11 @@ public class ClippingOp {
} }
@Requires({"!cigar.isEmpty()"}) @Requires({"!cigar.isEmpty()"})
private CigarShift hardClipCigar (Cigar cigar, int start, int stop) { private CigarShift hardClipCigar(Cigar cigar, int start, int stop) {
Cigar newCigar = new Cigar(); Cigar newCigar = new Cigar();
int index = 0; int index = 0;
int totalHardClipCount = stop - start + 1; int totalHardClipCount = stop - start + 1;
int alignmentShift = 0; // caused by hard clipping insertions or deletions int alignmentShift = 0; // caused by hard clipping deletions
// hard clip the beginning of the cigar string // hard clip the beginning of the cigar string
if (start == 0) { if (start == 0) {
@ -353,7 +351,7 @@ public class ClippingOp {
// element goes beyond what we need to clip // element goes beyond what we need to clip
else if (index + shift > stop + 1) { else if (index + shift > stop + 1) {
int elementLengthAfterChopping = cigarElement.getLength() - (stop - index + 1); int elementLengthAfterChopping = cigarElement.getLength() - (stop - index + 1);
alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop-index+1); alignmentShift += calculateHardClippingAlignmentShift(cigarElement, stop - index + 1);
newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP));
newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator()));
} }
@ -408,7 +406,7 @@ public class ClippingOp {
} }
// check if we are hard clipping indels // check if we are hard clipping indels
while(cigarElementIterator.hasNext()) { while (cigarElementIterator.hasNext()) {
cigarElement = cigarElementIterator.next(); cigarElement = cigarElementIterator.next();
alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength());
@ -444,34 +442,30 @@ public class ClippingOp {
boolean readHasStarted = false; boolean readHasStarted = false;
boolean addedHardClips = false; boolean addedHardClips = false;
while(!cigarStack.empty()) { while (!cigarStack.empty()) {
CigarElement cigarElement = cigarStack.pop(); CigarElement cigarElement = cigarStack.pop();
if ( !readHasStarted && if (!readHasStarted &&
cigarElement.getOperator() != CigarOperator.INSERTION && // cigarElement.getOperator() != CigarOperator.INSERTION &&
cigarElement.getOperator() != CigarOperator.DELETION && cigarElement.getOperator() != CigarOperator.DELETION &&
cigarElement.getOperator() != CigarOperator.HARD_CLIP) cigarElement.getOperator() != CigarOperator.HARD_CLIP)
readHasStarted = true; readHasStarted = true;
else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP) else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP)
totalHardClip += cigarElement.getLength(); totalHardClip += cigarElement.getLength();
else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.INSERTION) else if (!readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION)
shift += cigarElement.getLength();
else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION)
totalHardClip += cigarElement.getLength(); totalHardClip += cigarElement.getLength();
if (readHasStarted) { if (readHasStarted) {
if (i==1) { if (i == 1) {
if (!addedHardClips) { if (!addedHardClips) {
if (totalHardClip > 0) if (totalHardClip > 0)
inverseCigarStack.push(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP)); inverseCigarStack.push(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP));
addedHardClips = true; addedHardClips = true;
} }
inverseCigarStack.push(cigarElement); inverseCigarStack.push(cigarElement);
} } else {
else {
if (!addedHardClips) { if (!addedHardClips) {
if (totalHardClip > 0) if (totalHardClip > 0)
cleanCigar.add(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP)); cleanCigar.add(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP));
@ -509,7 +503,7 @@ public class ClippingOp {
} }
for (CigarElement cigarElement : oldCigar.getCigarElements()) { for (CigarElement cigarElement : oldCigar.getCigarElements()) {
if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP ) if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP)
oldShift += cigarElement.getLength(); oldShift += cigarElement.getLength();
else if (readHasStarted) else if (readHasStarted)
break; break;

View File

@ -374,24 +374,43 @@ public class ReadClipper {
* Generic functionality to hard clip a read, used internally by hardClipByReferenceCoordinatesLeftTail * Generic functionality to hard clip a read, used internally by hardClipByReferenceCoordinatesLeftTail
* and hardClipByReferenceCoordinatesRightTail. Should not be used directly. * and hardClipByReferenceCoordinatesRightTail. Should not be used directly.
* *
* Note, it REQUIRES you to give the directionality of your hard clip (i.e. whether you're clipping the
* left of right tail) by specifying either refStart < 0 or refStop < 0.
*
* @param refStart first base to clip (inclusive) * @param refStart first base to clip (inclusive)
* @param refStop last base to clip (inclusive) * @param refStop last base to clip (inclusive)
* @return a new read, without the clipped bases * @return a new read, without the clipped bases
*/ */
@Requires("!read.getReadUnmappedFlag()") // can't handle unmapped reads, as we're using reference coordinates to clip @Requires({"!read.getReadUnmappedFlag()", "refStart < 0 || refStop < 0"}) // can't handle unmapped reads, as we're using reference coordinates to clip
protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); if (read.isEmpty())
int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); return read;
if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1)) int start;
return GATKSAMRecord.emptyRead(read); int stop;
// return new GATKSAMRecord(read.getHeader());
// Determine the read coordinate to start and stop hard clipping
if (refStart < 0) {
if (refStop < 0)
throw new ReviewedStingException("Only one of refStart or refStop must be < 0, not both (" + refStart + ", " + refStop + ")");
start = 0;
stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL);
}
else {
if (refStop >= 0)
throw new ReviewedStingException("Either refStart or refStop must be < 0 (" + refStart + ", " + refStop + ")");
start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL);
stop = read.getReadLength() - 1;
}
// if ((start == 0 && stop == read.getReadLength() - 1))
// return GATKSAMRecord.emptyRead(read);
if (start < 0 || stop > read.getReadLength() - 1) if (start < 0 || stop > read.getReadLength() - 1)
throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); throw new ReviewedStingException("Trying to clip before the start or after the end of a read");
if ( start > stop ) if ( start > stop )
throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!"); throw new ReviewedStingException(String.format("START (%d) > (%d) STOP -- this should never happen -- call Mauricio!", start, stop));
if ( start > 0 && stop < read.getReadLength() - 1) if ( start > 0 && stop < read.getReadLength() - 1)
throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString())); throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString()));

View File

@ -238,7 +238,7 @@ public class ArtificialSAMUtils {
*/ */
public static GATKSAMRecord createArtificialRead( byte[] bases, byte[] qual, String cigar ) { public static GATKSAMRecord createArtificialRead( byte[] bases, byte[] qual, String cigar ) {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 1, bases, qual, cigar); return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar);
} }

View File

@ -24,7 +24,6 @@
package org.broadinstitute.sting.utils.sam; package org.broadinstitute.sting.utils.sam;
import com.google.java.contract.Ensures;
import net.sf.samtools.*; import net.sf.samtools.*;
import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.NGSPlatform;
@ -277,7 +276,6 @@ public class GATKSAMRecord extends BAMRecord {
* *
* @return the unclipped start of the read taking soft clips (but not hard clips) into account * @return the unclipped start of the read taking soft clips (but not hard clips) into account
*/ */
@Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"})
public int getSoftStart() { public int getSoftStart() {
int start = this.getUnclippedStart(); int start = this.getUnclippedStart();
for (CigarElement cigarElement : this.getCigar().getCigarElements()) { for (CigarElement cigarElement : this.getCigar().getCigarElements()) {
@ -286,17 +284,17 @@ public class GATKSAMRecord extends BAMRecord {
else else
break; break;
} }
return start; return start;
} }
/** /**
* Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips. * Calculates the reference coordinate for the end of the read taking into account soft clips but not hard clips.
* *
* Note: getUnclippedStart() adds soft and hard clips, this function only adds soft clips. * Note: getUnclippedEnd() adds soft and hard clips, this function only adds soft clips.
* *
* @return the unclipped end of the read taking soft clips (but not hard clips) into account * @return the unclipped end of the read taking soft clips (but not hard clips) into account
*/ */
@Ensures({"result >= getUnclippedStart()", "result <= getUnclippedEnd() || ReadUtils.readIsEntirelyInsertion(this)"})
public int getSoftEnd() { public int getSoftEnd() {
int stop = this.getUnclippedStart(); int stop = this.getUnclippedStart();
@ -313,6 +311,7 @@ public class GATKSAMRecord extends BAMRecord {
else else
shift = 0; shift = 0;
} }
return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ;
} }

View File

@ -58,7 +58,7 @@ public class ReadUtils {
/** /**
* A HashMap of the SAM spec read flag names * A HashMap of the SAM spec read flag names
* <p/> *
* Note: This is not being used right now, but can be useful in the future * Note: This is not being used right now, but can be useful in the future
*/ */
private static final Map<Integer, String> readFlagNames = new HashMap<Integer, String>(); private static final Map<Integer, String> readFlagNames = new HashMap<Integer, String>();
@ -79,49 +79,47 @@ public class ReadUtils {
/** /**
* This enum represents all the different ways in which a read can overlap an interval. * This enum represents all the different ways in which a read can overlap an interval.
* <p/> *
* NO_OVERLAP_CONTIG: * NO_OVERLAP_CONTIG:
* read and interval are in different contigs. * read and interval are in different contigs.
* <p/> *
* NO_OVERLAP_LEFT: * NO_OVERLAP_LEFT:
* the read does not overlap the interval. * the read does not overlap the interval.
* <p/> *
* |----------------| (interval) * |----------------| (interval)
* <----------------> (read) * <----------------> (read)
* <p/> *
* NO_OVERLAP_RIGHT: * NO_OVERLAP_RIGHT:
* the read does not overlap the interval. * the read does not overlap the interval.
* <p/> *
* |----------------| (interval) * |----------------| (interval)
* <----------------> (read) * <----------------> (read)
* <p/> *
* OVERLAP_LEFT: * OVERLAP_LEFT:
* the read starts before the beginning of the interval but ends inside of it * the read starts before the beginning of the interval but ends inside of it
* <p/> *
* |----------------| (interval) * |----------------| (interval)
* <----------------> (read) * <----------------> (read)
* <p/> *
* OVERLAP_RIGHT: * OVERLAP_RIGHT:
* the read starts inside the interval but ends outside of it * the read starts inside the interval but ends outside of it
* <p/> *
* |----------------| (interval) * |----------------| (interval)
* <----------------> (read) * <----------------> (read)
* <p/> *
* OVERLAP_LEFT_AND_RIGHT: * OVERLAP_LEFT_AND_RIGHT:
* the read starts before the interval and ends after the interval * the read starts before the interval and ends after the interval
* <p/> *
* |-----------| (interval) * |-----------| (interval)
* <-------------------> (read) * <-------------------> (read)
* <p/> *
* OVERLAP_CONTAINED: * OVERLAP_CONTAINED:
* the read starts and ends inside the interval * the read starts and ends inside the interval
* <p/> *
* |----------------| (interval) * |----------------| (interval)
* <--------> (read) * <--------> (read)
*/ */
public enum ReadAndIntervalOverlap { public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED}
NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED
}
/** /**
* Creates a SAMFileWriter with the given compression level if you request a bam file. Creates a regular * Creates a SAMFileWriter with the given compression level if you request a bam file. Creates a regular
@ -141,12 +139,12 @@ public class ReadUtils {
/** /**
* is this base inside the adaptor of the read? * is this base inside the adaptor of the read?
* <p/> *
* There are two cases to treat here: * There are two cases to treat here:
* <p/> *
* 1) Read is in the negative strand => Adaptor boundary is on the left tail * 1) Read is in the negative strand => Adaptor boundary is on the left tail
* 2) Read is in the positive strand => Adaptor boundary is on the right tail * 2) Read is in the positive strand => Adaptor boundary is on the right tail
* <p/> *
* Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event) * Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event)
* *
* @param read the read to test * @param read the read to test
@ -166,21 +164,21 @@ public class ReadUtils {
* the read boundary. If the read is in the positive strand, this is the first base after the end of the * the read boundary. If the read is in the positive strand, this is the first base after the end of the
* fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the * fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the
* beginning of the fragment. * beginning of the fragment.
* <p/> *
* There are two cases we need to treat here: * There are two cases we need to treat here:
* <p/> *
* 1) Our read is in the reverse strand : * 1) Our read is in the reverse strand :
* <p/> *
* <----------------------| * * <----------------------| *
* |---------------------> * |--------------------->
* <p/> *
* in these cases, the adaptor boundary is at the mate start (minus one) * in these cases, the adaptor boundary is at the mate start (minus one)
* <p/> *
* 2) Our read is in the forward strand : * 2) Our read is in the forward strand :
* <p/> *
* |----------------------> * * |----------------------> *
* <----------------------| * <----------------------|
* <p/> *
* in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one)
* *
* @param read the read being tested for the adaptor boundary * @param read the read being tested for the adaptor boundary
@ -264,7 +262,7 @@ public class ReadUtils {
/** /**
* If a read starts in INSERTION, returns the first element length. * If a read starts in INSERTION, returns the first element length.
* <p/> *
* Warning: If the read has Hard or Soft clips before the insertion this function will return 0. * Warning: If the read has Hard or Soft clips before the insertion this function will return 0.
* *
* @param read * @param read
@ -272,7 +270,7 @@ public class ReadUtils {
*/ */
public final static int getFirstInsertionOffset(SAMRecord read) { public final static int getFirstInsertionOffset(SAMRecord read) {
CigarElement e = read.getCigar().getCigarElement(0); CigarElement e = read.getCigar().getCigarElement(0);
if (e.getOperator() == CigarOperator.I) if ( e.getOperator() == CigarOperator.I )
return e.getLength(); return e.getLength();
else else
return 0; return 0;
@ -280,7 +278,7 @@ public class ReadUtils {
/** /**
* If a read ends in INSERTION, returns the last element length. * If a read ends in INSERTION, returns the last element length.
* <p/> *
* Warning: If the read has Hard or Soft clips after the insertion this function will return 0. * Warning: If the read has Hard or Soft clips after the insertion this function will return 0.
* *
* @param read * @param read
@ -288,7 +286,7 @@ public class ReadUtils {
*/ */
public final static int getLastInsertionOffset(SAMRecord read) { public final static int getLastInsertionOffset(SAMRecord read) {
CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1); CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1);
if (e.getOperator() == CigarOperator.I) if ( e.getOperator() == CigarOperator.I )
return e.getLength(); return e.getLength();
else else
return 0; return 0;
@ -297,7 +295,6 @@ public class ReadUtils {
/** /**
* Determines what is the position of the read in relation to the interval. * Determines what is the position of the read in relation to the interval.
* Note: This function uses the UNCLIPPED ENDS of the reads for the comparison. * Note: This function uses the UNCLIPPED ENDS of the reads for the comparison.
*
* @param read the read * @param read the read
* @param interval the interval * @param interval the interval
* @return the overlap type as described by ReadAndIntervalOverlap enum (see above) * @return the overlap type as described by ReadAndIntervalOverlap enum (see above)
@ -309,30 +306,30 @@ public class ReadUtils {
int uStart = read.getUnclippedStart(); int uStart = read.getUnclippedStart();
int uStop = read.getUnclippedEnd(); int uStop = read.getUnclippedEnd();
if (!read.getReferenceName().equals(interval.getContig())) if ( !read.getReferenceName().equals(interval.getContig()) )
return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG;
else if (uStop < interval.getStart()) else if ( uStop < interval.getStart() )
return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; return ReadAndIntervalOverlap.NO_OVERLAP_LEFT;
else if (uStart > interval.getStop()) else if ( uStart > interval.getStop() )
return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT;
else if (sStop < interval.getStart()) else if ( sStop < interval.getStart() )
return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT; return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT;
else if (sStart > interval.getStop()) else if ( sStart > interval.getStop() )
return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT; return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT;
else if ((sStart >= interval.getStart()) && else if ( (sStart >= interval.getStart()) &&
(sStop <= interval.getStop())) (sStop <= interval.getStop()) )
return ReadAndIntervalOverlap.OVERLAP_CONTAINED; return ReadAndIntervalOverlap.OVERLAP_CONTAINED;
else if ((sStart < interval.getStart()) && else if ( (sStart < interval.getStart()) &&
(sStop > interval.getStop())) (sStop > interval.getStop()) )
return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT;
else if ((sStart < interval.getStart())) else if ( (sStart < interval.getStart()) )
return ReadAndIntervalOverlap.OVERLAP_LEFT; return ReadAndIntervalOverlap.OVERLAP_LEFT;
else else
@ -340,36 +337,52 @@ public class ReadUtils {
} }
/** /**
* Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) in case it falls in * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) to take care of
* a deletion following the typical clipping needs. If clipping the left tail (beginning of the read) returns * two corner cases:
* the base prior to the deletion. If clipping the right tail (end of the read) returns the base after the *
* deletion. * 1. If clipping the right tail (end of the read) getReadCoordinateForReferenceCoordinate and fall inside
* a deletion return the base after the deletion. If clipping the left tail (beginning of the read) it
* doesn't matter because it already returns the previous base by default.
*
* 2. If clipping the left tail (beginning of the read) getReadCoordinateForReferenceCoordinate and the
* read starts with an insertion, and you're requesting the first read based coordinate, it will skip
* the leading insertion (because it has the same reference coordinate as the following base).
* *
* @param read * @param read
* @param refCoord * @param refCoord
* @param tail * @param tail
* @return the read coordinate corresponding to the requested reference coordinate for clipping. * @return the read coordinate corresponding to the requested reference coordinate for clipping.
*/ */
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"}) @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())"})
@Ensures({"result >= 0", "result < read.getReadLength()"}) @Ensures({"result >= 0", "result < read.getReadLength()"})
public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) { public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) {
Pair<Integer, Boolean> result = getReadCoordinateForReferenceCoordinate(read, refCoord); Pair<Integer, Boolean> result = getReadCoordinateForReferenceCoordinate(read, refCoord);
int readCoord = result.getFirst(); int readCoord = result.getFirst();
// Corner case one: clipping the right tail and falls on deletion, move to the next
// read coordinate. It is not a problem for the left tail because the default answer
// from getReadCoordinateForReferenceCoordinate is to give the previous read coordinate.
if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL) if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL)
readCoord++; readCoord++;
// clipping the left tail and first base is insertion, go to the next read coordinate
// with the same reference coordinate. Advance to the next cigar element, or to the
// end of the read if there is no next element.
Pair<Boolean, CigarElement> firstElementIsInsertion = readStartsWithInsertion(read);
if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion.getFirst())
readCoord = Math.min(firstElementIsInsertion.getSecond().getLength(), read.getReadLength() - 1);
return readCoord; return readCoord;
} }
/** /**
* Returns the read coordinate corresponding to the requested reference coordinate. * Returns the read coordinate corresponding to the requested reference coordinate.
* <p/> *
* WARNING: if the requested reference coordinate happens to fall inside a deletion in the read, this function * WARNING: if the requested reference coordinate happens to fall inside a deletion in the read, this function
* will return the last read base before the deletion. This function returns a * will return the last read base before the deletion. This function returns a
* Pair(int readCoord, boolean fallsInsideDeletion) so you can choose which readCoordinate to use when faced with * Pair(int readCoord, boolean fallsInsideDeletion) so you can choose which readCoordinate to use when faced with
* a deletion. * a deletion.
* <p/> *
* SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a * SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a
* pre-processed result according to normal clipping needs. Or you can use this function and tailor the * pre-processed result according to normal clipping needs. Or you can use this function and tailor the
* behavior to your needs. * behavior to your needs.
@ -457,7 +470,6 @@ public class ReadUtils {
if (!goalReached) if (!goalReached)
throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?");
return new Pair<Integer, Boolean>(readBases, fallsInsideDeletion); return new Pair<Integer, Boolean>(readBases, fallsInsideDeletion);
} }
@ -465,12 +477,11 @@ public class ReadUtils {
* Compares two SAMRecords only the basis on alignment start. Note that * Compares two SAMRecords only the basis on alignment start. Note that
* comparisons are performed ONLY on the basis of alignment start; any * comparisons are performed ONLY on the basis of alignment start; any
* two SAM records with the same alignment start will be considered equal. * two SAM records with the same alignment start will be considered equal.
* <p/> *
* Unmapped alignments will all be considered equal. * Unmapped alignments will all be considered equal.
*/ */
@Requires({"read1 != null", "read2 != null"}) @Requires({"read1 != null", "read2 != null"})
@Ensures("result == 0 || result == 1 || result == -1")
public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) { public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) {
AlignmentStartComparator comp = new AlignmentStartComparator(); AlignmentStartComparator comp = new AlignmentStartComparator();
return comp.compare(read1, read2); return comp.compare(read1, read2);
@ -502,4 +513,22 @@ public class ReadUtils {
} }
/**
* Checks if a read starts with an insertion. It looks beyond Hard and Soft clips
* if there are any.
*
* @param read
* @return A pair with the answer (true/false) and the element or null if it doesn't exist
*/
public static Pair<Boolean, CigarElement> readStartsWithInsertion(GATKSAMRecord read) {
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
if (cigarElement.getOperator() == CigarOperator.INSERTION)
return new Pair<Boolean, CigarElement>(true, cigarElement);
else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP)
break;
}
return new Pair<Boolean, CigarElement>(false, null);
}
} }

View File

@ -112,7 +112,8 @@ public class ReadClipperTestUtils {
} }
} }
if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION) // if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION)
if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION)
return true; // we don't accept reads starting or ending in deletions (add any other constraint here) return true; // we don't accept reads starting or ending in deletions (add any other constraint here)
} }
@ -190,4 +191,18 @@ public class ReadClipperTestUtils {
return invertedCigar; return invertedCigar;
} }
/**
* Checks whether or not the read has any cigar element that is not H or S
*
* @param read
* @return true if it has any M, I or D, false otherwise
*/
public static boolean readHasNonClippedBases(GATKSAMRecord read) {
for (CigarElement cigarElement : read.getCigar().getCigarElements())
if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP)
return true;
return false;
}
} }

View File

@ -30,12 +30,12 @@ import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator; import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import java.util.HashMap;
import java.util.List; import java.util.List;
/** /**
@ -59,10 +59,11 @@ public class ReadClipperUnitTest extends BaseTest {
int alnStart = read.getAlignmentStart(); int alnStart = read.getAlignmentStart();
int alnEnd = read.getAlignmentEnd(); int alnEnd = read.getAlignmentEnd();
int readLength = alnStart - alnEnd; int readLength = alnStart - alnEnd;
for (int i=0; i<readLength/2; i++) { for (int i = 0; i < readLength / 2; i++) {
GATKSAMRecord clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, alnStart + i, alnEnd - i); GATKSAMRecord clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, alnStart + i, alnEnd - i);
Assert.assertTrue(clippedRead.getAlignmentStart() >= alnStart + i, String.format("Clipped alignment start is less than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); Assert.assertTrue(clippedRead.getAlignmentStart() >= alnStart + i, String.format("Clipped alignment start is less than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString()));
Assert.assertTrue(clippedRead.getAlignmentEnd() <= alnEnd + i, String.format("Clipped alignment end is greater than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); Assert.assertTrue(clippedRead.getAlignmentEnd() <= alnEnd + i, String.format("Clipped alignment end is greater than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString()));
assertUnclippedLimits(read, clippedRead);
} }
} }
} }
@ -72,12 +73,14 @@ public class ReadClipperUnitTest extends BaseTest {
for (Cigar cigar : cigarList) { for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
int readLength = read.getReadLength(); int readLength = read.getReadLength();
for (int i=0; i<readLength; i++) { for (int i = 0; i < readLength; i++) {
GATKSAMRecord clipLeft = ReadClipper.hardClipByReadCoordinates(read, 0, i); GATKSAMRecord clipLeft = ReadClipper.hardClipByReadCoordinates(read, 0, i);
Assert.assertTrue(clipLeft.getReadLength() <= readLength - i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipLeft.getCigarString())); Assert.assertTrue(clipLeft.getReadLength() <= readLength - i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipLeft.getCigarString()));
assertUnclippedLimits(read, clipLeft);
GATKSAMRecord clipRight = ReadClipper.hardClipByReadCoordinates(read, i, readLength-1); GATKSAMRecord clipRight = ReadClipper.hardClipByReadCoordinates(read, i, readLength - 1);
Assert.assertTrue(clipRight.getReadLength() <= i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipRight.getCigarString())); Assert.assertTrue(clipRight.getReadLength() <= i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipRight.getCigarString()));
assertUnclippedLimits(read, clipRight);
} }
} }
} }
@ -86,19 +89,27 @@ public class ReadClipperUnitTest extends BaseTest {
public void testHardClipByReferenceCoordinates() { public void testHardClipByReferenceCoordinates() {
for (Cigar cigar : cigarList) { for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
int alnStart = read.getAlignmentStart(); int start = read.getSoftStart();
int alnEnd = read.getAlignmentEnd(); int stop = read.getSoftEnd();
for (int i=alnStart; i<=alnEnd; i++) {
if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side // System.out.println(String.format("CIGAR: %s (%d, %d)", cigar.toString(), start, stop));
GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(alnStart, i);
if (!clipLeft.isEmpty()) // if (ReadUtils.readIsEntirelyInsertion(read))
Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); // System.out.println("debug");
for (int i = start; i <= stop; i++) {
GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, i);
if (!clipLeft.isEmpty()) {
// System.out.println(String.format("\t left [%d] %s -> %s ", i-start+1, cigar.toString(), clipLeft.getCigarString()));
Assert.assertTrue(clipLeft.getAlignmentStart() >= Math.min(read.getAlignmentEnd(), i + 1), String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString()));
assertUnclippedLimits(read, clipLeft);
} }
if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, -1);
GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, alnEnd); if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those.
if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. // System.out.println(String.format("\t right [%d] %s -> %s ", i-start+1, cigar.toString(), clipRight.getCigarString()));
Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); Assert.assertTrue(clipRight.getAlignmentEnd() <= Math.max(read.getAlignmentStart(), i - 1), String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString()));
assertUnclippedLimits(read, clipRight);
} }
} }
} }
@ -111,10 +122,14 @@ public class ReadClipperUnitTest extends BaseTest {
int alnStart = read.getAlignmentStart(); int alnStart = read.getAlignmentStart();
int alnEnd = read.getAlignmentEnd(); int alnEnd = read.getAlignmentEnd();
if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side
for (int i=alnStart; i<=alnEnd; i++) { for (int i = alnStart; i <= alnEnd; i++) {
GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i); GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i);
if (!clipLeft.isEmpty())
if (!clipLeft.isEmpty()) {
// System.out.println(String.format("Left Tail [%d]: %s (%d,%d,%d : %d,%d,%d) -> %s (%d,%d,%d : %d,%d,%d)", i, cigar.toString(), read.getUnclippedStart(), read.getSoftStart(), read.getAlignmentStart(), read.getAlignmentEnd(), read.getSoftEnd(), read.getUnclippedEnd(), clipLeft.getCigarString(), clipLeft.getUnclippedStart(), clipLeft.getSoftStart(), clipLeft.getAlignmentStart(), clipLeft.getAlignmentEnd(), clipLeft.getSoftEnd(), clipLeft.getUnclippedEnd()));
Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString()));
assertUnclippedLimits(read, clipLeft);
}
} }
} }
} }
@ -127,10 +142,12 @@ public class ReadClipperUnitTest extends BaseTest {
int alnStart = read.getAlignmentStart(); int alnStart = read.getAlignmentStart();
int alnEnd = read.getAlignmentEnd(); int alnEnd = read.getAlignmentEnd();
if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side
for (int i=alnStart; i<=alnEnd; i++) { for (int i = alnStart; i <= alnEnd; i++) {
GATKSAMRecord clipRight = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, i); GATKSAMRecord clipRight = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, i);
if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those.
Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString()));
assertUnclippedLimits(read, clipRight);
}
} }
} }
} }
@ -145,43 +162,36 @@ public class ReadClipperUnitTest extends BaseTest {
for (Cigar cigar : cigarList) { for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
int readLength = read.getReadLength(); int readLength = read.getReadLength();
byte [] quals = new byte[readLength]; byte[] quals = new byte[readLength];
for (int nLowQualBases = 0; nLowQualBases < readLength; nLowQualBases++) { for (int nLowQualBases = 0; nLowQualBases < readLength; nLowQualBases++) {
Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the left tail
// create a read with nLowQualBases in the left tail
Utils.fillArrayWithByte(quals, HIGH_QUAL);
for (int addLeft = 0; addLeft < nLowQualBases; addLeft++) for (int addLeft = 0; addLeft < nLowQualBases; addLeft++)
quals[addLeft] = LOW_QUAL; quals[addLeft] = LOW_QUAL;
read.setBaseQualities(quals); read.setBaseQualities(quals);
GATKSAMRecord clipLeft = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); GATKSAMRecord clipLeft = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL);
// Tests assertUnclippedLimits(read, clipLeft); // Make sure limits haven't changed
assertNoLowQualBases(clipLeft, LOW_QUAL); // Make sure the low qualities are gone
Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped
String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString()));
// Make sure the low qualities are gone
assertNoLowQualBases(clipLeft, LOW_QUAL);
// Can't run this test with the current contract of no hanging insertions Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the right tail
// Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString()));
// create a read with nLowQualBases in the right tail
Utils.fillArrayWithByte(quals, HIGH_QUAL);
for (int addRight = 0; addRight < nLowQualBases; addRight++) for (int addRight = 0; addRight < nLowQualBases; addRight++)
quals[readLength - addRight - 1] = LOW_QUAL; quals[readLength - addRight - 1] = LOW_QUAL;
read.setBaseQualities(quals); read.setBaseQualities(quals);
GATKSAMRecord clipRight = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); GATKSAMRecord clipRight = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL);
// Tests // System.out.println(String.format("Debug [%d]: %s -> %s / %s", nLowQualBases, cigar.toString(), clipLeft.getCigarString(), clipRight.getCigarString()));
// Make sure the low qualities are gone assertUnclippedLimits(read, clipRight); // Make sure limits haven't changed
assertNoLowQualBases(clipRight, LOW_QUAL); assertNoLowQualBases(clipRight, LOW_QUAL); // Make sure the low qualities are gone
Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped
String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString()));
// Make sure we haven't clipped any high quals -- Can't run this test with the current contract of no hanging insertions if (nLowQualBases <= readLength / 2) {
//Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString())); Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases on both tails
// create a read with nLowQualBases in the both tails
if (nLowQualBases <= readLength/2) {
Utils.fillArrayWithByte(quals, HIGH_QUAL);
for (int addBoth = 0; addBoth < nLowQualBases; addBoth++) { for (int addBoth = 0; addBoth < nLowQualBases; addBoth++) {
quals[addBoth] = LOW_QUAL; quals[addBoth] = LOW_QUAL;
quals[readLength - addBoth - 1] = LOW_QUAL; quals[readLength - addBoth - 1] = LOW_QUAL;
@ -189,83 +199,25 @@ public class ReadClipperUnitTest extends BaseTest {
read.setBaseQualities(quals); read.setBaseQualities(quals);
GATKSAMRecord clipBoth = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL); GATKSAMRecord clipBoth = ReadClipper.hardClipLowQualEnds(read, LOW_QUAL);
// Tests assertUnclippedLimits(read, clipBoth); // Make sure limits haven't changed
assertNoLowQualBases(clipBoth, LOW_QUAL); // Make sure the low qualities are gone
// Make sure the low qualities are gone Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, // Make sure only low quality bases were clipped
assertNoLowQualBases(clipBoth, LOW_QUAL); String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2 * nLowQualBases), read.getCigarString(), clipBoth.getCigarString()));
// Can't run this test with the current contract of no hanging insertions
//Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2*nLowQualBases), read.getCigarString(), clipBoth.getCigarString()));
} }
} }
// logger.warn(String.format("Testing %s for all combinations of low/high qual... PASSED", read.getCigarString()));
} }
// ONE OFF Testing clipping that ends inside an insertion ( Ryan's bug )
final byte[] BASES = {'A','C','G','T','A','C','G','T'};
final byte[] QUALS = {2, 2, 2, 2, 20, 20, 20, 2};
final String CIGAR = "1S1M5I1S";
final byte[] CLIPPED_BASES = {};
final byte[] CLIPPED_QUALS = {};
final String CLIPPED_CIGAR = "";
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR);
GATKSAMRecord expected = ArtificialSAMUtils.createArtificialRead(CLIPPED_BASES, CLIPPED_QUALS, CLIPPED_CIGAR);
ReadClipperTestUtils.assertEqualReads(ReadClipper.hardClipLowQualEnds(read, (byte) 2), expected);
} }
@Test(enabled = true) @Test(enabled = true)
public void testHardClipSoftClippedBases() { public void testHardClipSoftClippedBases() {
// Generate a list of cigars to test
for (Cigar cigar : cigarList) { for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
GATKSAMRecord clippedRead = ReadClipper.hardClipSoftClippedBases(read); GATKSAMRecord clippedRead = ReadClipper.hardClipSoftClippedBases(read);
CigarCounter original = new CigarCounter(read);
CigarCounter clipped = new CigarCounter(clippedRead);
int sumHardClips = 0; assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed
int sumMatches = 0; original.assertHardClippingSoftClips(clipped); // Make sure we have only clipped SOFT_CLIPS
boolean tail = true;
for (CigarElement element : read.getCigar().getCigarElements()) {
// Assuming cigars are well formed, if we see S or H, it means we're on the tail (left or right)
if (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP)
tail = true;
// Adds all H, S and D's (next to hard/soft clips).
// All these should be hard clips after clipping.
if (tail && (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.DELETION))
sumHardClips += element.getLength();
// this means we're no longer on the tail (insertions can still potentially be the tail because
// of the current contract of clipping out hanging insertions
else if (element.getOperator() != CigarOperator.INSERTION)
tail = false;
// Adds all matches to verify that they remain the same after clipping
if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH)
sumMatches += element.getLength();
}
for (CigarElement element : clippedRead.getCigar().getCigarElements()) {
// Test if clipped read has Soft Clips (shouldn't have any!)
Assert.assertTrue( element.getOperator() != CigarOperator.SOFT_CLIP, String.format("Cigar %s -> %s -- FAILED (resulting cigar has soft clips)", read.getCigarString(), clippedRead.getCigarString()));
// Keep track of the total number of Hard Clips after clipping to make sure everything was accounted for
if (element.getOperator() == CigarOperator.HARD_CLIP)
sumHardClips -= element.getLength();
// Make sure all matches are still there
if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH)
sumMatches -= element.getLength();
}
Assert.assertTrue( sumHardClips == 0, String.format("Cigar %s -> %s -- FAILED (number of hard clips mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumHardClips));
Assert.assertTrue( sumMatches == 0, String.format("Cigar %s -> %s -- FAILED (number of matches mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumMatches));
// logger.warn(String.format("Cigar %s -> %s -- PASSED!", read.getCigarString(), clippedRead.getCigarString()));
} }
} }
@ -276,38 +228,39 @@ public class ReadClipperUnitTest extends BaseTest {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
GATKSAMRecord clippedRead = ReadClipper.hardClipLeadingInsertions(read); GATKSAMRecord clippedRead = ReadClipper.hardClipLeadingInsertions(read);
assertUnclippedLimits(read, clippedRead); // Make sure limits haven't changed
int expectedLength = read.getReadLength() - leadingCigarElementLength(read.getCigar(), CigarOperator.INSERTION); int expectedLength = read.getReadLength() - leadingCigarElementLength(read.getCigar(), CigarOperator.INSERTION);
if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar())) if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar()))
expectedLength -= leadingCigarElementLength(ReadClipperTestUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION); expectedLength -= leadingCigarElementLength(ReadClipperTestUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION);
if (! clippedRead.isEmpty()) { if (!clippedRead.isEmpty()) {
Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there
Assert.assertFalse(startsWithInsertion(clippedRead.getCigar())); // check that the insertions are gone Assert.assertFalse(startsWithInsertion(clippedRead.getCigar())); // check that the insertions are gone
} } else
else
Assert.assertTrue(expectedLength == 0, String.format("expected length: %d", expectedLength)); // check that the read was expected to be fully clipped Assert.assertTrue(expectedLength == 0, String.format("expected length: %d", expectedLength)); // check that the read was expected to be fully clipped
} }
} }
} }
@Test(enabled = true) @Test(enabled = true)
public void testRevertSoftClippedBases() public void testRevertSoftClippedBases() {
{ for (Cigar cigar : cigarList) {
for (Cigar cigar: cigarList) {
final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP);
final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP);
final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read); final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read);
if ( leadingSoftClips > 0 || tailSoftClips > 0) { assertUnclippedLimits(read, unclipped); // Make sure limits haven't changed
if (leadingSoftClips > 0 || tailSoftClips > 0) {
final int expectedStart = read.getAlignmentStart() - leadingSoftClips; final int expectedStart = read.getAlignmentStart() - leadingSoftClips;
final int expectedEnd = read.getAlignmentEnd() + tailSoftClips; final int expectedEnd = read.getAlignmentEnd() + tailSoftClips;
Assert.assertEquals(unclipped.getAlignmentStart(), expectedStart); Assert.assertEquals(unclipped.getAlignmentStart(), expectedStart);
Assert.assertEquals(unclipped.getAlignmentEnd(), expectedEnd); Assert.assertEquals(unclipped.getAlignmentEnd(), expectedEnd);
} } else
else
Assert.assertEquals(read.getCigarString(), unclipped.getCigarString()); Assert.assertEquals(read.getCigarString(), unclipped.getCigarString());
} }
} }
@ -315,12 +268,25 @@ public class ReadClipperUnitTest extends BaseTest {
private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) { private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) {
if (!read.isEmpty()) { if (!read.isEmpty()) {
byte [] quals = read.getBaseQualities(); byte[] quals = read.getBaseQualities();
for (int i=0; i<quals.length; i++) for (int i = 0; i < quals.length; i++)
Assert.assertFalse(quals[i] <= low_qual, String.format("Found low qual (%d) base after hard clipping. Position: %d -- %s", low_qual, i, read.getCigarString())); Assert.assertFalse(quals[i] <= low_qual, String.format("Found low qual (%d) base after hard clipping. Position: %d -- %s", low_qual, i, read.getCigarString()));
} }
} }
/**
* Asserts that clipping doesn't change the getUnclippedStart / getUnclippedEnd
*
* @param original
* @param clipped
*/
private void assertUnclippedLimits(GATKSAMRecord original, GATKSAMRecord clipped) {
if (ReadClipperTestUtils.readHasNonClippedBases(clipped)) {
Assert.assertEquals(original.getUnclippedStart(), clipped.getUnclippedStart());
Assert.assertEquals(original.getUnclippedEnd(), clipped.getUnclippedEnd());
}
}
private boolean startsWithInsertion(Cigar cigar) { private boolean startsWithInsertion(Cigar cigar) {
return leadingCigarElementLength(cigar, CigarOperator.INSERTION) > 0; return leadingCigarElementLength(cigar, CigarOperator.INSERTION) > 0;
} }
@ -335,10 +301,46 @@ public class ReadClipperUnitTest extends BaseTest {
return 0; return 0;
} }
private boolean cigarHasElementsDifferentThanInsertionsAndHardClips (Cigar cigar) { private boolean cigarHasElementsDifferentThanInsertionsAndHardClips(Cigar cigar) {
for (CigarElement cigarElement : cigar.getCigarElements()) for (CigarElement cigarElement : cigar.getCigarElements())
if (cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.HARD_CLIP) if (cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.HARD_CLIP)
return true; return true;
return false; return false;
} }
private class CigarCounter {
private HashMap<CigarOperator, Integer> counter;
public Integer getCounterForOp(CigarOperator operator) {
return counter.get(operator);
}
public CigarCounter(GATKSAMRecord read) {
CigarOperator[] operators = CigarOperator.values();
counter = new HashMap<CigarOperator, Integer>(operators.length);
for (CigarOperator op : operators)
counter.put(op, 0);
for (CigarElement cigarElement : read.getCigar().getCigarElements())
counter.put(cigarElement.getOperator(), counter.get(cigarElement.getOperator()) + cigarElement.getLength());
}
public boolean assertHardClippingSoftClips(CigarCounter clipped) {
for (CigarOperator op : counter.keySet()) {
if (op == CigarOperator.HARD_CLIP || op == CigarOperator.SOFT_CLIP) {
int counterTotal = counter.get(CigarOperator.HARD_CLIP) + counter.get(CigarOperator.SOFT_CLIP);
int clippedHard = clipped.getCounterForOp(CigarOperator.HARD_CLIP);
int clippedSoft = clipped.getCounterForOp(CigarOperator.SOFT_CLIP);
Assert.assertEquals(counterTotal, clippedHard);
Assert.assertTrue(clippedSoft == 0);
} else
Assert.assertEquals(counter.get(op), clipped.getCounterForOp(op));
}
return true;
}
}
} }