From 489c15b99d29907f1c864cac87f434a5e7294cbf Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 15 Aug 2011 01:42:34 -0400 Subject: [PATCH 01/13] Fixed indexing issue in coordinate conversion When a read had been previously soft clipped, the UnclippedEnd could not be used directly as Reference Coordinate for clipping , because the read does not go that far. --- .../sting/utils/clipreads/ReadClipper.java | 11 +++++++---- .../org/broadinstitute/sting/utils/sam/ReadUtils.java | 6 +++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 1307f1b6f..00405a98c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.clipreads; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.SAMRecord; +import org.broad.tribble.util.PositionalStream; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.jets3t.service.multi.ThreadedStorageService; @@ -48,8 +49,10 @@ public class ReadClipper { } public SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { - int start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart); - int stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop); + int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart); + int stop = (refStop < 0) ? read.getReadLength()-1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop); + + System.out.println("DEBUG -- clipping start/stop: " + start + "/" + stop); this.addOp(new ClippingOp(start, stop)); return clipRead(ClippingRepresentation.HARDCLIP_BASES); } @@ -60,9 +63,9 @@ public class ReadClipper { } public SAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) { - this.read = hardClipByReferenceCoordinates(read.getUnclippedStart(), left); + this.read = hardClipByReferenceCoordinates(-1, left); this.ops = null; // reset the operations - return hardClipByReferenceCoordinates(right, read.getUnclippedEnd()); + return hardClipByReferenceCoordinates(right, -1); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 3d803804a..5d39a01a5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -650,13 +650,13 @@ public class ReadUtils { return ReadAndIntervalOverlap.RIGHT_OVERLAP; } - @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"}) + @Requires({"refCoord >= read.getAlignmentStart()", "refCoord <= read.getAlignmentEnd()"}) @Ensures({"result >= 0", "result < read.getReadLength()"}) public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) { int readBases = 0; int refBases = 0; - int goal = refCoord - read.getUnclippedStart(); // read coords are 0-based! - boolean goalReached = false; + int goal = refCoord - read.getAlignmentStart(); // read coords are 0-based! + boolean goalReached = refBases == goal; Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); while (!goalReached && cigarElementIterator.hasNext()) { From 0d976d621162af883f282165651b59efb7335940 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 15 Aug 2011 12:04:53 -0400 Subject: [PATCH 02/13] Fixed second time clipping When a read is clipped once, and then in the second operation, because of indels, it doesn't reach the coordinate initially set for hard clipping, the indices were wrong. This should fix it. --- .../broadinstitute/sting/utils/clipreads/ReadClipper.java | 2 +- .../src/org/broadinstitute/sting/utils/sam/ReadUtils.java | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 00405a98c..85063ad14 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -50,7 +50,7 @@ public class ReadClipper { public SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart); - int stop = (refStop < 0) ? read.getReadLength()-1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop); + int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop); System.out.println("DEBUG -- clipping start/stop: " + start + "/" + stop); this.addOp(new ClippingOp(start, stop)); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 5d39a01a5..ef89a8820 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -655,16 +655,16 @@ public class ReadUtils { public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) { int readBases = 0; int refBases = 0; - int goal = refCoord - read.getAlignmentStart(); // read coords are 0-based! + int goal = refCoord - read.getAlignmentStart(); // The goal is to move this many reference bases boolean goalReached = refBases == goal; Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); while (!goalReached && cigarElementIterator.hasNext()) { CigarElement cigarElement = cigarElementIterator.next(); int shift = 0; - if (refBases == 0 && readBases == 0 && cigarElement.getOperator() == CigarOperator.HARD_CLIP) { - goal -= cigarElement.getLength(); - } + //if (refBases == 0 && readBases == 0 && cigarElement.getOperator() == CigarOperator.HARD_CLIP) { + // goal -= cigarElement.getLength(); + //} if (cigarElement.getOperator().consumesReferenceBases()) { if (refBases + cigarElement.getLength() < goal) { From 993ecb85da7e5ae15f811c551312138d53c0144c Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 15 Aug 2011 15:22:54 -0400 Subject: [PATCH 03/13] Added Hard Clipping Tail Ends Added functionality to hard clip the low quality tail ends of reads (lowQual <= 2) --- .../sting/utils/clipreads/ReadClipper.java | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 85063ad14..62b25ddf3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -15,6 +15,7 @@ import java.util.List; */ public class ReadClipper { SAMRecord read; + boolean wasClipped; List ops = null; /** @@ -24,6 +25,7 @@ public class ReadClipper { */ public ReadClipper(final SAMRecord read) { this.read = read; + this.wasClipped = false; } /** @@ -41,7 +43,7 @@ public class ReadClipper { } public boolean wasClipped() { - return ops != null; + return wasClipped; } public SAMRecord getRead() { @@ -52,7 +54,7 @@ public class ReadClipper { int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart); int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop); - System.out.println("DEBUG -- clipping start/stop: " + start + "/" + stop); + System.out.println("Clipping start/stop: " + start + "/" + stop); this.addOp(new ClippingOp(start, stop)); return clipRead(ClippingRepresentation.HARDCLIP_BASES); } @@ -68,6 +70,28 @@ public class ReadClipper { return hardClipByReferenceCoordinates(right, -1); } + public SAMRecord hardClipLowQualEnds(byte lowQual) { + byte [] quals = read.getBaseQualities(); + int leftClipIndex = 0; + int rightClipIndex = read.getReadLength() - 1; + + // check how far we can clip both sides + while (quals[rightClipIndex] <= lowQual) rightClipIndex--; + while (quals[leftClipIndex] <= lowQual) leftClipIndex++; + + // if the entire read should be clipped, then return an empty read. (--todo: maybe null is better? testing this for now) + if (leftClipIndex > rightClipIndex) + return (new SAMRecord(read.getHeader())); + + if (rightClipIndex < read.getReadLength() - 1) { + this.addOp(new ClippingOp(rightClipIndex + 1, read.getReadLength() - 1)); + } + if (leftClipIndex > 0 ) { + this.addOp(new ClippingOp(0, leftClipIndex - 1)); + } + return this.clipRead(ClippingRepresentation.HARDCLIP_BASES); + } + /** * Return a new read corresponding to this.read that's been clipped according to ops, if any are present. * @@ -83,6 +107,7 @@ public class ReadClipper { for (ClippingOp op : getOps()) { clippedRead = op.apply(algorithm, clippedRead); } + wasClipped = true; return clippedRead; } catch (CloneNotSupportedException e) { throw new RuntimeException(e); // this should never happen From b1355651837300b5ecc344935adebeff40aef00f Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 16 Aug 2011 13:51:25 -0400 Subject: [PATCH 04/13] Added low quality clipping Clips both tails of a read if the tails are below a given quality threshold (default Q2). *Added special treatment for reads that get completely clipped. --- .../sting/utils/clipreads/ReadClipper.java | 4 +- .../sting/utils/sam/ReadUtils.java | 39 +++++++++++++++---- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 62b25ddf3..6248849f9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -76,8 +76,8 @@ public class ReadClipper { int rightClipIndex = read.getReadLength() - 1; // check how far we can clip both sides - while (quals[rightClipIndex] <= lowQual) rightClipIndex--; - while (quals[leftClipIndex] <= lowQual) leftClipIndex++; + while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--; + while (leftClipIndex < read.getReadLength() && quals[leftClipIndex] <= lowQual) leftClipIndex++; // if the entire read should be clipped, then return an empty read. (--todo: maybe null is better? testing this for now) if (leftClipIndex > rightClipIndex) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index ef89a8820..79967f49f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -630,26 +630,51 @@ public class ReadUtils { * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) */ public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(SAMRecord read, GenomeLoc interval) { + + int start = getSoftUnclippedStart(read); + int stop = getSoftUnclippedStop(read); + if ( (!read.getReferenceName().equals(interval.getContig())) || - (read.getUnclippedEnd() < interval.getStart()) || - (read.getUnclippedStart() > interval.getStop()) ) + (stop < interval.getStart()) || + (start > interval.getStop()) ) return ReadAndIntervalOverlap.NO_OVERLAP; - else if ( (read.getUnclippedStart() >= interval.getStart()) && - (read.getUnclippedEnd() <= interval.getStop()) ) + else if ( (start >= interval.getStart()) && + (stop <= interval.getStop()) ) return ReadAndIntervalOverlap.CONTAINED; - else if ( (read.getUnclippedStart() < interval.getStart()) && - (read.getUnclippedEnd() > interval.getStop()) ) + else if ( (start < interval.getStart()) && + (stop > interval.getStop()) ) return ReadAndIntervalOverlap.FULL_OVERLAP; - else if ( (read.getAlignmentStart() < interval.getStart()) ) + else if ( (start < interval.getStart()) ) return ReadAndIntervalOverlap.LEFT_OVERLAP; else return ReadAndIntervalOverlap.RIGHT_OVERLAP; } + public static int getSoftUnclippedStart(SAMRecord read) { + int start = read.getUnclippedStart(); + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) + start += cigarElement.getLength(); + else + break; + } + return start; + } + + public static int getSoftUnclippedStop(SAMRecord read) { + int stop = getSoftUnclippedStart(read); + for (CigarElement cigarElement : read.getCigar().getCigarElements()) + if (cigarElement.getOperator().consumesReadBases()) + stop += cigarElement.getLength(); + return stop; + } + + + @Requires({"refCoord >= read.getAlignmentStart()", "refCoord <= read.getAlignmentEnd()"}) @Ensures({"result >= 0", "result < read.getReadLength()"}) public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) { From 07c1e113cdcca92f6cfda2651279d3a90425c3a7 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 16 Aug 2011 14:18:05 -0400 Subject: [PATCH 05/13] Fixed interval traversal for previously hard clipped reads. If a read was hard clipped for being low quality and no does not overlap the interval anymore, this read will now be discarded instead of treated as an error by the GATK traversal engine. --- .../sting/utils/sam/ReadUtils.java | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 79967f49f..b25c6c475 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -148,7 +148,7 @@ public class ReadUtils { * |----------------| (interval) * <--------> (read) */ - public enum ReadAndIntervalOverlap {NO_OVERLAP, LEFT_OVERLAP, RIGHT_OVERLAP, FULL_OVERLAP, CONTAINED} + public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} /** * God, there's a huge information asymmetry in SAM format: @@ -634,24 +634,28 @@ public class ReadUtils { int start = getSoftUnclippedStart(read); int stop = getSoftUnclippedStop(read); - if ( (!read.getReferenceName().equals(interval.getContig())) || - (stop < interval.getStart()) || - (start > interval.getStop()) ) - return ReadAndIntervalOverlap.NO_OVERLAP; + if ( !read.getReferenceName().equals(interval.getContig()) ) + return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; + + else if ( stop < interval.getStart() ) + return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; + + else if ( start > interval.getStop() ) + return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; else if ( (start >= interval.getStart()) && (stop <= interval.getStop()) ) - return ReadAndIntervalOverlap.CONTAINED; + return ReadAndIntervalOverlap.OVERLAP_CONTAINED; else if ( (start < interval.getStart()) && (stop > interval.getStop()) ) - return ReadAndIntervalOverlap.FULL_OVERLAP; + return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; else if ( (start < interval.getStart()) ) - return ReadAndIntervalOverlap.LEFT_OVERLAP; + return ReadAndIntervalOverlap.OVERLAP_LEFT; else - return ReadAndIntervalOverlap.RIGHT_OVERLAP; + return ReadAndIntervalOverlap.OVERLAP_RIGHT; } public static int getSoftUnclippedStart(SAMRecord read) { From ed8f769dce3bc12453312aa8386a4b0f7f15b53a Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 16 Aug 2011 18:54:28 -0400 Subject: [PATCH 06/13] Fixed index for getSoftUnclippedEnd() Unclipped end can be calculated simply by looking at the last cigar element and adding it's length in case it's a soft clip. --- .../org/broadinstitute/sting/utils/sam/ReadUtils.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index b25c6c475..4704b7d8f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -670,10 +670,11 @@ public class ReadUtils { } public static int getSoftUnclippedStop(SAMRecord read) { - int stop = getSoftUnclippedStart(read); - for (CigarElement cigarElement : read.getCigar().getCigarElements()) - if (cigarElement.getOperator().consumesReadBases()) - stop += cigarElement.getLength(); + int stop = read.getAlignmentEnd(); + List cigarElementList = read.getCigar().getCigarElements(); + CigarElement lastCigarElement = cigarElementList.get(cigarElementList.size()-1); + if (lastCigarElement.getOperator() == CigarOperator.SOFT_CLIP) + stop += lastCigarElement.getLength(); return stop; } From 5d6a6fab981d596e9e31de44cb0ce86666b9e53d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 16 Aug 2011 18:56:28 -0400 Subject: [PATCH 07/13] Renamed softUnclipped functions to refCoord* These functions return reference coordinates, so they should be named accordingly. --- .../src/org/broadinstitute/sting/utils/sam/ReadUtils.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 4704b7d8f..6be37cf52 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -631,8 +631,8 @@ public class ReadUtils { */ public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(SAMRecord read, GenomeLoc interval) { - int start = getSoftUnclippedStart(read); - int stop = getSoftUnclippedStop(read); + int start = getRefCoordSoftUnclippedStart(read); + int stop = getRefCoordSoftUnclippedStop(read); if ( !read.getReferenceName().equals(interval.getContig()) ) return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; @@ -658,7 +658,7 @@ public class ReadUtils { return ReadAndIntervalOverlap.OVERLAP_RIGHT; } - public static int getSoftUnclippedStart(SAMRecord read) { + public static int getRefCoordSoftUnclippedStart(SAMRecord read) { int start = read.getUnclippedStart(); for (CigarElement cigarElement : read.getCigar().getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) @@ -669,7 +669,7 @@ public class ReadUtils { return start; } - public static int getSoftUnclippedStop(SAMRecord read) { + public static int getRefCoordSoftUnclippedStop(SAMRecord read) { int stop = read.getAlignmentEnd(); List cigarElementList = read.getCigar().getCigarElements(); CigarElement lastCigarElement = cigarElementList.get(cigarElementList.size()-1); From 6ef01e40b826383e42976d61dd31eaefb53416f8 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 18 Aug 2011 18:35:45 -0400 Subject: [PATCH 09/13] Complete rewrite of Hard Clipping (ReadClipper) Hard clipping is now completely independent from softclipping and plows through previously hard or soft clipped reads. --- .../sting/utils/clipreads/ClippingOp.java | 190 +++++++++++++++--- 1 file changed, 162 insertions(+), 28 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java index 7d351686f..d899613d7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java @@ -1,13 +1,19 @@ package org.broadinstitute.sting.utils.clipreads; +import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.apache.poi.hssf.record.PageBreakRecord; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.ReadUtils; +import java.io.PipedOutputStream; +import java.lang.reflect.Array; +import java.util.Iterator; +import java.util.List; import java.util.Vector; /** @@ -31,16 +37,15 @@ public class ClippingOp { } /** - * Clips the bases in clippedRead according to this operation's start and stop. Uses the clipping + * Clips the bases in read according to this operation's start and stop. Uses the clipping * representation used is the one provided by algorithm argument. * * @param algorithm - * @param clippedRead + * @param read */ - public SAMRecord apply(ClippingRepresentation algorithm, SAMRecord clippedRead) { - //clippedRead.setReferenceIndex(1); - byte[] quals = clippedRead.getBaseQualities(); - byte[] bases = clippedRead.getReadBases(); + public SAMRecord apply(ClippingRepresentation algorithm, SAMRecord read) { + byte[] quals = read.getBaseQualities(); + byte[] bases = read.getReadBases(); switch (algorithm) { // important note: @@ -49,59 +54,57 @@ public class ClippingOp { case WRITE_NS: for (int i = start; i <= stop; i++) bases[i] = 'N'; - clippedRead.setReadBases(bases); + read.setReadBases(bases); break; case WRITE_Q0S: for (int i = start; i <= stop; i++) quals[i] = 0; - clippedRead.setBaseQualities(quals); + read.setBaseQualities(quals); break; case WRITE_NS_Q0S: for (int i = start; i <= stop; i++) { bases[i] = 'N'; quals[i] = 0; } - clippedRead.setReadBases(bases); - clippedRead.setBaseQualities(quals); + read.setReadBases(bases); + read.setBaseQualities(quals); break; case HARDCLIP_BASES: + read = hardClip(read, start, stop); + break; + case SOFTCLIP_BASES: - if ( clippedRead.getReadUnmappedFlag() ) { + if ( read.getReadUnmappedFlag() ) { // we can't process unmapped reads - throw new UserException("Read Clipper cannot soft/hard clip unmapped reads"); + throw new UserException("Read Clipper cannot soft clip unmapped reads"); } - //System.out.printf("%d %d %d%n", stop, start, clippedRead.getReadLength()); + //System.out.printf("%d %d %d%n", stop, start, read.getReadLength()); int myStop = stop; - if ( (stop + 1 - start) == clippedRead.getReadLength() ) { + if ( (stop + 1 - start) == read.getReadLength() ) { // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone - //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", clippedRead.getReadName())); + //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", read.getReadName())); //break; myStop--; // just decrement stop } - if ( start > 0 && myStop != clippedRead.getReadLength() - 1 ) - throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", - clippedRead.getReadName(), start, myStop)); + if ( start > 0 && myStop != read.getReadLength() - 1 ) + throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", read.getReadName(), start, myStop)); - Cigar oldCigar = clippedRead.getCigar(); + Cigar oldCigar = read.getCigar(); - int scLeft = 0, scRight = clippedRead.getReadLength(); + int scLeft = 0, scRight = read.getReadLength(); if ( start == 0 ) scLeft = myStop + 1; else scRight = start; Cigar newCigar = softClip(oldCigar, scLeft, scRight); - clippedRead.setCigar(newCigar); + read.setCigar(newCigar); int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar); - int newStart = clippedRead.getAlignmentStart() + newClippedStart; - clippedRead.setAlignmentStart(newStart); - - if ( algorithm == ClippingRepresentation.HARDCLIP_BASES ) - clippedRead = ReadUtils.hardClipSoftClippedBases(clippedRead); - //System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart); + int newStart = read.getAlignmentStart() + newClippedStart; + read.setAlignmentStart(newStart); break; @@ -109,7 +112,7 @@ public class ClippingOp { throw new IllegalStateException("Unexpected Clipping operator type " + algorithm); } - return clippedRead; + return read; } /** @@ -247,4 +250,135 @@ public class ClippingOp { assert newCigar.isValid(null, -1) == null; return newCigar; } + + @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1", "!read.getReadUnmappedFlag()"}) + private SAMRecord hardClip (SAMRecord read, int start, int stop) { + if (start == 0 && stop == read.getReadLength() -1) + return new SAMRecord(read.getHeader()); + + int newLength = read.getReadLength() - (stop - start + 1); + byte [] newBases = new byte[newLength]; + byte [] newQuals = new byte[newLength]; + int copyStart = (start == 0) ? stop + 1 : 0; + + System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength); + System.arraycopy(read.getBaseQualities(), copyStart, newQuals, 0, newLength); Cigar newCigar = hardClipCigar(read.getCigar(), start, stop); + + SAMRecord hardClippedRead; + try { + hardClippedRead = (SAMRecord) read.clone(); + } catch (CloneNotSupportedException e) { + throw new ReviewedStingException("Where did the clone go?"); + } + + hardClippedRead.setBaseQualities(newQuals); + hardClippedRead.setReadBases(newBases); + hardClippedRead.setCigar(newCigar); + if (start == 0) + hardClippedRead.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), newCigar)); + + return hardClippedRead; + + } + + @Requires({"!cigar.isEmpty()"}) + private Cigar hardClipCigar (Cigar cigar, int start, int stop) { + Cigar newCigar = new Cigar(); + int index = 0; + int totalHardClipCount = stop - start + 1; + + // hard clip the beginning of the cigar string + if (start == 0) { + Iterator cigarElementIterator = cigar.getCigarElements().iterator(); + CigarElement cigarElement = cigarElementIterator.next(); + // Skip all leading hard clips + while (cigarElement.getOperator() == CigarOperator.HARD_CLIP) { + totalHardClipCount += cigarElement.getLength(); + if (cigarElementIterator.hasNext()) + cigarElement = cigarElementIterator.next(); + else + throw new ReviewedStingException("Read is entirely hardclipped, shouldn't be trying to clip it's cigar string"); + } + // keep clipping until we hit stop + while (index <= stop) { + int shift = 0; + if (cigarElement.getOperator().consumesReadBases()) + shift = cigarElement.getLength(); + + // we're still clipping or just finished perfectly + if (index + shift == stop + 1) { + newCigar.add(new CigarElement(totalHardClipCount, CigarOperator.HARD_CLIP)); + } + // element goes beyond what we need to clip + else if (index + shift > stop + 1) { + int elementLengthAfterChopping = cigarElement.getLength() - (stop - index + 1); + newCigar.add(new CigarElement(totalHardClipCount, CigarOperator.HARD_CLIP)); + newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); + } + index += shift; + if (index <= stop && cigarElementIterator.hasNext()) + cigarElement = cigarElementIterator.next(); + } + + // add the remaining cigar elements + while (cigarElementIterator.hasNext()) { + cigarElement = cigarElementIterator.next(); + newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator())); + } + } + + // hard clip the end of the cigar string + else { + Iterator cigarElementIterator = cigar.getCigarElements().iterator(); + CigarElement cigarElement = cigarElementIterator.next(); + + // Keep marching on until we find the start + while (index < start) { + int shift = 0; + if (cigarElement.getOperator().consumesReadBases()) + shift = cigarElement.getLength(); + + // we haven't gotten to the start yet, keep everything as is. + if (index + shift < start) + newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator())); + + // element goes beyond our clip starting position + else { + int elementLengthAfterChopping = start - index; + // if this last element is a HARD CLIP operator, just merge it with our hard clip operator to be added later + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) + totalHardClipCount += elementLengthAfterChopping; + // otherwise, maintain what's left of this last operator + else + newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator())); + } + index += shift; + if (index < start && cigarElementIterator.hasNext()) + cigarElement = cigarElementIterator.next(); + } + newCigar.add(new CigarElement(totalHardClipCount, CigarOperator.HARD_CLIP)); + } + return newCigar; + } + + private int calculateAlignmentStartShift(Cigar oldCigar, Cigar newCigar) { + int shift = 0; + + // Rewind to previous start (by counting everything that was already clipped in this read) + for (CigarElement cigarElement : oldCigar.getCigarElements()) { + if (!cigarElement.getOperator().consumesReferenceBases()) + shift -= cigarElement.getLength(); + else + break; + } + + // Advance to new start (by counting everything new that has been clipped ) + for (CigarElement cigarElement : newCigar.getCigarElements()) { + if (!cigarElement.getOperator().consumesReferenceBases()) + shift += cigarElement.getLength(); + else + break; + } + return shift; + } }