From 78d9bf7196a76ceb901a7ec5f90c55c9f4d20457 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 19 Dec 2011 14:56:19 -0500 Subject: [PATCH] Added REVERT_SOFTCLIPPED_BASES capability to ReadClipper * New ClippingOp REVERT_SOFTCLIPPED_BASES turns soft clipped bases into matches. * Added functionality to clipping op to revert all soft clip bases in a read into matches * Added revertSoftClipBases function to the ReadClipper for public use * Wrote systematic unit tests --- .../sting/utils/clipping/ClippingOp.java | 38 ++++++++++++++++++- .../clipping/ClippingRepresentation.java | 7 +++- .../sting/utils/clipping/ReadClipper.java | 5 +++ .../utils/clipping/ReadClipperUnitTest.java | 31 ++++++++++++--- 4 files changed, 74 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index f440eda5d..7eb853097 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -104,6 +104,10 @@ public class ClippingOp { break; + case REVERT_SOFTCLIPPED_BASES: + read = revertSoftClippedBases(read); + break; + default: throw new IllegalStateException("Unexpected Clipping operator type " + algorithm); } @@ -111,6 +115,38 @@ public class ClippingOp { return read; } + private GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { + GATKSAMRecord unclipped; + + // shallow copy of the read bases and quals should be fine here because they are immutable in the original read + try { + unclipped = (GATKSAMRecord) read.clone(); + } catch (CloneNotSupportedException e) { + throw new ReviewedStingException("Where did the clone go?"); + } + + Cigar unclippedCigar = new Cigar(); + int matchesCount = 0; + for (CigarElement element : read.getCigar().getCigarElements()) { + if (element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) + matchesCount += element.getLength(); + else if (matchesCount > 0) { + unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); + matchesCount = 0; + unclippedCigar.add(element); + } + else + unclippedCigar.add(element); + } + if (matchesCount > 0) + unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); + + unclipped.setCigar(unclippedCigar); + unclipped.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), unclippedCigar)); + + return unclipped; + } + /** * Given a cigar string, get the number of bases hard or soft clipped at the start */ @@ -472,7 +508,7 @@ public class ClippingOp { for (CigarElement cigarElement : oldCigar.getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP ) - oldShift += Math.min(cigarElement.getLength(), newShift - oldShift); + oldShift += cigarElement.getLength(); else if (readHasStarted) break; } diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java index c9d8730d1..f0765665a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java @@ -29,5 +29,10 @@ public enum ClippingRepresentation { * lossy) operation. Note that this can only be applied to cases where the clipped * bases occur at the start or end of a read. */ - HARDCLIP_BASES + HARDCLIP_BASES, + + /** + * Turn all soft-clipped bases into matches + */ + REVERT_SOFTCLIPPED_BASES, } diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index 6e3f37980..f19a7a04f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -213,4 +213,9 @@ public class ReadClipper { } return clipRead(ClippingRepresentation.HARDCLIP_BASES); } + + public GATKSAMRecord revertSoftClippedBases() { + this.addOp(new ClippingOp(0, 0)); // UNSOFTCLIP_BASES doesn't need coordinates + return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index 3b4b0abce..e228762b7 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -279,9 +279,9 @@ public class ReadClipperUnitTest extends BaseTest { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); GATKSAMRecord clippedRead = (new ReadClipper(read)).hardClipLeadingInsertions(); - int expectedLength = read.getReadLength() - leadingInsertionLength(read.getCigar()); + int expectedLength = read.getReadLength() - leadingCigarElementLength(read.getCigar(), CigarOperator.INSERTION); if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar())) - expectedLength -= leadingInsertionLength(ReadClipperTestUtils.invertCigar(read.getCigar())); + expectedLength -= leadingCigarElementLength(ReadClipperTestUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION); if (! clippedRead.isEmpty()) { Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there @@ -293,6 +293,27 @@ public class ReadClipperUnitTest extends BaseTest { } } + @Test(enabled = true) + public void testRevertSoftClippedBases() + { + for (Cigar cigar: cigarList) { + final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); + final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); + + final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + final GATKSAMRecord unclipped = (new ReadClipper(read)).revertSoftClippedBases(); + + if ( leadingSoftClips > 0 || tailSoftClips > 0) { + final int expectedStart = read.getAlignmentStart() - leadingSoftClips; + final int expectedEnd = read.getAlignmentEnd() + tailSoftClips; + + Assert.assertEquals(unclipped.getAlignmentStart(), expectedStart); + Assert.assertEquals(unclipped.getAlignmentEnd(), expectedEnd); + } + else + Assert.assertEquals(read.getCigarString(), unclipped.getCigarString()); + } + } private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) { @@ -304,12 +325,12 @@ public class ReadClipperUnitTest extends BaseTest { } private boolean startsWithInsertion(Cigar cigar) { - return leadingInsertionLength(cigar) > 0; + return leadingCigarElementLength(cigar, CigarOperator.INSERTION) > 0; } - private int leadingInsertionLength(Cigar cigar) { + private int leadingCigarElementLength(Cigar cigar, CigarOperator operator) { for (CigarElement cigarElement : cigar.getCigarElements()) { - if (cigarElement.getOperator() == CigarOperator.INSERTION) + if (cigarElement.getOperator() == operator) return cigarElement.getLength(); if (cigarElement.getOperator() != CigarOperator.HARD_CLIP) break;