From 62a2e335bc1edb6b95cb2165b3032ea19842b9c1 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 15 Dec 2011 11:08:19 -0500 Subject: [PATCH 1/2] Changing HardClipper contract to allow UNMAPPED reads shifted the contract to functions that operate on reference based coordinates. The clipper should do the right thing with unmapped reads, but it needs more testing (Ryan is using it at the moment and says it works). Will write some unit tests. --- .../org/broadinstitute/sting/utils/clipreads/ClippingOp.java | 2 +- .../org/broadinstitute/sting/utils/clipreads/ReadClipper.java | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java index 4a253b217..97855080c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java @@ -247,7 +247,7 @@ public class ClippingOp { return newCigar; } - @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1", "!read.getReadUnmappedFlag()"}) + @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1"}) private GATKSAMRecord hardClip (GATKSAMRecord read, int start, int stop) { if (start == 0 && stop == read.getReadLength() - 1) return new GATKSAMRecord(read.getHeader()); diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index c3684034c..4df899888 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -58,6 +58,7 @@ public class ReadClipper { return hardClipByReferenceCoordinates(refStart, -1); } + @Requires("!read.getReadUnmappedFlag()") protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL); int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL); From 4748ae0a140613f56c47013c78a2deb0146c6c40 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 14 Dec 2011 19:17:49 -0500 Subject: [PATCH 2/2] Bugfix: Softclips before Hardclips weren't being accounted for caught a bug in the hard clipper where it does not account for hard clipping softclipped bases in the resulting cigar string, if there is already a hard clipped base immediately after it. * updated unit test for hardClipSoftClippedBases with corresponding test-case. --- .../sting/utils/clipreads/ClippingOp.java | 4 + .../utils/clipreads/ReadClipperUnitTest.java | 97 ++++++++----------- 2 files changed, 42 insertions(+), 59 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java index 97855080c..06985041e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java @@ -373,6 +373,10 @@ public class ClippingOp { while(cigarElementIterator.hasNext()) { cigarElement = cigarElementIterator.next(); alignmentShift += calculateHardClippingAlignmentShift(cigarElement, cigarElement.getLength()); + + // if the read had a HardClip operator in the end, combine it with the Hard Clip we are adding + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) + totalHardClipCount += cigarElement.getLength(); } newCigar.add(new CigarElement(totalHardClipCount + alignmentShift, CigarOperator.HARD_CLIP)); } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java index 650d3f26e..f998a4e78 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -252,77 +252,56 @@ public class ReadClipperUnitTest extends BaseTest { } - @Test(enabled = false) + @Test(enabled = true) public void testHardClipSoftClippedBases() { // Generate a list of cigars to test for (Cigar cigar : ClipReadsTestUtils.generateCigars()) { - //logger.warn("Testing Cigar: "+cigar.toString()); - readClipper = new ReadClipper(ClipReadsTestUtils.makeReadFromCigar(cigar)); + GATKSAMRecord read = ClipReadsTestUtils.makeReadFromCigar(cigar); + readClipper = new ReadClipper(read); + GATKSAMRecord clippedRead = readClipper.hardClipSoftClippedBases(); - int clipStart = 0; - int clipEnd = 0; - boolean expectEmptyRead = false; + int sumHardClips = 0; + int sumMatches = 0; - List cigarElements = cigar.getCigarElements(); - int CigarListLength = cigarElements.size(); + boolean tail = true; + for (CigarElement element : read.getCigar().getCigarElements()) { + // Assuming cigars are well formed, if we see S or H, it means we're on the tail (left or right) + if (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP) + tail = true; - // It will know what needs to be clipped based on the start and end of the string, hardclips and softclips - // are added to the amount to clip - if (cigarElements.get(0).getOperator() == CigarOperator.HARD_CLIP) { - //clipStart += cigarElements.get(0).getLength(); - if (cigarElements.get(1).getOperator() == CigarOperator.SOFT_CLIP) { - clipStart += cigarElements.get(1).getLength(); - // Check for leading indel - if (cigarElements.get(2).getOperator() == CigarOperator.INSERTION) { - expectEmptyRead = true; - } - } - // Check for leading indel - else if (cigarElements.get(1).getOperator() == CigarOperator.INSERTION) { - expectEmptyRead = true; - } - } else if (cigarElements.get(0).getOperator() == CigarOperator.SOFT_CLIP) { - clipStart += cigarElements.get(0).getLength(); - // Check for leading indel - if (cigarElements.get(1).getOperator() == CigarOperator.INSERTION) { - expectEmptyRead = true; - } - } - //Check for leading indel - else if (cigarElements.get(0).getOperator() == CigarOperator.INSERTION) { - expectEmptyRead = true; + // Adds all H, S and D's (next to hard/soft clips). + // All these should be hard clips after clipping. + if (tail && (element.getOperator() == CigarOperator.HARD_CLIP || element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.DELETION)) + sumHardClips += element.getLength(); + + // this means we're no longer on the tail (insertions can still potentially be the tail because + // of the current contract of clipping out hanging insertions + else if (element.getOperator() != CigarOperator.INSERTION) + tail = false; + + // Adds all matches to verify that they remain the same after clipping + if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) + sumMatches += element.getLength(); } - if (cigarElements.get(CigarListLength - 1).getOperator() == CigarOperator.HARD_CLIP) { - //clipEnd += cigarElements.get(CigarListLength - 1).getLength(); - if (cigarElements.get(CigarListLength - 2).getOperator() == CigarOperator.SOFT_CLIP) - clipEnd += cigarElements.get(CigarListLength - 2).getLength(); - } else if (cigarElements.get(CigarListLength - 1).getOperator() == CigarOperator.SOFT_CLIP) - clipEnd += cigarElements.get(CigarListLength - 1).getLength(); + for (CigarElement element : clippedRead.getCigar().getCigarElements()) { + // Test if clipped read has Soft Clips (shouldn't have any!) + Assert.assertTrue( element.getOperator() != CigarOperator.SOFT_CLIP, String.format("Cigar %s -> %s -- FAILED (resulting cigar has soft clips)", read.getCigarString(), clippedRead.getCigarString())); - String readBases = readClipper.read.getReadString(); - String baseQuals = readClipper.read.getBaseQualityString(); + // Keep track of the total number of Hard Clips after clipping to make sure everything was accounted for + if (element.getOperator() == CigarOperator.HARD_CLIP) + sumHardClips -= element.getLength(); - // "*" is the default empty-sequence-string and for our test it needs to be changed to "" - if (readBases.equals("*")) - readBases = ""; - if (baseQuals.equals("*")) - baseQuals = ""; + // Make sure all matches are still there + if (element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) + sumMatches -= element.getLength(); + } + Assert.assertTrue( sumHardClips == 0, String.format("Cigar %s -> %s -- FAILED (number of hard clips mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumHardClips)); + Assert.assertTrue( sumMatches == 0, String.format("Cigar %s -> %s -- FAILED (number of matches mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumMatches)); - logger.warn(String.format("Testing cigar %s, expecting Base: %s and Qual: %s", - cigar.toString(), readBases.substring(clipStart, readBases.length() - clipEnd), - baseQuals.substring(clipStart, baseQuals.length() - clipEnd))); - //if (expectEmptyRead) - // testBaseQual( readClipper.hardClipSoftClippedBases(), new byte[0], new byte[0] ); - //else - ClipReadsTestUtils.testBaseQual(readClipper.hardClipSoftClippedBases(), - readBases.substring(clipStart, readBases.length() - clipEnd).getBytes(), - baseQuals.substring(clipStart, baseQuals.length() - clipEnd).getBytes()); - logger.warn("Cigar: " + cigar.toString() + " PASSED!"); + + logger.warn(String.format("Cigar %s -> %s -- PASSED!", read.getCigarString(), clippedRead.getCigarString())); } - // We will use testParameter in the following way - // Right tail, left tail, - } } \ No newline at end of file