From ab532206353ce224b0d419f829b508375f1a85b0 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Tue, 19 Jun 2012 13:26:43 -0400 Subject: [PATCH] Refactor on how RR treats soft clips * Sites with more soft clipped bases than regular will force-trigger a variant region * No more unclipping/reclipping, RR machinery now handles soft clips natively. * implemented support for base insertion and base deletion quality scores in synthetic and regular reads. * GATKSAMRecord clone() now creates a fresh object for temporary attributes if one is present. note: SAMRecords create a shallow copy of the tempAttribute object which was causing multiple reads (that came from the same read) to have their temporary attributes modified by one another inside reduce reads. Beware, if you're not using GATKSAMRecord! --- .../sting/utils/clipping/ReadClipper.java | 32 +++++++++------- .../sting/utils/sam/GATKSAMRecord.java | 34 ++++++++++++++++- .../utils/clipping/ReadClipperUnitTest.java | 38 ++++++++++++------- 3 files changed, 76 insertions(+), 28 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index d934bb972..d438de900 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -422,13 +422,24 @@ public class ReadClipper { /** * Reverts only soft clipped bases with quality score greater than or equal to minQual * - * Note: Will write a temporary field with the number of soft clips that were undone on each side (left: 'SL', right: 'SR') + * todo -- Note: Will write a temporary field with the number of soft clips that were undone on each side (left: 'SL', right: 'SR') -- THIS HAS BEEN REMOVED TEMPORARILY SHOULD HAPPEN INSIDE THE CLIPPING ROUTINE! * * @param read the read * @param minQual the mininum base quality score to revert the base (inclusive) - * @return the read with high quality soft clips reverted + * @return a new read with high quality soft clips reverted */ public static GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read, byte minQual) { + return revertSoftClippedBases(hardClipLowQualitySoftClips(read, minQual)); + } + + /** + * Hard clips away soft clipped bases that are below the given quality threshold + * + * @param read the read + * @param minQual the mininum base quality score to revert the base (inclusive) + * @return a new read without low quality soft clipped bases + */ + public static GATKSAMRecord hardClipLowQualitySoftClips(GATKSAMRecord read, byte minQual) { int nLeadingSoftClips = read.getAlignmentStart() - read.getSoftStart(); if (read.isEmpty() || nLeadingSoftClips > read.getReadLength()) return GATKSAMRecord.emptyRead(read); @@ -457,17 +468,12 @@ public class ReadClipper { } GATKSAMRecord clippedRead = read; - if (right >= 0) { - if (right + 1 < clippedRead.getReadLength()) - clippedRead = hardClipByReadCoordinates(clippedRead, right+1, clippedRead.getReadLength()-1); // first we hard clip the low quality soft clips on the left tail - clippedRead.setTemporaryAttribute("SR", nTailingSoftClips - (read.getReadLength() - right - 1)); // keep track of how may bases to 're-softclip' after processing - } - if (left >= 0) { - if (left - 1 > 0) - clippedRead = hardClipByReadCoordinates(clippedRead, 0, left-1); // then we hard clip the low quality soft clips on the right tail - clippedRead.setTemporaryAttribute("SL", nLeadingSoftClips - left); // keep track of how may bases to 're-softclip' after processing - } - return revertSoftClippedBases(clippedRead); // now that we have only good bases in the soft clips, we can revert them all + if (right >= 0 && right + 1 < clippedRead.getReadLength()) // only clip if there are softclipped bases (right >= 0) and the first high quality soft clip is not the last base (right+1 < readlength) + clippedRead = hardClipByReadCoordinates(clippedRead, right+1, clippedRead.getReadLength()-1); // first we hard clip the low quality soft clips on the right tail + if (left >= 0 && left - 1 > 0) // only clip if there are softclipped bases (left >= 0) and the first high quality soft clip is not the last base (left-1 > 0) + clippedRead = hardClipByReadCoordinates(clippedRead, 0, left-1); // then we hard clip the low quality soft clips on the left tail + + return clippedRead; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index b6cc97c9f..5fbe12eed 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -194,12 +194,35 @@ public class GATKSAMRecord extends BAMRecord { } } + /** + * @return whether or not this read has base insertion or deletion qualities (one of the two is sufficient to return true) + */ public boolean hasBaseIndelQualities() { return getAttribute( BQSR_BASE_INSERTION_QUALITIES ) != null || getAttribute( BQSR_BASE_DELETION_QUALITIES ) != null; } + /** + * @return the base deletion quality or null if read doesn't have one + */ + public byte[] getExistingBaseInsertionQualities() { + return SAMUtils.fastqToPhred( getStringAttribute(BQSR_BASE_INSERTION_QUALITIES)); + } + + /** + * @return the base deletion quality or null if read doesn't have one + */ + public byte[] getExistingBaseDeletionQualities() { + return SAMUtils.fastqToPhred( getStringAttribute(BQSR_BASE_DELETION_QUALITIES)); + } + + /** + * Default utility to query the base insertion quality of a read. If the read doesn't have one, it creates an array of default qualities (currently Q45) + * and assigns it to the read. + * + * @return the base insertion quality array + */ public byte[] getBaseInsertionQualities() { - byte[] quals = SAMUtils.fastqToPhred( getStringAttribute( BQSR_BASE_INSERTION_QUALITIES ) ); + byte [] quals = getExistingBaseInsertionQualities(); if( quals == null ) { quals = new byte[getBaseQualities().length]; Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will @@ -209,8 +232,14 @@ public class GATKSAMRecord extends BAMRecord { return quals; } + /** + * Default utility to query the base deletion quality of a read. If the read doesn't have one, it creates an array of default qualities (currently Q45) + * and assigns it to the read. + * + * @return the base deletion quality array + */ public byte[] getBaseDeletionQualities() { - byte[] quals = SAMUtils.fastqToPhred( getStringAttribute( BQSR_BASE_DELETION_QUALITIES ) ); + byte[] quals = getExistingBaseDeletionQualities(); if( quals == null ) { quals = new byte[getBaseQualities().length]; Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will @@ -478,6 +507,7 @@ public class GATKSAMRecord extends BAMRecord { public Object clone() throws CloneNotSupportedException { final GATKSAMRecord clone = (GATKSAMRecord) super.clone(); if (temporaryAttributes != null) { + clone.temporaryAttributes = new HashMap(); for (Object attribute : temporaryAttributes.keySet()) clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute)); } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index bc918c0a4..44c9ab255 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -45,7 +45,7 @@ import java.util.List; public class ReadClipperUnitTest extends BaseTest { List cigarList; - int maximumCigarSize = 6; // 6 is the minimum necessary number to try all combinations of cigar types with guarantee of clipping an element with length = 2 + int maximumCigarSize = 6; // 6 is the minimum necessary number to try all combinations of cigar types with guarantee of clipping an element with length = 2 @BeforeClass public void init() { @@ -92,22 +92,15 @@ public class ReadClipperUnitTest extends BaseTest { int start = read.getSoftStart(); int stop = read.getSoftEnd(); -// System.out.println(String.format("CIGAR: %s (%d, %d)", cigar.toString(), start, stop)); - -// if (ReadUtils.readIsEntirelyInsertion(read)) -// System.out.println("debug"); - for (int i = start; i <= stop; i++) { GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(-1, i); if (!clipLeft.isEmpty()) { -// System.out.println(String.format("\t left [%d] %s -> %s ", i-start+1, cigar.toString(), clipLeft.getCigarString())); Assert.assertTrue(clipLeft.getAlignmentStart() >= Math.min(read.getAlignmentEnd(), i + 1), String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); assertUnclippedLimits(read, clipLeft); } GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, -1); - if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. -// System.out.println(String.format("\t right [%d] %s -> %s ", i-start+1, cigar.toString(), clipRight.getCigarString())); + if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. Assert.assertTrue(clipRight.getAlignmentEnd() <= Math.max(read.getAlignmentStart(), i - 1), String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); assertUnclippedLimits(read, clipRight); } @@ -121,7 +114,7 @@ public class ReadClipperUnitTest extends BaseTest { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); - if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side + if (read.getSoftStart() == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side for (int i = alnStart; i <= alnEnd; i++) { GATKSAMRecord clipLeft = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, i); @@ -141,7 +134,7 @@ public class ReadClipperUnitTest extends BaseTest { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); int alnStart = read.getAlignmentStart(); int alnEnd = read.getAlignmentEnd(); - if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side + if (read.getSoftEnd() == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side for (int i = alnStart; i <= alnEnd; i++) { GATKSAMRecord clipRight = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, i); if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) { // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. @@ -165,7 +158,7 @@ public class ReadClipperUnitTest extends BaseTest { byte[] quals = new byte[readLength]; for (int nLowQualBases = 0; nLowQualBases < readLength; nLowQualBases++) { - Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the left tail + Utils.fillArrayWithByte(quals, HIGH_QUAL); // create a read with nLowQualBases in the left tail for (int addLeft = 0; addLeft < nLowQualBases; addLeft++) quals[addLeft] = LOW_QUAL; read.setBaseQualities(quals); @@ -252,7 +245,7 @@ public class ReadClipperUnitTest extends BaseTest { final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read); - assertUnclippedLimits(read, unclipped); // Make sure limits haven't changed + assertUnclippedLimits(read, unclipped); // Make sure limits haven't changed if (leadingSoftClips > 0 || tailSoftClips > 0) { final int expectedStart = read.getAlignmentStart() - leadingSoftClips; @@ -265,6 +258,25 @@ public class ReadClipperUnitTest extends BaseTest { } } + @Test(enabled = true) + public void testRevertSoftClippedBasesWithThreshold() { + for (Cigar cigar : cigarList) { + final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); + final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); + + final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + final GATKSAMRecord unclipped = ReadClipper.revertSoftClippedBases(read); + + assertUnclippedLimits(read, unclipped); // Make sure limits haven't changed + Assert.assertNull(read.getCigar().isValid(null, -1)); + Assert.assertNull(unclipped.getCigar().isValid(null, -1)); + + if (!(leadingSoftClips > 0 || tailSoftClips > 0)) + Assert.assertEquals(read.getCigarString(), unclipped.getCigarString()); + + } + } + private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) { if (!read.isEmpty()) {