diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index 2c2cbd98f..836c16a7e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -194,9 +194,17 @@ public class ClippingOp { unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); unclipped.setCigar(unclippedCigar); - unclipped.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), unclippedCigar)); + final int newStart = read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), unclippedCigar); + unclipped.setAlignmentStart(newStart); - return unclipped; + if ( newStart <= 0 ) { + // if the start of the unclipped read occurs before the contig, + // we must hard clip away the bases since we cannot represent reads with + // negative or 0 alignment start values in the SAMRecord (e.g., 0 means unaligned) + return hardClip(unclipped, 0, - newStart); + } else { + return unclipped; + } } /** @@ -335,7 +343,24 @@ public class ClippingOp { return newCigar; } - @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1"}) + /** + * Hard clip bases from read, from start to stop in base coordinates + * + * If start == 0, then we will clip from the front of the read, otherwise we clip + * from the right. If start == 0 and stop == 10, this would clip out the first + * 10 bases of the read. + * + * Note that this function works with reads with negative alignment starts, in order to + * allow us to hardClip reads that have had their soft clips reverted and so might have + * negative alignment starts + * + * Works properly with reduced reads and insertion/deletion base qualities + * + * @param read a non-null read + * @param start a start >= 0 and < read.length + * @param stop a stop >= 0 and < read.length. + * @return a cloned version of read that has been properly trimmed down + */ private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) { final int firstBaseAfterSoftClips = read.getAlignmentStart() - read.getSoftStart(); final int lastBaseBeforeSoftClips = read.getSoftEnd() - read.getSoftStart(); @@ -343,7 +368,6 @@ public class ClippingOp { if (start == firstBaseAfterSoftClips && stop == lastBaseBeforeSoftClips) // note that if the read has no soft clips, these constants will be 0 and read length - 1 (beauty of math). return GATKSAMRecord.emptyRead(read); - // If the read is unmapped there is no Cigar string and neither should we create a new cigar string CigarShift cigarShift = (read.getReadUnmappedFlag()) ? new CigarShift(new Cigar(), 0, 0) : hardClipCigar(read.getCigar(), start, stop); @@ -357,7 +381,7 @@ public class ClippingOp { System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength); System.arraycopy(read.getBaseQualities(), copyStart, newQuals, 0, newLength); - GATKSAMRecord hardClippedRead; + final GATKSAMRecord hardClippedRead; try { hardClippedRead = (GATKSAMRecord) read.clone(); } catch (CloneNotSupportedException e) { diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java index 0e0f6322e..cbbc8252b 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperTestUtils.java @@ -28,8 +28,8 @@ package org.broadinstitute.sting.utils.clipping; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; @@ -38,13 +38,6 @@ import java.util.LinkedList; import java.util.List; import java.util.Stack; -/** - * Created by IntelliJ IDEA. - * User: roger - * Date: 11/27/11 - * Time: 6:45 AM - * To change this template use File | Settings | File Templates. - */ public class ReadClipperTestUtils { //Should contain all the utils needed for tests to mass produce //reads, cigars, and other needed classes @@ -236,78 +229,6 @@ public class ReadClipperTestUtils { } public static Cigar cigarFromString(String cigarString) { - Cigar cigar = new Cigar(); - - boolean isNumber = false; - int number = 0; - for (int i = 0; i < cigarString.length(); i++) { - char x = cigarString.charAt(i); - - if (x >= '0' && x <='9') { - if (isNumber) { - number *= 10; - } - else { - isNumber = true; - } - number += x - '0'; - } - - else { - CigarElement e; - switch (x) { - case 'M': - case 'm': - e = new CigarElement(number, CigarOperator.M); - break; - - case 'I': - case 'i': - e = new CigarElement(number, CigarOperator.I); - break; - - case 'D': - case 'd': - e = new CigarElement(number, CigarOperator.D); - break; - - case 'S': - case 's': - e = new CigarElement(number, CigarOperator.S); - break; - - case 'N': - case 'n': - e = new CigarElement(number, CigarOperator.N); - break; - - case 'H': - case 'h': - e = new CigarElement(number, CigarOperator.H); - break; - - case 'P': - case 'p': - e = new CigarElement(number, CigarOperator.P); - break; - - case '=': - e = new CigarElement(number, CigarOperator.EQ); - break; - - case 'X': - case 'x': - e = new CigarElement(number, CigarOperator.X); - break; - - default: - throw new ReviewedStingException("Unrecognized cigar operator: " + x + " (number: " + number + ")"); - } - cigar.add(e); - } - } - return cigar; + return TextCigarCodec.getSingleton().decode(cigarString); } - - } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index 0b4153535..d6bd0d4d2 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -46,6 +46,7 @@ import java.util.List; * Date: 9/28/11 */ public class ReadClipperUnitTest extends BaseTest { + private final static boolean DEBUG = false; List cigarList; int maximumCigarSize = 10; // 6 is the minimum necessary number to try all combinations of cigar types with guarantee of clipping an element with length = 2 @@ -55,7 +56,7 @@ public class ReadClipperUnitTest extends BaseTest { cigarList = ReadClipperTestUtils.generateCigarList(maximumCigarSize); } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testHardClipBothEndsByReferenceCoordinates() { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); @@ -71,7 +72,7 @@ public class ReadClipperUnitTest extends BaseTest { } } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testHardClipByReadCoordinates() { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); @@ -101,7 +102,7 @@ public class ReadClipperUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "ClippedReadLengthData", enabled = true) + @Test(dataProvider = "ClippedReadLengthData", enabled = !DEBUG) public void testHardClipReadLengthIsRight(final int originalReadLength, final int nToClip) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(originalReadLength + "M"); read.getReadLength(); // provoke the caching of the read length @@ -112,7 +113,7 @@ public class ReadClipperUnitTest extends BaseTest { clipped.getReadLength(), clipped.getCigar(), expectedReadLength, nToClip, read.getReadLength(), read.getCigar())); } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testHardClipByReferenceCoordinates() { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); @@ -135,7 +136,7 @@ public class ReadClipperUnitTest extends BaseTest { } } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testHardClipByReferenceCoordinatesLeftTail() { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); @@ -154,7 +155,7 @@ public class ReadClipperUnitTest extends BaseTest { } } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testHardClipByReferenceCoordinatesRightTail() { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); @@ -172,7 +173,7 @@ public class ReadClipperUnitTest extends BaseTest { } } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testHardClipLowQualEnds() { final byte LOW_QUAL = 2; final byte HIGH_QUAL = 30; @@ -216,7 +217,7 @@ public class ReadClipperUnitTest extends BaseTest { } } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testHardClipSoftClippedBases() { for (Cigar cigar : cigarList) { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); @@ -251,7 +252,7 @@ public class ReadClipperUnitTest extends BaseTest { } } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testRevertSoftClippedBases() { for (Cigar cigar : cigarList) { final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); @@ -273,7 +274,7 @@ public class ReadClipperUnitTest extends BaseTest { } } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testRevertSoftClippedBasesWithThreshold() { for (Cigar cigar : cigarList) { final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); @@ -292,6 +293,40 @@ public class ReadClipperUnitTest extends BaseTest { } } + @DataProvider(name = "RevertSoftClipsBeforeContig") + public Object[][] makeRevertSoftClipsBeforeContig() { + List tests = new ArrayList<>(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + for ( int softStart : Arrays.asList(-10, -1, 0) ) { + for ( int alignmentStart : Arrays.asList(1, 10) ) { + tests.add(new Object[]{softStart, alignmentStart}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "RevertSoftClipsBeforeContig") + public void testRevertSoftClippedBasesBeforeStartOfContig(final int softStart, final int alignmentStart) { + final int nMatches = 10; + final int nSoft = -1 * (softStart - alignmentStart); + final String cigar = nSoft + "S" + nMatches + "M"; + final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + read.setAlignmentStart(alignmentStart); + + Assert.assertEquals(read.getSoftStart(), softStart); + Assert.assertEquals(read.getAlignmentStart(), alignmentStart); + Assert.assertEquals(read.getCigarString(), cigar); + + final GATKSAMRecord reverted = ReadClipper.revertSoftClippedBases(read); + + final int expectedAlignmentStart = 1; + final String expectedCigar = (1 - softStart) + "H" + read.getAlignmentEnd() + "M"; + Assert.assertEquals(reverted.getSoftStart(), expectedAlignmentStart); + Assert.assertEquals(reverted.getAlignmentStart(), expectedAlignmentStart); + Assert.assertEquals(reverted.getCigarString(), expectedCigar); + } private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) { if (!read.isEmpty()) { @@ -375,7 +410,7 @@ public class ReadClipperUnitTest extends BaseTest { } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testHardClipReducedRead() { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("10M"); final int[] counts = new int[read.getReadLength()]; @@ -391,7 +426,7 @@ public class ReadClipperUnitTest extends BaseTest { } } - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testRevertEntirelySoftclippedReads() { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("2H1S3H"); GATKSAMRecord clippedRead = ReadClipper.revertSoftClippedBases(read); diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java index e9600480a..0886427ca 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.fragments; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -296,4 +297,51 @@ public class FragmentUtilsUnitTest extends BaseTest { final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); Assert.assertNull(actual); } + + @DataProvider(name = "MergeFragmentsOffContig") + public Object[][] makeMergeFragmentsOffContig() throws Exception { + List tests = new ArrayList<>(); + + for ( final int pre1 : Arrays.asList(0, 50)) { + for ( final int post1 : Arrays.asList(0, 50)) { + for ( final int pre2 : Arrays.asList(0, 50)) { + for ( final int post2 : Arrays.asList(0, 50)) { + tests.add(new Object[]{pre1, post1, pre2, post2}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MergeFragmentsOffContig") + public void testMergeFragmentsOffContig(final int pre1, final int post1, final int pre2, final int post2) { + final int contigSize = 10; + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 0, contigSize); + + final GATKSAMRecord read1 = createReadOffContig(header, false, pre1, post1); + final GATKSAMRecord read2 = createReadOffContig(header, true, pre2, post2); + + final GATKSAMRecord merged = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); + } + + private GATKSAMRecord createReadOffContig(final SAMFileHeader header, final boolean negStrand, final int pre, final int post) { + final int contigLen = header.getSequence(0).getSequenceLength(); + final int readLen = pre + contigLen + post; + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, readLen); + read.setAlignmentStart(1); + read.setCigar(TextCigarCodec.getSingleton().decode(pre + "S" + contigLen + "M" + post + "S")); + read.setBaseQualities(Utils.dupBytes((byte) 30, readLen)); + read.setReadBases(Utils.dupBytes((byte)'A', readLen)); + read.setMappingQuality(60); + read.setMateAlignmentStart(1); + read.setProperPairFlag(true); + read.setReadPairedFlag(true); + read.setInferredInsertSize(30); + read.setReadNegativeStrandFlag(negStrand); + read.setMateNegativeStrandFlag(! negStrand); + read.setReadGroup(new GATKSAMReadGroupRecord("foo")); + return read; + } }