Merge pull request #257 from broadinstitute/md_unclip_reads_over_contig

Bugfix for HaplotypeCaller error: Only one of refStart or refStop must b...
This commit is contained in:
MauricioCarneiro 2013-06-04 08:01:30 -07:00
commit eaebba5ba1
4 changed files with 126 additions and 98 deletions

View File

@ -194,9 +194,17 @@ public class ClippingOp {
unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH));
unclipped.setCigar(unclippedCigar);
unclipped.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), unclippedCigar));
final int newStart = read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), unclippedCigar);
unclipped.setAlignmentStart(newStart);
return unclipped;
if ( newStart <= 0 ) {
// if the start of the unclipped read occurs before the contig,
// we must hard clip away the bases since we cannot represent reads with
// negative or 0 alignment start values in the SAMRecord (e.g., 0 means unaligned)
return hardClip(unclipped, 0, - newStart);
} else {
return unclipped;
}
}
/**
@ -335,7 +343,24 @@ public class ClippingOp {
return newCigar;
}
@Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1"})
/**
* Hard clip bases from read, from start to stop in base coordinates
*
* If start == 0, then we will clip from the front of the read, otherwise we clip
* from the right. If start == 0 and stop == 10, this would clip out the first
* 10 bases of the read.
*
* Note that this function works with reads with negative alignment starts, in order to
* allow us to hardClip reads that have had their soft clips reverted and so might have
* negative alignment starts
*
* Works properly with reduced reads and insertion/deletion base qualities
*
* @param read a non-null read
* @param start a start >= 0 and < read.length
* @param stop a stop >= 0 and < read.length.
* @return a cloned version of read that has been properly trimmed down
*/
private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) {
final int firstBaseAfterSoftClips = read.getAlignmentStart() - read.getSoftStart();
final int lastBaseBeforeSoftClips = read.getSoftEnd() - read.getSoftStart();
@ -343,7 +368,6 @@ public class ClippingOp {
if (start == firstBaseAfterSoftClips && stop == lastBaseBeforeSoftClips) // note that if the read has no soft clips, these constants will be 0 and read length - 1 (beauty of math).
return GATKSAMRecord.emptyRead(read);
// If the read is unmapped there is no Cigar string and neither should we create a new cigar string
CigarShift cigarShift = (read.getReadUnmappedFlag()) ? new CigarShift(new Cigar(), 0, 0) : hardClipCigar(read.getCigar(), start, stop);
@ -357,7 +381,7 @@ public class ClippingOp {
System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength);
System.arraycopy(read.getBaseQualities(), copyStart, newQuals, 0, newLength);
GATKSAMRecord hardClippedRead;
final GATKSAMRecord hardClippedRead;
try {
hardClippedRead = (GATKSAMRecord) read.clone();
} catch (CloneNotSupportedException e) {

View File

@ -28,8 +28,8 @@ package org.broadinstitute.sting.utils.clipping;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.TextCigarCodec;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
@ -38,13 +38,6 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Stack;
/**
* Created by IntelliJ IDEA.
* User: roger
* Date: 11/27/11
* Time: 6:45 AM
* To change this template use File | Settings | File Templates.
*/
public class ReadClipperTestUtils {
//Should contain all the utils needed for tests to mass produce
//reads, cigars, and other needed classes
@ -236,78 +229,6 @@ public class ReadClipperTestUtils {
}
public static Cigar cigarFromString(String cigarString) {
Cigar cigar = new Cigar();
boolean isNumber = false;
int number = 0;
for (int i = 0; i < cigarString.length(); i++) {
char x = cigarString.charAt(i);
if (x >= '0' && x <='9') {
if (isNumber) {
number *= 10;
}
else {
isNumber = true;
}
number += x - '0';
}
else {
CigarElement e;
switch (x) {
case 'M':
case 'm':
e = new CigarElement(number, CigarOperator.M);
break;
case 'I':
case 'i':
e = new CigarElement(number, CigarOperator.I);
break;
case 'D':
case 'd':
e = new CigarElement(number, CigarOperator.D);
break;
case 'S':
case 's':
e = new CigarElement(number, CigarOperator.S);
break;
case 'N':
case 'n':
e = new CigarElement(number, CigarOperator.N);
break;
case 'H':
case 'h':
e = new CigarElement(number, CigarOperator.H);
break;
case 'P':
case 'p':
e = new CigarElement(number, CigarOperator.P);
break;
case '=':
e = new CigarElement(number, CigarOperator.EQ);
break;
case 'X':
case 'x':
e = new CigarElement(number, CigarOperator.X);
break;
default:
throw new ReviewedStingException("Unrecognized cigar operator: " + x + " (number: " + number + ")");
}
cigar.add(e);
}
}
return cigar;
return TextCigarCodec.getSingleton().decode(cigarString);
}
}

View File

@ -46,6 +46,7 @@ import java.util.List;
* Date: 9/28/11
*/
public class ReadClipperUnitTest extends BaseTest {
private final static boolean DEBUG = false;
List<Cigar> cigarList;
int maximumCigarSize = 10; // 6 is the minimum necessary number to try all combinations of cigar types with guarantee of clipping an element with length = 2
@ -55,7 +56,7 @@ public class ReadClipperUnitTest extends BaseTest {
cigarList = ReadClipperTestUtils.generateCigarList(maximumCigarSize);
}
@Test(enabled = true)
@Test(enabled = !DEBUG)
public void testHardClipBothEndsByReferenceCoordinates() {
for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
@ -71,7 +72,7 @@ public class ReadClipperUnitTest extends BaseTest {
}
}
@Test(enabled = true)
@Test(enabled = !DEBUG)
public void testHardClipByReadCoordinates() {
for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
@ -101,7 +102,7 @@ public class ReadClipperUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "ClippedReadLengthData", enabled = true)
@Test(dataProvider = "ClippedReadLengthData", enabled = !DEBUG)
public void testHardClipReadLengthIsRight(final int originalReadLength, final int nToClip) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(originalReadLength + "M");
read.getReadLength(); // provoke the caching of the read length
@ -112,7 +113,7 @@ public class ReadClipperUnitTest extends BaseTest {
clipped.getReadLength(), clipped.getCigar(), expectedReadLength, nToClip, read.getReadLength(), read.getCigar()));
}
@Test(enabled = true)
@Test(enabled = !DEBUG)
public void testHardClipByReferenceCoordinates() {
for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
@ -135,7 +136,7 @@ public class ReadClipperUnitTest extends BaseTest {
}
}
@Test(enabled = true)
@Test(enabled = !DEBUG)
public void testHardClipByReferenceCoordinatesLeftTail() {
for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
@ -154,7 +155,7 @@ public class ReadClipperUnitTest extends BaseTest {
}
}
@Test(enabled = true)
@Test(enabled = !DEBUG)
public void testHardClipByReferenceCoordinatesRightTail() {
for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
@ -172,7 +173,7 @@ public class ReadClipperUnitTest extends BaseTest {
}
}
@Test(enabled = true)
@Test(enabled = !DEBUG)
public void testHardClipLowQualEnds() {
final byte LOW_QUAL = 2;
final byte HIGH_QUAL = 30;
@ -216,7 +217,7 @@ public class ReadClipperUnitTest extends BaseTest {
}
}
@Test(enabled = true)
@Test(enabled = !DEBUG)
public void testHardClipSoftClippedBases() {
for (Cigar cigar : cigarList) {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
@ -251,7 +252,7 @@ public class ReadClipperUnitTest extends BaseTest {
}
}
@Test(enabled = true)
@Test(enabled = !DEBUG)
public void testRevertSoftClippedBases() {
for (Cigar cigar : cigarList) {
final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP);
@ -273,7 +274,7 @@ public class ReadClipperUnitTest extends BaseTest {
}
}
@Test(enabled = true)
@Test(enabled = !DEBUG)
public void testRevertSoftClippedBasesWithThreshold() {
for (Cigar cigar : cigarList) {
final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP);
@ -292,6 +293,40 @@ public class ReadClipperUnitTest extends BaseTest {
}
}
@DataProvider(name = "RevertSoftClipsBeforeContig")
public Object[][] makeRevertSoftClipsBeforeContig() {
List<Object[]> tests = new ArrayList<>();
// this functionality can be adapted to provide input data for whatever you might want in your data
for ( int softStart : Arrays.asList(-10, -1, 0) ) {
for ( int alignmentStart : Arrays.asList(1, 10) ) {
tests.add(new Object[]{softStart, alignmentStart});
}
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "RevertSoftClipsBeforeContig")
public void testRevertSoftClippedBasesBeforeStartOfContig(final int softStart, final int alignmentStart) {
final int nMatches = 10;
final int nSoft = -1 * (softStart - alignmentStart);
final String cigar = nSoft + "S" + nMatches + "M";
final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar);
read.setAlignmentStart(alignmentStart);
Assert.assertEquals(read.getSoftStart(), softStart);
Assert.assertEquals(read.getAlignmentStart(), alignmentStart);
Assert.assertEquals(read.getCigarString(), cigar);
final GATKSAMRecord reverted = ReadClipper.revertSoftClippedBases(read);
final int expectedAlignmentStart = 1;
final String expectedCigar = (1 - softStart) + "H" + read.getAlignmentEnd() + "M";
Assert.assertEquals(reverted.getSoftStart(), expectedAlignmentStart);
Assert.assertEquals(reverted.getAlignmentStart(), expectedAlignmentStart);
Assert.assertEquals(reverted.getCigarString(), expectedCigar);
}
private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) {
if (!read.isEmpty()) {
@ -375,7 +410,7 @@ public class ReadClipperUnitTest extends BaseTest {
}
@Test(enabled = true)
@Test(enabled = !DEBUG)
public void testHardClipReducedRead() {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("10M");
final int[] counts = new int[read.getReadLength()];
@ -391,7 +426,7 @@ public class ReadClipperUnitTest extends BaseTest {
}
}
@Test(enabled = true)
@Test(enabled = !DEBUG)
public void testRevertEntirelySoftclippedReads() {
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("2H1S3H");
GATKSAMRecord clippedRead = ReadClipper.revertSoftClippedBases(read);

View File

@ -26,6 +26,7 @@
package org.broadinstitute.sting.utils.fragments;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.TextCigarCodec;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
@ -296,4 +297,51 @@ public class FragmentUtilsUnitTest extends BaseTest {
final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2);
Assert.assertNull(actual);
}
@DataProvider(name = "MergeFragmentsOffContig")
public Object[][] makeMergeFragmentsOffContig() throws Exception {
List<Object[]> tests = new ArrayList<>();
for ( final int pre1 : Arrays.asList(0, 50)) {
for ( final int post1 : Arrays.asList(0, 50)) {
for ( final int pre2 : Arrays.asList(0, 50)) {
for ( final int post2 : Arrays.asList(0, 50)) {
tests.add(new Object[]{pre1, post1, pre2, post2});
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "MergeFragmentsOffContig")
public void testMergeFragmentsOffContig(final int pre1, final int post1, final int pre2, final int post2) {
final int contigSize = 10;
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 0, contigSize);
final GATKSAMRecord read1 = createReadOffContig(header, false, pre1, post1);
final GATKSAMRecord read2 = createReadOffContig(header, true, pre2, post2);
final GATKSAMRecord merged = FragmentUtils.mergeOverlappingPairedFragments(read1, read2);
}
private GATKSAMRecord createReadOffContig(final SAMFileHeader header, final boolean negStrand, final int pre, final int post) {
final int contigLen = header.getSequence(0).getSequenceLength();
final int readLen = pre + contigLen + post;
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, readLen);
read.setAlignmentStart(1);
read.setCigar(TextCigarCodec.getSingleton().decode(pre + "S" + contigLen + "M" + post + "S"));
read.setBaseQualities(Utils.dupBytes((byte) 30, readLen));
read.setReadBases(Utils.dupBytes((byte)'A', readLen));
read.setMappingQuality(60);
read.setMateAlignmentStart(1);
read.setProperPairFlag(true);
read.setReadPairedFlag(true);
read.setInferredInsertSize(30);
read.setReadNegativeStrandFlag(negStrand);
read.setMateNegativeStrandFlag(! negStrand);
read.setReadGroup(new GATKSAMReadGroupRecord("foo"));
return read;
}
}