Critical bugfix for adapter clipping in HaplotypeCaller

-- The previous code would adapter clip before reverting soft clips, so because we only clip the adapter when it's actually aligned (i.e., not in the soft clips) we were actually not removing bases in the adapter unless at least 1 bp of the adapter was aligned to the reference.  Terrible.
-- Removed the broken logic of determining whether a read adaptor is too long.
-- Doesn't require isProperPairFlag to be set for a read to be adapter clipped
-- Update integration tests for new adapter clipping code
This commit is contained in:
Mark DePristo 2013-06-27 15:44:53 -04:00
parent c3d59d890d
commit 41aba491c0
8 changed files with 177 additions and 78 deletions

View File

@ -929,28 +929,28 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// Loop through the reads hard clipping the adaptor and low quality tails
final List<GATKSAMRecord> readsToUse = new ArrayList<>(activeRegion.getReads().size());
for( final GATKSAMRecord myRead : activeRegion.getReads() ) {
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
GATKSAMRecord clippedRead;
if (errorCorrectReads)
clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION );
else if (useLowQualityBasesForAssembly)
clippedRead = postAdapterRead;
else // default case: clip low qual ends of reads
clippedRead= ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
GATKSAMRecord clippedRead;
if (errorCorrectReads)
clippedRead = ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION );
else if (useLowQualityBasesForAssembly)
clippedRead = myRead;
else // default case: clip low qual ends of reads
clippedRead= ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY );
if ( dontUseSoftClippedBases ) {
// uncomment to remove hard clips from consideration at all
clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
} else {
// revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
// TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
// TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
// TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
// TODO -- reference haplotype start must be removed
clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
}
if ( dontUseSoftClippedBases || ! ReadUtils.hasWellDefinedFragmentSize(clippedRead) ) {
// remove soft clips if we cannot reliably clip off adapter sequence or if the user doesn't want to use soft clips at all
clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
} else {
// revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
// TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
// TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
// TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
// TODO -- reference haplotype start must be removed
clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
}
clippedRead = ( clippedRead.getReadUnmappedFlag() ? clippedRead : ReadClipper.hardClipAdaptorSequence( clippedRead ) );
if( clippedRead != null && !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) {
clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() );
if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
//logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd());

View File

@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
@Test
public void testHaplotypeCallerMultiSampleComplex1() {
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "4a3479fc4ad387d381593b328f737a1b");
HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "12ed9d67139e7a94d67e9e6c06ac6e16");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {

View File

@ -208,7 +208,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("cc6f2a76ee97ecc14a5f956ffbb21d88"));
Arrays.asList("c5c63d03e1c4bbe32f06902acd4a10f9"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -217,7 +217,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("51e91c8af61a6b47807165906baefb00"));
Arrays.asList("f0b2a96040429908cce17327442eec29"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
}

View File

@ -61,7 +61,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
List<Object[]> tests = new ArrayList<Object[]>();
for ( final int nct : Arrays.asList(1, 2, 4) ) {
tests.add(new Object[]{nct, "9da4cc89590c4c64a36f4a9c820f8609"});
tests.add(new Object[]{nct, "e800f6bb3a820da5c6b29f0195480796"});
}
return tests.toArray(new Object[][]{});

View File

@ -30,10 +30,7 @@ import com.google.java.contract.Requires;
import net.sf.samtools.*;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.NGSPlatform;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -214,34 +211,52 @@ public class ReadUtils {
* CANNOT_COMPUTE_ADAPTOR_BOUNDARY if the read is unmapped or the mate is mapped to another contig.
*/
public static int getAdaptorBoundary(final SAMRecord read) {
final int MAXIMUM_ADAPTOR_LENGTH = 8;
final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value)
if (insertSize == 0 || read.getReadUnmappedFlag()) // no adaptors in reads with mates in another chromosome or unmapped pairs
return CANNOT_COMPUTE_ADAPTOR_BOUNDARY;
if ( read.getReadPairedFlag() && read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag() ) {
// note that the read.getProperPairFlag() is not reliably set, so many reads may have this tag but still be overlapping
// logger.info(String.format("Read %s start=%d end=%d insert=%d mateStart=%d readNeg=%b mateNeg=%b not properly paired, returning CANNOT_COMPUTE_ADAPTOR_BOUNDARY",
// read.getReadName(), read.getAlignmentStart(), read.getAlignmentEnd(), insertSize, read.getMateAlignmentStart(),
// read.getReadNegativeStrandFlag(), read.getMateNegativeStrandFlag()));
if ( ! hasWellDefinedFragmentSize(read) ) {
return CANNOT_COMPUTE_ADAPTOR_BOUNDARY;
} else if ( read.getReadNegativeStrandFlag() ) {
return read.getMateAlignmentStart() - 1; // case 1 (see header)
} else {
final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value)
return read.getAlignmentStart() + insertSize + 1; // case 2 (see header)
}
int adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read)
if (read.getReadNegativeStrandFlag())
adaptorBoundary = read.getMateAlignmentStart() - 1; // case 1 (see header)
else
adaptorBoundary = read.getAlignmentStart() + insertSize + 1; // case 2 (see header)
if ( (adaptorBoundary < read.getAlignmentStart() - MAXIMUM_ADAPTOR_LENGTH) || (adaptorBoundary > read.getAlignmentEnd() + MAXIMUM_ADAPTOR_LENGTH) )
adaptorBoundary = CANNOT_COMPUTE_ADAPTOR_BOUNDARY; // we are being conservative by not allowing the adaptor boundary to go beyond what we belive is the maximum size of an adaptor
return adaptorBoundary;
}
public static int CANNOT_COMPUTE_ADAPTOR_BOUNDARY = Integer.MIN_VALUE;
/**
* Can the adaptor sequence of read be reliably removed from the read based on the alignment of
* read and its mate?
*
* @param read the read to check
* @return true if it can, false otherwise
*/
public static boolean hasWellDefinedFragmentSize(final SAMRecord read) {
if ( read.getInferredInsertSize() == 0 )
// no adaptors in reads with mates in another chromosome or unmapped pairs
return false;
if ( ! read.getReadPairedFlag() )
// only reads that are paired can be adaptor trimmed
return false;
if ( read.getReadUnmappedFlag() || read.getMateUnmappedFlag() )
// only reads when both reads are mapped can be trimmed
return false;
// if ( ! read.getProperPairFlag() )
// // note this flag isn't always set properly in BAMs, can will stop us from eliminating some proper pairs
// // reads that aren't part of a proper pair (i.e., have strange alignments) can't be trimmed
// return false;
if ( read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag() )
// sanity check on getProperPairFlag to ensure that read1 and read2 aren't on the same strand
return false;
if ( read.getReadNegativeStrandFlag() ) {
// we're on the negative strand, so our read runs right to left
return read.getAlignmentEnd() > read.getMateAlignmentStart();
} else {
// we're on the positive strand, so our mate should be to our right (his start + insert size should be past our start)
return read.getAlignmentStart() <= read.getMateAlignmentStart() + read.getInferredInsertSize();
}
}
/**
* is the read a 454 read?
*

View File

@ -253,17 +253,22 @@ public class FragmentUtilsUnitTest extends BaseTest {
final GATKSAMRecord expectedMerged = makeOverlappingRead("", 30, common, commonQuals, "", 30, 10);
read1.setCigarString("4S" + common.length() + "M");
read1.setProperPairFlag(true);
read1.setReadPairedFlag(true);
read1.setFirstOfPairFlag(true);
read1.setReadNegativeStrandFlag(true);
read1.setMateAlignmentStart(10);
read1.setMateNegativeStrandFlag(false);
read1.setMateAlignmentStart(read2.getAlignmentStart());
read2.setCigarString(common.length() + "M4S");
read2.setProperPairFlag(true);
read2.setReadPairedFlag(true);
read2.setFirstOfPairFlag(false);
read2.setReadNegativeStrandFlag(false);
read2.setMateNegativeStrandFlag(true);
read2.setMateAlignmentStart(read1.getAlignmentStart());
final int insertSize = common.length() - 1;
read1.setInferredInsertSize(insertSize);
read2.setInferredInsertSize(-insertSize);
read1.setInferredInsertSize(-insertSize);
read2.setInferredInsertSize(insertSize);
final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2);
Assert.assertEquals(actual.getCigarString(), expectedMerged.getCigarString());

View File

@ -54,7 +54,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
private static final boolean DEBUG = false;
protected LocusIteratorByState li;
@Test(enabled = true)
@Test(enabled = !DEBUG)
public void testUnmappedAndAllIReadsPassThrough() {
final int readLength = 10;
GATKSAMRecord mapped1 = ArtificialSAMUtils.createArtificialRead(header,"mapped1",0,1,readLength);
@ -697,24 +697,28 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
final List<Object[]> tests = new LinkedList<Object[]>();
final int start = 10;
// for ( final int goodBases : Arrays.asList(10) ) {
// for ( final int nClipsOnTheRight : Arrays.asList(0)) {
for ( final int goodBases : Arrays.asList(10, 20, 30) ) {
for ( final int nClips : Arrays.asList(0, 1, 2, 10)) {
for ( final boolean onLeft : Arrays.asList(true, false) ) {
final int readLength = nClips + goodBases;
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1" , 0, start, readLength);
read.setProperPairFlag(true);
read.setReadPairedFlag(true);
read.setReadUnmappedFlag(false);
read.setMateUnmappedFlag(false);
read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
read.setBaseQualities(Utils.dupBytes((byte) '@', readLength));
read.setCigarString(readLength + "M");
if ( onLeft ) {
read.setReadNegativeStrandFlag(true);
read.setMateNegativeStrandFlag(false);
read.setMateAlignmentStart(start + nClips);
read.setInferredInsertSize(readLength);
tests.add(new Object[]{nClips, goodBases, 0, read});
} else {
read.setReadNegativeStrandFlag(false);
read.setMateNegativeStrandFlag(true);
read.setMateAlignmentStart(start - 1);
read.setInferredInsertSize(goodBases - 1);
tests.add(new Object[]{0, goodBases, nClips, read});
@ -723,29 +727,13 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
}
}
// for ( final int nClipsOnTheLeft : Arrays.asList(0, 1, 2, 10)) {
// final int readLength = nClipsOnTheLeft + goodBases;
// GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1" , 0, start, readLength);
// read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
// read.setBaseQualities(Utils.dupBytes((byte) '@', readLength));
// read.setCigarString(readLength + "M");
// read.setReadNegativeStrandFlag(true);
//
// read.setMateAlignmentStart(start + nClipsOnTheLeft);
// read.setInferredInsertSize(readLength);
//
// tests.add(new Object[]{nClipsOnTheLeft, goodBases, 0, read});
// }
// }
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "AdapterClippingTest")
// @Test(enabled = true, dataProvider = "LIBS_NotHoldingTooManyReads", timeOut = 100000)
public void testAdapterClipping(final int nClipsOnLeft, final int nReadContainingPileups, final int nClipsOnRight, final GATKSAMRecord read) {
li = new LocusIteratorByState(new FakeCloseableIterator<GATKSAMRecord>(Collections.singletonList(read).iterator()),
li = new LocusIteratorByState(new FakeCloseableIterator<>(Collections.singletonList(read).iterator()),
createTestReadProperties(DownsamplingMethod.NONE, false),
genomeLocParser,
LocusIteratorByState.sampleListForSAMWithoutReadGroups());
@ -755,10 +743,6 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest {
while ( li.hasNext() ) {
final AlignmentContext next = li.next();
Assert.assertEquals(next.getLocation().getStart(), expectedPos);
// if ( nPileups < nClipsOnLeft || nPileups > (nClipsOnLeft + nReadContainingPileups) )
// Assert.assertEquals(next.getBasePileup().getNumberOfElements(), 0, "Expected empty pileups when the read is in the adapter clipping zone at " + nPileups);
// else
// Assert.assertEquals(next.getBasePileup().getNumberOfElements(), 1, "Expected a pileups with 1 element when the read is within the good part of the read at " + nPileups);
nPileups++;
expectedPos++;
}

View File

@ -66,6 +66,8 @@ public class ReadUtilsUnitTest extends BaseTest {
final byte[] quals = {30, 30, 30, 30, 30, 30, 30, 30};
final String cigar = "8M";
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, cigar);
read.setProperPairFlag(true);
read.setReadPairedFlag(true);
read.setMateAlignmentStart(mateStart);
read.setInferredInsertSize(fragmentSize);
return read;
@ -85,6 +87,7 @@ public class ReadUtilsUnitTest extends BaseTest {
myStart = BEFORE;
read.setAlignmentStart(myStart);
read.setReadNegativeStrandFlag(false);
read.setMateNegativeStrandFlag(true);
boundary = get.getAdaptor(read);
Assert.assertEquals(boundary, myStart + fragmentSize + 1);
@ -93,6 +96,7 @@ public class ReadUtilsUnitTest extends BaseTest {
myStart = AFTER;
read.setAlignmentStart(myStart);
read.setReadNegativeStrandFlag(false);
read.setMateNegativeStrandFlag(true);
boundary = get.getAdaptor(read);
Assert.assertEquals(boundary, myStart + fragmentSize + 1);
@ -101,6 +105,7 @@ public class ReadUtilsUnitTest extends BaseTest {
myStart = AFTER;
read.setAlignmentStart(myStart);
read.setReadNegativeStrandFlag(true);
read.setMateNegativeStrandFlag(false);
boundary = get.getAdaptor(read);
Assert.assertEquals(boundary, mateStart - 1);
@ -109,6 +114,7 @@ public class ReadUtilsUnitTest extends BaseTest {
myStart = BEFORE;
read.setAlignmentStart(myStart);
read.setReadNegativeStrandFlag(true);
read.setMateNegativeStrandFlag(false);
boundary = get.getAdaptor(read);
Assert.assertEquals(boundary, mateStart - 1);
@ -116,9 +122,11 @@ public class ReadUtilsUnitTest extends BaseTest {
read = makeRead(fragmentSize, mateStart);
read.setInferredInsertSize(0);
read.setReadNegativeStrandFlag(true);
read.setMateNegativeStrandFlag(false);
boundary = get.getAdaptor(read);
Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY);
read.setReadNegativeStrandFlag(false);
read.setMateNegativeStrandFlag(true);
boundary = get.getAdaptor(read);
Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY);
read.setInferredInsertSize(10);
@ -226,4 +234,91 @@ public class ReadUtilsUnitTest extends BaseTest {
final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9392, ReadUtils.ClippingTail.LEFT_TAIL);
Assert.assertEquals(result, 3);
}
@DataProvider(name = "HasWellDefinedFragmentSizeData")
public Object[][] makeHasWellDefinedFragmentSizeData() throws Exception {
final List<Object[]> tests = new LinkedList<Object[]>();
// setup a basic read that will work
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader();
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, 10);
read.setReadPairedFlag(true);
read.setProperPairFlag(true);
read.setReadUnmappedFlag(false);
read.setMateUnmappedFlag(false);
read.setAlignmentStart(100);
read.setCigarString("50M");
read.setMateAlignmentStart(130);
read.setInferredInsertSize(80);
read.setFirstOfPairFlag(true);
read.setReadNegativeStrandFlag(false);
read.setMateNegativeStrandFlag(true);
tests.add( new Object[]{ "basic case", read.clone(), true });
{
final GATKSAMRecord bad1 = (GATKSAMRecord)read.clone();
bad1.setReadPairedFlag(false);
tests.add( new Object[]{ "not paired", bad1, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setProperPairFlag(false);
// we currently don't require the proper pair flag to be set
tests.add( new Object[]{ "not proper pair", bad, true });
// tests.add( new Object[]{ "not proper pair", bad, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setReadUnmappedFlag(true);
tests.add( new Object[]{ "read is unmapped", bad, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setMateUnmappedFlag(true);
tests.add( new Object[]{ "mate is unmapped", bad, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setMateNegativeStrandFlag(false);
tests.add( new Object[]{ "read and mate both on positive strand", bad, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setReadNegativeStrandFlag(true);
tests.add( new Object[]{ "read and mate both on negative strand", bad, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setInferredInsertSize(0);
tests.add( new Object[]{ "insert size is 0", bad, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setAlignmentStart(1000);
tests.add( new Object[]{ "positve read starts after mate end", bad, false });
}
{
final GATKSAMRecord bad = (GATKSAMRecord)read.clone();
bad.setReadNegativeStrandFlag(true);
bad.setMateNegativeStrandFlag(false);
bad.setMateAlignmentStart(1000);
tests.add( new Object[]{ "negative strand read ends before mate starts", bad, false });
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "HasWellDefinedFragmentSizeData")
private void testHasWellDefinedFragmentSize(final String name, final GATKSAMRecord read, final boolean expected) {
Assert.assertEquals(ReadUtils.hasWellDefinedFragmentSize(read), expected);
}
}