From d20be41fee027eb9d31d20ffa44835a2685825de Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 24 Apr 2013 17:22:07 -0400 Subject: [PATCH] Bugfix for FragmentUtils.mergeOverlappingPairedFragments -- The previous version was unclipping soft clipped bases, and these were sometimes adaptor sequences. If the two reads successfully merged, we'd lose all of the information necessary to remove the adaptor, producing a very high quality read that matched reference. Updated the code to first clip the adapter sequences from the incoming fragments -- Update MD5s --- ...lexAndSymbolicVariantsIntegrationTest.java | 2 +- .../HaplotypeCallerIntegrationTest.java | 4 +- .../sting/utils/fragments/FragmentUtils.java | 27 +++++---- .../fragments/FragmentUtilsUnitTest.java | 58 ++++++++++++++++++- 4 files changed, 76 insertions(+), 15 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 17f04971b..5fe4e6dfa 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fd51f8c7235eb6547b678093c7a01089"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "27db36467d40c3cde201f5826e959d78"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 2664f3ed0..fff1c0bb9 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -85,7 +85,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "c1530f2158cb41d50e830ca5be0f97a0"); + HCTest(NA12878_BAM, "", "18d5671d8454e8a0c05ee5f6e9fabfe3"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -166,7 +166,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("eb5772b825120a0b8710e5add485d73a")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("cb190c935541ebb9f660f713a882b922")); executeTest("HCTestStructuralIndels: ", spec); } diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index 99f1d99c7..5d882ba8c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -31,6 +31,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -213,24 +214,28 @@ public final class FragmentUtils { * * Assumes that firstRead starts before secondRead (according to their soft clipped starts) * - * @param firstRead the left most read - * @param firstRead the right most read + * @param unclippedFirstRead the left most read + * @param unclippedSecondRead the right most read * * @return a strandless merged read of first and second, or null if the algorithm cannot create a meaningful one */ - public static GATKSAMRecord mergeOverlappingPairedFragments(final GATKSAMRecord firstRead, final GATKSAMRecord secondRead) { - if ( firstRead == null ) throw new IllegalArgumentException("firstRead cannot be null"); - if ( secondRead == null ) throw new IllegalArgumentException("secondRead cannot be null"); - if ( ! firstRead.getReadName().equals(secondRead.getReadName()) ) throw new IllegalArgumentException("attempting to merge two reads with different names " + firstRead + " and " + secondRead); + public static GATKSAMRecord mergeOverlappingPairedFragments(final GATKSAMRecord unclippedFirstRead, final GATKSAMRecord unclippedSecondRead) { + if ( unclippedFirstRead == null ) throw new IllegalArgumentException("unclippedFirstRead cannot be null"); + if ( unclippedSecondRead == null ) throw new IllegalArgumentException("unclippedSecondRead cannot be null"); + if ( ! unclippedFirstRead.getReadName().equals(unclippedSecondRead.getReadName()) ) throw new IllegalArgumentException("attempting to merge two reads with different names " + unclippedFirstRead + " and " + unclippedSecondRead); + + if( unclippedFirstRead.getCigarString().contains("I") || unclippedFirstRead.getCigarString().contains("D") || unclippedSecondRead.getCigarString().contains("I") || unclippedSecondRead.getCigarString().contains("D") ) { + return null; // fragments contain indels so don't merge them + } + + final GATKSAMRecord firstRead = ReadClipper.hardClipAdaptorSequence(ReadClipper.revertSoftClippedBases(unclippedFirstRead)); + final GATKSAMRecord secondRead = ReadClipper.hardClipAdaptorSequence(ReadClipper.revertSoftClippedBases(unclippedSecondRead)); if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { return null; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A } - if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) { - return null; // fragments contain indels so don't merge them - } - final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart()); + final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getAlignmentStart()); final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() ); final int numBases = firstReadStop + secondRead.getReadLength(); @@ -264,7 +269,7 @@ public final class FragmentUtils { final GATKSAMRecord returnRead = new GATKSAMRecord( firstRead.getHeader() ); returnRead.setIsStrandless(true); - returnRead.setAlignmentStart( firstRead.getSoftStart() ); + returnRead.setAlignmentStart( firstRead.getAlignmentStart() ); returnRead.setReadBases( bases ); returnRead.setBaseQualities( quals ); returnRead.setReadGroup( firstRead.getReadGroup() ); diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java index 4f49eb933..e9600480a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java @@ -222,7 +222,7 @@ public class FragmentUtilsUnitTest extends BaseTest { return read; } - @Test(enabled = true, dataProvider = "MergeFragmentsTest") + @Test(enabled = !DEBUG, dataProvider = "MergeFragmentsTest") public void testMergingTwoReads(final String name, final GATKSAMRecord read1, GATKSAMRecord read2, final GATKSAMRecord expectedMerged) { final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); @@ -240,4 +240,60 @@ public class FragmentUtilsUnitTest extends BaseTest { Assert.assertEquals(actual.getBaseQualities(type), expectedMerged.getBaseQualities(type), "Failed base qualities for event type " + type); } } + + @Test(enabled = !DEBUG) + public void testHardClippingBeforeMerge() { + final String common = Utils.dupString("A", 10); + final byte[] commonQuals = Utils.dupBytes((byte)30, common.length()); + final String adapter = "NNNN"; + + final GATKSAMRecord read1 = makeOverlappingRead(adapter, 30, common, commonQuals, "", 30, 10); + final GATKSAMRecord read2 = makeOverlappingRead("", 30, common, commonQuals, adapter, 30, 10); + final GATKSAMRecord expectedMerged = makeOverlappingRead("", 30, common, commonQuals, "", 30, 10); + read1.setCigarString("4S" + common.length() + "M"); + read1.setProperPairFlag(true); + read1.setFirstOfPairFlag(true); + read1.setReadNegativeStrandFlag(true); + read1.setMateAlignmentStart(10); + read2.setCigarString(common.length() + "M4S"); + read2.setProperPairFlag(true); + read2.setFirstOfPairFlag(false); + read2.setReadNegativeStrandFlag(false); + + final int insertSize = common.length() - 1; + read1.setInferredInsertSize(insertSize); + read2.setInferredInsertSize(-insertSize); + + final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); + Assert.assertEquals(actual.getCigarString(), expectedMerged.getCigarString()); + Assert.assertEquals(actual.getReadBases(), expectedMerged.getReadBases()); + Assert.assertEquals(actual.getReadGroup(), expectedMerged.getReadGroup()); + Assert.assertEquals(actual.getMappingQuality(), expectedMerged.getMappingQuality()); + for ( final EventType type : EventType.values() ) + Assert.assertEquals(actual.getBaseQualities(type), expectedMerged.getBaseQualities(type), "Failed base qualities for event type " + type); + } + + @Test(enabled = true) + public void testHardClippingBeforeMergeResultingInCompletelyContainedSecondRead() { + final String adapter = "NNNN"; + + final GATKSAMRecord read1 = makeOverlappingRead(adapter, 30, Utils.dupString("A", 10), Utils.dupBytes((byte)30, 10), "", 30, 10); + final GATKSAMRecord read2 = makeOverlappingRead("", 30, Utils.dupString("A", 7), Utils.dupBytes((byte)30, 7), adapter, 30, 10); + read1.setCigarString("4S10M"); + read1.setProperPairFlag(true); + read1.setFirstOfPairFlag(true); + read1.setReadNegativeStrandFlag(true); + read1.setMateAlignmentStart(10); + read2.setCigarString("7M4S"); + read2.setProperPairFlag(true); + read2.setFirstOfPairFlag(false); + read2.setReadNegativeStrandFlag(false); + + final int insertSize = 7 - 1; + read1.setInferredInsertSize(insertSize); + read2.setInferredInsertSize(-insertSize); + + final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); + Assert.assertNull(actual); + } }