diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilter.java index 07c7bfc8e..ae7057de3 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilter.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilter.java @@ -40,8 +40,8 @@ import org.broadinstitute.gatk.utils.exceptions.UserException; * This filter is intended to filter out reads that are potentially from foreign organisms. * From experience with sequencing of human DNA we have found cases of contamination by bacterial * organisms; the symptoms of such contamination are a class of reads with only a small number - * of aligned bases and additionally many soft-clipped bases on both ends. This filter is intended - * to remove such reads. + * of aligned bases and additionally many soft-clipped bases. This filter is intended + * to remove such reads. Consecutive soft-clipped blocks are treated as a single block *

* */ @@ -50,25 +50,31 @@ public class OverclippedReadFilter extends ReadFilter { @Argument(fullName = "filter_is_too_short_value", shortName = "filterTooShort", doc = "Value for which reads with less than this number of aligned bases is considered too short", required = false) int tooShort = 30; + @Argument(fullName = "do_not_require_softclips_both_ends", shortName = "NoRequireSCBothEnds", doc = "Allow a read to be filtered out based on having only 1 soft-clipped block. By default, both ends must have a soft-clipped block, setting this flag requires only 1 soft-clipped block.", required = false) + Boolean doNotRequireSoftclipsOnBothEnds = false; + public boolean filterOut(final SAMRecord read) { - boolean sawLeadingSoftclip = false; - boolean sawAlignedBase = false; int alignedLength = 0; + int softClipBlocks = 0; + int minSoftClipBlocks = doNotRequireSoftclipsOnBothEnds ? 1 : 2; + CigarOperator lastOperator = null; for ( final CigarElement element : read.getCigar().getCigarElements() ) { if ( element.getOperator() == CigarOperator.S ) { - if ( sawAlignedBase ) // if this is true then we must also have seen a leading soft-clip - return (alignedLength < tooShort); - sawLeadingSoftclip = true; + //Treat consecutive S blocks as a single one + if(lastOperator != CigarOperator.S){ + softClipBlocks += 1; + } + } else if ( element.getOperator().consumesReadBases() ) { // M, I, X, and EQ (S was already accounted for above) - if ( !sawLeadingSoftclip ) - return false; - sawAlignedBase = true; alignedLength += element.getLength(); } + lastOperator = element.getOperator(); } - return false; + return(alignedLength < tooShort && softClipBlocks >= minSoftClipBlocks); + } + } diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilterUnitTest.java index 3400b61e9..85d95b5f2 100644 --- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilterUnitTest.java +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilterUnitTest.java @@ -42,11 +42,12 @@ import java.util.*; public class OverclippedReadFilterUnitTest extends ReadFilterTest { @Test(enabled = true, dataProvider= "OverclippedDataProvider") - public void testOverclippedFilter(final String cigarString, final boolean expectedResult) { + public void testOverclippedFilter(final String cigarString, boolean doNotRequireSoftclipsOnBothEnds, final boolean expectedResult) { - final OverclippedReadFilter filter = new OverclippedReadFilter(); - final SAMRecord read = buildSAMRecord(cigarString); - Assert.assertEquals(filter.filterOut(read), expectedResult, cigarString); + final OverclippedReadFilter filter = new OverclippedReadFilter(); + filter.doNotRequireSoftclipsOnBothEnds = doNotRequireSoftclipsOnBothEnds; + final SAMRecord read = buildSAMRecord(cigarString); + Assert.assertEquals(filter.filterOut(read), expectedResult, cigarString); } private SAMRecord buildSAMRecord(final String cigarString) { @@ -58,20 +59,47 @@ public class OverclippedReadFilterUnitTest extends ReadFilterTest { public Iterator overclippedDataProvider() { final List result = new LinkedList(); - result.add(new Object[] { "1S10M1S", true }); - result.add(new Object[] { "1S10X1S", true }); - result.add(new Object[] { "1H1S10M1S1H", true }); - result.add(new Object[] { "1S40M1S", false }); - result.add(new Object[] { "1S40X1S", false }); - result.add(new Object[] { "1H10M1S", false }); - result.add(new Object[] { "1S10M1H", false }); - result.add(new Object[] { "10M1S", false }); - result.add(new Object[] { "1S10M", false }); - result.add(new Object[] { "1S10M10D10M1S", true }); - result.add(new Object[] { "1S1M40I1S", false }); - result.add(new Object[] { "1S10I1S", true }); - result.add(new Object[] { "1S40I1S", false }); + result.add(new Object[] { "1S10M1S", false, true }); + result.add(new Object[] { "1S10X1S", false, true }); + result.add(new Object[] { "1H1S10M1S1H", false, true }); + result.add(new Object[] { "1S40M1S", false, false}); + result.add(new Object[] { "1S40X1S", false, false }); + result.add(new Object[] { "1H10M1S", false, false}); + result.add(new Object[] { "1S10M1H", false, false}); + + result.add(new Object[] { "10M1S", false, false}); + result.add(new Object[] { "1S10M", false, false}); + + result.add(new Object[] { "10M1S", true, true}); + result.add(new Object[] { "1S10M", true, true}); + + result.add(new Object[] { "1S10M10D10M1S", false, true }); + result.add(new Object[] { "1S1M40I1S", false, false }); + + result.add(new Object[] { "1S10I1S", false, true }); + result.add(new Object[] { "1S40I1S", false, false }); + result.add(new Object[] { "1S40I1S", true, false }); + + result.add(new Object[] { "25S40I25M", true, false }); + + //Read is too short once soft-clipping removed + result.add(new Object[] { "25S25M", true, true }); + result.add(new Object[] { "25S25X", true, true }); + result.add(new Object[] { "25S25H", true, true }); + result.add(new Object[] { "25S25H", false, false }); + + result.add(new Object[] { "25S25M25S", false, true }); + result.add(new Object[] { "25M25S", true, true }); + result.add(new Object[] { "25S25M", true, true }); + + result.add(new Object[] { "25S35S", true, true }); + + //Read long enough even with soft clipping removed + result.add(new Object[] { "25S35M25S", true, false }); + result.add(new Object[] { "35M25S", true, false }); + result.add(new Object[] { "25S35M", true, false }); return result.iterator(); } + }