From 8c81e7df955a48fbdb9871152feaf40d9480b895 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 21 May 2015 12:54:42 -0400 Subject: [PATCH] Added a new filter that can be used to remove reads that are too small and overly clipped. --- .../engine/filters/OverclippedReadFilter.java | 74 ++++++++++++++++++ .../OverclippedReadFilterUnitTest.java | 77 +++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilter.java create mode 100644 public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilterUnitTest.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilter.java new file mode 100644 index 000000000..07c7bfc8e --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilter.java @@ -0,0 +1,74 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.ReadProperties; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +/** + * Filter out reads that are over-soft-clipped + * + *

+ * This filter is intended to filter out reads that are potentially from foreign organisms. + * From experience with sequencing of human DNA we have found cases of contamination by bacterial + * organisms; the symptoms of such contamination are a class of reads with only a small number + * of aligned bases and additionally many soft-clipped bases on both ends. This filter is intended + * to remove such reads. + *

+ * + */ +public class OverclippedReadFilter extends ReadFilter { + + @Argument(fullName = "filter_is_too_short_value", shortName = "filterTooShort", doc = "Value for which reads with less than this number of aligned bases is considered too short", required = false) + int tooShort = 30; + + + public boolean filterOut(final SAMRecord read) { + boolean sawLeadingSoftclip = false; + boolean sawAlignedBase = false; + int alignedLength = 0; + + for ( final CigarElement element : read.getCigar().getCigarElements() ) { + if ( element.getOperator() == CigarOperator.S ) { + if ( sawAlignedBase ) // if this is true then we must also have seen a leading soft-clip + return (alignedLength < tooShort); + sawLeadingSoftclip = true; + } else if ( element.getOperator().consumesReadBases() ) { // M, I, X, and EQ (S was already accounted for above) + if ( !sawLeadingSoftclip ) + return false; + sawAlignedBase = true; + alignedLength += element.getLength(); + } + } + + return false; + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilterUnitTest.java new file mode 100644 index 000000000..3400b61e9 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilterUnitTest.java @@ -0,0 +1,77 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.TextCigarCodec; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +/** + * Tests for the OverclippedReadFilter + */ +public class OverclippedReadFilterUnitTest extends ReadFilterTest { + + @Test(enabled = true, dataProvider= "OverclippedDataProvider") + public void testOverclippedFilter(final String cigarString, final boolean expectedResult) { + + final OverclippedReadFilter filter = new OverclippedReadFilter(); + final SAMRecord read = buildSAMRecord(cigarString); + Assert.assertEquals(filter.filterOut(read), expectedResult, cigarString); + } + + private SAMRecord buildSAMRecord(final String cigarString) { + final Cigar cigar = TextCigarCodec.decode(cigarString); + return this.createRead(cigar, 1, 0, 10); + } + + @DataProvider(name= "OverclippedDataProvider") + public Iterator overclippedDataProvider() { + final List result = new LinkedList(); + + result.add(new Object[] { "1S10M1S", true }); + result.add(new Object[] { "1S10X1S", true }); + result.add(new Object[] { "1H1S10M1S1H", true }); + result.add(new Object[] { "1S40M1S", false }); + result.add(new Object[] { "1S40X1S", false }); + result.add(new Object[] { "1H10M1S", false }); + result.add(new Object[] { "1S10M1H", false }); + result.add(new Object[] { "10M1S", false }); + result.add(new Object[] { "1S10M", false }); + result.add(new Object[] { "1S10M10D10M1S", true }); + result.add(new Object[] { "1S1M40I1S", false }); + result.add(new Object[] { "1S10I1S", true }); + result.add(new Object[] { "1S40I1S", false }); + + return result.iterator(); + } +}