From 62625b4bc66567bb25ce72d2a95bc56f3a3055b4 Mon Sep 17 00:00:00 2001
From: Jacob Silterra
Date: Wed, 12 Aug 2015 10:38:27 -0400
Subject: [PATCH] Add option to not require soft-clips on both ends
Previous version of OverclippedReadFilter would only filter a read if both ends of a read had a soft-clipped block.
This adds a boolean option to relax that requirement, and only require 1 soft-clipped block, while also filtering on read length - softclipped length
---
.../engine/filters/OverclippedReadFilter.java | 28 +++++----
.../OverclippedReadFilterUnitTest.java | 62 ++++++++++++++-----
2 files changed, 62 insertions(+), 28 deletions(-)
diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilter.java
index 07c7bfc8e..ae7057de3 100644
--- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilter.java
+++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilter.java
@@ -40,8 +40,8 @@ import org.broadinstitute.gatk.utils.exceptions.UserException;
* This filter is intended to filter out reads that are potentially from foreign organisms.
* From experience with sequencing of human DNA we have found cases of contamination by bacterial
* organisms; the symptoms of such contamination are a class of reads with only a small number
- * of aligned bases and additionally many soft-clipped bases on both ends. This filter is intended
- * to remove such reads.
+ * of aligned bases and additionally many soft-clipped bases. This filter is intended
+ * to remove such reads. Consecutive soft-clipped blocks are treated as a single block
*
*
*/
@@ -50,25 +50,31 @@ public class OverclippedReadFilter extends ReadFilter {
@Argument(fullName = "filter_is_too_short_value", shortName = "filterTooShort", doc = "Value for which reads with less than this number of aligned bases is considered too short", required = false)
int tooShort = 30;
+ @Argument(fullName = "do_not_require_softclips_both_ends", shortName = "NoRequireSCBothEnds", doc = "Allow a read to be filtered out based on having only 1 soft-clipped block. By default, both ends must have a soft-clipped block, setting this flag requires only 1 soft-clipped block.", required = false)
+ Boolean doNotRequireSoftclipsOnBothEnds = false;
+
public boolean filterOut(final SAMRecord read) {
- boolean sawLeadingSoftclip = false;
- boolean sawAlignedBase = false;
int alignedLength = 0;
+ int softClipBlocks = 0;
+ int minSoftClipBlocks = doNotRequireSoftclipsOnBothEnds ? 1 : 2;
+ CigarOperator lastOperator = null;
for ( final CigarElement element : read.getCigar().getCigarElements() ) {
if ( element.getOperator() == CigarOperator.S ) {
- if ( sawAlignedBase ) // if this is true then we must also have seen a leading soft-clip
- return (alignedLength < tooShort);
- sawLeadingSoftclip = true;
+ //Treat consecutive S blocks as a single one
+ if(lastOperator != CigarOperator.S){
+ softClipBlocks += 1;
+ }
+
} else if ( element.getOperator().consumesReadBases() ) { // M, I, X, and EQ (S was already accounted for above)
- if ( !sawLeadingSoftclip )
- return false;
- sawAlignedBase = true;
alignedLength += element.getLength();
}
+ lastOperator = element.getOperator();
}
- return false;
+ return(alignedLength < tooShort && softClipBlocks >= minSoftClipBlocks);
+
}
+
}
diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilterUnitTest.java
index 3400b61e9..85d95b5f2 100644
--- a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilterUnitTest.java
+++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/OverclippedReadFilterUnitTest.java
@@ -42,11 +42,12 @@ import java.util.*;
public class OverclippedReadFilterUnitTest extends ReadFilterTest {
@Test(enabled = true, dataProvider= "OverclippedDataProvider")
- public void testOverclippedFilter(final String cigarString, final boolean expectedResult) {
+ public void testOverclippedFilter(final String cigarString, boolean doNotRequireSoftclipsOnBothEnds, final boolean expectedResult) {
- final OverclippedReadFilter filter = new OverclippedReadFilter();
- final SAMRecord read = buildSAMRecord(cigarString);
- Assert.assertEquals(filter.filterOut(read), expectedResult, cigarString);
+ final OverclippedReadFilter filter = new OverclippedReadFilter();
+ filter.doNotRequireSoftclipsOnBothEnds = doNotRequireSoftclipsOnBothEnds;
+ final SAMRecord read = buildSAMRecord(cigarString);
+ Assert.assertEquals(filter.filterOut(read), expectedResult, cigarString);
}
private SAMRecord buildSAMRecord(final String cigarString) {
@@ -58,20 +59,47 @@ public class OverclippedReadFilterUnitTest extends ReadFilterTest {
public Iterator