Add option to not require soft-clips on both ends

Previous version of OverclippedReadFilter would only filter a read if both ends of a read had a soft-clipped block.
This adds a boolean option to relax that requirement, and only require 1 soft-clipped block, while also filtering on read length - softclipped length
This commit is contained in:
Jacob Silterra 2015-08-12 10:38:27 -04:00
parent f73871c865
commit 62625b4bc6
2 changed files with 62 additions and 28 deletions

View File

@ -40,8 +40,8 @@ import org.broadinstitute.gatk.utils.exceptions.UserException;
* This filter is intended to filter out reads that are potentially from foreign organisms. * This filter is intended to filter out reads that are potentially from foreign organisms.
* From experience with sequencing of human DNA we have found cases of contamination by bacterial * From experience with sequencing of human DNA we have found cases of contamination by bacterial
* organisms; the symptoms of such contamination are a class of reads with only a small number * organisms; the symptoms of such contamination are a class of reads with only a small number
* of aligned bases and additionally many soft-clipped bases on both ends. This filter is intended * of aligned bases and additionally many soft-clipped bases. This filter is intended
* to remove such reads. * to remove such reads. Consecutive soft-clipped blocks are treated as a single block
* </p> * </p>
* *
*/ */
@ -50,25 +50,31 @@ public class OverclippedReadFilter extends ReadFilter {
@Argument(fullName = "filter_is_too_short_value", shortName = "filterTooShort", doc = "Value for which reads with less than this number of aligned bases is considered too short", required = false) @Argument(fullName = "filter_is_too_short_value", shortName = "filterTooShort", doc = "Value for which reads with less than this number of aligned bases is considered too short", required = false)
int tooShort = 30; int tooShort = 30;
@Argument(fullName = "do_not_require_softclips_both_ends", shortName = "NoRequireSCBothEnds", doc = "Allow a read to be filtered out based on having only 1 soft-clipped block. By default, both ends must have a soft-clipped block, setting this flag requires only 1 soft-clipped block.", required = false)
Boolean doNotRequireSoftclipsOnBothEnds = false;
public boolean filterOut(final SAMRecord read) { public boolean filterOut(final SAMRecord read) {
boolean sawLeadingSoftclip = false;
boolean sawAlignedBase = false;
int alignedLength = 0; int alignedLength = 0;
int softClipBlocks = 0;
int minSoftClipBlocks = doNotRequireSoftclipsOnBothEnds ? 1 : 2;
CigarOperator lastOperator = null;
for ( final CigarElement element : read.getCigar().getCigarElements() ) { for ( final CigarElement element : read.getCigar().getCigarElements() ) {
if ( element.getOperator() == CigarOperator.S ) { if ( element.getOperator() == CigarOperator.S ) {
if ( sawAlignedBase ) // if this is true then we must also have seen a leading soft-clip //Treat consecutive S blocks as a single one
return (alignedLength < tooShort); if(lastOperator != CigarOperator.S){
sawLeadingSoftclip = true; softClipBlocks += 1;
}
} else if ( element.getOperator().consumesReadBases() ) { // M, I, X, and EQ (S was already accounted for above) } else if ( element.getOperator().consumesReadBases() ) { // M, I, X, and EQ (S was already accounted for above)
if ( !sawLeadingSoftclip )
return false;
sawAlignedBase = true;
alignedLength += element.getLength(); alignedLength += element.getLength();
} }
lastOperator = element.getOperator();
} }
return false; return(alignedLength < tooShort && softClipBlocks >= minSoftClipBlocks);
} }
} }

View File

@ -42,11 +42,12 @@ import java.util.*;
public class OverclippedReadFilterUnitTest extends ReadFilterTest { public class OverclippedReadFilterUnitTest extends ReadFilterTest {
@Test(enabled = true, dataProvider= "OverclippedDataProvider") @Test(enabled = true, dataProvider= "OverclippedDataProvider")
public void testOverclippedFilter(final String cigarString, final boolean expectedResult) { public void testOverclippedFilter(final String cigarString, boolean doNotRequireSoftclipsOnBothEnds, final boolean expectedResult) {
final OverclippedReadFilter filter = new OverclippedReadFilter(); final OverclippedReadFilter filter = new OverclippedReadFilter();
final SAMRecord read = buildSAMRecord(cigarString); filter.doNotRequireSoftclipsOnBothEnds = doNotRequireSoftclipsOnBothEnds;
Assert.assertEquals(filter.filterOut(read), expectedResult, cigarString); final SAMRecord read = buildSAMRecord(cigarString);
Assert.assertEquals(filter.filterOut(read), expectedResult, cigarString);
} }
private SAMRecord buildSAMRecord(final String cigarString) { private SAMRecord buildSAMRecord(final String cigarString) {
@ -58,20 +59,47 @@ public class OverclippedReadFilterUnitTest extends ReadFilterTest {
public Iterator<Object[]> overclippedDataProvider() { public Iterator<Object[]> overclippedDataProvider() {
final List<Object[]> result = new LinkedList<Object[]>(); final List<Object[]> result = new LinkedList<Object[]>();
result.add(new Object[] { "1S10M1S", true }); result.add(new Object[] { "1S10M1S", false, true });
result.add(new Object[] { "1S10X1S", true }); result.add(new Object[] { "1S10X1S", false, true });
result.add(new Object[] { "1H1S10M1S1H", true }); result.add(new Object[] { "1H1S10M1S1H", false, true });
result.add(new Object[] { "1S40M1S", false }); result.add(new Object[] { "1S40M1S", false, false});
result.add(new Object[] { "1S40X1S", false }); result.add(new Object[] { "1S40X1S", false, false });
result.add(new Object[] { "1H10M1S", false }); result.add(new Object[] { "1H10M1S", false, false});
result.add(new Object[] { "1S10M1H", false }); result.add(new Object[] { "1S10M1H", false, false});
result.add(new Object[] { "10M1S", false });
result.add(new Object[] { "1S10M", false }); result.add(new Object[] { "10M1S", false, false});
result.add(new Object[] { "1S10M10D10M1S", true }); result.add(new Object[] { "1S10M", false, false});
result.add(new Object[] { "1S1M40I1S", false });
result.add(new Object[] { "1S10I1S", true }); result.add(new Object[] { "10M1S", true, true});
result.add(new Object[] { "1S40I1S", false }); result.add(new Object[] { "1S10M", true, true});
result.add(new Object[] { "1S10M10D10M1S", false, true });
result.add(new Object[] { "1S1M40I1S", false, false });
result.add(new Object[] { "1S10I1S", false, true });
result.add(new Object[] { "1S40I1S", false, false });
result.add(new Object[] { "1S40I1S", true, false });
result.add(new Object[] { "25S40I25M", true, false });
//Read is too short once soft-clipping removed
result.add(new Object[] { "25S25M", true, true });
result.add(new Object[] { "25S25X", true, true });
result.add(new Object[] { "25S25H", true, true });
result.add(new Object[] { "25S25H", false, false });
result.add(new Object[] { "25S25M25S", false, true });
result.add(new Object[] { "25M25S", true, true });
result.add(new Object[] { "25S25M", true, true });
result.add(new Object[] { "25S35S", true, true });
//Read long enough even with soft clipping removed
result.add(new Object[] { "25S35M25S", true, false });
result.add(new Object[] { "35M25S", true, false });
result.add(new Object[] { "25S35M", true, false });
return result.iterator(); return result.iterator();
} }
} }