Add option to not require soft-clips on both ends

Previous version of OverclippedReadFilter would only filter a read if both ends of a read had a soft-clipped block.
This adds a boolean option to relax that requirement, and only require 1 soft-clipped block, while also filtering on read length - softclipped length
This commit is contained in:
Jacob Silterra 2015-08-12 10:38:27 -04:00
parent f73871c865
commit 62625b4bc6
2 changed files with 62 additions and 28 deletions

View File

@ -40,8 +40,8 @@ import org.broadinstitute.gatk.utils.exceptions.UserException;
* This filter is intended to filter out reads that are potentially from foreign organisms.
* From experience with sequencing of human DNA we have found cases of contamination by bacterial
* organisms; the symptoms of such contamination are a class of reads with only a small number
* of aligned bases and additionally many soft-clipped bases on both ends. This filter is intended
* to remove such reads.
* of aligned bases and additionally many soft-clipped bases. This filter is intended
* to remove such reads. Consecutive soft-clipped blocks are treated as a single block
* </p>
*
*/
@ -50,25 +50,31 @@ public class OverclippedReadFilter extends ReadFilter {
@Argument(fullName = "filter_is_too_short_value", shortName = "filterTooShort", doc = "Value for which reads with less than this number of aligned bases is considered too short", required = false)
int tooShort = 30;
@Argument(fullName = "do_not_require_softclips_both_ends", shortName = "NoRequireSCBothEnds", doc = "Allow a read to be filtered out based on having only 1 soft-clipped block. By default, both ends must have a soft-clipped block, setting this flag requires only 1 soft-clipped block.", required = false)
Boolean doNotRequireSoftclipsOnBothEnds = false;
public boolean filterOut(final SAMRecord read) {
boolean sawLeadingSoftclip = false;
boolean sawAlignedBase = false;
int alignedLength = 0;
int softClipBlocks = 0;
int minSoftClipBlocks = doNotRequireSoftclipsOnBothEnds ? 1 : 2;
CigarOperator lastOperator = null;
for ( final CigarElement element : read.getCigar().getCigarElements() ) {
if ( element.getOperator() == CigarOperator.S ) {
if ( sawAlignedBase ) // if this is true then we must also have seen a leading soft-clip
return (alignedLength < tooShort);
sawLeadingSoftclip = true;
//Treat consecutive S blocks as a single one
if(lastOperator != CigarOperator.S){
softClipBlocks += 1;
}
} else if ( element.getOperator().consumesReadBases() ) { // M, I, X, and EQ (S was already accounted for above)
if ( !sawLeadingSoftclip )
return false;
sawAlignedBase = true;
alignedLength += element.getLength();
}
lastOperator = element.getOperator();
}
return false;
return(alignedLength < tooShort && softClipBlocks >= minSoftClipBlocks);
}
}

View File

@ -42,11 +42,12 @@ import java.util.*;
public class OverclippedReadFilterUnitTest extends ReadFilterTest {
@Test(enabled = true, dataProvider= "OverclippedDataProvider")
public void testOverclippedFilter(final String cigarString, final boolean expectedResult) {
public void testOverclippedFilter(final String cigarString, boolean doNotRequireSoftclipsOnBothEnds, final boolean expectedResult) {
final OverclippedReadFilter filter = new OverclippedReadFilter();
final SAMRecord read = buildSAMRecord(cigarString);
Assert.assertEquals(filter.filterOut(read), expectedResult, cigarString);
final OverclippedReadFilter filter = new OverclippedReadFilter();
filter.doNotRequireSoftclipsOnBothEnds = doNotRequireSoftclipsOnBothEnds;
final SAMRecord read = buildSAMRecord(cigarString);
Assert.assertEquals(filter.filterOut(read), expectedResult, cigarString);
}
private SAMRecord buildSAMRecord(final String cigarString) {
@ -58,20 +59,47 @@ public class OverclippedReadFilterUnitTest extends ReadFilterTest {
public Iterator<Object[]> overclippedDataProvider() {
final List<Object[]> result = new LinkedList<Object[]>();
result.add(new Object[] { "1S10M1S", true });
result.add(new Object[] { "1S10X1S", true });
result.add(new Object[] { "1H1S10M1S1H", true });
result.add(new Object[] { "1S40M1S", false });
result.add(new Object[] { "1S40X1S", false });
result.add(new Object[] { "1H10M1S", false });
result.add(new Object[] { "1S10M1H", false });
result.add(new Object[] { "10M1S", false });
result.add(new Object[] { "1S10M", false });
result.add(new Object[] { "1S10M10D10M1S", true });
result.add(new Object[] { "1S1M40I1S", false });
result.add(new Object[] { "1S10I1S", true });
result.add(new Object[] { "1S40I1S", false });
result.add(new Object[] { "1S10M1S", false, true });
result.add(new Object[] { "1S10X1S", false, true });
result.add(new Object[] { "1H1S10M1S1H", false, true });
result.add(new Object[] { "1S40M1S", false, false});
result.add(new Object[] { "1S40X1S", false, false });
result.add(new Object[] { "1H10M1S", false, false});
result.add(new Object[] { "1S10M1H", false, false});
result.add(new Object[] { "10M1S", false, false});
result.add(new Object[] { "1S10M", false, false});
result.add(new Object[] { "10M1S", true, true});
result.add(new Object[] { "1S10M", true, true});
result.add(new Object[] { "1S10M10D10M1S", false, true });
result.add(new Object[] { "1S1M40I1S", false, false });
result.add(new Object[] { "1S10I1S", false, true });
result.add(new Object[] { "1S40I1S", false, false });
result.add(new Object[] { "1S40I1S", true, false });
result.add(new Object[] { "25S40I25M", true, false });
//Read is too short once soft-clipping removed
result.add(new Object[] { "25S25M", true, true });
result.add(new Object[] { "25S25X", true, true });
result.add(new Object[] { "25S25H", true, true });
result.add(new Object[] { "25S25H", false, false });
result.add(new Object[] { "25S25M25S", false, true });
result.add(new Object[] { "25M25S", true, true });
result.add(new Object[] { "25S25M", true, true });
result.add(new Object[] { "25S35S", true, true });
//Read long enough even with soft clipping removed
result.add(new Object[] { "25S35M25S", true, false });
result.add(new Object[] { "35M25S", true, false });
result.add(new Object[] { "25S35M", true, false });
return result.iterator();
}
}