Merge pull request #242 from broadinstitute/vrr_N_cigar_error_and_override_option

Reads with N operator in CIGAR now result in a User exception. Option to filter them out is provided.
This commit is contained in:
Eric Banks 2013-06-10 08:46:15 -07:00
commit 2a935374f3
10 changed files with 836 additions and 79 deletions

View File

@ -288,9 +288,10 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testNsInCigar() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1,
Arrays.asList("2ae3fd39c53a6954d32faed8703adfe8"));
UserException.UnsupportedCigarOperatorException.class);
executeTest("test calling on reads with Ns in CIGAR", spec);
}
}

View File

@ -36,6 +36,8 @@ public class ValidationExclusion {
// our validation options
public enum TYPE {
ALLOW_N_CIGAR_READS, // ignore the presence of N operators in CIGARs: do not blow up and process reads that contain one or more N operators.
// This exclusion does not have effect on reads that get filtered {@see MalformedReadFilter}.
ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard
ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set
NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file

View File

@ -25,14 +25,16 @@
package org.broadinstitute.sting.gatk.filters;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.SAMTagUtil;
import net.sf.samtools.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.Collections;
/**
* Filter out malformed reads.
*
@ -40,20 +42,46 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
* @version 0.1
*/
public class MalformedReadFilter extends ReadFilter {
private static final String FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME = "filter_reads_with_N_cigar" ;
private SAMFileHeader header;
@Argument(fullName = FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME, shortName = "filterRNC", doc = "filter out reads with CIGAR containing the N operator, instead of stop processing and report an error.", required = false)
boolean filterReadsWithNCigar = false;
@Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required = false)
boolean filterMismatchingBaseAndQuals = false;
@Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "if a read has no stored bases (i.e. a '*'), filter out the read instead of blowing up.", required = false)
boolean filterBasesNotStored = false;
/**
* Indicates the applicable validation exclusions
*/
private boolean allowNCigars;
@Override
public void initialize(GenomeAnalysisEngine engine) {
this.header = engine.getSAMFileHeader();
public void initialize(final GenomeAnalysisEngine engine) {
header = engine.getSAMFileHeader();
ValidationExclusion validationExclusions = null;
final SAMDataSource rds = engine.getReadsDataSource();
if (rds != null) {
final ReadProperties rps = rds.getReadsInfo();
if (rps != null) {
validationExclusions = rps.getValidationExclusionList();
}
}
if (validationExclusions == null) {
allowNCigars = false;
} else {
allowNCigars = validationExclusions.contains(ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS);
}
}
public boolean filterOut(SAMRecord read) {
public boolean filterOut(final SAMRecord read) {
// slowly changing the behavior to blow up first and filtering out if a parameter is explicitly provided
return !checkInvalidAlignmentStart(read) ||
!checkInvalidAlignmentEnd(read) ||
@ -61,7 +89,8 @@ public class MalformedReadFilter extends ReadFilter {
!checkHasReadGroup(read) ||
!checkMismatchingBasesAndQuals(read, filterMismatchingBaseAndQuals) ||
!checkCigarDisagreesWithAlignment(read) ||
!checkSeqStored(read, filterBasesNotStored);
!checkSeqStored(read, filterBasesNotStored) ||
!checkCigarIsSupported(read,filterReadsWithNCigar,allowNCigars);
}
private static boolean checkHasReadGroup(final SAMRecord read) {
@ -80,7 +109,7 @@ public class MalformedReadFilter extends ReadFilter {
* @param read The read to validate.
* @return true if read start is valid, false otherwise.
*/
private static boolean checkInvalidAlignmentStart( SAMRecord read ) {
private static boolean checkInvalidAlignmentStart(final SAMRecord read ) {
// read is not flagged as 'unmapped', but alignment start is NO_ALIGNMENT_START
if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START )
return false;
@ -95,7 +124,7 @@ public class MalformedReadFilter extends ReadFilter {
* @param read The read to validate.
* @return true if read end is valid, false otherwise.
*/
private static boolean checkInvalidAlignmentEnd( SAMRecord read ) {
private static boolean checkInvalidAlignmentEnd(final SAMRecord read ) {
// Alignment aligns to negative number of bases in the reference.
if( !read.getReadUnmappedFlag() && read.getAlignmentEnd() != -1 && (read.getAlignmentEnd()-read.getAlignmentStart()+1)<0 )
return false;
@ -108,11 +137,11 @@ public class MalformedReadFilter extends ReadFilter {
* @param read The read to verify.
* @return true if alignment agrees with header, false othrewise.
*/
private static boolean checkAlignmentDisagreesWithHeader( SAMFileHeader header, SAMRecord read ) {
private static boolean checkAlignmentDisagreesWithHeader(final SAMFileHeader header, final SAMRecord read ) {
// Read is aligned to nonexistent contig
if( read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START )
return false;
SAMSequenceRecord contigHeader = header.getSequence( read.getReferenceIndex() );
final SAMSequenceRecord contigHeader = header.getSequence( read.getReferenceIndex() );
// Read is aligned to a point after the end of the contig
if( !read.getReadUnmappedFlag() && read.getAlignmentStart() > contigHeader.getSequenceLength() )
return false;
@ -124,7 +153,7 @@ public class MalformedReadFilter extends ReadFilter {
* @param read The read to validate.
* @return true if cigar agrees with alignment, false otherwise.
*/
private static boolean checkCigarDisagreesWithAlignment(SAMRecord read) {
private static boolean checkCigarDisagreesWithAlignment(final SAMRecord read) {
// Read has a valid alignment start, but the CIGAR string is empty
if( !read.getReadUnmappedFlag() &&
read.getAlignmentStart() != -1 &&
@ -134,13 +163,72 @@ public class MalformedReadFilter extends ReadFilter {
return true;
}
/**
* Check for unsupported CIGAR operators.
* Currently the N operator is not supported.
* @param read The read to validate.
* @param filterReadsWithNCigar whether the offending read should just
* be silently filtered or not.
* @param allowNCigars whether reads that contain N operators in their CIGARs
* can be processed or an exception should be thrown instead.
* @throws UserException.UnsupportedCigarOperatorException
* if {@link #filterReadsWithNCigar} is <code>false</code> and
* the input read has some unsupported operation.
* @return <code>true</code> if the read CIGAR operations are
* fully supported, otherwise <code>false</code>, as long as
* no exception has been thrown.
*/
private static boolean checkCigarIsSupported(final SAMRecord read, final boolean filterReadsWithNCigar, final boolean allowNCigars) {
if( containsNOperator(read)) {
if (! filterReadsWithNCigar && !allowNCigars) {
throw new UserException.UnsupportedCigarOperatorException(
CigarOperator.N,read,
"Perhaps you are"
+ " trying to use RNA-Seq data?"
+ " While we are currently actively working to"
+ " support this data type unfortunately the"
+ " GATK cannot be used with this data in its"
+ " current form. You have the option of either"
+ " filtering out all reads with operator "
+ CigarOperator.N + " in their CIGAR string"
+ " (please add --"
+ FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME
+ " to your command line) or"
+ " assume the risk of processing those reads as they"
+ " are including the pertinent unsafe flag (please add -U"
+ ' ' + ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS
+ " to your command line). Notice however that if you were"
+ " to choose the latter, an unspecified subset of the"
+ " analytical outputs of an unspecified subset of the tools"
+ " will become unpredictable. Consequently the GATK team"
+ " might well not be able to provide you with the usual support"
+ " with any issue regarding any output");
}
return ! filterReadsWithNCigar;
}
return true;
}
private static boolean containsNOperator(final SAMRecord read) {
final Cigar cigar = read.getCigar();
if (cigar == null) {
return false;
}
for (final CigarElement ce : cigar.getCigarElements()) {
if (ce.getOperator() == CigarOperator.N) {
return true;
}
}
return false;
}
/**
* Check if the read has the same number of bases and base qualities
* @param read the read to validate
* @return true if they have the same number. False otherwise.
*/
private static boolean checkMismatchingBasesAndQuals(SAMRecord read, boolean filterMismatchingBaseAndQuals) {
boolean result;
private static boolean checkMismatchingBasesAndQuals(final SAMRecord read, final boolean filterMismatchingBaseAndQuals) {
final boolean result;
if (read.getReadLength() == read.getBaseQualities().length)
result = true;
else if (filterMismatchingBaseAndQuals)

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.utils.exceptions;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
@ -87,6 +88,19 @@ public class UserException extends ReviewedStingException {
}
}
public static class UnsupportedCigarOperatorException extends UserException {
public UnsupportedCigarOperatorException(final CigarOperator co, final SAMRecord read, final String message) {
super(String.format(
"Unsupported CIGAR operator %s in read %s at %s:%d. %s",
co,
read.getReadName(),
read.getReferenceName(),
read.getAlignmentStart(),
message));
}
}
public static class MalformedGenomeLoc extends UserException {
public MalformedGenomeLoc(String message, GenomeLoc loc) {
super(String.format("Badly formed genome loc: %s: %s", message, loc));

View File

@ -131,6 +131,7 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
final String root = "-T ErrorThrowing -R " + exampleFASTA;
final String args = root + cfg.args + " -E " + cfg.expectedException.getSimpleName();
WalkerTestSpec spec = new WalkerTestSpec(args, 0, cfg.expectedException);
executeTest(cfg.toString(), spec);
}
}

View File

@ -0,0 +1,77 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.filters;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.util.Collections;
/**
* Tests for the {@link MalformedReadFilter} when the unsafe flag
* {@link ValidationExclusion.TYPE#ALLOW_N_CIGAR_READS} is set.
*
* @author Valentin Ruano-Rubio
* @since 6/6/13
*/
public class AllowNCigarMalformedReadFilterUnitTest extends MalformedReadFilterUnitTest {
@Override
protected ValidationExclusion composeValidationExclusion() {
return new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS));
}
@Test(enabled = true,
dataProvider= "UnsupportedCigarOperatorDataProvider")
@CigarOperatorTest(CigarOperatorTest.Outcome.IGNORE)
public void testCigarNOperatorFilterIgnore(final String cigarString) {
final MalformedReadFilter filter = buildMalformedReadFilter(false);
final SAMRecord nContainingCigarRead = buildSAMRecord(cigarString);
Assert.assertFalse(filter.filterOut(nContainingCigarRead),
"filters out N containing Cigar when it should ignore the fact");
}
@Test(enabled = false)
@Override
public void testCigarNOperatorFilterException(final String cigarString) {
// Nothing to do here.
// Just deactivates the parents test case.
}
}

View File

@ -25,11 +25,25 @@
package org.broadinstitute.sting.gatk.filters;
import org.broadinstitute.sting.utils.exceptions.UserException;
import net.sf.samtools.Cigar;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.TextCigarCodec;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.exceptions.UserException.UnsupportedCigarOperatorException;
import java.lang.annotation.*;
import java.lang.reflect.Method;
import java.util.*;
/**
@ -38,14 +52,14 @@ import org.testng.annotations.Test;
* @author Eric Banks
* @since 3/14/13
*/
public class MalformedReadFilterUnitTest {
public class MalformedReadFilterUnitTest extends ReadFilterTest {
//////////////////////////////////////
// Test the checkSeqStored() method //
//////////////////////////////////////
@Test(enabled = true)
public void testcheckSeqStored () {
public void testCheckSeqStored () {
final GATKSAMRecord goodRead = ArtificialSAMUtils.createArtificialRead(new byte[]{(byte)'A'}, new byte[]{(byte)'A'}, "1M");
final GATKSAMRecord badRead = ArtificialSAMUtils.createArtificialRead(new byte[]{}, new byte[]{}, "1M");
@ -59,4 +73,174 @@ public class MalformedReadFilterUnitTest {
Assert.assertTrue(false, "We should have exceptioned out in the previous line");
} catch (UserException e) { }
}
@Test(enabled = true, dataProvider= "UnsupportedCigarOperatorDataProvider")
@CigarOperatorTest(CigarOperatorTest.Outcome.FILTER)
public void testCigarNOperatorFilterTruePositive(String cigarString) {
final MalformedReadFilter filter = buildMalformedReadFilter(true);
final SAMRecord nContainingCigarRead = buildSAMRecord(cigarString);
Assert.assertTrue(filter.filterOut(nContainingCigarRead),
" Did not filtered out a N containing CIGAR read");
}
@Test(enabled = true, dataProvider= "UnsupportedCigarOperatorDataProvider")
@CigarOperatorTest(CigarOperatorTest.Outcome.ACCEPT)
public void testCigarNOperatorFilterTrueNegative(String cigarString) {
final MalformedReadFilter filter = buildMalformedReadFilter(true);
final SAMRecord nonNContainingCigarRead = buildSAMRecord(cigarString);
Assert.assertFalse(filter.filterOut(nonNContainingCigarRead),
" Filtered out a non-N containing CIGAR read");
}
@Test(enabled = true,
expectedExceptions = UnsupportedCigarOperatorException.class,
dataProvider= "UnsupportedCigarOperatorDataProvider")
@CigarOperatorTest(CigarOperatorTest.Outcome.EXCEPTION)
public void testCigarNOperatorFilterException(final String cigarString) {
final MalformedReadFilter filter = buildMalformedReadFilter(false);
final SAMRecord nContainingCigarRead = buildSAMRecord(cigarString);
filter.filterOut(nContainingCigarRead);
}
@Test(enabled = true, dataProvider="UnsupportedCigarOperatorDataProvider")
@CigarOperatorTest(CigarOperatorTest.Outcome.ACCEPT)
public void testCigarNOperatorFilterControl(final String cigarString) {
final MalformedReadFilter filter = buildMalformedReadFilter(false);
final SAMRecord nonNContainingCigarRead = buildSAMRecord(cigarString);
Assert.assertFalse(filter.filterOut(nonNContainingCigarRead));
}
protected SAMRecord buildSAMRecord(final String cigarString) {
final Cigar nContainingCigar = TextCigarCodec.getSingleton().decode(cigarString);
return this.createRead(nContainingCigar, 1, 0, 10);
}
protected MalformedReadFilter buildMalformedReadFilter(final boolean filterRNO) {
return buildMalformedReadFiter(filterRNO,new ValidationExclusion.TYPE[] {});
}
protected MalformedReadFilter buildMalformedReadFiter(boolean filterRNO, final ValidationExclusion.TYPE... excl) {
final ValidationExclusion ve = new ValidationExclusion(Arrays.asList(excl));
final MalformedReadFilter filter = new MalformedReadFilter();
final SAMFileHeader h = getHeader();
final SAMDataSource ds = getDataSource();
final GenomeAnalysisEngine gae = new GenomeAnalysisEngine() {
@Override
public SAMFileHeader getSAMFileHeader() {
return h;
}
@Override
public SAMDataSource getReadsDataSource() {
return ds;
}
};
filter.initialize(gae);
filter.filterReadsWithNCigar = filterRNO;
return filter;
}
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.METHOD)
@Inherited
protected @interface CigarOperatorTest {
enum Outcome {
ANY,ACCEPT,FILTER,EXCEPTION,IGNORE;
public boolean appliesTo (String cigar) {
boolean hasN = cigar.indexOf('N') != -1;
switch (this) {
case ANY: return true;
case ACCEPT: return !hasN;
case IGNORE: return hasN;
case FILTER:
case EXCEPTION:
default:
return hasN;
}
}
}
Outcome value() default Outcome.ANY;
}
/**
* Cigar test data for unsupported operator test.
* Each element of this array corresponds to a test case. In turn the first element of the test case array is the
* Cigar string for that test case and the second indicates whether it should be filtered due to the presence of a
* unsupported operator
*/
private static final String[] TEST_CIGARS = {
"101M10D20I10M",
"6M14N5M",
"1N",
"101M",
"110N",
"2N4M",
"4M2N",
"3M1I1M",
"1M2I2M",
"1M10N1I1M",
"1M1I1D",
"11N12M1I34M12N"
};
@DataProvider(name= "UnsupportedCigarOperatorDataProvider")
public Iterator<Object[]> unsupportedOperatorDataProvider(final Method testMethod) {
final CigarOperatorTest a = resolveCigarOperatorTestAnnotation(testMethod);
final List<Object[]> result = new LinkedList<Object[]>();
for (final String cigarString : TEST_CIGARS) {
if (a == null || a.value().appliesTo(cigarString)) {
result.add(new Object[] { cigarString });
}
}
return result.iterator();
}
/**
* Gets the most specific {@link CigarOperatorTest} annotation for the
* signature of the test method provided.
* <p/>
* This in-house implementation is required due to the fact that method
* annotations do not have inheritance.
*
* @param m targeted test method.
* @return <code>null</code> if there is no {@link CigarOperatorTest}
* annotation in this or overridden methods.
*/
private CigarOperatorTest resolveCigarOperatorTestAnnotation(final Method m) {
CigarOperatorTest res = m.getAnnotation(CigarOperatorTest.class);
if (res != null) {
return res;
}
Class<?> c = this.getClass();
Class<?> p = c.getSuperclass();
while (p != null && p != Object.class) {
try {
final Method met = p.getDeclaredMethod(m.getName(),
m.getParameterTypes());
res = met.getAnnotation(CigarOperatorTest.class);
if (res != null) {
break;
}
} catch (NoSuchMethodException e) {
// Its ok; nothing to do here, just keep looking.
}
c = p;
p = c.getSuperclass();
}
return res;
}
}

View File

@ -0,0 +1,370 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.filters;
import net.sf.samtools.*;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
import java.util.*;
/**
* Class ReadBaseTest
* <p/>
* This is the base test class for read filter test classes. All read
* filter test cases should extend from this
* class; it sets ups a header mock up to test read filtering.
*
* Feel free to override non-final method to modify the behavior
* (i.e. change how read group id are formatted, or complete a header).
*
* <p/>
* You can statically determine the number of read-group involved
* in the test by calling {@link #ReadFilterTest(int)} in you constructor.
* <p/>
*
* Notice that the same header object is shared by all test and
* it is initialized by Junit (calling {@link #beforeClass()}.
*
* @author Valentin Ruano Rubio
* @date May 23, 2013
*/
public class ReadFilterTest extends BaseTest {
private static final int DEFAULT_READ_GROUP_COUNT = 5;
private static final int DEFAULT_READER_COUNT = 1;
private static final String DEFAULT_READ_GROUP_PREFIX = "ReadGroup";
private static final String DEFAULT_PLATFORM_UNIT_PREFIX = "Lane";
private static final String DEFAULT_SAMPLE_NAME_PREFIX = "Sample";
private static final String DEFAULT_PLATFORM_PREFIX = "Platform";
private static final int DEFAULT_CHROMOSOME_COUNT = 1;
private static final int DEFAULT_CHROMOSOME_START_INDEX = 1;
private static final int DEFAULT_CHROMOSOME_SIZE = 1000;
private static final String DEFAULT_SAM_FILE_FORMAT = "readfile-%3d.bam";
private final int groupCount;
private SAMFileHeader header;
private SAMDataSource dataSource;
/**
* Constructs a new read-filter test providing the number of read
* groups in the file.
*
* @param groupCount number of read-group in the fictional SAM file,
* must be equal or greater than 1.
*/
protected ReadFilterTest(final int groupCount) {
if (groupCount < 1) {
throw new IllegalArgumentException(
"the read group count must at least be 1");
}
this.groupCount = groupCount;
}
/**
* Gets the data source.
*
* @throws IllegalStateException if the data source was not initialized
* invoking {@link #beforeClass()}
* @return never <code>null</code>
*/
protected final SAMDataSource getDataSource() {
checkDataSourceExists();
return dataSource;
}
/**
* Returns the mock-up SAM file header for testing.
*
* @throws IllegalStateException if the header was not initialized
* invoking {@link #beforeClass()}
* @return never <code>null</code>
*/
protected final SAMFileHeader getHeader() {
checkHeaderExists();
return header;
}
/**
* Construct a read filter test with the default number of groups
* ({@link #DEFAULT_READ_GROUP_COUNT}.
*/
public ReadFilterTest() {
this(DEFAULT_READ_GROUP_COUNT);
}
/**
* Return the number of read groups involved in the test
* @return <code>1</code> or greater.
*/
protected final int getReadGroupCount() {
return groupCount;
}
/**
* Composes the Id for the read group given its index.
*
* This methods must return a unique distinct ID for each possible index and
* it must be the same value each time it is invoked.
*
* @param index the index of the targeted read group in the range
* [1,{@link #getReadGroupCount()}]
* @return never <code>null</code> and must be unique to each possible
* read group index.
*/
protected String composeReadGroupId(final int index) {
checkReadGroupIndex(index);
return DEFAULT_READ_GROUP_PREFIX + index;
}
/**
* Composes the Platform name for the read group given its index.
*
* This method must always return the same value give an index.
*
* @param index the index of the targeted read group in the range
* [1,{@link #getReadGroupCount()}]
* @return never <code>null</code>.
*/
protected String composePlatformName(final int index) {
checkReadGroupIndex(index);
return DEFAULT_PLATFORM_PREFIX + (((index-1)%2)+1);
}
/**
* Composes the Platform unit name for the read group given its index.
*
* @param index the index of the targeted read group in the range
* [1,{@link #getReadGroupCount()}]
* @return never <code>null</code>.
*/
protected String composePlatformUnitName(final int index) {
checkReadGroupIndex(index);
return DEFAULT_PLATFORM_UNIT_PREFIX + (((index-1)%3)+1);
}
/**
* Checks the correctness of a given read group index.
*
* A correct index is any value in the range [1,{@link #getReadGroupCount()}].
*
* @param index the target index.
* @throws IllegalArgumentException if the input index is not correct.
*/
protected final void checkReadGroupIndex(final int index) {
checkIndex(index,groupCount,"read group");
}
private void checkIndex(final int index, final int max, CharSequence name) {
if (index < 1 || index > max) {
throw new IllegalArgumentException(
name + " index ("
+ index
+ ") is out of bounds [1," + max + "]");
}
}
/**
* Checks whether the header was initialized.
*
* @throws IllegalStateException if the header was not yet initialized.
*/
protected final void checkHeaderExists() {
if (header == null) {
throw new IllegalArgumentException(
"header has not been initialized;"
+ " beforeClass() was not invoked");
}
}
/**
* Checks whether the data source was initialized.
*
* @throws IllegalStateException if the data source was not yet initialized.
*/
protected final void checkDataSourceExists() {
if (header == null) {
throw new IllegalArgumentException(
"data source has not been initialized;"
+ " beforeClass() was not invoked");
}
}
/**
* Returns the ID for a read group given its index.
*
* @param index the index of the targeted read group in the range
* [1,{@link #getReadGroupCount()}]
* @return never <code>null</code> and must be unique to each
* possible read group index.
*/
protected final String getReadGroupId(final int index) {
checkReadGroupIndex(index);
return getHeader().getReadGroups().get(index - 1).getReadGroupId();
}
/**
* Returns the platform name for a read group given its index.
*
* @param group the index of the targeted read group in the range
* [1,{@link #getReadGroupCount()}]
* @return never <code>null</code>.
*/
protected final String getPlatformName(final int group) {
checkReadGroupIndex(group);
return getHeader().getReadGroups().get(group - 1).getPlatform();
}
/**
* Returns the platform unit for a read group given its index.
*
* @param group the index of the targeted read group in the range
* [1,{@link #getReadGroupCount()}]
* @return never <code>null</code>.
*/
protected final String getPlatformUnit(final int group) {
checkReadGroupIndex(group);
return getHeader().getReadGroups().get(group - 1).getPlatformUnit();
}
/**
* Composes the mock up SAM file header.
*
* It must return an equivalent (equal) value each time it is invoked.
*
* @return never <code>null</code>.
*/
protected SAMFileHeader composeHeader() {
return ArtificialSAMUtils.createArtificialSamHeader(
DEFAULT_CHROMOSOME_COUNT, DEFAULT_CHROMOSOME_START_INDEX,
DEFAULT_CHROMOSOME_SIZE);
}
@BeforeClass
public void beforeClass() {
header = composeHeader();
dataSource = composeDataSource();
final List<String> readGroupIDs = new ArrayList<String>();
final List<String> sampleNames = new ArrayList<String>();
for (int i = 1; i <= getReadGroupCount(); i++) {
final String readGroupId = composeReadGroupId(i);
readGroupIDs.add(readGroupId);
sampleNames.add(readGroupId);
}
ArtificialSAMUtils.createEnumeratedReadGroups(
header, readGroupIDs, sampleNames);
for (int i = 1; i <= getReadGroupCount(); i++) {
final String readGroupId = readGroupIDs.get(i-1);
final SAMReadGroupRecord groupRecord = header.getReadGroup(readGroupId);
groupRecord.setAttribute("PL", composePlatformName(i));
groupRecord.setAttribute("PU", composePlatformUnitName(i));
}
}
protected ValidationExclusion composeValidationExclusion() {
return new ValidationExclusion();
}
protected SAMDataSource composeDataSource() {
checkHeaderExists();
final Set<SAMReaderID> readerIDs = new HashSet<>(1);
final ThreadAllocation ta = new ThreadAllocation();
final Integer numFileHandles = 1; // I believe that any value would do but need to confirm.
final boolean useOriginalBaseQualities = true;
final SAMFileReader.ValidationStringency strictness = SAMFileReader.ValidationStringency.LENIENT;
final Integer readBufferSize = 1; // not relevant.
final DownsamplingMethod downsamplingMethod = DownsamplingMethod.NONE;
final ValidationExclusion exclusionList = composeValidationExclusion();
final Collection<ReadFilter> supplementalFilters = Collections.EMPTY_SET;
final boolean includeReadsWithDeletionAtLoci = true;
final GenomeLocParser glp = new GenomeLocParser(header.getSequenceDictionary());
final SAMDataSource res = new SAMDataSource(
readerIDs,
ta,
numFileHandles,
glp,
useOriginalBaseQualities,
strictness,
readBufferSize,
downsamplingMethod,
exclusionList,
supplementalFilters,
includeReadsWithDeletionAtLoci);
return res;
}
@AfterClass
public void afterClass() {
header = null;
dataSource = null;
}
/**
* Creates a read record.
*
* @param cigar the new record CIGAR.
* @param group the new record group index that must be in the range \
* [1,{@link #getReadGroupCount()}]
* @param reference the reference sequence index (0-based)
* @param start the start position of the read alignment in the reference
* (1-based)
* @return never <code>null</code>
*/
protected SAMRecord createRead(final Cigar cigar, final int group, final int reference, final int start) {
final SAMRecord record = ArtificialSAMUtils.createArtificialRead(cigar);
record.setHeader(getHeader());
record.setAlignmentStart(start);
record.setReferenceIndex(reference);
record.setAttribute(SAMTag.RG.toString(), getReadGroupId(group));
return record;
}
}

View File

@ -26,13 +26,10 @@
package org.broadinstitute.sting.gatk.filters;
import org.testng.Assert;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMReadGroupRecord;
@ -40,34 +37,7 @@ import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
public class ReadGroupBlackListFilterUnitTest extends BaseTest {
private static final int READ_GROUP_COUNT = 5;
private static final String READ_GROUP_PREFIX = "ReadGroup";
private static final String SAMPLE_NAME_PREFIX = "Sample";
private static final String PLATFORM_PREFIX = "Platform";
private static final String PLATFORM_UNIT_PREFIX = "Lane";
private static SAMFileHeader header;
@BeforeClass
public void beforeClass() {
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
List<String> readGroupIDs = new ArrayList<String>();
List<String> sampleNames = new ArrayList<String>();
for (int i = 1; i <= READ_GROUP_COUNT; i++) {
readGroupIDs.add(READ_GROUP_PREFIX + i);
sampleNames.add(SAMPLE_NAME_PREFIX + i);
}
ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroupIDs, sampleNames);
for (int i = 1; i <= READ_GROUP_COUNT; i++) {
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + i);
groupRecord.setAttribute("PL", PLATFORM_PREFIX + (((i-1)%2)+1));
groupRecord.setAttribute("PU", PLATFORM_UNIT_PREFIX + (((i-1)%3)+1));
}
}
public class ReadGroupBlackListFilterUnitTest extends ReadFilterTest {
@Test(expectedExceptions=ReviewedStingException.class)
public void testBadFilter() {
@ -88,14 +58,14 @@ public class ReadGroupBlackListFilterUnitTest extends BaseTest {
@Test
public void testFilterReadGroup() {
SAMRecord filteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 1, 20);
filteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "1");
SAMRecord filteredRecord = ArtificialSAMUtils.createArtificialRead(getHeader(), "readUno", 0, 1, 20);
filteredRecord.setAttribute("RG", getReadGroupId(1));
SAMRecord unfilteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readDos", 0, 2, 20);
unfilteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "2");
SAMRecord unfilteredRecord = ArtificialSAMUtils.createArtificialRead(getHeader(), "readDos", 0, 2, 20);
unfilteredRecord.setAttribute("RG", getReadGroupId(2));
List<String> filterList = new ArrayList<String>();
filterList.add("RG:" + READ_GROUP_PREFIX + "1");
filterList.add("RG:" + getReadGroupId(1));
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
Assert.assertTrue(filter.filterOut(filteredRecord));
@ -104,14 +74,14 @@ public class ReadGroupBlackListFilterUnitTest extends BaseTest {
@Test
public void testFilterPlatformUnit() {
SAMRecord filteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, 1, 20);
filteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "1");
SAMRecord filteredRecord = ArtificialSAMUtils.createArtificialRead(getHeader(), "readUno", 0, 1, 20);
filteredRecord.setAttribute("RG", getReadGroupId(1));
SAMRecord unfilteredRecord = ArtificialSAMUtils.createArtificialRead(header, "readDos", 0, 2, 20);
unfilteredRecord.setAttribute("RG", READ_GROUP_PREFIX + "2");
SAMRecord unfilteredRecord = ArtificialSAMUtils.createArtificialRead(getHeader(), "readDos", 0, 2, 20);
unfilteredRecord.setAttribute("RG", getReadGroupId(2));
List<String> filterList = new ArrayList<String>();
filterList.add("PU:" + PLATFORM_UNIT_PREFIX + "1");
filterList.add("PU:" + getPlatformUnit(1));
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
Assert.assertTrue(filter.filterOut(filteredRecord));
@ -123,18 +93,18 @@ public class ReadGroupBlackListFilterUnitTest extends BaseTest {
int recordsPerGroup = 3;
List<SAMRecord> records = new ArrayList<SAMRecord>();
int alignmentStart = 0;
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
for (int x = 1; x <= getReadGroupCount(); x++) {
SAMReadGroupRecord groupRecord = getHeader().getReadGroup(getReadGroupId(x));
for (int y = 1; y <= recordsPerGroup; y++) {
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
SAMRecord record = ArtificialSAMUtils.createArtificialRead(getHeader(), "readUno", 0, ++alignmentStart, 20);
record.setAttribute("RG", groupRecord.getReadGroupId());
records.add(record);
}
}
List<String> filterList = new ArrayList<String>();
filterList.add("RG:" + READ_GROUP_PREFIX + "1");
filterList.add("RG:" + READ_GROUP_PREFIX + "3");
filterList.add("RG:" + getReadGroupId(1));
filterList.add("RG:" + getReadGroupId(3));
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
int filtered = 0;
@ -153,7 +123,7 @@ public class ReadGroupBlackListFilterUnitTest extends BaseTest {
}
int filteredExpected = recordsPerGroup * 2;
int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2);
int unfilteredExpected = recordsPerGroup * (getReadGroupCount() - 2);
Assert.assertEquals(filtered, filteredExpected, "Filtered");
Assert.assertEquals(unfiltered, unfilteredExpected, "Uniltered");
}
@ -163,17 +133,17 @@ public class ReadGroupBlackListFilterUnitTest extends BaseTest {
int recordsPerGroup = 3;
List<SAMRecord> records = new ArrayList<SAMRecord>();
int alignmentStart = 0;
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
for (int x = 1; x <= getReadGroupCount(); x++) {
SAMReadGroupRecord groupRecord = getHeader().getReadGroup(getReadGroupId(x));
for (int y = 1; y <= recordsPerGroup; y++) {
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
SAMRecord record = ArtificialSAMUtils.createArtificialRead(getHeader(), "readUno", 0, ++alignmentStart, 20);
record.setAttribute("RG", groupRecord.getReadGroupId());
records.add(record);
}
}
List<String> filterList = new ArrayList<String>();
filterList.add("PU:" + PLATFORM_UNIT_PREFIX + "1");
filterList.add("PU:" + getPlatformUnit(1));
ReadGroupBlackListFilter filter = new ReadGroupBlackListFilter(filterList);
int filtered = 0;
@ -202,10 +172,10 @@ public class ReadGroupBlackListFilterUnitTest extends BaseTest {
int recordsPerGroup = 3;
List<SAMRecord> records = new ArrayList<SAMRecord>();
int alignmentStart = 0;
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
for (int x = 1; x <= getReadGroupCount(); x++) {
SAMReadGroupRecord groupRecord = getHeader().getReadGroup(getReadGroupId(x));
for (int y = 1; y <= recordsPerGroup; y++) {
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
SAMRecord record = ArtificialSAMUtils.createArtificialRead(getHeader(), "readUno", 0, ++alignmentStart, 20);
record.setAttribute("RG", groupRecord.getReadGroupId());
records.add(record);
}
@ -231,7 +201,7 @@ public class ReadGroupBlackListFilterUnitTest extends BaseTest {
}
int filteredExpected = recordsPerGroup * 2;
int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2);
int unfilteredExpected = recordsPerGroup * (getReadGroupCount() - 2);
Assert.assertEquals(filtered, filteredExpected, "Filtered");
Assert.assertEquals(unfiltered, unfilteredExpected, "Uniltered");
}
@ -241,10 +211,10 @@ public class ReadGroupBlackListFilterUnitTest extends BaseTest {
int recordsPerGroup = 3;
List<SAMRecord> records = new ArrayList<SAMRecord>();
int alignmentStart = 0;
for (int x = 1; x <= READ_GROUP_COUNT; x++) {
SAMReadGroupRecord groupRecord = header.getReadGroup(READ_GROUP_PREFIX + x);
for (int x = 1; x <= getReadGroupCount(); x++) {
SAMReadGroupRecord groupRecord = getHeader().getReadGroup(getReadGroupId(x));
for (int y = 1; y <= recordsPerGroup; y++) {
SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "readUno", 0, ++alignmentStart, 20);
SAMRecord record = ArtificialSAMUtils.createArtificialRead(getHeader(), "readUno", 0, ++alignmentStart, 20);
record.setAttribute("RG", groupRecord.getReadGroupId());
records.add(record);
}
@ -270,7 +240,7 @@ public class ReadGroupBlackListFilterUnitTest extends BaseTest {
}
int filteredExpected = recordsPerGroup * 2;
int unfilteredExpected = recordsPerGroup * (READ_GROUP_COUNT - 2);
int unfilteredExpected = recordsPerGroup * (getReadGroupCount() - 2);
Assert.assertEquals(filtered, filteredExpected, "Filtered");
Assert.assertEquals(unfiltered, unfilteredExpected, "Uniltered");
}

View File

@ -0,0 +1,50 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.filters;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import java.util.Collections;
/**
* Tests for the {@link MalformedReadFilter} when the unsafe flag
* {@link ValidationExclusion.TYPE#ALL} is set.
*
* @author Valentin Ruano-Rubio
* @since 6/6/13
*/
public class UnsafeMalformedReadFilterUnitTest extends AllowNCigarMalformedReadFilterUnitTest {
@Override
protected ValidationExclusion composeValidationExclusion() {
return new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL));
}
}