Added tests for changing IUPAC bases to Ns, for failing on bad ref bases, and for the HaplotypeCaller not failing when running over a region with an IUPAC base.

Out of curiosity, why does Picard's IndexedFastaSequenceFile allow one to query for start position 0?  When doing so, that base is a line feed (-1 offset to the first base in the contig) which is an illegal base (and which caused me no end of trouble)...
This commit is contained in:
Eric Banks 2013-01-16 14:55:33 -05:00
parent 445735a4a5
commit d18dbcbac1
4 changed files with 55 additions and 13 deletions

View File

@ -50,6 +50,7 @@ import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections;
public class HaplotypeCallerIntegrationTest extends WalkerTest { public class HaplotypeCallerIntegrationTest extends WalkerTest {
final static String REF = b37KGReference; final static String REF = b37KGReference;
@ -156,6 +157,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
executeTest("HCTestStructuralIndels: ", spec); executeTest("HCTestStructuralIndels: ", spec);
} }
@Test
public void HCTestDoesNotFailOnBadRefBase() {
// don't care about the output - just want to make sure it doesn't fail
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2";
final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.<String>emptyList());
executeTest("HCTestDoesNotFailOnBadRefBase: ", spec);
}
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
// //
// testing reduced reads // testing reduced reads

View File

@ -125,13 +125,13 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
* @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
*/ */
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase ) throws FileNotFoundException { public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) throws FileNotFoundException {
super(fasta); super(fasta);
if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0");
this.cacheSize = cacheSize; this.cacheSize = cacheSize;
this.cacheMissBackup = Math.max(cacheSize / 1000, 1); this.cacheMissBackup = Math.max(cacheSize / 1000, 1);
this.preserveCase = preserveCase; this.preserveCase = preserveCase;
preserveIUPAC = false; this.preserveIUPAC = preserveIUPAC;
} }
/** /**
@ -168,7 +168,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
*/ */
public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException { public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException {
this(fasta, DEFAULT_CACHE_SIZE, preserveCase); this(fasta, DEFAULT_CACHE_SIZE, preserveCase, false);
} }
/** /**
@ -181,7 +181,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
* @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0
*/ */
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException { public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException {
this(fasta, cacheSize, false); this(fasta, cacheSize, false, false);
} }
/** /**
@ -261,7 +261,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
* all of the bases in the ReferenceSequence returned by this method will be upper cased. * all of the bases in the ReferenceSequence returned by this method will be upper cased.
*/ */
@Override @Override
public ReferenceSequence getSubsequenceAt( final String contig, final long start, final long stop ) { public ReferenceSequence getSubsequenceAt( final String contig, long start, final long stop ) {
final ReferenceSequence result; final ReferenceSequence result;
final Cache myCache = cache.get(); final Cache myCache = cache.get();
@ -269,7 +269,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
cacheMisses++; cacheMisses++;
result = super.getSubsequenceAt(contig, start, stop); result = super.getSubsequenceAt(contig, start, stop);
if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases()); if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases());
if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true); if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true, start < 1);
} else { } else {
// todo -- potential optimization is to check if contig.name == contig, as this in general will be true // todo -- potential optimization is to check if contig.name == contig, as this in general will be true
SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig); SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig);
@ -285,7 +285,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
// convert all of the bases in the sequence to upper case if we aren't preserving cases // convert all of the bases in the sequence to upper case if we aren't preserving cases
if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases()); if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases());
if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true); if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true, myCache.start == 0);
} else { } else {
cacheHits++; cacheHits++;
} }

View File

@ -166,9 +166,11 @@ public class BaseUtils {
return base >= 'A' && base <= 'Z'; return base >= 'A' && base <= 'Z';
} }
public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase) { public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase, final boolean ignoreConversionOfFirstByte) {
final int length = bases.length; final int length = bases.length;
for ( int i = 0; i < length; i++ ) { final int start = ignoreConversionOfFirstByte ? 1 : 0;
for ( int i = start; i < length; i++ ) {
final int baseIndex = baseIndexWithIupacMap[bases[i]]; final int baseIndex = baseIndexWithIupacMap[bases[i]];
if ( baseIndex == Base.N.ordinal() ) { if ( baseIndex == Base.N.ordinal() ) {
bases[i] = 'N'; bases[i] = 'N';

View File

@ -32,8 +32,10 @@ package org.broadinstitute.sting.utils.fasta;
import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequence; import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.SAMSequenceRecord;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Priority; import org.apache.log4j.Priority;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.DataProvider; import org.testng.annotations.DataProvider;
import org.testng.annotations.Test; import org.testng.annotations.Test;
@ -49,7 +51,7 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
/** /**
* Basic unit test for GenomeLoc * Basic unit test for CachingIndexedFastaSequenceFile
*/ */
public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
private File simpleFasta = new File(publicTestDir + "/exampleFASTA.fasta"); private File simpleFasta = new File(publicTestDir + "/exampleFASTA.fasta");
@ -80,7 +82,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
@Test(dataProvider = "fastas", enabled = true && ! DEBUG) @Test(dataProvider = "fastas", enabled = true && ! DEBUG)
public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException { public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);
SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0); SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0);
logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d", logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d",
@ -122,7 +124,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
@Test(dataProvider = "fastas", enabled = true && ! DEBUG) @Test(dataProvider = "fastas", enabled = true && ! DEBUG)
public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException { public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
@ -167,7 +169,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
@Test(dataProvider = "ParallelFastaTest", enabled = true && ! DEBUG, timeOut = 60000) @Test(dataProvider = "ParallelFastaTest", enabled = true && ! DEBUG, timeOut = 60000)
public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException { public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException {
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);
logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt)); logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt));
for ( int iterations = 0; iterations < 1; iterations++ ) { for ( int iterations = 0; iterations < 1; iterations++ ) {
@ -230,4 +232,33 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
else else
return new String(reader.getSubsequenceAt(contig, start, stop).getBases()); return new String(reader.getSubsequenceAt(contig, start, stop).getBases());
} }
@Test(enabled = true)
public void testIupacChanges() throws FileNotFoundException, InterruptedException {
final String testFasta = privateTestDir + "iupacFASTA.fasta";
final CachingIndexedFastaSequenceFile iupacPreserving = new CachingIndexedFastaSequenceFile(new File(testFasta), CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE, false, true);
final CachingIndexedFastaSequenceFile makeNs = new CachingIndexedFastaSequenceFile(new File(testFasta));
int preservingNs = 0;
int changingNs = 0;
for ( SAMSequenceRecord contig : iupacPreserving.getSequenceDictionary().getSequences() ) {
final String sPreserving = fetchBaseString(iupacPreserving, contig.getSequenceName(), 0, 15000);
preservingNs += StringUtils.countMatches(sPreserving, "N");
final String sChanging = fetchBaseString(makeNs, contig.getSequenceName(), 0, 15000);
changingNs += StringUtils.countMatches(sChanging, "N");
}
Assert.assertEquals(changingNs, preservingNs + 4);
}
@Test(enabled = true, expectedExceptions = {UserException.class})
public void testFailOnBadBase() throws FileNotFoundException, InterruptedException {
final String testFasta = privateTestDir + "problematicFASTA.fasta";
final CachingIndexedFastaSequenceFile fasta = new CachingIndexedFastaSequenceFile(new File(testFasta));
for ( SAMSequenceRecord contig : fasta.getSequenceDictionary().getSequences() ) {
fetchBaseString(fasta, contig.getSequenceName(), -1, -1);
}
}
} }