Added tests for changing IUPAC bases to Ns, for failing on bad ref bases, and for the HaplotypeCaller not failing when running over a region with an IUPAC base.

Out of curiosity, why does Picard's IndexedFastaSequenceFile allow one to query for start position 0?  When doing so, that base is a line feed (-1 offset to the first base in the contig) which is an illegal base (and which caused me no end of trouble)...
This commit is contained in:
Eric Banks 2013-01-16 14:55:33 -05:00
parent 445735a4a5
commit d18dbcbac1
4 changed files with 55 additions and 13 deletions

View File

@ -50,6 +50,7 @@ import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
import java.util.Collections;
public class HaplotypeCallerIntegrationTest extends WalkerTest {
final static String REF = b37KGReference;
@ -156,6 +157,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
executeTest("HCTestStructuralIndels: ", spec);
}
@Test
public void HCTestDoesNotFailOnBadRefBase() {
// don't care about the output - just want to make sure it doesn't fail
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2";
final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.<String>emptyList());
executeTest("HCTestDoesNotFailOnBadRefBase: ", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing reduced reads

View File

@ -125,13 +125,13 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
* @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
*/
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase ) throws FileNotFoundException {
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) throws FileNotFoundException {
super(fasta);
if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0");
this.cacheSize = cacheSize;
this.cacheMissBackup = Math.max(cacheSize / 1000, 1);
this.preserveCase = preserveCase;
preserveIUPAC = false;
this.preserveIUPAC = preserveIUPAC;
}
/**
@ -168,7 +168,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
*/
public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException {
this(fasta, DEFAULT_CACHE_SIZE, preserveCase);
this(fasta, DEFAULT_CACHE_SIZE, preserveCase, false);
}
/**
@ -181,7 +181,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
* @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0
*/
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException {
this(fasta, cacheSize, false);
this(fasta, cacheSize, false, false);
}
/**
@ -261,7 +261,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
* all of the bases in the ReferenceSequence returned by this method will be upper cased.
*/
@Override
public ReferenceSequence getSubsequenceAt( final String contig, final long start, final long stop ) {
public ReferenceSequence getSubsequenceAt( final String contig, long start, final long stop ) {
final ReferenceSequence result;
final Cache myCache = cache.get();
@ -269,7 +269,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
cacheMisses++;
result = super.getSubsequenceAt(contig, start, stop);
if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases());
if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true);
if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true, start < 1);
} else {
// todo -- potential optimization is to check if contig.name == contig, as this in general will be true
SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig);
@ -285,7 +285,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
// convert all of the bases in the sequence to upper case if we aren't preserving cases
if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases());
if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true);
if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true, myCache.start == 0);
} else {
cacheHits++;
}

View File

@ -166,9 +166,11 @@ public class BaseUtils {
return base >= 'A' && base <= 'Z';
}
public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase) {
public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase, final boolean ignoreConversionOfFirstByte) {
final int length = bases.length;
for ( int i = 0; i < length; i++ ) {
final int start = ignoreConversionOfFirstByte ? 1 : 0;
for ( int i = start; i < length; i++ ) {
final int baseIndex = baseIndexWithIupacMap[bases[i]];
if ( baseIndex == Base.N.ordinal() ) {
bases[i] = 'N';

View File

@ -32,8 +32,10 @@ package org.broadinstitute.sting.utils.fasta;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Priority;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
@ -49,7 +51,7 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* Basic unit test for GenomeLoc
* Basic unit test for CachingIndexedFastaSequenceFile
*/
public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
private File simpleFasta = new File(publicTestDir + "/exampleFASTA.fasta");
@ -80,7 +82,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
@Test(dataProvider = "fastas", enabled = true && ! DEBUG)
public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);
SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0);
logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d",
@ -122,7 +124,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
@Test(dataProvider = "fastas", enabled = true && ! DEBUG)
public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
@ -167,7 +169,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
@Test(dataProvider = "ParallelFastaTest", enabled = true && ! DEBUG, timeOut = 60000)
public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException {
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);
logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt));
for ( int iterations = 0; iterations < 1; iterations++ ) {
@ -230,4 +232,33 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
else
return new String(reader.getSubsequenceAt(contig, start, stop).getBases());
}
@Test(enabled = true)
public void testIupacChanges() throws FileNotFoundException, InterruptedException {
final String testFasta = privateTestDir + "iupacFASTA.fasta";
final CachingIndexedFastaSequenceFile iupacPreserving = new CachingIndexedFastaSequenceFile(new File(testFasta), CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE, false, true);
final CachingIndexedFastaSequenceFile makeNs = new CachingIndexedFastaSequenceFile(new File(testFasta));
int preservingNs = 0;
int changingNs = 0;
for ( SAMSequenceRecord contig : iupacPreserving.getSequenceDictionary().getSequences() ) {
final String sPreserving = fetchBaseString(iupacPreserving, contig.getSequenceName(), 0, 15000);
preservingNs += StringUtils.countMatches(sPreserving, "N");
final String sChanging = fetchBaseString(makeNs, contig.getSequenceName(), 0, 15000);
changingNs += StringUtils.countMatches(sChanging, "N");
}
Assert.assertEquals(changingNs, preservingNs + 4);
}
@Test(enabled = true, expectedExceptions = {UserException.class})
public void testFailOnBadBase() throws FileNotFoundException, InterruptedException {
final String testFasta = privateTestDir + "problematicFASTA.fasta";
final CachingIndexedFastaSequenceFile fasta = new CachingIndexedFastaSequenceFile(new File(testFasta));
for ( SAMSequenceRecord contig : fasta.getSequenceDictionary().getSequences() ) {
fetchBaseString(fasta, contig.getSequenceName(), -1, -1);
}
}
}