Added tests for changing IUPAC bases to Ns, for failing on bad ref bases, and for the HaplotypeCaller not failing when running over a region with an IUPAC base.
Out of curiosity, why does Picard's IndexedFastaSequenceFile allow one to query for start position 0? When doing so, that base is a line feed (-1 offset to the first base in the contig) which is an illegal base (and which caused me no end of trouble)...
This commit is contained in:
parent
445735a4a5
commit
d18dbcbac1
|
|
@ -50,6 +50,7 @@ import org.broadinstitute.sting.WalkerTest;
|
|||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
|
||||
public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
||||
final static String REF = b37KGReference;
|
||||
|
|
@ -156,6 +157,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
executeTest("HCTestStructuralIndels: ", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestDoesNotFailOnBadRefBase() {
|
||||
// don't care about the output - just want to make sure it doesn't fail
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.<String>emptyList());
|
||||
executeTest("HCTestDoesNotFailOnBadRefBase: ", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing reduced reads
|
||||
|
|
|
|||
|
|
@ -125,13 +125,13 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
* @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0
|
||||
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase ) throws FileNotFoundException {
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) throws FileNotFoundException {
|
||||
super(fasta);
|
||||
if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0");
|
||||
this.cacheSize = cacheSize;
|
||||
this.cacheMissBackup = Math.max(cacheSize / 1000, 1);
|
||||
this.preserveCase = preserveCase;
|
||||
preserveIUPAC = false;
|
||||
this.preserveIUPAC = preserveIUPAC;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -168,7 +168,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
* @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException {
|
||||
this(fasta, DEFAULT_CACHE_SIZE, preserveCase);
|
||||
this(fasta, DEFAULT_CACHE_SIZE, preserveCase, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -181,7 +181,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
* @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0
|
||||
*/
|
||||
public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException {
|
||||
this(fasta, cacheSize, false);
|
||||
this(fasta, cacheSize, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -261,7 +261,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
* all of the bases in the ReferenceSequence returned by this method will be upper cased.
|
||||
*/
|
||||
@Override
|
||||
public ReferenceSequence getSubsequenceAt( final String contig, final long start, final long stop ) {
|
||||
public ReferenceSequence getSubsequenceAt( final String contig, long start, final long stop ) {
|
||||
final ReferenceSequence result;
|
||||
final Cache myCache = cache.get();
|
||||
|
||||
|
|
@ -269,7 +269,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
cacheMisses++;
|
||||
result = super.getSubsequenceAt(contig, start, stop);
|
||||
if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases());
|
||||
if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true);
|
||||
if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true, start < 1);
|
||||
} else {
|
||||
// todo -- potential optimization is to check if contig.name == contig, as this in general will be true
|
||||
SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig);
|
||||
|
|
@ -285,7 +285,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile {
|
|||
|
||||
// convert all of the bases in the sequence to upper case if we aren't preserving cases
|
||||
if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases());
|
||||
if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true);
|
||||
if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true, myCache.start == 0);
|
||||
} else {
|
||||
cacheHits++;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -166,9 +166,11 @@ public class BaseUtils {
|
|||
return base >= 'A' && base <= 'Z';
|
||||
}
|
||||
|
||||
public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase) {
|
||||
public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase, final boolean ignoreConversionOfFirstByte) {
|
||||
final int length = bases.length;
|
||||
for ( int i = 0; i < length; i++ ) {
|
||||
final int start = ignoreConversionOfFirstByte ? 1 : 0;
|
||||
|
||||
for ( int i = start; i < length; i++ ) {
|
||||
final int baseIndex = baseIndexWithIupacMap[bases[i]];
|
||||
if ( baseIndex == Base.N.ordinal() ) {
|
||||
bases[i] = 'N';
|
||||
|
|
|
|||
|
|
@ -32,8 +32,10 @@ package org.broadinstitute.sting.utils.fasta;
|
|||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.log4j.Priority;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
|
@ -49,7 +51,7 @@ import java.util.concurrent.ExecutorService;
|
|||
import java.util.concurrent.Executors;
|
||||
|
||||
/**
|
||||
* Basic unit test for GenomeLoc
|
||||
* Basic unit test for CachingIndexedFastaSequenceFile
|
||||
*/
|
||||
public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
|
||||
private File simpleFasta = new File(publicTestDir + "/exampleFASTA.fasta");
|
||||
|
|
@ -80,7 +82,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
|
|||
|
||||
@Test(dataProvider = "fastas", enabled = true && ! DEBUG)
|
||||
public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
|
||||
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
|
||||
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);
|
||||
|
||||
SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0);
|
||||
logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d",
|
||||
|
|
@ -122,7 +124,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
|
|||
@Test(dataProvider = "fastas", enabled = true && ! DEBUG)
|
||||
public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException {
|
||||
final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta);
|
||||
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
|
||||
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);
|
||||
|
||||
SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0);
|
||||
|
||||
|
|
@ -167,7 +169,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
|
|||
|
||||
@Test(dataProvider = "ParallelFastaTest", enabled = true && ! DEBUG, timeOut = 60000)
|
||||
public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException {
|
||||
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true);
|
||||
final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true, false);
|
||||
|
||||
logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt));
|
||||
for ( int iterations = 0; iterations < 1; iterations++ ) {
|
||||
|
|
@ -230,4 +232,33 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest {
|
|||
else
|
||||
return new String(reader.getSubsequenceAt(contig, start, stop).getBases());
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testIupacChanges() throws FileNotFoundException, InterruptedException {
|
||||
final String testFasta = privateTestDir + "iupacFASTA.fasta";
|
||||
final CachingIndexedFastaSequenceFile iupacPreserving = new CachingIndexedFastaSequenceFile(new File(testFasta), CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE, false, true);
|
||||
final CachingIndexedFastaSequenceFile makeNs = new CachingIndexedFastaSequenceFile(new File(testFasta));
|
||||
|
||||
int preservingNs = 0;
|
||||
int changingNs = 0;
|
||||
for ( SAMSequenceRecord contig : iupacPreserving.getSequenceDictionary().getSequences() ) {
|
||||
final String sPreserving = fetchBaseString(iupacPreserving, contig.getSequenceName(), 0, 15000);
|
||||
preservingNs += StringUtils.countMatches(sPreserving, "N");
|
||||
|
||||
final String sChanging = fetchBaseString(makeNs, contig.getSequenceName(), 0, 15000);
|
||||
changingNs += StringUtils.countMatches(sChanging, "N");
|
||||
}
|
||||
|
||||
Assert.assertEquals(changingNs, preservingNs + 4);
|
||||
}
|
||||
|
||||
@Test(enabled = true, expectedExceptions = {UserException.class})
|
||||
public void testFailOnBadBase() throws FileNotFoundException, InterruptedException {
|
||||
final String testFasta = privateTestDir + "problematicFASTA.fasta";
|
||||
final CachingIndexedFastaSequenceFile fasta = new CachingIndexedFastaSequenceFile(new File(testFasta));
|
||||
|
||||
for ( SAMSequenceRecord contig : fasta.getSequenceDictionary().getSequences() ) {
|
||||
fetchBaseString(fasta, contig.getSequenceName(), -1, -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue