Added index iteration for ReferenceSequenceFile interface compatibility.
Added better error checking for querying past the end of a contig. Lots more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@429 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3fda8613c3
commit
26e84d7fd6
|
|
@ -4,11 +4,10 @@ import edu.mit.broad.picard.PicardException;
|
|||
import edu.mit.broad.picard.io.IoUtil;
|
||||
|
||||
import java.util.Scanner;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.regex.MatchResult;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
|
|
@ -20,9 +19,10 @@ import java.io.FileNotFoundException;
|
|||
*
|
||||
* Reads a fasta index file (.fai).
|
||||
*/
|
||||
public class FastaSequenceIndex {
|
||||
private Map<String,FastaSequenceIndexEntry> sequenceEntries =
|
||||
new HashMap<String,FastaSequenceIndexEntry>();
|
||||
public class FastaSequenceIndex implements Iterable {
|
||||
// Use a linked hash map to preserve the ordering of the contigs.
|
||||
private Map<String,FastaSequenceIndexEntry> sequenceEntries =
|
||||
new LinkedHashMap<String,FastaSequenceIndexEntry>();
|
||||
|
||||
/**
|
||||
* Build a sequence index from the specified file.
|
||||
|
|
@ -91,6 +91,10 @@ public class FastaSequenceIndex {
|
|||
|
||||
return sequenceEntries.get(contigName);
|
||||
}
|
||||
|
||||
public Iterator<FastaSequenceIndexEntry> iterator() {
|
||||
return sequenceEntries.values().iterator();
|
||||
}
|
||||
}
|
||||
|
||||
class FastaSequenceIndexEntry {
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ import java.nio.charset.CharsetDecoder;
|
|||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharacterCodingException;
|
||||
import java.util.Scanner;
|
||||
import java.util.Iterator;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
|
||||
|
|
@ -35,8 +36,7 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
|
|||
private FileChannel channel;
|
||||
|
||||
private final FastaSequenceIndex index;
|
||||
|
||||
private String currentContigName = null;
|
||||
private final Iterator<FastaSequenceIndexEntry> indexIterator;
|
||||
|
||||
public IndexedFastaSequenceFile(File file) throws FileNotFoundException {
|
||||
this.file = file;
|
||||
|
|
@ -46,6 +46,7 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
|
|||
|
||||
File indexFile = new File(file.getAbsolutePath() + ".fai");
|
||||
index = new FastaSequenceIndex(indexFile);
|
||||
indexIterator = index.iterator();
|
||||
}
|
||||
|
||||
public SAMSequenceDictionary getSequenceDictionary() {
|
||||
|
|
@ -59,10 +60,14 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
|
|||
public ReferenceSequence getSubsequenceAt( String contig, int pos, int length ) {
|
||||
FastaSequenceIndexEntry indexEntry = index.getIndexEntry(contig);
|
||||
|
||||
if(pos + length - 1 > indexEntry.getSize())
|
||||
throw new PicardException("Query asks for data past end of contig");
|
||||
|
||||
final int basesPerLine = indexEntry.getBasesPerLine();
|
||||
final int bytesPerLine = indexEntry.getBytesPerLine();
|
||||
|
||||
// Start reading at the closest start-of-line to our data.
|
||||
long readStart = indexEntry.getLocation() + (pos / basesPerLine);
|
||||
long readStart = indexEntry.getLocation() + (pos / basesPerLine) * bytesPerLine;
|
||||
int dataOfInterestStart = pos % basesPerLine;
|
||||
|
||||
byte[] accumulator = new byte[length];
|
||||
|
|
@ -131,9 +136,14 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
|
|||
return basesRead;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the next sequence if available, or null if not present.
|
||||
* @return next sequence if available, or null if not present.
|
||||
*/
|
||||
public ReferenceSequence nextSequence() {
|
||||
return getSubsequenceAt("chrM", 0, 20);
|
||||
if( !indexIterator.hasNext() )
|
||||
return null;
|
||||
return getSequence( indexIterator.next().getContig() );
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import org.junit.Test;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -135,4 +136,57 @@ public class FastaSequenceIndexTest extends BaseTest {
|
|||
sequenceIndex.getIndexEntry("invalid");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIteration() {
|
||||
logger.warn("Executing testIteration");
|
||||
|
||||
Iterator<FastaSequenceIndexEntry> sequenceIndexEntries = sequenceIndex.iterator();
|
||||
|
||||
Assert.assertEquals("Contig chrM is not present", "chrM", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr1 is not present", "chr1", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr2 is not present", "chr2", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr3 is not present", "chr3", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr4 is not present", "chr4", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr5 is not present", "chr5", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr6 is not present", "chr6", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr7 is not present", "chr7", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr8 is not present", "chr8", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr9 is not present", "chr9", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr10 is not present", "chr10", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr11 is not present", "chr11", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr12 is not present", "chr12", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr13 is not present", "chr13", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr14 is not present", "chr14", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr15 is not present", "chr15", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr16 is not present", "chr16", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr17 is not present", "chr17", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr18 is not present", "chr18", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr19 is not present", "chr19", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr20 is not present", "chr20", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr21 is not present", "chr21", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr22 is not present", "chr22", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chrX is not present", "chrX", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chrY is not present", "chrY", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr1_random is not present", "chr1_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr2_random is not present", "chr2_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr3_random is not present", "chr3_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr4_random is not present", "chr4_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr5_random is not present", "chr5_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr6_random is not present", "chr6_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr7_random is not present", "chr7_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr8_random is not present", "chr8_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr9_random is not present", "chr9_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr10_random is not present", "chr10_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr11_random is not present", "chr11_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr13_random is not present", "chr13_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr15_random is not present", "chr15_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr16_random is not present", "chr16_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr17_random is not present", "chr17_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr18_random is not present", "chr18_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr19_random is not present", "chr19_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr21_random is not present", "chr21_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chr22_random is not present", "chr22_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertEquals("Contig chrX_random is not present", "chrX_random", sequenceIndexEntries.next().getContig());
|
||||
Assert.assertFalse("Iterator still has more entries", sequenceIndexEntries.hasNext());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import java.io.File;
|
|||
import java.io.FileNotFoundException;
|
||||
|
||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
|
||||
/**
|
||||
|
|
@ -52,12 +53,13 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
|
|||
@Test
|
||||
public void testFirstSequence() {
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.nextSequence();
|
||||
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",0,firstBasesOfChrM.length());
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
|
||||
Assert.assertEquals( "First n bases of chrM are incorrect",
|
||||
firstBasesOfChrM,
|
||||
StringUtil.bytesToString( sequence.getBases() ) );
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testFirstSequence runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
|
@ -66,46 +68,190 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
|
|||
public void testFirstSequenceExtended() {
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",0,extendedBasesOfChrM.length());
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
|
||||
Assert.assertEquals( "First n bases of chrM are incorrect",
|
||||
extendedBasesOfChrM.substring(0,110),
|
||||
StringUtil.bytesToString( sequence.getBases(),0,110 ) );
|
||||
long endTime = System.currentTimeMillis();
|
||||
extendedBasesOfChrM,
|
||||
StringUtil.bytesToString(sequence.getBases()) );
|
||||
|
||||
System.err.printf("testFirstSequenceExtended runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadStartingInCenterOfLine() {
|
||||
public void testReadStartingInCenterOfFirstLine() {
|
||||
final int bytesToChopOff = 5;
|
||||
String truncated = extendedBasesOfChrM.substring(bytesToChopOff);
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM", bytesToChopOff ,truncated.length() );
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
|
||||
Assert.assertEquals( "First n bases of chrM are incorrect",
|
||||
truncated,
|
||||
StringUtil.bytesToString( sequence.getBases() ) );
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testReadStartingInCenterOfLine runtime: %dms%n", (endTime - startTime)) ;
|
||||
System.err.printf("testReadStartingInCenterOfFirstLine runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCompleteContigRead() {
|
||||
public void testReadStartingInCenterOfMiddleLine() {
|
||||
final int bytesToChopOff = 120;
|
||||
String truncated = extendedBasesOfChrM.substring(bytesToChopOff);
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM", bytesToChopOff, truncated.length() );
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
|
||||
Assert.assertEquals( "First n bases of chrM are incorrect",
|
||||
truncated,
|
||||
StringUtil.bytesToString( sequence.getBases() ) );
|
||||
|
||||
System.err.printf("testReadStartingInCenterOfMiddleLine runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFirstCompleteContigRead() {
|
||||
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
|
||||
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.getSequence("chrM");
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
|
||||
Assert.assertEquals("chrM is incorrect",
|
||||
StringUtil.bytesToString(expectedSequence.getBases(),0,4096),
|
||||
StringUtil.bytesToString(sequence.getBases(),0,4096) );
|
||||
ReferenceSequence sequence = sequenceFile.getSequence("chrM");
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testCompleteContigRead runtime: %dms%n", (endTime - startTime)) ;
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
|
||||
Assert.assertEquals("chrM is incorrect",
|
||||
StringUtil.bytesToString(expectedSequence.getBases()),
|
||||
StringUtil.bytesToString(sequence.getBases()) );
|
||||
|
||||
System.err.printf("testFirstCompleteContigRead runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test(expected= PicardException.class)
|
||||
public void testReadThroughEndOfContig() {
|
||||
long startTime = System.currentTimeMillis();
|
||||
try {
|
||||
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",16500,100);
|
||||
}
|
||||
finally {
|
||||
long endTime = System.currentTimeMillis();
|
||||
System.err.printf("testReadThroughEndOfContig runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
}
|
||||
|
||||
@Test(expected= PicardException.class)
|
||||
public void testReadPastEndOfContig() {
|
||||
long startTime = System.currentTimeMillis();
|
||||
try {
|
||||
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",16800,100);
|
||||
}
|
||||
finally {
|
||||
long endTime = System.currentTimeMillis();
|
||||
System.err.printf("testReadPastEndOfContig runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMiddleCompleteContigRead() {
|
||||
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
|
||||
originalSequenceFile.seekToContig("chrY");
|
||||
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.getSequence("chrY");
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrY");
|
||||
Assert.assertEquals("chrY is incorrect",
|
||||
StringUtil.bytesToString(expectedSequence.getBases()),
|
||||
StringUtil.bytesToString(sequence.getBases()) );
|
||||
|
||||
System.err.printf("testMiddleCompleteContigRead runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLastCompleteContigRead() {
|
||||
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
|
||||
originalSequenceFile.seekToContig("chrX_random");
|
||||
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.getSequence("chrX_random");
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrX_random");
|
||||
Assert.assertEquals("chrX_random is incorrect",
|
||||
StringUtil.bytesToString(expectedSequence.getBases()),
|
||||
StringUtil.bytesToString(sequence.getBases()) );
|
||||
|
||||
System.err.printf("testLastCompleteContigRead runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testFirstOfChr1() {
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr1",0,firstBasesOfChr1.length());
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chr1");
|
||||
Assert.assertEquals( "First n bases of chr1 are incorrect",
|
||||
firstBasesOfChr1,
|
||||
StringUtil.bytesToString( sequence.getBases() ) );
|
||||
|
||||
System.err.printf("testFirstOfChr1 runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFirstOfChr8() {
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr8",0,firstBasesOfChr8.length());
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chr8");
|
||||
Assert.assertEquals( "First n bases of chr8 are incorrect",
|
||||
firstBasesOfChr8,
|
||||
StringUtil.bytesToString( sequence.getBases() ) );
|
||||
|
||||
System.err.printf("testFirstOfChr8 runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFirstElementOfIterator() {
|
||||
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
|
||||
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.nextSequence();
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
|
||||
Assert.assertEquals("chrM is incorrect",
|
||||
StringUtil.bytesToString(expectedSequence.getBases()),
|
||||
StringUtil.bytesToString(sequence.getBases()) );
|
||||
|
||||
System.err.printf("testFirstElementOfIterator runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNextElementOfIterator() {
|
||||
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
|
||||
// Skip past the first one and load the second one.
|
||||
originalSequenceFile.nextSequence();
|
||||
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
sequenceFile.nextSequence();
|
||||
ReferenceSequence sequence = sequenceFile.nextSequence();
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chr1");
|
||||
Assert.assertEquals("chr1 is incorrect",
|
||||
StringUtil.bytesToString(expectedSequence.getBases()),
|
||||
StringUtil.bytesToString(sequence.getBases()) );
|
||||
|
||||
System.err.printf("testNextElementOfIterator runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue