From 26e84d7fd69930a6f8580c2ed12262ce918a1b86 Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 15 Apr 2009 17:17:11 +0000 Subject: [PATCH] Added index iteration for ReferenceSequenceFile interface compatibility. Added better error checking for querying past the end of a contig. Lots more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@429 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/utils/fasta/FastaSequenceIndex.java | 16 +- .../utils/fasta/IndexedFastaSequenceFile.java | 20 +- .../utils/fasta/FastaSequenceIndexTest.java | 54 ++++++ .../fasta/IndexedFastaSequenceFileTest.java | 176 ++++++++++++++++-- 4 files changed, 240 insertions(+), 26 deletions(-) diff --git a/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java b/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java index 324ed6a37..7c140097c 100755 --- a/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java +++ b/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java @@ -4,11 +4,10 @@ import edu.mit.broad.picard.PicardException; import edu.mit.broad.picard.io.IoUtil; import java.util.Scanner; -import java.util.HashMap; import java.util.Map; +import java.util.LinkedHashMap; +import java.util.Iterator; import java.util.regex.MatchResult; -import java.util.regex.Pattern; -import java.util.regex.Matcher; import java.io.File; import java.io.FileNotFoundException; @@ -20,9 +19,10 @@ import java.io.FileNotFoundException; * * Reads a fasta index file (.fai). */ -public class FastaSequenceIndex { - private Map sequenceEntries = - new HashMap(); +public class FastaSequenceIndex implements Iterable { + // Use a linked hash map to preserve the ordering of the contigs. + private Map sequenceEntries = + new LinkedHashMap(); /** * Build a sequence index from the specified file. @@ -91,6 +91,10 @@ public class FastaSequenceIndex { return sequenceEntries.get(contigName); } + + public Iterator iterator() { + return sequenceEntries.values().iterator(); + } } class FastaSequenceIndexEntry { diff --git a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java index 291060fd5..0e983fa30 100755 --- a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java +++ b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java @@ -14,6 +14,7 @@ import java.nio.charset.CharsetDecoder; import java.nio.charset.Charset; import java.nio.charset.CharacterCodingException; import java.util.Scanner; +import java.util.Iterator; import net.sf.samtools.SAMSequenceDictionary; @@ -35,8 +36,7 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { private FileChannel channel; private final FastaSequenceIndex index; - - private String currentContigName = null; + private final Iterator indexIterator; public IndexedFastaSequenceFile(File file) throws FileNotFoundException { this.file = file; @@ -46,6 +46,7 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { File indexFile = new File(file.getAbsolutePath() + ".fai"); index = new FastaSequenceIndex(indexFile); + indexIterator = index.iterator(); } public SAMSequenceDictionary getSequenceDictionary() { @@ -59,10 +60,14 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { public ReferenceSequence getSubsequenceAt( String contig, int pos, int length ) { FastaSequenceIndexEntry indexEntry = index.getIndexEntry(contig); + if(pos + length - 1 > indexEntry.getSize()) + throw new PicardException("Query asks for data past end of contig"); + final int basesPerLine = indexEntry.getBasesPerLine(); + final int bytesPerLine = indexEntry.getBytesPerLine(); // Start reading at the closest start-of-line to our data. - long readStart = indexEntry.getLocation() + (pos / basesPerLine); + long readStart = indexEntry.getLocation() + (pos / basesPerLine) * bytesPerLine; int dataOfInterestStart = pos % basesPerLine; byte[] accumulator = new byte[length]; @@ -131,9 +136,14 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { return basesRead; } - + /** + * Gets the next sequence if available, or null if not present. + * @return next sequence if available, or null if not present. + */ public ReferenceSequence nextSequence() { - return getSubsequenceAt("chrM", 0, 20); + if( !indexIterator.hasNext() ) + return null; + return getSequence( indexIterator.next().getContig() ); } public String toString() { diff --git a/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexTest.java b/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexTest.java index 2556e07d5..c3853424e 100755 --- a/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexTest.java +++ b/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexTest.java @@ -9,6 +9,7 @@ import org.junit.Test; import java.io.File; import java.io.FileNotFoundException; +import java.util.Iterator; /** * Created by IntelliJ IDEA. @@ -135,4 +136,57 @@ public class FastaSequenceIndexTest extends BaseTest { sequenceIndex.getIndexEntry("invalid"); } + @Test + public void testIteration() { + logger.warn("Executing testIteration"); + + Iterator sequenceIndexEntries = sequenceIndex.iterator(); + + Assert.assertEquals("Contig chrM is not present", "chrM", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr1 is not present", "chr1", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr2 is not present", "chr2", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr3 is not present", "chr3", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr4 is not present", "chr4", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr5 is not present", "chr5", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr6 is not present", "chr6", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr7 is not present", "chr7", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr8 is not present", "chr8", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr9 is not present", "chr9", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr10 is not present", "chr10", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr11 is not present", "chr11", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr12 is not present", "chr12", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr13 is not present", "chr13", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr14 is not present", "chr14", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr15 is not present", "chr15", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr16 is not present", "chr16", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr17 is not present", "chr17", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr18 is not present", "chr18", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr19 is not present", "chr19", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr20 is not present", "chr20", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr21 is not present", "chr21", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr22 is not present", "chr22", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chrX is not present", "chrX", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chrY is not present", "chrY", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr1_random is not present", "chr1_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr2_random is not present", "chr2_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr3_random is not present", "chr3_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr4_random is not present", "chr4_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr5_random is not present", "chr5_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr6_random is not present", "chr6_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr7_random is not present", "chr7_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr8_random is not present", "chr8_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr9_random is not present", "chr9_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr10_random is not present", "chr10_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr11_random is not present", "chr11_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr13_random is not present", "chr13_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr15_random is not present", "chr15_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr16_random is not present", "chr16_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr17_random is not present", "chr17_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr18_random is not present", "chr18_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr19_random is not present", "chr19_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr21_random is not present", "chr21_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chr22_random is not present", "chr22_random", sequenceIndexEntries.next().getContig()); + Assert.assertEquals("Contig chrX_random is not present", "chrX_random", sequenceIndexEntries.next().getContig()); + Assert.assertFalse("Iterator still has more entries", sequenceIndexEntries.hasNext()); + } } diff --git a/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java b/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java index 3d80628d7..8a0719402 100755 --- a/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java +++ b/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java @@ -10,6 +10,7 @@ import java.io.File; import java.io.FileNotFoundException; import edu.mit.broad.picard.reference.ReferenceSequence; +import edu.mit.broad.picard.PicardException; import net.sf.samtools.util.StringUtil; /** @@ -52,12 +53,13 @@ public class IndexedFastaSequenceFileTest extends BaseTest { @Test public void testFirstSequence() { long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.nextSequence(); + ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",0,firstBasesOfChrM.length()); + long endTime = System.currentTimeMillis(); + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM"); Assert.assertEquals( "First n bases of chrM are incorrect", firstBasesOfChrM, StringUtil.bytesToString( sequence.getBases() ) ); - long endTime = System.currentTimeMillis(); System.err.printf("testFirstSequence runtime: %dms%n", (endTime - startTime)) ; } @@ -66,46 +68,190 @@ public class IndexedFastaSequenceFileTest extends BaseTest { public void testFirstSequenceExtended() { long startTime = System.currentTimeMillis(); ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",0,extendedBasesOfChrM.length()); + long endTime = System.currentTimeMillis(); + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM"); Assert.assertEquals( "First n bases of chrM are incorrect", - extendedBasesOfChrM.substring(0,110), - StringUtil.bytesToString( sequence.getBases(),0,110 ) ); - long endTime = System.currentTimeMillis(); + extendedBasesOfChrM, + StringUtil.bytesToString(sequence.getBases()) ); System.err.printf("testFirstSequenceExtended runtime: %dms%n", (endTime - startTime)) ; } @Test - public void testReadStartingInCenterOfLine() { + public void testReadStartingInCenterOfFirstLine() { final int bytesToChopOff = 5; String truncated = extendedBasesOfChrM.substring(bytesToChopOff); long startTime = System.currentTimeMillis(); ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM", bytesToChopOff ,truncated.length() ); + long endTime = System.currentTimeMillis(); + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM"); Assert.assertEquals( "First n bases of chrM are incorrect", truncated, StringUtil.bytesToString( sequence.getBases() ) ); - long endTime = System.currentTimeMillis(); - System.err.printf("testReadStartingInCenterOfLine runtime: %dms%n", (endTime - startTime)) ; + System.err.printf("testReadStartingInCenterOfFirstLine runtime: %dms%n", (endTime - startTime)) ; } @Test - public void testCompleteContigRead() { + public void testReadStartingInCenterOfMiddleLine() { + final int bytesToChopOff = 120; + String truncated = extendedBasesOfChrM.substring(bytesToChopOff); + + long startTime = System.currentTimeMillis(); + ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM", bytesToChopOff, truncated.length() ); + long endTime = System.currentTimeMillis(); + + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM"); + Assert.assertEquals( "First n bases of chrM are incorrect", + truncated, + StringUtil.bytesToString( sequence.getBases() ) ); + + System.err.printf("testReadStartingInCenterOfMiddleLine runtime: %dms%n", (endTime - startTime)) ; + } + + @Test + public void testFirstCompleteContigRead() { FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName)); ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); long startTime = System.currentTimeMillis(); - ReferenceSequence sequence = sequenceFile.getSequence("chrM"); - Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM"); - Assert.assertEquals("chrM is incorrect", - StringUtil.bytesToString(expectedSequence.getBases(),0,4096), - StringUtil.bytesToString(sequence.getBases(),0,4096) ); + ReferenceSequence sequence = sequenceFile.getSequence("chrM"); long endTime = System.currentTimeMillis(); - System.err.printf("testCompleteContigRead runtime: %dms%n", (endTime - startTime)) ; + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM"); + Assert.assertEquals("chrM is incorrect", + StringUtil.bytesToString(expectedSequence.getBases()), + StringUtil.bytesToString(sequence.getBases()) ); + + System.err.printf("testFirstCompleteContigRead runtime: %dms%n", (endTime - startTime)) ; + } + + @Test(expected= PicardException.class) + public void testReadThroughEndOfContig() { + long startTime = System.currentTimeMillis(); + try { + ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",16500,100); + } + finally { + long endTime = System.currentTimeMillis(); + System.err.printf("testReadThroughEndOfContig runtime: %dms%n", (endTime - startTime)) ; + } + } + + @Test(expected= PicardException.class) + public void testReadPastEndOfContig() { + long startTime = System.currentTimeMillis(); + try { + ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",16800,100); + } + finally { + long endTime = System.currentTimeMillis(); + System.err.printf("testReadPastEndOfContig runtime: %dms%n", (endTime - startTime)) ; + } + } + + @Test + public void testMiddleCompleteContigRead() { + FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName)); + originalSequenceFile.seekToContig("chrY"); + ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); + + long startTime = System.currentTimeMillis(); + ReferenceSequence sequence = sequenceFile.getSequence("chrY"); + long endTime = System.currentTimeMillis(); + + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrY"); + Assert.assertEquals("chrY is incorrect", + StringUtil.bytesToString(expectedSequence.getBases()), + StringUtil.bytesToString(sequence.getBases()) ); + + System.err.printf("testMiddleCompleteContigRead runtime: %dms%n", (endTime - startTime)) ; + } + + @Test + public void testLastCompleteContigRead() { + FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName)); + originalSequenceFile.seekToContig("chrX_random"); + ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); + + long startTime = System.currentTimeMillis(); + ReferenceSequence sequence = sequenceFile.getSequence("chrX_random"); + long endTime = System.currentTimeMillis(); + + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrX_random"); + Assert.assertEquals("chrX_random is incorrect", + StringUtil.bytesToString(expectedSequence.getBases()), + StringUtil.bytesToString(sequence.getBases()) ); + + System.err.printf("testLastCompleteContigRead runtime: %dms%n", (endTime - startTime)) ; } + @Test + public void testFirstOfChr1() { + long startTime = System.currentTimeMillis(); + ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr1",0,firstBasesOfChr1.length()); + long endTime = System.currentTimeMillis(); + + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chr1"); + Assert.assertEquals( "First n bases of chr1 are incorrect", + firstBasesOfChr1, + StringUtil.bytesToString( sequence.getBases() ) ); + + System.err.printf("testFirstOfChr1 runtime: %dms%n", (endTime - startTime)) ; + } + + @Test + public void testFirstOfChr8() { + long startTime = System.currentTimeMillis(); + ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr8",0,firstBasesOfChr8.length()); + long endTime = System.currentTimeMillis(); + + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chr8"); + Assert.assertEquals( "First n bases of chr8 are incorrect", + firstBasesOfChr8, + StringUtil.bytesToString( sequence.getBases() ) ); + + System.err.printf("testFirstOfChr8 runtime: %dms%n", (endTime - startTime)) ; + } + + @Test + public void testFirstElementOfIterator() { + FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName)); + ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); + + long startTime = System.currentTimeMillis(); + ReferenceSequence sequence = sequenceFile.nextSequence(); + long endTime = System.currentTimeMillis(); + + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM"); + Assert.assertEquals("chrM is incorrect", + StringUtil.bytesToString(expectedSequence.getBases()), + StringUtil.bytesToString(sequence.getBases()) ); + + System.err.printf("testFirstElementOfIterator runtime: %dms%n", (endTime - startTime)) ; + } + + @Test + public void testNextElementOfIterator() { + FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName)); + // Skip past the first one and load the second one. + originalSequenceFile.nextSequence(); + ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); + + long startTime = System.currentTimeMillis(); + sequenceFile.nextSequence(); + ReferenceSequence sequence = sequenceFile.nextSequence(); + long endTime = System.currentTimeMillis(); + + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chr1"); + Assert.assertEquals("chr1 is incorrect", + StringUtil.bytesToString(expectedSequence.getBases()), + StringUtil.bytesToString(sequence.getBases()) ); + + System.err.printf("testNextElementOfIterator runtime: %dms%n", (endTime - startTime)) ; + } }