diff --git a/java/src/org/broadinstitute/sting/utils/FastaSequenceIndex.java b/java/src/org/broadinstitute/sting/utils/FastaSequenceIndex.java new file mode 100755 index 000000000..a6221b3f8 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/FastaSequenceIndex.java @@ -0,0 +1,148 @@ +package org.broadinstitute.sting.utils; + +import edu.mit.broad.picard.PicardException; + +import java.util.Scanner; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.MatchResult; +import java.util.regex.Pattern; +import java.util.regex.Matcher; +import java.io.File; +import java.io.FileNotFoundException; + +/** + * Created by IntelliJ IDEA. + * User: hanna + * Date: Apr 14, 2009 + * Time: 10:02:10 AM + * + * Reads a fasta index file (.fai). + */ +public class FastaSequenceIndex { + private Map sequenceEntries = + new HashMap(); + + /** + * Build a sequence index from the specified file. + * @param indexFile File to open. + * @throws PicardException if file is of invalid format. + */ + public FastaSequenceIndex( File indexFile ) throws FileNotFoundException { + Scanner scanner = new Scanner(indexFile); + + while( scanner.hasNext() ) { + // Tokenize and validate the index line. + String result = scanner.findInLine("(\\w+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)"); + if( result == null ) + throw new PicardException("Found invalid line in index file:" + scanner.nextLine()); + MatchResult tokens = scanner.match(); + if( tokens.groupCount() != 5 ) + throw new PicardException("Found invalid line in index file:" + scanner.nextLine()); + + // Skip past the line separator + scanner.nextLine(); + + // Parse the index line. + String contig = tokens.group(1); + long size = Long.valueOf(tokens.group(2)); + long location = Long.valueOf(tokens.group(3)); + int basesPerLine = Integer.valueOf(tokens.group(4)); + int bytesPerLine = Integer.valueOf(tokens.group(5)); + + // Build sequence structure + sequenceEntries.put( contig,new FastaSequenceIndexEntry(contig,location,size,basesPerLine,bytesPerLine) ); + } + } + + /** + * Does the given contig name have a corresponding entry? + * @param contigName The contig name for which to search. + * @return True if contig name is present; false otherwise. + */ + public boolean hasIndexEntry( String contigName ) { + return sequenceEntries.containsKey(contigName); + } + + /** + * Retrieve the index entry associated with the given contig. + * @param contigName Name of the contig for which to search. + * @return Index entry associated with the given contig. + * @throws PicardException if the associated index entry can't be found. + */ + public FastaSequenceIndexEntry getIndexEntry( String contigName ) { + if( !hasIndexEntry(contigName) ) + throw new PicardException("Unable to find entry for contig: " + contigName); + + return sequenceEntries.get(contigName); + } +} + +class FastaSequenceIndexEntry { + private String contig; + private long location; + private long size; + private int basesPerLine; + private int bytesPerLine; + + public FastaSequenceIndexEntry( String contig, + long location, + long size, + int basesPerLine, + int bytesPerLine ) { + this.contig = contig; + this.location = location; + this.size = size; + this.basesPerLine = basesPerLine; + this.bytesPerLine = bytesPerLine; + } + + /** + * Gets the contig associated with this entry. + * @return String representation of the contig. + */ + public String getContig() { + return contig; + } + + /** + * Gets the location of this contig within the fasta. + * @return seek position within the fasta. + */ + public long getLocation() { + return location; + } + + /** + * Gets the size, in bytes, of the data in the contig. + * @return size of the contig bases in bytes. + */ + public long getSize() { + return size; + } + + /** + * Gets the number of bases in a given line. + * @return Number of bases in the fasta line. + */ + public int getBasesPerLine() { + return basesPerLine; + } + + /** + * How many bytes (bases + whitespace) are consumed by the + * given line? + * @return Number of bytes in a line. + */ + public int getBytesPerLine() { + return bytesPerLine; + } + + public String toString() { + return String.format("contig %s; location %d; size %d; basesPerLine %d; bytesPerLine %d", contig, + location, + size, + basesPerLine, + bytesPerLine ); + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/utils/FastaSequenceIndexTest.java b/java/test/org/broadinstitute/sting/utils/FastaSequenceIndexTest.java new file mode 100755 index 000000000..90375641b --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/FastaSequenceIndexTest.java @@ -0,0 +1,127 @@ +package org.broadinstitute.sting.utils; + +import org.junit.BeforeClass; +import org.junit.Before; +import org.junit.Test; +import org.junit.Assert; +import org.apache.log4j.BasicConfigurator; + +import java.io.File; +import java.io.FileNotFoundException; + +import edu.mit.broad.picard.PicardException; + +/** + * Created by IntelliJ IDEA. + * User: hanna + * Date: Apr 14, 2009 + * Time: 10:34:15 AM + * To change this template use File | Settings | File Templates. + */ +public class FastaSequenceIndexTest { + private final String sequenceIndexName = "/Volumes/ifs/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai"; + private FastaSequenceIndex sequenceIndex = null; + + @BeforeClass + public static void initialize() { + BasicConfigurator.configure(); + } + + @Before + public void doForEachTest() throws FileNotFoundException { + sequenceIndex = new FastaSequenceIndex( new File(sequenceIndexName) ); + } + + @Test + public void testInitialContig() { + Assert.assertTrue("Contig chrM is not present", sequenceIndex.hasIndexEntry("chrM")); + FastaSequenceIndexEntry entry = sequenceIndex.getIndexEntry("chrM"); + Assert.assertEquals("Contig chrM name is incorrect",entry.getContig(),"chrM"); + Assert.assertEquals("Contig chrM location is incorrect",entry.getLocation(),6L); + Assert.assertEquals("Contig chrM size is incorrect",entry.getSize(),16571L); + Assert.assertEquals("Contig chrM bases per line is incorrect",entry.getBasesPerLine(),50); + Assert.assertEquals("Contig chrM bytes per line is incorrect",entry.getBytesPerLine(),51); + } + + @Test + public void testMiddleContig() { + Assert.assertTrue("Contig chr8 is not present", sequenceIndex.hasIndexEntry("chr8")); + FastaSequenceIndexEntry entry = sequenceIndex.getIndexEntry("chr8"); + Assert.assertEquals("Contig chr8 name is incorrect",entry.getContig(),"chr8"); + Assert.assertEquals("Contig chr8 location is incorrect",entry.getLocation(),1419403101L); + Assert.assertEquals("Contig chr8 size is incorrect",entry.getSize(),146274826L); + Assert.assertEquals("Contig chr8 bases per line is incorrect",entry.getBasesPerLine(),50); + Assert.assertEquals("Contig chr8 bytes per line is incorrect",entry.getBytesPerLine(),51); + } + + @Test + public void testLastContig() { + Assert.assertTrue("Contig chrX_random is not present", sequenceIndex.hasIndexEntry("chrX_random")); + FastaSequenceIndexEntry entry = sequenceIndex.getIndexEntry("chrX_random"); + Assert.assertEquals("Contig chrX_random name is incorrect",entry.getContig(),"chrX_random"); + Assert.assertEquals("Contig chrX_random location is incorrect",entry.getLocation(),3156698441L); + Assert.assertEquals("Contig chrX_random size is incorrect",entry.getSize(),1719168L); + Assert.assertEquals("Contig chrX_random bases per line is incorrect",entry.getBasesPerLine(),50); + Assert.assertEquals("Contig chrX_random bytes per line is incorrect",entry.getBytesPerLine(),51); + } + + @Test + public void testAllContigsPresent() { + Assert.assertTrue("Contig chrM is not present", sequenceIndex.hasIndexEntry("chrM")); + Assert.assertTrue("Contig chr1 is not present", sequenceIndex.hasIndexEntry("chr1")); + Assert.assertTrue("Contig chr2 is not present", sequenceIndex.hasIndexEntry("chr2")); + Assert.assertTrue("Contig chr3 is not present", sequenceIndex.hasIndexEntry("chr3")); + Assert.assertTrue("Contig chr4 is not present", sequenceIndex.hasIndexEntry("chr4")); + Assert.assertTrue("Contig chr5 is not present", sequenceIndex.hasIndexEntry("chr5")); + Assert.assertTrue("Contig chr6 is not present", sequenceIndex.hasIndexEntry("chr6")); + Assert.assertTrue("Contig chr7 is not present", sequenceIndex.hasIndexEntry("chr7")); + Assert.assertTrue("Contig chr8 is not present", sequenceIndex.hasIndexEntry("chr8")); + Assert.assertTrue("Contig chr9 is not present", sequenceIndex.hasIndexEntry("chr9")); + Assert.assertTrue("Contig chr10 is not present", sequenceIndex.hasIndexEntry("chr10")); + Assert.assertTrue("Contig chr11 is not present", sequenceIndex.hasIndexEntry("chr11")); + Assert.assertTrue("Contig chr12 is not present", sequenceIndex.hasIndexEntry("chr12")); + Assert.assertTrue("Contig chr13 is not present", sequenceIndex.hasIndexEntry("chr13")); + Assert.assertTrue("Contig chr14 is not present", sequenceIndex.hasIndexEntry("chr14")); + Assert.assertTrue("Contig chr15 is not present", sequenceIndex.hasIndexEntry("chr15")); + Assert.assertTrue("Contig chr16 is not present", sequenceIndex.hasIndexEntry("chr16")); + Assert.assertTrue("Contig chr17 is not present", sequenceIndex.hasIndexEntry("chr17")); + Assert.assertTrue("Contig chr18 is not present", sequenceIndex.hasIndexEntry("chr18")); + Assert.assertTrue("Contig chr19 is not present", sequenceIndex.hasIndexEntry("chr19")); + Assert.assertTrue("Contig chr20 is not present", sequenceIndex.hasIndexEntry("chr20")); + Assert.assertTrue("Contig chr21 is not present", sequenceIndex.hasIndexEntry("chr21")); + Assert.assertTrue("Contig chr22 is not present", sequenceIndex.hasIndexEntry("chr22")); + Assert.assertTrue("Contig chrX is not present", sequenceIndex.hasIndexEntry("chrX")); + Assert.assertTrue("Contig chrY is not present", sequenceIndex.hasIndexEntry("chrY")); + Assert.assertTrue("Contig chr1_random is not present", sequenceIndex.hasIndexEntry("chr1_random")); + Assert.assertTrue("Contig chr2_random is not present", sequenceIndex.hasIndexEntry("chr2_random")); + Assert.assertTrue("Contig chr3_random is not present", sequenceIndex.hasIndexEntry("chr3_random")); + Assert.assertTrue("Contig chr4_random is not present", sequenceIndex.hasIndexEntry("chr4_random")); + Assert.assertTrue("Contig chr5_random is not present", sequenceIndex.hasIndexEntry("chr5_random")); + Assert.assertTrue("Contig chr6_random is not present", sequenceIndex.hasIndexEntry("chr6_random")); + Assert.assertTrue("Contig chr7_random is not present", sequenceIndex.hasIndexEntry("chr7_random")); + Assert.assertTrue("Contig chr8_random is not present", sequenceIndex.hasIndexEntry("chr8_random")); + Assert.assertTrue("Contig chr9_random is not present", sequenceIndex.hasIndexEntry("chr9_random")); + Assert.assertTrue("Contig chr10_random is not present", sequenceIndex.hasIndexEntry("chr10_random")); + Assert.assertTrue("Contig chr11_random is not present", sequenceIndex.hasIndexEntry("chr11_random")); + Assert.assertTrue("Contig chr13_random is not present", sequenceIndex.hasIndexEntry("chr13_random")); + Assert.assertTrue("Contig chr15_random is not present", sequenceIndex.hasIndexEntry("chr15_random")); + Assert.assertTrue("Contig chr16_random is not present", sequenceIndex.hasIndexEntry("chr16_random")); + Assert.assertTrue("Contig chr17_random is not present", sequenceIndex.hasIndexEntry("chr17_random")); + Assert.assertTrue("Contig chr18_random is not present", sequenceIndex.hasIndexEntry("chr18_random")); + Assert.assertTrue("Contig chr19_random is not present", sequenceIndex.hasIndexEntry("chr19_random")); + Assert.assertTrue("Contig chr21_random is not present", sequenceIndex.hasIndexEntry("chr21_random")); + Assert.assertTrue("Contig chr22_random is not present", sequenceIndex.hasIndexEntry("chr22_random")); + Assert.assertTrue("Contig chrX_random is not present", sequenceIndex.hasIndexEntry("chrX_random")); + } + + @Test + public void testHasInvalidEntry() { + Assert.assertFalse("Found an invalid entry", sequenceIndex.hasIndexEntry("invalid")); + } + + @Test(expected= PicardException.class) + public void testGetInvalidEntry() { + sequenceIndex.getIndexEntry("invalid"); + } + +}