Class to read an .fai file.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@405 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-04-14 17:37:18 +00:00
parent 704f1bd634
commit 186c799ffc
2 changed files with 275 additions and 0 deletions

View File

@ -0,0 +1,148 @@
package org.broadinstitute.sting.utils;
import edu.mit.broad.picard.PicardException;
import java.util.Scanner;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.MatchResult;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.File;
import java.io.FileNotFoundException;
/**
* Created by IntelliJ IDEA.
* User: hanna
* Date: Apr 14, 2009
* Time: 10:02:10 AM
*
* Reads a fasta index file (.fai).
*/
public class FastaSequenceIndex {
private Map<String,FastaSequenceIndexEntry> sequenceEntries =
new HashMap<String,FastaSequenceIndexEntry>();
/**
* Build a sequence index from the specified file.
* @param indexFile File to open.
* @throws PicardException if file is of invalid format.
*/
public FastaSequenceIndex( File indexFile ) throws FileNotFoundException {
Scanner scanner = new Scanner(indexFile);
while( scanner.hasNext() ) {
// Tokenize and validate the index line.
String result = scanner.findInLine("(\\w+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)");
if( result == null )
throw new PicardException("Found invalid line in index file:" + scanner.nextLine());
MatchResult tokens = scanner.match();
if( tokens.groupCount() != 5 )
throw new PicardException("Found invalid line in index file:" + scanner.nextLine());
// Skip past the line separator
scanner.nextLine();
// Parse the index line.
String contig = tokens.group(1);
long size = Long.valueOf(tokens.group(2));
long location = Long.valueOf(tokens.group(3));
int basesPerLine = Integer.valueOf(tokens.group(4));
int bytesPerLine = Integer.valueOf(tokens.group(5));
// Build sequence structure
sequenceEntries.put( contig,new FastaSequenceIndexEntry(contig,location,size,basesPerLine,bytesPerLine) );
}
}
/**
* Does the given contig name have a corresponding entry?
* @param contigName The contig name for which to search.
* @return True if contig name is present; false otherwise.
*/
public boolean hasIndexEntry( String contigName ) {
return sequenceEntries.containsKey(contigName);
}
/**
* Retrieve the index entry associated with the given contig.
* @param contigName Name of the contig for which to search.
* @return Index entry associated with the given contig.
* @throws PicardException if the associated index entry can't be found.
*/
public FastaSequenceIndexEntry getIndexEntry( String contigName ) {
if( !hasIndexEntry(contigName) )
throw new PicardException("Unable to find entry for contig: " + contigName);
return sequenceEntries.get(contigName);
}
}
class FastaSequenceIndexEntry {
private String contig;
private long location;
private long size;
private int basesPerLine;
private int bytesPerLine;
public FastaSequenceIndexEntry( String contig,
long location,
long size,
int basesPerLine,
int bytesPerLine ) {
this.contig = contig;
this.location = location;
this.size = size;
this.basesPerLine = basesPerLine;
this.bytesPerLine = bytesPerLine;
}
/**
* Gets the contig associated with this entry.
* @return String representation of the contig.
*/
public String getContig() {
return contig;
}
/**
* Gets the location of this contig within the fasta.
* @return seek position within the fasta.
*/
public long getLocation() {
return location;
}
/**
* Gets the size, in bytes, of the data in the contig.
* @return size of the contig bases in bytes.
*/
public long getSize() {
return size;
}
/**
* Gets the number of bases in a given line.
* @return Number of bases in the fasta line.
*/
public int getBasesPerLine() {
return basesPerLine;
}
/**
* How many bytes (bases + whitespace) are consumed by the
* given line?
* @return Number of bytes in a line.
*/
public int getBytesPerLine() {
return bytesPerLine;
}
public String toString() {
return String.format("contig %s; location %d; size %d; basesPerLine %d; bytesPerLine %d", contig,
location,
size,
basesPerLine,
bytesPerLine );
}
}

View File

@ -0,0 +1,127 @@
package org.broadinstitute.sting.utils;
import org.junit.BeforeClass;
import org.junit.Before;
import org.junit.Test;
import org.junit.Assert;
import org.apache.log4j.BasicConfigurator;
import java.io.File;
import java.io.FileNotFoundException;
import edu.mit.broad.picard.PicardException;
/**
* Created by IntelliJ IDEA.
* User: hanna
* Date: Apr 14, 2009
* Time: 10:34:15 AM
* To change this template use File | Settings | File Templates.
*/
public class FastaSequenceIndexTest {
private final String sequenceIndexName = "/Volumes/ifs/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai";
private FastaSequenceIndex sequenceIndex = null;
@BeforeClass
public static void initialize() {
BasicConfigurator.configure();
}
@Before
public void doForEachTest() throws FileNotFoundException {
sequenceIndex = new FastaSequenceIndex( new File(sequenceIndexName) );
}
@Test
public void testInitialContig() {
Assert.assertTrue("Contig chrM is not present", sequenceIndex.hasIndexEntry("chrM"));
FastaSequenceIndexEntry entry = sequenceIndex.getIndexEntry("chrM");
Assert.assertEquals("Contig chrM name is incorrect",entry.getContig(),"chrM");
Assert.assertEquals("Contig chrM location is incorrect",entry.getLocation(),6L);
Assert.assertEquals("Contig chrM size is incorrect",entry.getSize(),16571L);
Assert.assertEquals("Contig chrM bases per line is incorrect",entry.getBasesPerLine(),50);
Assert.assertEquals("Contig chrM bytes per line is incorrect",entry.getBytesPerLine(),51);
}
@Test
public void testMiddleContig() {
Assert.assertTrue("Contig chr8 is not present", sequenceIndex.hasIndexEntry("chr8"));
FastaSequenceIndexEntry entry = sequenceIndex.getIndexEntry("chr8");
Assert.assertEquals("Contig chr8 name is incorrect",entry.getContig(),"chr8");
Assert.assertEquals("Contig chr8 location is incorrect",entry.getLocation(),1419403101L);
Assert.assertEquals("Contig chr8 size is incorrect",entry.getSize(),146274826L);
Assert.assertEquals("Contig chr8 bases per line is incorrect",entry.getBasesPerLine(),50);
Assert.assertEquals("Contig chr8 bytes per line is incorrect",entry.getBytesPerLine(),51);
}
@Test
public void testLastContig() {
Assert.assertTrue("Contig chrX_random is not present", sequenceIndex.hasIndexEntry("chrX_random"));
FastaSequenceIndexEntry entry = sequenceIndex.getIndexEntry("chrX_random");
Assert.assertEquals("Contig chrX_random name is incorrect",entry.getContig(),"chrX_random");
Assert.assertEquals("Contig chrX_random location is incorrect",entry.getLocation(),3156698441L);
Assert.assertEquals("Contig chrX_random size is incorrect",entry.getSize(),1719168L);
Assert.assertEquals("Contig chrX_random bases per line is incorrect",entry.getBasesPerLine(),50);
Assert.assertEquals("Contig chrX_random bytes per line is incorrect",entry.getBytesPerLine(),51);
}
@Test
public void testAllContigsPresent() {
Assert.assertTrue("Contig chrM is not present", sequenceIndex.hasIndexEntry("chrM"));
Assert.assertTrue("Contig chr1 is not present", sequenceIndex.hasIndexEntry("chr1"));
Assert.assertTrue("Contig chr2 is not present", sequenceIndex.hasIndexEntry("chr2"));
Assert.assertTrue("Contig chr3 is not present", sequenceIndex.hasIndexEntry("chr3"));
Assert.assertTrue("Contig chr4 is not present", sequenceIndex.hasIndexEntry("chr4"));
Assert.assertTrue("Contig chr5 is not present", sequenceIndex.hasIndexEntry("chr5"));
Assert.assertTrue("Contig chr6 is not present", sequenceIndex.hasIndexEntry("chr6"));
Assert.assertTrue("Contig chr7 is not present", sequenceIndex.hasIndexEntry("chr7"));
Assert.assertTrue("Contig chr8 is not present", sequenceIndex.hasIndexEntry("chr8"));
Assert.assertTrue("Contig chr9 is not present", sequenceIndex.hasIndexEntry("chr9"));
Assert.assertTrue("Contig chr10 is not present", sequenceIndex.hasIndexEntry("chr10"));
Assert.assertTrue("Contig chr11 is not present", sequenceIndex.hasIndexEntry("chr11"));
Assert.assertTrue("Contig chr12 is not present", sequenceIndex.hasIndexEntry("chr12"));
Assert.assertTrue("Contig chr13 is not present", sequenceIndex.hasIndexEntry("chr13"));
Assert.assertTrue("Contig chr14 is not present", sequenceIndex.hasIndexEntry("chr14"));
Assert.assertTrue("Contig chr15 is not present", sequenceIndex.hasIndexEntry("chr15"));
Assert.assertTrue("Contig chr16 is not present", sequenceIndex.hasIndexEntry("chr16"));
Assert.assertTrue("Contig chr17 is not present", sequenceIndex.hasIndexEntry("chr17"));
Assert.assertTrue("Contig chr18 is not present", sequenceIndex.hasIndexEntry("chr18"));
Assert.assertTrue("Contig chr19 is not present", sequenceIndex.hasIndexEntry("chr19"));
Assert.assertTrue("Contig chr20 is not present", sequenceIndex.hasIndexEntry("chr20"));
Assert.assertTrue("Contig chr21 is not present", sequenceIndex.hasIndexEntry("chr21"));
Assert.assertTrue("Contig chr22 is not present", sequenceIndex.hasIndexEntry("chr22"));
Assert.assertTrue("Contig chrX is not present", sequenceIndex.hasIndexEntry("chrX"));
Assert.assertTrue("Contig chrY is not present", sequenceIndex.hasIndexEntry("chrY"));
Assert.assertTrue("Contig chr1_random is not present", sequenceIndex.hasIndexEntry("chr1_random"));
Assert.assertTrue("Contig chr2_random is not present", sequenceIndex.hasIndexEntry("chr2_random"));
Assert.assertTrue("Contig chr3_random is not present", sequenceIndex.hasIndexEntry("chr3_random"));
Assert.assertTrue("Contig chr4_random is not present", sequenceIndex.hasIndexEntry("chr4_random"));
Assert.assertTrue("Contig chr5_random is not present", sequenceIndex.hasIndexEntry("chr5_random"));
Assert.assertTrue("Contig chr6_random is not present", sequenceIndex.hasIndexEntry("chr6_random"));
Assert.assertTrue("Contig chr7_random is not present", sequenceIndex.hasIndexEntry("chr7_random"));
Assert.assertTrue("Contig chr8_random is not present", sequenceIndex.hasIndexEntry("chr8_random"));
Assert.assertTrue("Contig chr9_random is not present", sequenceIndex.hasIndexEntry("chr9_random"));
Assert.assertTrue("Contig chr10_random is not present", sequenceIndex.hasIndexEntry("chr10_random"));
Assert.assertTrue("Contig chr11_random is not present", sequenceIndex.hasIndexEntry("chr11_random"));
Assert.assertTrue("Contig chr13_random is not present", sequenceIndex.hasIndexEntry("chr13_random"));
Assert.assertTrue("Contig chr15_random is not present", sequenceIndex.hasIndexEntry("chr15_random"));
Assert.assertTrue("Contig chr16_random is not present", sequenceIndex.hasIndexEntry("chr16_random"));
Assert.assertTrue("Contig chr17_random is not present", sequenceIndex.hasIndexEntry("chr17_random"));
Assert.assertTrue("Contig chr18_random is not present", sequenceIndex.hasIndexEntry("chr18_random"));
Assert.assertTrue("Contig chr19_random is not present", sequenceIndex.hasIndexEntry("chr19_random"));
Assert.assertTrue("Contig chr21_random is not present", sequenceIndex.hasIndexEntry("chr21_random"));
Assert.assertTrue("Contig chr22_random is not present", sequenceIndex.hasIndexEntry("chr22_random"));
Assert.assertTrue("Contig chrX_random is not present", sequenceIndex.hasIndexEntry("chrX_random"));
}
@Test
public void testHasInvalidEntry() {
Assert.assertFalse("Found an invalid entry", sequenceIndex.hasIndexEntry("invalid"));
}
@Test(expected= PicardException.class)
public void testGetInvalidEntry() {
sequenceIndex.getIndexEntry("invalid");
}
}