Class to read an .fai file.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@405 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
704f1bd634
commit
186c799ffc
|
|
@ -0,0 +1,148 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
|
||||
import java.util.Scanner;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.regex.MatchResult;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: hanna
|
||||
* Date: Apr 14, 2009
|
||||
* Time: 10:02:10 AM
|
||||
*
|
||||
* Reads a fasta index file (.fai).
|
||||
*/
|
||||
public class FastaSequenceIndex {
|
||||
private Map<String,FastaSequenceIndexEntry> sequenceEntries =
|
||||
new HashMap<String,FastaSequenceIndexEntry>();
|
||||
|
||||
/**
|
||||
* Build a sequence index from the specified file.
|
||||
* @param indexFile File to open.
|
||||
* @throws PicardException if file is of invalid format.
|
||||
*/
|
||||
public FastaSequenceIndex( File indexFile ) throws FileNotFoundException {
|
||||
Scanner scanner = new Scanner(indexFile);
|
||||
|
||||
while( scanner.hasNext() ) {
|
||||
// Tokenize and validate the index line.
|
||||
String result = scanner.findInLine("(\\w+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)");
|
||||
if( result == null )
|
||||
throw new PicardException("Found invalid line in index file:" + scanner.nextLine());
|
||||
MatchResult tokens = scanner.match();
|
||||
if( tokens.groupCount() != 5 )
|
||||
throw new PicardException("Found invalid line in index file:" + scanner.nextLine());
|
||||
|
||||
// Skip past the line separator
|
||||
scanner.nextLine();
|
||||
|
||||
// Parse the index line.
|
||||
String contig = tokens.group(1);
|
||||
long size = Long.valueOf(tokens.group(2));
|
||||
long location = Long.valueOf(tokens.group(3));
|
||||
int basesPerLine = Integer.valueOf(tokens.group(4));
|
||||
int bytesPerLine = Integer.valueOf(tokens.group(5));
|
||||
|
||||
// Build sequence structure
|
||||
sequenceEntries.put( contig,new FastaSequenceIndexEntry(contig,location,size,basesPerLine,bytesPerLine) );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the given contig name have a corresponding entry?
|
||||
* @param contigName The contig name for which to search.
|
||||
* @return True if contig name is present; false otherwise.
|
||||
*/
|
||||
public boolean hasIndexEntry( String contigName ) {
|
||||
return sequenceEntries.containsKey(contigName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the index entry associated with the given contig.
|
||||
* @param contigName Name of the contig for which to search.
|
||||
* @return Index entry associated with the given contig.
|
||||
* @throws PicardException if the associated index entry can't be found.
|
||||
*/
|
||||
public FastaSequenceIndexEntry getIndexEntry( String contigName ) {
|
||||
if( !hasIndexEntry(contigName) )
|
||||
throw new PicardException("Unable to find entry for contig: " + contigName);
|
||||
|
||||
return sequenceEntries.get(contigName);
|
||||
}
|
||||
}
|
||||
|
||||
class FastaSequenceIndexEntry {
|
||||
private String contig;
|
||||
private long location;
|
||||
private long size;
|
||||
private int basesPerLine;
|
||||
private int bytesPerLine;
|
||||
|
||||
public FastaSequenceIndexEntry( String contig,
|
||||
long location,
|
||||
long size,
|
||||
int basesPerLine,
|
||||
int bytesPerLine ) {
|
||||
this.contig = contig;
|
||||
this.location = location;
|
||||
this.size = size;
|
||||
this.basesPerLine = basesPerLine;
|
||||
this.bytesPerLine = bytesPerLine;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the contig associated with this entry.
|
||||
* @return String representation of the contig.
|
||||
*/
|
||||
public String getContig() {
|
||||
return contig;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the location of this contig within the fasta.
|
||||
* @return seek position within the fasta.
|
||||
*/
|
||||
public long getLocation() {
|
||||
return location;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the size, in bytes, of the data in the contig.
|
||||
* @return size of the contig bases in bytes.
|
||||
*/
|
||||
public long getSize() {
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the number of bases in a given line.
|
||||
* @return Number of bases in the fasta line.
|
||||
*/
|
||||
public int getBasesPerLine() {
|
||||
return basesPerLine;
|
||||
}
|
||||
|
||||
/**
|
||||
* How many bytes (bases + whitespace) are consumed by the
|
||||
* given line?
|
||||
* @return Number of bytes in a line.
|
||||
*/
|
||||
public int getBytesPerLine() {
|
||||
return bytesPerLine;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("contig %s; location %d; size %d; basesPerLine %d; bytesPerLine %d", contig,
|
||||
location,
|
||||
size,
|
||||
basesPerLine,
|
||||
bytesPerLine );
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,127 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.Assert;
|
||||
import org.apache.log4j.BasicConfigurator;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: hanna
|
||||
* Date: Apr 14, 2009
|
||||
* Time: 10:34:15 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class FastaSequenceIndexTest {
|
||||
private final String sequenceIndexName = "/Volumes/ifs/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai";
|
||||
private FastaSequenceIndex sequenceIndex = null;
|
||||
|
||||
@BeforeClass
|
||||
public static void initialize() {
|
||||
BasicConfigurator.configure();
|
||||
}
|
||||
|
||||
@Before
|
||||
public void doForEachTest() throws FileNotFoundException {
|
||||
sequenceIndex = new FastaSequenceIndex( new File(sequenceIndexName) );
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInitialContig() {
|
||||
Assert.assertTrue("Contig chrM is not present", sequenceIndex.hasIndexEntry("chrM"));
|
||||
FastaSequenceIndexEntry entry = sequenceIndex.getIndexEntry("chrM");
|
||||
Assert.assertEquals("Contig chrM name is incorrect",entry.getContig(),"chrM");
|
||||
Assert.assertEquals("Contig chrM location is incorrect",entry.getLocation(),6L);
|
||||
Assert.assertEquals("Contig chrM size is incorrect",entry.getSize(),16571L);
|
||||
Assert.assertEquals("Contig chrM bases per line is incorrect",entry.getBasesPerLine(),50);
|
||||
Assert.assertEquals("Contig chrM bytes per line is incorrect",entry.getBytesPerLine(),51);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMiddleContig() {
|
||||
Assert.assertTrue("Contig chr8 is not present", sequenceIndex.hasIndexEntry("chr8"));
|
||||
FastaSequenceIndexEntry entry = sequenceIndex.getIndexEntry("chr8");
|
||||
Assert.assertEquals("Contig chr8 name is incorrect",entry.getContig(),"chr8");
|
||||
Assert.assertEquals("Contig chr8 location is incorrect",entry.getLocation(),1419403101L);
|
||||
Assert.assertEquals("Contig chr8 size is incorrect",entry.getSize(),146274826L);
|
||||
Assert.assertEquals("Contig chr8 bases per line is incorrect",entry.getBasesPerLine(),50);
|
||||
Assert.assertEquals("Contig chr8 bytes per line is incorrect",entry.getBytesPerLine(),51);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLastContig() {
|
||||
Assert.assertTrue("Contig chrX_random is not present", sequenceIndex.hasIndexEntry("chrX_random"));
|
||||
FastaSequenceIndexEntry entry = sequenceIndex.getIndexEntry("chrX_random");
|
||||
Assert.assertEquals("Contig chrX_random name is incorrect",entry.getContig(),"chrX_random");
|
||||
Assert.assertEquals("Contig chrX_random location is incorrect",entry.getLocation(),3156698441L);
|
||||
Assert.assertEquals("Contig chrX_random size is incorrect",entry.getSize(),1719168L);
|
||||
Assert.assertEquals("Contig chrX_random bases per line is incorrect",entry.getBasesPerLine(),50);
|
||||
Assert.assertEquals("Contig chrX_random bytes per line is incorrect",entry.getBytesPerLine(),51);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAllContigsPresent() {
|
||||
Assert.assertTrue("Contig chrM is not present", sequenceIndex.hasIndexEntry("chrM"));
|
||||
Assert.assertTrue("Contig chr1 is not present", sequenceIndex.hasIndexEntry("chr1"));
|
||||
Assert.assertTrue("Contig chr2 is not present", sequenceIndex.hasIndexEntry("chr2"));
|
||||
Assert.assertTrue("Contig chr3 is not present", sequenceIndex.hasIndexEntry("chr3"));
|
||||
Assert.assertTrue("Contig chr4 is not present", sequenceIndex.hasIndexEntry("chr4"));
|
||||
Assert.assertTrue("Contig chr5 is not present", sequenceIndex.hasIndexEntry("chr5"));
|
||||
Assert.assertTrue("Contig chr6 is not present", sequenceIndex.hasIndexEntry("chr6"));
|
||||
Assert.assertTrue("Contig chr7 is not present", sequenceIndex.hasIndexEntry("chr7"));
|
||||
Assert.assertTrue("Contig chr8 is not present", sequenceIndex.hasIndexEntry("chr8"));
|
||||
Assert.assertTrue("Contig chr9 is not present", sequenceIndex.hasIndexEntry("chr9"));
|
||||
Assert.assertTrue("Contig chr10 is not present", sequenceIndex.hasIndexEntry("chr10"));
|
||||
Assert.assertTrue("Contig chr11 is not present", sequenceIndex.hasIndexEntry("chr11"));
|
||||
Assert.assertTrue("Contig chr12 is not present", sequenceIndex.hasIndexEntry("chr12"));
|
||||
Assert.assertTrue("Contig chr13 is not present", sequenceIndex.hasIndexEntry("chr13"));
|
||||
Assert.assertTrue("Contig chr14 is not present", sequenceIndex.hasIndexEntry("chr14"));
|
||||
Assert.assertTrue("Contig chr15 is not present", sequenceIndex.hasIndexEntry("chr15"));
|
||||
Assert.assertTrue("Contig chr16 is not present", sequenceIndex.hasIndexEntry("chr16"));
|
||||
Assert.assertTrue("Contig chr17 is not present", sequenceIndex.hasIndexEntry("chr17"));
|
||||
Assert.assertTrue("Contig chr18 is not present", sequenceIndex.hasIndexEntry("chr18"));
|
||||
Assert.assertTrue("Contig chr19 is not present", sequenceIndex.hasIndexEntry("chr19"));
|
||||
Assert.assertTrue("Contig chr20 is not present", sequenceIndex.hasIndexEntry("chr20"));
|
||||
Assert.assertTrue("Contig chr21 is not present", sequenceIndex.hasIndexEntry("chr21"));
|
||||
Assert.assertTrue("Contig chr22 is not present", sequenceIndex.hasIndexEntry("chr22"));
|
||||
Assert.assertTrue("Contig chrX is not present", sequenceIndex.hasIndexEntry("chrX"));
|
||||
Assert.assertTrue("Contig chrY is not present", sequenceIndex.hasIndexEntry("chrY"));
|
||||
Assert.assertTrue("Contig chr1_random is not present", sequenceIndex.hasIndexEntry("chr1_random"));
|
||||
Assert.assertTrue("Contig chr2_random is not present", sequenceIndex.hasIndexEntry("chr2_random"));
|
||||
Assert.assertTrue("Contig chr3_random is not present", sequenceIndex.hasIndexEntry("chr3_random"));
|
||||
Assert.assertTrue("Contig chr4_random is not present", sequenceIndex.hasIndexEntry("chr4_random"));
|
||||
Assert.assertTrue("Contig chr5_random is not present", sequenceIndex.hasIndexEntry("chr5_random"));
|
||||
Assert.assertTrue("Contig chr6_random is not present", sequenceIndex.hasIndexEntry("chr6_random"));
|
||||
Assert.assertTrue("Contig chr7_random is not present", sequenceIndex.hasIndexEntry("chr7_random"));
|
||||
Assert.assertTrue("Contig chr8_random is not present", sequenceIndex.hasIndexEntry("chr8_random"));
|
||||
Assert.assertTrue("Contig chr9_random is not present", sequenceIndex.hasIndexEntry("chr9_random"));
|
||||
Assert.assertTrue("Contig chr10_random is not present", sequenceIndex.hasIndexEntry("chr10_random"));
|
||||
Assert.assertTrue("Contig chr11_random is not present", sequenceIndex.hasIndexEntry("chr11_random"));
|
||||
Assert.assertTrue("Contig chr13_random is not present", sequenceIndex.hasIndexEntry("chr13_random"));
|
||||
Assert.assertTrue("Contig chr15_random is not present", sequenceIndex.hasIndexEntry("chr15_random"));
|
||||
Assert.assertTrue("Contig chr16_random is not present", sequenceIndex.hasIndexEntry("chr16_random"));
|
||||
Assert.assertTrue("Contig chr17_random is not present", sequenceIndex.hasIndexEntry("chr17_random"));
|
||||
Assert.assertTrue("Contig chr18_random is not present", sequenceIndex.hasIndexEntry("chr18_random"));
|
||||
Assert.assertTrue("Contig chr19_random is not present", sequenceIndex.hasIndexEntry("chr19_random"));
|
||||
Assert.assertTrue("Contig chr21_random is not present", sequenceIndex.hasIndexEntry("chr21_random"));
|
||||
Assert.assertTrue("Contig chr22_random is not present", sequenceIndex.hasIndexEntry("chr22_random"));
|
||||
Assert.assertTrue("Contig chrX_random is not present", sequenceIndex.hasIndexEntry("chrX_random"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHasInvalidEntry() {
|
||||
Assert.assertFalse("Found an invalid entry", sequenceIndex.hasIndexEntry("invalid"));
|
||||
}
|
||||
|
||||
@Test(expected= PicardException.class)
|
||||
public void testGetInvalidEntry() {
|
||||
sequenceIndex.getIndexEntry("invalid");
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue