Adding new support for reference data. ReferenceDataSource is a new class that manages reference data, and allows IndexedFastaSequenceFile to be a simple reader. This checkin also includes FastaSequenceIndexBuilder, which reads a fasta file and creates an index, like samtools faidx. Right now this is not enabled, because we are still working out thread safety. So the only new UI change is that GATK can be run without a fai file. Soon, we will enable 1) GATK to be run without a dict file too, and 2) both dict and fai files will be saved on disk for future program executions. For more info, see ReferenceDataSource.java
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3527 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f55f32d4ee
commit
99b684ea89
|
|
@ -52,10 +52,8 @@ import org.broadinstitute.sting.gatk.walkers.*;
|
|||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.commandline.ArgumentException;
|
||||
import org.broadinstitute.sting.commandline.ArgumentSource;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
public class GenomeAnalysisEngine {
|
||||
|
|
@ -72,7 +70,7 @@ public class GenomeAnalysisEngine {
|
|||
/**
|
||||
* Accessor for sharded reference data.
|
||||
*/
|
||||
private IndexedFastaSequenceFile referenceDataSource = null;
|
||||
private ReferenceDataSource referenceDataSource = null;
|
||||
|
||||
/**
|
||||
* Accessor for sharded reference-ordered data.
|
||||
|
|
@ -192,7 +190,7 @@ public class GenomeAnalysisEngine {
|
|||
else {
|
||||
// if include argument isn't given, create new set of all possible intervals
|
||||
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ?
|
||||
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) :
|
||||
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) :
|
||||
loadIntervals(argCollection.intervals,
|
||||
argCollection.intervalMerging,
|
||||
GenomeLocParser.mergeIntervalLocations(checkRODToIntervalArgument(),argCollection.intervalMerging)));
|
||||
|
|
@ -352,7 +350,7 @@ public class GenomeAnalysisEngine {
|
|||
validateSuppliedReferenceOrderedDataAgainstWalker(my_walker, tracks);
|
||||
|
||||
// validate all the sequence dictionaries against the reference
|
||||
validateSourcesAgainstReference(readsDataSource, referenceDataSource, tracks);
|
||||
validateSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), tracks);
|
||||
|
||||
rodDataSources = getReferenceOrderedDataSources(my_walker, tracks);
|
||||
}
|
||||
|
|
@ -373,7 +371,7 @@ public class GenomeAnalysisEngine {
|
|||
Utils.scareUser(String.format("Read-based traversals require a reference file but none was given"));
|
||||
}
|
||||
|
||||
return MicroScheduler.create(this,my_walker,readsDataSource,referenceDataSource,rodDataSources,argCollection.numberOfThreads);
|
||||
return MicroScheduler.create(this,my_walker,readsDataSource,referenceDataSource.getReference(),rodDataSources,argCollection.numberOfThreads);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -762,14 +760,14 @@ public class GenomeAnalysisEngine {
|
|||
ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL :
|
||||
ShardStrategyFactory.SHATTER_STRATEGY.LINEAR;
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
!argCollection.disableExperimentalSharding ? ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL : shardType,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,
|
||||
intervals, maxIterations);
|
||||
} else
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
!argCollection.disableExperimentalSharding ? ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL : ShardStrategyFactory.SHATTER_STRATEGY.LINEAR,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE, maxIterations);
|
||||
|
|
@ -782,14 +780,14 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
if (intervals != null && !intervals.isEmpty()) {
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
shardType,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE,
|
||||
intervals, maxIterations);
|
||||
} else {
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
shardType,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE, maxIterations);
|
||||
|
|
@ -803,7 +801,7 @@ public class GenomeAnalysisEngine {
|
|||
Utils.scareUser("Pairs traversal cannot be used in conjunction with intervals.");
|
||||
|
||||
shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
|
||||
referenceDataSource,
|
||||
referenceDataSource.getReference(),
|
||||
ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL,
|
||||
drivingDataSource.getSequenceDictionary(),
|
||||
SHARD_SIZE, maxIterations);
|
||||
|
|
@ -839,15 +837,9 @@ public class GenomeAnalysisEngine {
|
|||
* @param refFile Handle to a reference sequence file. Non-null.
|
||||
* @return A thread-safe file wrapper.
|
||||
*/
|
||||
private IndexedFastaSequenceFile openReferenceSequenceFile(File refFile) {
|
||||
IndexedFastaSequenceFile ref = null;
|
||||
try {
|
||||
ref = new IndexedFastaSequenceFile(refFile);
|
||||
}
|
||||
catch (FileNotFoundException ex) {
|
||||
throw new StingException("I/O error while opening fasta file: " + ex.getMessage(), ex);
|
||||
}
|
||||
GenomeLocParser.setupRefContigOrdering(ref);
|
||||
private ReferenceDataSource openReferenceSequenceFile(File refFile) {
|
||||
ReferenceDataSource ref = new ReferenceDataSource(refFile);
|
||||
GenomeLocParser.setupRefContigOrdering(ref.getReference());
|
||||
return ref;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,155 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceIndex;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceIndexBuilder;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.sam.CreateSequenceDictionary;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
/**
|
||||
* Loads reference data from fasta file
|
||||
* Looks for fai and dict files, and tries to create them if they don't exist
|
||||
*/
|
||||
public class ReferenceDataSource implements ReferenceDataSourceProgressListener {
|
||||
private IndexedFastaSequenceFile index;
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class);
|
||||
|
||||
/**
|
||||
* Create reference data source from fasta file
|
||||
* @param fastaFile Fasta file to be used as reference
|
||||
*
|
||||
* Please note: This should be re-done when PIC-370 is fixed. (See below.) At that time, we may consider loading
|
||||
* sequenceIndex and sequenceDict here, instead of in IndexedFastaSequenceFile.
|
||||
*/
|
||||
public ReferenceDataSource(File fastaFile) {
|
||||
File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
|
||||
File dictFile = new File(fastaFile.getAbsolutePath().replace(".fasta", ".dict"));
|
||||
FastaSequenceIndex sequenceIndex; // stores FastaSequenceIndex if file doesn't exist
|
||||
|
||||
/*
|
||||
Note: this code is temporary. See commented code below, which will be used when thread safety is resolved.
|
||||
*/
|
||||
if (!indexFile.exists()) {
|
||||
FastaSequenceIndexBuilder faiBuilder = new FastaSequenceIndexBuilder(fastaFile, this);
|
||||
sequenceIndex = faiBuilder.sequenceIndex;
|
||||
try {
|
||||
index = new IndexedFastaSequenceFile(fastaFile, sequenceIndex);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new StingException("Could not load fasta file. Stack trace below. ", e);
|
||||
}
|
||||
}
|
||||
else {
|
||||
try {
|
||||
index = new IndexedFastaSequenceFile(fastaFile);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new StingException("Could not load fasta file. Stack trace below.", e);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If dict file doesn't exist, try to create it using Picard's CreateSequenceDictionary
|
||||
* Currently, dictionary cannot be created without running CreateSequenceDictionary's main routine, hence the
|
||||
* argument string
|
||||
* This has been filed in trac as (PIC-370) Want programmatic interface to CreateSequenceDictionary
|
||||
*
|
||||
* Todo: Currently working on making the creation of dictionary and index thread-safe
|
||||
*
|
||||
*/
|
||||
/*if (!dictFile.exists()) {
|
||||
logger.info(String.format("Fasta dictionary file %s does not exist. Trying to create it now.",
|
||||
dictFile.getAbsolutePath()));
|
||||
String args[] = {String.format("r=%s", fastaFile.getAbsolutePath()),
|
||||
String.format("o=%s", dictFile.getAbsolutePath())};
|
||||
new CreateSequenceDictionary().instanceMain(args);
|
||||
logger.info(String.format("Dictionary file created successfully!"));
|
||||
}*/
|
||||
|
||||
/**
|
||||
* Create fai file if it doesn't exist and throw exception if unsuccessful
|
||||
* Note that this implies walkers cannot be run if a fai file is not provided and GATK cannot write to disk
|
||||
*
|
||||
* Todo: currently working on making this thread safe.
|
||||
* Rather than creating a new fai file, structure is created in memory. This whole block will be fixed
|
||||
* in a couple days when we figure out the thread stuff.
|
||||
*
|
||||
*/
|
||||
/*if (!indexFile.exists()) {
|
||||
logger.info(String.format("Fasta index file %s does not exist. Trying to create it now.",
|
||||
indexFile.getAbsolutePath()));
|
||||
FastaSequenceIndexBuilder faiBuilder = new FastaSequenceIndexBuilder(fastaFile, this);
|
||||
|
||||
try {
|
||||
faiBuilder.saveAsFaiFile();
|
||||
logger.info(String.format("Index file created successfully!"));
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new StingException("Index file could not be saved", e);
|
||||
}
|
||||
}
|
||||
|
||||
// now create IndexedFastaSequenceFile
|
||||
try {
|
||||
index = indexFile.exists() ? new IndexedFastaSequenceFile(fastaFile) :
|
||||
new IndexedFastaSequenceFile(fastaFile, sequenceIndex);
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new StingException("An error occurred while loading the fasta file and its .fasta.fai and " +
|
||||
".dict counterparts.", e);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new StingException("An error occurred while processing the fasta file and its .fasta.fai and " +
|
||||
".dict counterparts. If the error could have been caused by the .fasta.fai or .dict files, " +
|
||||
"you can re-create them by removing them from the folder that the fasta file is in and " +
|
||||
"running GATK again.", e);
|
||||
}*/
|
||||
}
|
||||
|
||||
/**
|
||||
* Get indexed fasta file
|
||||
* @return IndexedFastaSequenceFile that was created from file
|
||||
*/
|
||||
public IndexedFastaSequenceFile getReference() {
|
||||
return this.index;
|
||||
}
|
||||
|
||||
/**
|
||||
* Notify user of progress in creating fai file
|
||||
* @param percent Percent of fasta file read as a percent
|
||||
*/
|
||||
public void percentProgress(int percent) {
|
||||
System.out.println(String.format("PROGRESS UPDATE: file is %d percent complete", percent));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
||||
|
||||
public interface ReferenceDataSourceProgressListener {
|
||||
public void percentProgress(int percent);
|
||||
}
|
||||
|
|
@ -1,3 +1,28 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.fasta;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
|
|
@ -18,19 +43,26 @@ public class FastaSequenceIndex implements Iterable {
|
|||
/**
|
||||
* Store the entries. Use a LinkedHashMap for consistent iteration in insertion order.
|
||||
*/
|
||||
private Map<String,FastaSequenceIndexEntry> sequenceEntries = new LinkedHashMap<String,FastaSequenceIndexEntry>();
|
||||
private final Map<String,FastaSequenceIndexEntry> sequenceEntries = new LinkedHashMap<String,FastaSequenceIndexEntry>();
|
||||
|
||||
/**
|
||||
* Build a sequence index from the specified file.
|
||||
* @param indexFile File to open.
|
||||
* @throws FileNotFoundException if the index file cannot be found.
|
||||
*/
|
||||
public FastaSequenceIndex( File indexFile ) throws FileNotFoundException {
|
||||
protected FastaSequenceIndex( File indexFile ) throws FileNotFoundException {
|
||||
if(!indexFile.exists())
|
||||
throw new FileNotFoundException(String.format("Fasta index file is missing: %s",indexFile.getAbsolutePath()));
|
||||
|
||||
IoUtil.assertFileIsReadable(indexFile);
|
||||
parseIndexFile(indexFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build an empty sequence index. Entries can be added later.
|
||||
*/
|
||||
protected FastaSequenceIndex() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -102,17 +134,46 @@ public class FastaSequenceIndex implements Iterable {
|
|||
public int size() {
|
||||
return sequenceEntries.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds entry to index. Used by Fai file generator to create index entry on the fly.
|
||||
* @param contig The name of the contig
|
||||
* @param location Byte-referenced location of contig in file
|
||||
* @param size Number of bases in contig
|
||||
* @param basesPerLine Number of bases in each line. Must be uniform.
|
||||
* @param bytesPerLine Number of bytes in each line. Must be uniform.
|
||||
*/
|
||||
public void addIndexEntry(String contig, long location, long size, int basesPerLine, int bytesPerLine) {
|
||||
sequenceEntries.put( contig,new FastaSequenceIndexEntry(contig,location,size,basesPerLine,bytesPerLine) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two FastaSequenceIndex objects. Built for use in testing. No hash function has been created.
|
||||
* @param other Another FastaSequenceIndex to compare
|
||||
* @return True if index has the same entries as other instance, in the same order
|
||||
*/
|
||||
public boolean equals(FastaSequenceIndex other) {
|
||||
Iterator<FastaSequenceIndexEntry> iter = this.iterator();
|
||||
Iterator<FastaSequenceIndexEntry> otherIter = other.iterator();
|
||||
while (iter.hasNext()) {
|
||||
if (!otherIter.hasNext())
|
||||
return false;
|
||||
if (!iter.next().equals(otherIter.next()))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Hold an individual entry in a fasta sequence index file.
|
||||
*/
|
||||
class FastaSequenceIndexEntry {
|
||||
private String contig;
|
||||
private long location;
|
||||
private long size;
|
||||
private int basesPerLine;
|
||||
private int bytesPerLine;
|
||||
private final String contig;
|
||||
private final long location;
|
||||
private final long size;
|
||||
private final int basesPerLine;
|
||||
private final int bytesPerLine;
|
||||
|
||||
/**
|
||||
* Create a new entry with the given parameters.
|
||||
|
|
@ -186,4 +247,22 @@ class FastaSequenceIndexEntry {
|
|||
basesPerLine,
|
||||
bytesPerLine );
|
||||
}
|
||||
|
||||
/**
|
||||
* Print string in format of fai file line
|
||||
* @return Contig as one line in a fai file
|
||||
*/
|
||||
public String toIndexFileLine() {
|
||||
return String.format("%s\t%d\t%d\t%d\t%d", contig, size, location, basesPerLine, bytesPerLine);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare entry to another instance
|
||||
* @param other another FastaSequenceIndexEntry
|
||||
* @return True if each has the same name, location, size, basesPerLine and bytesPerLine
|
||||
*/
|
||||
public boolean equals(FastaSequenceIndexEntry other) {
|
||||
return (contig.equals(other.contig) && size == other.size && location == other.location
|
||||
&& basesPerLine == other.basesPerLine && bytesPerLine == other.bytesPerLine);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,290 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.fasta;
|
||||
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSourceProgressListener;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import static org.broadinstitute.sting.utils.fasta.FastaSequenceIndexBuilder.Status.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Builds FastaSequenceIndex from fasta file.
|
||||
* Produces fai file with same output as samtools faidx
|
||||
*/
|
||||
public class FastaSequenceIndexBuilder {
|
||||
public File fastaFile;
|
||||
ReferenceDataSourceProgressListener progress; // interface that provides a method for updating user on progress of reading file
|
||||
public FastaSequenceIndex sequenceIndex = new FastaSequenceIndex();
|
||||
|
||||
// keep track of location in file
|
||||
long bytesRead, endOfLastLine, lastTimestamp, fileLength; // initialized to -1 to keep 0-indexed position in file;
|
||||
|
||||
// vars that store information about the contig that is currently being read
|
||||
String contig;
|
||||
long location, size, bytesPerLine, basesPerLine, basesThisLine;
|
||||
|
||||
// vars that keep loop state
|
||||
byte lastByte = 0, currentByte = 0, nextByte = 0;
|
||||
public enum Status { NONE, CONTIG, FIRST_SEQ_LINE, SEQ_LINE, COMMENT }
|
||||
Status status = Status.NONE; // keeps state of what is currently being read. better to use int instead of enum?
|
||||
|
||||
public FastaSequenceIndexBuilder(File fastaFile, ReferenceDataSourceProgressListener progress) {
|
||||
this.progress = progress;
|
||||
this.fastaFile = fastaFile;
|
||||
fileLength = fastaFile.length();
|
||||
parseFastaFile();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates fasta sequence index from fasta file
|
||||
*/
|
||||
private void parseFastaFile() {
|
||||
bytesRead = -1;
|
||||
endOfLastLine = -1;
|
||||
contig = "";
|
||||
location = 0;
|
||||
size = 0;
|
||||
bytesPerLine = 0;
|
||||
basesPerLine = 0;
|
||||
basesThisLine = 0;
|
||||
lastTimestamp = System.currentTimeMillis();
|
||||
|
||||
// initialize input stream
|
||||
DataInputStream in;
|
||||
try {
|
||||
in = new DataInputStream(new BufferedInputStream(new FileInputStream(fastaFile)));
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new StingException(String.format("Could not read fasta file %s", fastaFile.getAbsolutePath()));
|
||||
}
|
||||
|
||||
/*
|
||||
* iterate through each character in file one at a time, but must account for variance in line terminators
|
||||
* strategy is to check if current character is a line terminator (\n, \r), then check next character
|
||||
* only treat as end of line if next character is NOT a line terminator
|
||||
*/
|
||||
try {
|
||||
// intialize iterators
|
||||
nextByte = in.readByte();
|
||||
currentByte = '\n';
|
||||
while(currentByte != -1) {
|
||||
|
||||
bytesRead ++; // update position in file
|
||||
lastByte = currentByte;
|
||||
currentByte = nextByte;
|
||||
try {
|
||||
nextByte = in.readByte();
|
||||
}
|
||||
catch (EOFException e) {
|
||||
nextByte = -1;
|
||||
}
|
||||
|
||||
switch(status) {
|
||||
|
||||
// if not currently reading anything
|
||||
// only thing that can trigger action is '>' (start of contig) or ';' (comment)
|
||||
case NONE:
|
||||
if (currentByte == '>')
|
||||
status = CONTIG;
|
||||
else if (currentByte == ';')
|
||||
status = COMMENT;
|
||||
break;
|
||||
|
||||
// if reading a comment, just ignore everything until end of line
|
||||
case COMMENT:
|
||||
if (isEol(currentByte)) {
|
||||
if (!isEol(nextByte))
|
||||
status = Status.NONE;
|
||||
}
|
||||
break;
|
||||
|
||||
// if reading a contig, add char to contig string
|
||||
// contig string can be anything, including spaces
|
||||
case CONTIG:
|
||||
if (isEol(currentByte)) {
|
||||
if (!isEol(nextByte)) {
|
||||
status = Status.FIRST_SEQ_LINE;
|
||||
location = bytesRead + 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
contig += (char) currentByte;
|
||||
break;
|
||||
|
||||
// record bases and bytes of first sequence line, to validate against later lines
|
||||
case FIRST_SEQ_LINE:
|
||||
|
||||
if (isEol(currentByte)) {
|
||||
|
||||
// record bases per line if last character was a base
|
||||
if (!isEol(lastByte)) {
|
||||
basesPerLine = bytesRead - location;
|
||||
basesThisLine = basesPerLine;
|
||||
size += basesPerLine;
|
||||
}
|
||||
|
||||
// next character is start of next line, now know bytes per line
|
||||
if (!isEol(nextByte)) { // figure out what to do if there is only one data line
|
||||
bytesPerLine = bytesRead - location + 1;
|
||||
status = Status.SEQ_LINE;
|
||||
endOfLastLine = bytesRead;
|
||||
|
||||
// if next char is ';' or '>', then there is only one contig =>
|
||||
if (nextByte == ';' || nextByte == '>')
|
||||
finishReadingContig();
|
||||
}
|
||||
}
|
||||
|
||||
// validate base character
|
||||
else {
|
||||
if (!isValidBase(currentByte))
|
||||
throw new StingException(String.format("An invalid base was found in the contig: %s", contig));
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case SEQ_LINE:
|
||||
|
||||
if (isEol(currentByte)) {
|
||||
|
||||
// record bases per line if last character was a base
|
||||
if (!isEol(lastByte)) {
|
||||
basesThisLine = bytesRead - endOfLastLine - 1;
|
||||
size += basesThisLine;
|
||||
}
|
||||
|
||||
// reached end of line - check if end of contig
|
||||
if (!isEol(nextByte)) {
|
||||
|
||||
// if comment or new contig, definitely end of sequence
|
||||
if (nextByte == ';' || nextByte == '>')
|
||||
finishReadingContig();
|
||||
|
||||
// if this line has different # of bases OR same # of bases and different # of bytes:
|
||||
// error if next char is a valid base, end of contig otherwise
|
||||
else if (basesThisLine != basesPerLine || bytesPerLine != bytesRead - endOfLastLine) {
|
||||
if (isValidBase(nextByte) && nextByte != -1) {
|
||||
throw new StingException(String.format("An invalid line was found in the contig: %s", contig));
|
||||
}
|
||||
else
|
||||
finishReadingContig();
|
||||
}
|
||||
endOfLastLine = bytesRead;
|
||||
}
|
||||
}
|
||||
|
||||
// validate base character
|
||||
else {
|
||||
if (!isValidBase(currentByte))
|
||||
throw new StingException(String.format("An invalid base was found in the contig: %s", contig));
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new StingException(String.format("Could not read fasta file %s", fastaFile.getAbsolutePath()), e); }
|
||||
catch (Exception e) {
|
||||
throw new StingException(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if character is an end of line character, \n or \r
|
||||
* @param currentByte Character to check, as a byte
|
||||
* @return True if current character is \n or \r' false otherwise
|
||||
*/
|
||||
private boolean isEol(byte currentByte) {
|
||||
return (currentByte == '\n' || currentByte == '\r');
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* checks that character is valid base
|
||||
* only checks that the base isn't whitespace, like picard does
|
||||
* could add check that character is A/C/G/T/U if wanted
|
||||
* @param currentByte Character to check, as a byte
|
||||
* @return True if character is not whitespace, false otherwise
|
||||
*/
|
||||
private boolean isValidBase(byte currentByte) {
|
||||
return (!Character.isWhitespace(currentByte) && currentByte != ';' && currentByte != '>');
|
||||
}
|
||||
|
||||
/*
|
||||
* When reader reaches the end of a contig
|
||||
* Reset iterators and add contig to sequence index
|
||||
*/
|
||||
private void finishReadingContig() {
|
||||
sequenceIndex.addIndexEntry(contig, location, size, (int) basesPerLine, (int) bytesPerLine);
|
||||
status = Status.NONE;
|
||||
contig = "";
|
||||
size = 0;
|
||||
|
||||
if (System.currentTimeMillis() - lastTimestamp > 10000) {
|
||||
int percentProgress = (int) (100*bytesRead/fileLength);
|
||||
if (progress != null)
|
||||
progress.percentProgress(percentProgress);
|
||||
lastTimestamp = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores FastaSequenceIndex as a .fasta.fai file on local machine
|
||||
* Although method is public it cannot be called on any old FastaSequenceIndex - must be created by a FastaSequenceIndexBuilder
|
||||
*/
|
||||
public void saveAsFaiFile() {
|
||||
File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
|
||||
|
||||
if (indexFile.exists()) {
|
||||
throw new StingException(String.format("Fai file could not be created, because a file with name %s already exists." +
|
||||
"Please delete or rename this file and try again.", indexFile.getAbsolutePath()));
|
||||
}
|
||||
|
||||
BufferedWriter out;
|
||||
try {
|
||||
out = new BufferedWriter(new FileWriter(indexFile));
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new StingException(String.format("Could not open file %s for writing. Check that GATK is permitted to write to disk.",
|
||||
indexFile.getAbsolutePath()), e);
|
||||
}
|
||||
|
||||
Iterator<FastaSequenceIndexEntry> iter = sequenceIndex.iterator();
|
||||
|
||||
try {
|
||||
while (iter.hasNext()) {
|
||||
out.write(iter.next().toIndexFileLine());
|
||||
out.newLine();
|
||||
}
|
||||
out.close();
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new StingException(String.format("An error occurred while writing file %s", e));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -64,6 +64,24 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
|
|||
sanityCheckDictionaryAgainstIndex();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
||||
* @param file The file to open.
|
||||
* @param sequenceIndex FastaSequenceIndex that was previously created
|
||||
* @throws FileNotFoundException If the fasta or any of its supporting files cannot be found.
|
||||
*/
|
||||
public IndexedFastaSequenceFile(File file, FastaSequenceIndex sequenceIndex) throws FileNotFoundException {
|
||||
this.file = file;
|
||||
FileInputStream in = new FileInputStream(file);
|
||||
channel = in.getChannel();
|
||||
|
||||
loadDictionary(file);
|
||||
// Temporary change: sequenceIndex is passed in directly. See comments in ReferenceDataSource.
|
||||
index = sequenceIndex;
|
||||
sanityCheckDictionaryAgainstIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary, if available.
|
||||
* @param fastaFile File to check for a match.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,116 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.fasta;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSourceProgressListener;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
/**
|
||||
* Test the fasta sequence index reader.
|
||||
*/
|
||||
public class FastaSequenceIndexBuilderUnitTest extends BaseTest {
|
||||
|
||||
private FastaSequenceIndexBuilder builder;
|
||||
private ReferenceDataSourceProgressListener progress;
|
||||
private File fastaFile;
|
||||
private FastaSequenceIndex sequenceIndex;
|
||||
|
||||
@Before
|
||||
public void doForEachTest() throws FileNotFoundException {
|
||||
sequenceIndex = new FastaSequenceIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests basic unix file with one contig
|
||||
* File is the exampleFASTA.fasta shipped with GATK
|
||||
*/
|
||||
@Test
|
||||
public void unixFileTest() {
|
||||
logger.warn("Executing unixFileTest");
|
||||
|
||||
fastaFile = new File(validationDataLocation + "exampleFASTA.fasta");
|
||||
builder = new FastaSequenceIndexBuilder(fastaFile, progress);
|
||||
sequenceIndex.addIndexEntry("chr1", 6, 100000, 60, 61);
|
||||
|
||||
Assert.assertTrue(sequenceIndex.equals(builder.sequenceIndex));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Tests basic windows file with one contig
|
||||
* File is a simple fasta file
|
||||
*/
|
||||
@Test
|
||||
public void windowsFileTest() {
|
||||
logger.warn("Executing windowsFileTest");
|
||||
|
||||
fastaFile = new File(validationDataLocation + "exampleFASTA-windows.fasta");
|
||||
builder = new FastaSequenceIndexBuilder(fastaFile, progress);
|
||||
sequenceIndex.addIndexEntry("chr2", 7, 29, 7, 9);
|
||||
|
||||
Assert.assertTrue(sequenceIndex.equals(builder.sequenceIndex));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests fasta with the two contigs from above combined
|
||||
* File is the exampleFASTA.fasta shipped with GATK
|
||||
*/
|
||||
@Test
|
||||
public void combinedWindowsUnix() {
|
||||
logger.warn("Executing combinedWindowsUnix");
|
||||
|
||||
fastaFile = new File(validationDataLocation + "exampleFASTA-combined.fasta");
|
||||
builder = new FastaSequenceIndexBuilder(fastaFile, progress);
|
||||
sequenceIndex.addIndexEntry("chr1", 6, 100000, 60, 61);
|
||||
sequenceIndex.addIndexEntry("chr2", 101680, 29, 7, 9);
|
||||
|
||||
Assert.assertTrue(sequenceIndex.equals(builder.sequenceIndex));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests fasta with the two contigs from above combined
|
||||
* File is the exampleFASTA.fasta shipped with GATK
|
||||
*/
|
||||
@Test
|
||||
public void threeVariableLengthContigs() {
|
||||
logger.warn("Executing threeVariableLengthContigs");
|
||||
|
||||
fastaFile = new File(validationDataLocation + "exampleFASTA-3contigs.fasta");
|
||||
builder = new FastaSequenceIndexBuilder(fastaFile, progress);
|
||||
sequenceIndex.addIndexEntry("chr1", 6, 17, 5, 6);
|
||||
sequenceIndex.addIndexEntry("chr2", 35, 21, 7, 8);
|
||||
sequenceIndex.addIndexEntry("chr3", 66, 100, 10, 11);
|
||||
|
||||
Assert.assertTrue(sequenceIndex.equals(builder.sequenceIndex));
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue