Updating the way reference data is processed, so GATK creates the .fasta.fai and .dict files automatically. If either (or both) don't exist, GATK will create them in the same folder as the fasta file. If it can't write the file, GATK will fail with a message to create them manually.

Note that this functionality will only work if the directory with the fasta is writeable. GATK will fail if directory is read only and and either the .fasta.fai or .dict files don't exist. In the future, we could have these references be created in memory, but we decided against it this time. 

Locking was also added to ReferenceDataSource so no issues come up while running multiple GATKs on the same reference: we don't want one process to be half-finished and another try to read it. So, you could see error messages related to locking. See ReferenceDataSource.java for explanation of the locking strategy. 



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3601 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
bthomas 2010-06-21 21:42:42 +00:00
parent df1cadc4c9
commit 300a18b85f
3 changed files with 133 additions and 88 deletions

View File

@ -25,15 +25,21 @@
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.fasta.FastaSequenceIndex;
import org.broadinstitute.sting.utils.fasta.FastaSequenceIndexBuilder;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import net.sf.picard.sam.CreateSequenceDictionary;
import org.broadinstitute.sting.utils.file.FSLock;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
/**
* Loads reference data from fasta file
* Looks for fai and dict files, and tries to create them if they don't exist
@ -47,34 +53,41 @@ public class ReferenceDataSource implements ReferenceDataSourceProgressListener
/**
* Create reference data source from fasta file
* @param fastaFile Fasta file to be used as reference
*
* Please note: This should be re-done when PIC-370 is fixed. (See below.) At that time, we may consider loading
* sequenceIndex and sequenceDict here, instead of in IndexedFastaSequenceFile.
*/
public ReferenceDataSource(File fastaFile) {
File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
File dictFile = new File(fastaFile.getAbsolutePath().replace(".fasta", ".dict"));
FastaSequenceIndex sequenceIndex; // stores FastaSequenceIndex if file doesn't exist
/*
Note: this code is temporary. See commented code below, which will be used when thread safety is resolved.
*/
if index file does not exist, create it manually
*/
if (!indexFile.exists()) {
FastaSequenceIndexBuilder faiBuilder = new FastaSequenceIndexBuilder(fastaFile, this);
sequenceIndex = faiBuilder.sequenceIndex;
logger.info(String.format("Index file %s does not exist. Trying to create it now.", indexFile.getAbsolutePath()));
FileChannel indexChannel;
FileLock indexLock;
try {
index = new IndexedFastaSequenceFile(fastaFile, sequenceIndex);
// get exclusive lock
indexChannel = new RandomAccessFile(indexFile, "rw").getChannel();
if ((indexLock = indexChannel.tryLock(0, Long.MAX_VALUE, false)) == null)
throw new StingException("Index file could not be written because a lock could not be obtained." +
"If you are running multiple instances of GATK, another process is probably creating this " +
"file now. Please wait until it is finished and try again.");
FastaSequenceIndexBuilder faiBuilder = new FastaSequenceIndexBuilder(fastaFile, this);
FastaSequenceIndex sequenceIndex = faiBuilder.createIndex();
FastaSequenceIndexBuilder.saveAsFaiFile(sequenceIndex, indexFile);
// unlock
try {
if (indexLock != null)
indexLock.release();
if (indexChannel != null)
indexChannel.close();
}
catch (Exception e) {
throw new StingException("An error occurred while unlocking file:" + indexFile.getAbsolutePath(), e);
}
}
catch (Exception e) {
throw new StingException("Could not load fasta file. Stack trace below. ", e);
}
}
else {
try {
index = new IndexedFastaSequenceFile(fastaFile);
}
catch (Exception e) {
throw new StingException("Could not load fasta file. Stack trace below.", e);
throw new StingException("Index file does not exist and could not be created. See error below.", e);
}
}
@ -83,57 +96,88 @@ public class ReferenceDataSource implements ReferenceDataSourceProgressListener
* Currently, dictionary cannot be created without running CreateSequenceDictionary's main routine, hence the
* argument string
* This has been filed in trac as (PIC-370) Want programmatic interface to CreateSequenceDictionary
*
* Todo: Currently working on making the creation of dictionary and index thread-safe
*
*/
/*if (!dictFile.exists()) {
logger.info(String.format("Fasta dictionary file %s does not exist. Trying to create it now.",
dictFile.getAbsolutePath()));
String args[] = {String.format("r=%s", fastaFile.getAbsolutePath()),
String.format("o=%s", dictFile.getAbsolutePath())};
new CreateSequenceDictionary().instanceMain(args);
logger.info(String.format("Dictionary file created successfully!"));
}*/
/**
* Create fai file if it doesn't exist and throw exception if unsuccessful
* Note that this implies walkers cannot be run if a fai file is not provided and GATK cannot write to disk
*
* Todo: currently working on making this thread safe.
* Rather than creating a new fai file, structure is created in memory. This whole block will be fixed
* in a couple days when we figure out the thread stuff.
*
*/
/*if (!indexFile.exists()) {
logger.info(String.format("Fasta index file %s does not exist. Trying to create it now.",
indexFile.getAbsolutePath()));
FastaSequenceIndexBuilder faiBuilder = new FastaSequenceIndexBuilder(fastaFile, this);
if (!dictFile.exists()) {
logger.info(String.format("Index file %s does not exist. Trying to create it now.", indexFile.getAbsolutePath()));
FileChannel dictChannel;
FileLock dictLock;
try {
faiBuilder.saveAsFaiFile();
logger.info(String.format("Index file created successfully!"));
// get exclusive lock
dictChannel = new RandomAccessFile(indexFile, "rw").getChannel();
if ((dictLock = dictChannel.tryLock(0, Long.MAX_VALUE, false)) == null)
throw new StingException("Dictionary file could not be written because a lock could not be obtained." +
"If you are running multiple instances of GATK, another process is probably creating this " +
"file now. Please wait until it is finished and try again.");
// create dictionary by calling main routine. Temporary fix - see comment above.
String args[] = {String.format("r=%s", fastaFile.getAbsolutePath()),
String.format("o=%s", dictFile.getAbsolutePath())};
new CreateSequenceDictionary().instanceMain(args);
// unlock
try {
if (dictLock != null)
dictLock.release();
if (dictChannel != null)
dictChannel.close();
}
catch (Exception e) {
throw new StingException("An error occurred while unlocking file:" + indexFile.getAbsolutePath(), e);
}
}
catch (Exception e) {
throw new StingException("Index file could not be saved", e);
throw new StingException("Dictionary file does not exist and could not be created. See error below.", e);
}
}
// now create IndexedFastaSequenceFile
/*
* Read reference data by creating an IndexedFastaSequenceFile.
* A note about thread safety: IndexFastaSequenceFile reads the fasta using dictionary and index files. It will
* fail if either does not exist, but not if either is currently being written (in which case it exists
* but is incomplete). To avoid this, obtain shared locks on both files before creating IndexedFastaSequenceFile.
*/
FileChannel dictChannel;
FileChannel indexChannel;
FileLock dictLock;
FileLock indexLock;
try {
index = indexFile.exists() ? new IndexedFastaSequenceFile(fastaFile) :
new IndexedFastaSequenceFile(fastaFile, sequenceIndex);
}
catch (IOException e) {
throw new StingException("An error occurred while loading the fasta file and its .fasta.fai and " +
".dict counterparts.", e);
// set up dictionary and index locks
// channel is read only and lock is shared (third argument is true)
dictChannel = new RandomAccessFile(dictFile, "r").getChannel();
if ((dictLock = dictChannel.tryLock(0, Long.MAX_VALUE, true)) == null) {
throw new StingException("Could not open dictionary file because a lock could not be obtained.");
}
indexChannel = new RandomAccessFile(indexFile, "r").getChannel();
if ((indexLock = indexChannel.tryLock(0, Long.MAX_VALUE, true)) == null) {
throw new StingException("Could not open dictionary file because a lock could not be obtained.");
}
index = new IndexedFastaSequenceFile(fastaFile);
// unlock/close
try {
if (dictLock != null)
dictLock.release();
if (dictChannel != null)
dictChannel.close();
}
catch (Exception e) {
throw new StingException("An error occurred while unlocking file:" + dictFile.getAbsolutePath(), e);
}
try {
if (indexLock != null)
indexLock.release();
if (indexChannel != null)
indexChannel.close();
}
catch (Exception e) {
throw new StingException("An error occurred while unlocking file:" + indexFile.getAbsolutePath(), e);
}
}
catch (Exception e) {
throw new StingException("An error occurred while processing the fasta file and its .fasta.fai and " +
".dict counterparts. If the error could have been caused by the .fasta.fai or .dict files, " +
"you can re-create them by removing them from the folder that the fasta file is in and " +
"running GATK again.", e);
}*/
throw new StingException(String.format("Error reading fasta file %s. See stack trace below.", fastaFile.getAbsolutePath()), e);
}
}
/**

View File

@ -39,7 +39,6 @@ import java.util.Iterator;
public class FastaSequenceIndexBuilder {
public File fastaFile;
ReferenceDataSourceProgressListener progress; // interface that provides a method for updating user on progress of reading file
public FastaSequenceIndex sequenceIndex = new FastaSequenceIndex();
// keep track of location in file
long bytesRead, endOfLastLine, lastTimestamp, fileLength; // initialized to -1 to keep 0-indexed position in file;
@ -57,13 +56,13 @@ public class FastaSequenceIndexBuilder {
this.progress = progress;
this.fastaFile = fastaFile;
fileLength = fastaFile.length();
parseFastaFile();
}
/**
* Creates fasta sequence index from fasta file
* @return FastaSequenceIndex that is read from file
*/
private void parseFastaFile() {
public FastaSequenceIndex createIndex() { // should this be static?
bytesRead = -1;
endOfLastLine = -1;
contig = "";
@ -73,6 +72,7 @@ public class FastaSequenceIndexBuilder {
basesPerLine = 0;
basesThisLine = 0;
lastTimestamp = System.currentTimeMillis();
FastaSequenceIndex sequenceIndex = new FastaSequenceIndex();
// initialize input stream
DataInputStream in;
@ -156,7 +156,7 @@ public class FastaSequenceIndexBuilder {
// if next char is ';' or '>', then there is only one contig =>
if (nextByte == ';' || nextByte == '>')
finishReadingContig();
finishReadingContig(sequenceIndex);
}
}
@ -183,7 +183,7 @@ public class FastaSequenceIndexBuilder {
// if comment or new contig, definitely end of sequence
if (nextByte == ';' || nextByte == '>')
finishReadingContig();
finishReadingContig(sequenceIndex);
// if this line has different # of bases OR same # of bases and different # of bytes:
// error if next char is a valid base, end of contig otherwise
@ -192,7 +192,7 @@ public class FastaSequenceIndexBuilder {
throw new StingException(String.format("An invalid line was found in the contig: %s", contig));
}
else
finishReadingContig();
finishReadingContig(sequenceIndex);
}
endOfLastLine = bytesRead;
}
@ -206,6 +206,7 @@ public class FastaSequenceIndexBuilder {
break;
}
}
return sequenceIndex;
}
catch (IOException e) {
throw new StingException(String.format("Could not read fasta file %s", fastaFile.getAbsolutePath()), e); }
@ -239,7 +240,7 @@ public class FastaSequenceIndexBuilder {
* When reader reaches the end of a contig
* Reset iterators and add contig to sequence index
*/
private void finishReadingContig() {
private void finishReadingContig(FastaSequenceIndex sequenceIndex) {
sequenceIndex.addIndexEntry(contig, location, size, (int) basesPerLine, (int) bytesPerLine);
status = Status.NONE;
contig = "";
@ -256,22 +257,18 @@ public class FastaSequenceIndexBuilder {
/**
* Stores FastaSequenceIndex as a .fasta.fai file on local machine
* Although method is public it cannot be called on any old FastaSequenceIndex - must be created by a FastaSequenceIndexBuilder
* @param sequenceIndex sequenceIndex to be saved
* @param faiFile file where we should store index
*/
public void saveAsFaiFile() {
File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
if (indexFile.exists()) {
throw new StingException(String.format("Fai file could not be created, because a file with name %s already exists." +
"Please delete or rename this file and try again.", indexFile.getAbsolutePath()));
}
public static void saveAsFaiFile(FastaSequenceIndex sequenceIndex, File faiFile) {
BufferedWriter out;
try {
out = new BufferedWriter(new FileWriter(indexFile));
out = new BufferedWriter(new FileWriter(faiFile));
}
catch (Exception e) {
throw new StingException(String.format("Could not open file %s for writing. Check that GATK is permitted to write to disk.",
indexFile.getAbsolutePath()), e);
faiFile.getAbsolutePath()), e);
}
Iterator<FastaSequenceIndexEntry> iter = sequenceIndex.iterator();

View File

@ -43,11 +43,11 @@ public class FastaSequenceIndexBuilderUnitTest extends BaseTest {
private FastaSequenceIndexBuilder builder;
private ReferenceDataSourceProgressListener progress;
private File fastaFile;
private FastaSequenceIndex sequenceIndex;
private FastaSequenceIndex controlIndex;
@Before
public void doForEachTest() throws FileNotFoundException {
sequenceIndex = new FastaSequenceIndex();
controlIndex = new FastaSequenceIndex();
}
/**
@ -60,9 +60,10 @@ public class FastaSequenceIndexBuilderUnitTest extends BaseTest {
fastaFile = new File(validationDataLocation + "exampleFASTA.fasta");
builder = new FastaSequenceIndexBuilder(fastaFile, progress);
sequenceIndex.addIndexEntry("chr1", 6, 100000, 60, 61);
FastaSequenceIndex index = builder.createIndex();
controlIndex.addIndexEntry("chr1", 6, 100000, 60, 61);
Assert.assertTrue(sequenceIndex.equals(builder.sequenceIndex));
Assert.assertTrue(index.equals(controlIndex));
}
@ -76,9 +77,10 @@ public class FastaSequenceIndexBuilderUnitTest extends BaseTest {
fastaFile = new File(validationDataLocation + "exampleFASTA-windows.fasta");
builder = new FastaSequenceIndexBuilder(fastaFile, progress);
sequenceIndex.addIndexEntry("chr2", 7, 29, 7, 9);
FastaSequenceIndex index = builder.createIndex();
controlIndex.addIndexEntry("chr2", 7, 29, 7, 9);
Assert.assertTrue(sequenceIndex.equals(builder.sequenceIndex));
Assert.assertTrue(index.equals(controlIndex));
}
/**
@ -91,10 +93,11 @@ public class FastaSequenceIndexBuilderUnitTest extends BaseTest {
fastaFile = new File(validationDataLocation + "exampleFASTA-combined.fasta");
builder = new FastaSequenceIndexBuilder(fastaFile, progress);
sequenceIndex.addIndexEntry("chr1", 6, 100000, 60, 61);
sequenceIndex.addIndexEntry("chr2", 101680, 29, 7, 9);
FastaSequenceIndex index = builder.createIndex();
controlIndex.addIndexEntry("chr1", 6, 100000, 60, 61);
controlIndex.addIndexEntry("chr2", 101680, 29, 7, 9);
Assert.assertTrue(sequenceIndex.equals(builder.sequenceIndex));
Assert.assertTrue(index.equals(controlIndex));
}
/**
@ -107,10 +110,11 @@ public class FastaSequenceIndexBuilderUnitTest extends BaseTest {
fastaFile = new File(validationDataLocation + "exampleFASTA-3contigs.fasta");
builder = new FastaSequenceIndexBuilder(fastaFile, progress);
sequenceIndex.addIndexEntry("chr1", 6, 17, 5, 6);
sequenceIndex.addIndexEntry("chr2", 35, 21, 7, 8);
sequenceIndex.addIndexEntry("chr3", 66, 100, 10, 11);
FastaSequenceIndex index = builder.createIndex();
controlIndex.addIndexEntry("chr1", 6, 17, 5, 6);
controlIndex.addIndexEntry("chr2", 35, 21, 7, 8);
controlIndex.addIndexEntry("chr3", 66, 100, 10, 11);
Assert.assertTrue(sequenceIndex.equals(builder.sequenceIndex));
Assert.assertTrue(index.equals(controlIndex));
}
}