Remove auto-creation of fai/dict files for fasta references

-A UserException is now thrown if either the fai or dict file for the
 reference does not exist, with pointers to instructions for creating
 these files.

-Gets rid of problematic file locking that was causing intermittent
 errors on our farm.

-Integration tests to verify that correct exceptions are thrown in
 the case of a missing fai / dict file.

GSA-866 #resolve
This commit is contained in:
David Roazen 2013-04-02 17:04:44 -04:00
parent e7a8e6e8ee
commit 2eac97a76c
3 changed files with 98 additions and 136 deletions

View File

@ -25,10 +25,7 @@
package org.broadinstitute.sting.gatk.datasources.reference;
import net.sf.picard.reference.FastaSequenceIndex;
import net.sf.picard.reference.FastaSequenceIndexBuilder;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.sam.CreateSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.gatk.datasources.reads.LocusShard;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
@ -36,11 +33,8 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.file.FSLockWithShared;
import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException;
import java.io.File;
import java.util.ArrayList;
@ -77,128 +71,25 @@ public class ReferenceDataSource {
final String fastaExt = fastaFile.getAbsolutePath().endsWith("fa") ? ".fa" : ".fasta";
final File dictFile = new File(fastaFile.getAbsolutePath().replace(fastaExt, ".dict"));
/*
* if index file does not exist, create it manually
*/
// It's an error if either the fai or dict file does not exist. The user is now responsible
// for creating these files.
if (!indexFile.exists()) {
logger.info(String.format("Index file %s does not exist. Trying to create it now.", indexFile.getAbsolutePath()));
FSLockWithShared indexLock = new FSLockWithShared(indexFile,true);
try {
// get exclusive lock
if (!indexLock.exclusiveLock())
throw new UserException.CouldNotCreateReferenceIndexFileBecauseOfLock(dictFile);
FastaSequenceIndexBuilder faiBuilder = new FastaSequenceIndexBuilder(fastaFile, true);
FastaSequenceIndex sequenceIndex = faiBuilder.createIndex();
FastaSequenceIndexBuilder.saveAsFaiFile(sequenceIndex, indexFile);
}
catch(FileSystemInabilityToLockException ex) {
logger.info("Unable to create write lock: " + ex.getMessage());
logger.info("Skipping index creation.");
}
catch(UserException e) {
// Rethrow all user exceptions as-is; there should be more details in the UserException itself.
throw e;
}
catch (Exception e) {
// If lock creation succeeded, the failure must have been generating the index.
// If lock creation failed, just skip over index creation entirely.
throw new UserException.CouldNotCreateReferenceIndexFile(indexFile, e);
}
finally {
indexLock.unlock();
}
throw new UserException.MissingReferenceFaiFile(indexFile, fastaFile);
}
/*
* If dict file doesn't exist, try to create it using Picard's CreateSequenceDictionary
* Currently, dictionary cannot be created without running CreateSequenceDictionary's main routine, hence the
* argument string
* This has been filed in trac as (PIC-370) Want programmatic interface to CreateSequenceDictionary
*/
if (!dictFile.exists()) {
logger.info(String.format("Dict file %s does not exist. Trying to create it now.", dictFile.getAbsolutePath()));
/*
* Please note another hack here: we have to create a temporary file b/c CreateSequenceDictionary cannot
* create a dictionary file if that file is locked.
*/
// get read lock on dict file so nobody else can read it
FSLockWithShared dictLock = new FSLockWithShared(dictFile,true);
try {
// get shared lock on dict file so nobody else can start creating it
if (!dictLock.exclusiveLock())
throw new UserException.CouldNotCreateReferenceIndexFileBecauseOfLock(dictFile);
// dict will be written to random temporary file in same directory (see note above)
File tempFile = File.createTempFile("dict", null, dictFile.getParentFile());
tempFile.deleteOnExit();
// create dictionary by calling main routine. Temporary fix - see comment above.
String args[] = {String.format("r=%s", fastaFile.getAbsolutePath()),
String.format("o=%s", tempFile.getAbsolutePath())};
new CreateSequenceDictionary().instanceMain(args);
if (!tempFile.renameTo(dictFile))
throw new UserException("Error transferring temp file " + tempFile + " to dict file " + dictFile);
}
catch(FileSystemInabilityToLockException ex) {
logger.info("Unable to create write lock: " + ex.getMessage());
logger.info("Skipping dictionary creation.");
}
catch (Exception e) {
// If lock creation succeeded, the failure must have been generating the index.
// If lock creation failed, just skip over index creation entirely.
throw new UserException.CouldNotCreateReferenceIndexFile(dictFile, e);
}
finally {
dictLock.unlock();
}
throw new UserException.MissingReferenceDictFile(dictFile, fastaFile);
}
/*
* Read reference data by creating an IndexedFastaSequenceFile.
* A note about thread safety: IndexFastaSequenceFile reads the fasta using dictionary and index files. It will
* fail if either does not exist, but not if either is currently being written (in which case it exists
* but is incomplete). To avoid this, obtain shared locks on both files before creating IndexedFastaSequenceFile.
*/
FSLockWithShared dictLock = new FSLockWithShared(dictFile,true);
FSLockWithShared indexLock = new FSLockWithShared(indexFile,true);
// Read reference data by creating an IndexedFastaSequenceFile.
try {
try {
if (!dictLock.sharedLock()) {
throw new ReviewedStingException("Could not open dictionary file because a lock could not be obtained.");
}
}
catch(FileSystemInabilityToLockException ex) {
logger.info(String.format("Unable to create a lock on dictionary file: %s",ex.getMessage()));
logger.info("Treating existing dictionary file as complete.");
}
try {
if (!indexLock.sharedLock()) {
throw new ReviewedStingException("Could not open index file because a lock could not be obtained.");
}
}
catch(FileSystemInabilityToLockException ex) {
logger.info(String.format("Unable to create a lock on index file: %s",ex.getMessage()));
logger.info("Treating existing index file as complete.");
}
reference = new CachingIndexedFastaSequenceFile(fastaFile);
} catch (IllegalArgumentException e) {
}
catch (IllegalArgumentException e) {
throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e);
}
catch (Exception e) {
throw new UserException.CouldNotReadInputFile(fastaFile, e);
}
finally {
dictLock.unlock();
indexLock.unlock();
}
}
/**

View File

@ -392,29 +392,25 @@ public class UserException extends ReviewedStingException {
}
}
public static class CouldNotCreateReferenceIndexFile extends UserException {
public CouldNotCreateReferenceIndexFile(File f, Exception e) {
this(f, "", e);
}
public CouldNotCreateReferenceIndexFile(File f, String message, Exception e) {
super(String.format("Index file %s does not exist but could not be created because: %s. ", f, message)
+ (e == null ? "" : getMessage(e)));
}
}
public static class CannotHandleGzippedRef extends UserException {
public CannotHandleGzippedRef() {
super("The GATK cannot process compressed (.gz) reference sequences. Please unzip the file and try again. Sorry for the inconvenience.");
}
public CannotHandleGzippedRef() {
super("The GATK cannot process compressed (.gz) reference sequences. Please unzip the file and try again. Sorry for the inconvenience.");
}
}
public static class CouldNotCreateReferenceIndexFileBecauseOfLock extends UserException.CouldNotCreateReferenceIndexFile {
public CouldNotCreateReferenceIndexFileBecauseOfLock(File f) {
super(f, "could not be written because an exclusive file lock could not be obtained. " +
"If you are running multiple instances of GATK, another GATK process is " +
"probably creating this file now, and has locked it. Please wait until this process finishes " +
"and try again.", null);
public static class MissingReferenceFaiFile extends UserException {
public MissingReferenceFaiFile( final File indexFile, final File fastaFile ) {
super(String.format("Fasta index file %s for reference %s does not exist. Please see %s for help creating it.",
indexFile.getAbsolutePath(), fastaFile.getAbsolutePath(),
HelpConstants.forumPost("discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference")));
}
}
public static class MissingReferenceDictFile extends UserException {
public MissingReferenceDictFile( final File dictFile, final File fastaFile ) {
super(String.format("Fasta dict file %s for reference %s does not exist. Please see %s for help creating it.",
dictFile.getAbsolutePath(), fastaFile.getAbsolutePath(),
HelpConstants.forumPost("discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference")));
}
}

View File

@ -0,0 +1,75 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reference;
import junit.framework.Assert;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.Test;
import java.io.File;
import java.io.IOException;
public class ReferenceDataSourceIntegrationTest extends WalkerTest {
@Test
public void testReferenceWithMissingFaiFile() throws IOException {
final File dummyReference = createTempFile("dummy", ".fasta");
final File dictFile = new File(dummyReference.getAbsolutePath().replace(".fasta", ".dict"));
dictFile.deleteOnExit();
Assert.assertTrue(dictFile.createNewFile());
final WalkerTestSpec spec = new WalkerTestSpec(
" -T PrintReads" +
" -R " + dummyReference.getAbsolutePath() +
" -I " + privateTestDir + "NA12878.4.snippet.bam" +
" -o %s",
1,
UserException.MissingReferenceFaiFile.class
);
executeTest("testReferenceWithMissingFaiFile", spec);
}
@Test
public void testReferenceWithMissingDictFile() throws IOException {
final File dummyReference = createTempFile("dummy", ".fasta");
final File faiFile = new File(dummyReference.getAbsolutePath() + ".fai");
faiFile.deleteOnExit();
Assert.assertTrue(faiFile.createNewFile());
final WalkerTestSpec spec = new WalkerTestSpec(
" -T PrintReads" +
" -R " + dummyReference.getAbsolutePath() +
" -I " + privateTestDir + "NA12878.4.snippet.bam" +
" -o %s",
1,
UserException.MissingReferenceDictFile.class
);
executeTest("testReferenceWithMissingDictFile", spec);
}
}