The tribble indexes are now updated with correct sequence lengths for each contig they have in their sequence dictionary. Also clean-up in the RMD track builder.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4321 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-09-21 18:21:22 +00:00
parent 2586f0a1ca
commit b968af5db5
4 changed files with 137 additions and 104 deletions

View File

@ -386,6 +386,10 @@ public class GenomeAnalysisEngine {
if (argCollection.DBSNPFile != null) bindConvenienceRods(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, "dbsnp", argCollection.DBSNPFile);
RMDTrackBuilder manager = new RMDTrackBuilder();
// set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference
manager.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary());
List<RMDTrack> tracks = manager.getReferenceMetaDataSources(this,argCollection.RODBindings);
validateSuppliedReferenceOrderedDataAgainstWalker(my_walker, tracks);

View File

@ -59,23 +59,37 @@ import java.util.*;
*/
public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
/**
* our log, which we want to capture anything from this class
* our log, which we use to capture anything from this class
*/
private static Logger logger = Logger.getLogger(RMDTrackBuilder.class);
private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class);
// a constant we use for marking sequence dictionary entries in the Tribble index property list
public static final String SequenceDictionaryPropertyPredicate = "DICT:";
// the input strings we use to create RODs from
List<RMDTriplet> inputs = new ArrayList<RMDTriplet>();
private final List<RMDTriplet> inputs = new ArrayList<RMDTriplet>();
// the linear index extension
public static final String indexExtension = ".idx";
private Map<String, Class> classes = null;
// private sequence dictionary we use to set our tracks with
private SAMSequenceDictionary dict = null;
/** Create a new plugin manager. */
public RMDTrackBuilder() {
super(FeatureCodec.class, "Codecs", "Codec");
}
/**
*
* @param dict the sequence dictionary to use as a reference for Tribble track contig length lookups
*/
public void setSequenceDictionary(SAMSequenceDictionary dict) {
this.dict = dict;
}
/** @return a list of all available track types we currently have access to create */
public Map<String, Class> getAvailableTrackNamesAndTypes() {
classes = new HashMap<String, Class>();
@ -135,7 +149,7 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
if (inputFile.getAbsolutePath().endsWith(".gz"))
pair = createBasicFeatureSourceNoAssumedIndex(targetClass, name, inputFile);
else
pair = getLinearFeatureReader(targetClass, name, inputFile);
pair = getFeatureSource(targetClass, name, inputFile);
return pair;
}
@ -173,20 +187,29 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
}
/**
* create a linear feature reader, where we create the index ahead of time
* create a feature source object given:
* @param targetClass the target class
* @param name the name of the codec
* @param inputFile the tribble file to parse
* @return the input file as a FeatureReader
*/
private Pair<BasicFeatureSource, SAMSequenceDictionary> getLinearFeatureReader(Class targetClass, String name, File inputFile) {
private Pair<BasicFeatureSource, SAMSequenceDictionary> getFeatureSource(Class targetClass, String name, File inputFile) {
Pair<BasicFeatureSource, SAMSequenceDictionary> reader;
try {
Index index = loadIndex(inputFile, createCodec(targetClass, name), true);
Index index = loadIndex(inputFile, createCodec(targetClass, name));
SAMSequenceDictionary dictFromIndex = getSequenceDictionaryFromProperties(index);
// if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
if (dictFromIndex.size() == 0 && dict != null) {
File indexFile = new File(inputFile.getAbsoluteFile() + indexExtension);
setIndexSequenceDictionary(index,dict,indexFile,true);
dictFromIndex = getSequenceDictionaryFromProperties(index);
}
reader = new Pair<BasicFeatureSource, SAMSequenceDictionary>(new BasicFeatureSource(inputFile.getAbsolutePath(),
index,
createCodec(targetClass, name)),
sequenceSetToDictionary(index.getSequenceNames()));
dictFromIndex);
} catch (FileNotFoundException e) {
throw new UserException.CouldNotReadInputFile(inputFile, "Unable to create reader with file", e);
} catch (IOException e) {
@ -199,11 +222,10 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
* create an index for the input file
* @param inputFile the input file
* @param codec the codec to use
* @param onDisk write the index to disk?
* @return a linear index for the specified type
* @throws IOException if we cannot write the index file
*/
public synchronized static Index loadIndex(File inputFile, FeatureCodec codec, boolean onDisk) throws IOException {
public synchronized static Index loadIndex(File inputFile, FeatureCodec codec) throws IOException {
// create the index file name, locking on the index file name
File indexFile = new File(inputFile.getAbsoluteFile() + indexExtension);
@ -218,7 +240,7 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
if (idx != null) return idx;
// we couldn't read the file, or we fell out of the conditions above, continue on to making a new index
return createNewIndex(inputFile, codec, onDisk, indexFile, lock);
return writeIndexToDisk(createIndexInMemory(inputFile, codec), indexFile, lock);
}
/**
@ -277,33 +299,29 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
/**
* attempt to create the index, and write it to disk
* @param inputFile the input file
* @param codec the codec to use
* @param onDisk if they asked for disk storage or now
* attempt to write the index to disk
* @param index the index to write to disk
* @param indexFile the index file location
* @param lock the locking object
* @return the index object
* @throws IOException when unable to create the new index
*/
private static Index createNewIndex(File inputFile, FeatureCodec codec, boolean onDisk, File indexFile, FSLockWithShared lock) throws IOException {
Index index = createIndexInMemory(inputFile, codec);
private static Index writeIndexToDisk(Index index, File indexFile, FSLockWithShared lock) throws IOException {
boolean locked = false; // could we exclusive lock the file?
try {
locked = lock.exclusiveLock();
if (locked) {
logger.info("Writing Tribble index to disk for file " + inputFile);
logger.info("Writing Tribble index to disk for file " + indexFile);
LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
index.write(stream);
stream.close();
}
else // we can't write it to disk, just store it in memory, tell them this
if (onDisk) logger.info("Unable to write to " + indexFile + " for the index file, creating index in memory only");
logger.info("Unable to write to " + indexFile + " for the index file, creating index in memory only");
return index;
}
catch(FileSystemInabilityToLockException ex) {
throw new UserException.MissortedFile(inputFile,"Unexpected inability to lock exception", ex);
throw new UserException.CouldNotCreateOutputFile(indexFile,"Unexpected inability to lock exception", ex);
}
finally {
if (locked) lock.unlock();
@ -318,28 +336,12 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
* @return a LinearIndex, given the file location
* @throws IOException when unable to create the index in memory
*/
private static Index createIndexInMemory(File inputFile, FeatureCodec codec) throws IOException {
private static Index createIndexInMemory(File inputFile, FeatureCodec codec) {
// this can take a while, let them know what we're doing
logger.info("Creating Tribble index in memory for file " + inputFile);
return IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
}
/**
* convert a list of Strings into a sequence dictionary
* @param contigList the contig list, in coordinate order, this is allowed to be null
* @return a SAMSequenceDictionary, WITHOUT contig sizes
*/
private static SAMSequenceDictionary sequenceSetToDictionary(LinkedHashSet<String> contigList) {
SAMSequenceDictionary dict = new SAMSequenceDictionary();
if (contigList == null) return dict;
for (String name : contigList) {
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
dict.addSequence(seq);
}
return dict;
}
/**
* Returns a collection of track names that match the record type.
* @param trackRecordType the record type specified in the @RMD annotation
@ -361,10 +363,11 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
* find the associated reference meta data
*
* @param bindings the bindings of strings from the -B command line option
* @param engine the GATK engine to bind the tracks to
*
* @return a list of RMDTracks, one for each -B option
*/
public List<RMDTrack> getReferenceMetaDataSources(GenomeAnalysisEngine engine,List<String> bindings) {
public List<RMDTrack> getReferenceMetaDataSources(GenomeAnalysisEngine engine, List<String> bindings) {
initializeBindings(engine,bindings);
// try and make the tracks given their requests
return createRequestedTrackObjects();
@ -416,4 +419,74 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
}
return tracks;
}
// ---------------------------------------------------------------------------------------------------------
// static functions to work with the sequence dictionaries of indexes
// ---------------------------------------------------------------------------------------------------------
/**
* get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index
* @param index the index file to use
* @return a SAMSequenceDictionary if available, null if unavailable
*/
public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) {
SAMSequenceDictionary dict = new SAMSequenceDictionary();
for (Map.Entry<String,String> entry : index.getProperties().entrySet()) {
if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate))
dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()),
Integer.valueOf(entry.getValue())));
}
return dict;
}
/**
* create the sequence dictionary with the contig list; a backup approach
* @param index the index file to use
* @param dict the sequence dictionary to add contigs to
* @return the filled-in sequence dictionary
*/
private static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) {
LinkedHashSet<String> seqNames = index.getSequenceNames();
if (seqNames == null) {
return dict;
}
for (String name : seqNames) {
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
dict.addSequence(seq);
}
return dict;
}
/**
* set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible.
* (that each contig in the index is in the sequence dictionary).
* @param dict the sequence dictionary
* @param index the index file
* @param indexFile the index file
* @param rewriteIndex should we rewrite the index when we're done?
*
*/
public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) {
if (dict == null) return;
SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
// check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set
for (SAMSequenceRecord seq : currentDict.getSequences()) {
if (dict.getSequence(seq.getSequenceName()) == null)
throw new UserException.IncompatibleSequenceDictionaries("The sequence dictionary from the reference the GATK is running with is not compatible with the sequence " +
"dictionary in the Tribble file " + indexFile + ". It doesn't contain the contig: " + seq.getSequenceName(),
"RMD Sequence Dictionary",
currentDict,
"Reference Sequence Dictionary",
dict);
index.addProperty(SequenceDictionaryPropertyPredicate + dict.getSequence(seq.getSequenceName()).getSequenceName(), String.valueOf(dict.getSequence(seq.getSequenceName()).getSequenceLength()));
}
// re-write the index
if (rewriteIndex) try {
writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
} catch (IOException e) {
logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK");
}
}
}

View File

@ -25,6 +25,8 @@ package org.broadinstitute.sting.gatk.refdata.tracks.builders;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broad.tribble.index.Index;
import org.broad.tribble.vcf.VCFCodec;
import org.broadinstitute.sting.BaseTest;
@ -48,11 +50,12 @@ import java.util.Map;
*/
public class RMDTrackBuilderUnitTest extends BaseTest {
private RMDTrackBuilder builder;
private IndexedFastaSequenceFile seq;
@Before
public void setup() {
builder = new RMDTrackBuilder();
IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(b36KGReference));
seq = new IndexedFastaSequenceFile(new File(b36KGReference));
GenomeLocParser.setupRefContigOrdering(seq);
}
@ -67,7 +70,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
public void testBuilderIndexUnwriteable() {
File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/relic.vcf");
try {
builder.loadIndex(vcfFile, new VCFCodec(), true);
builder.loadIndex(vcfFile, new VCFCodec());
} catch (IOException e) {
e.printStackTrace();
Assert.fail("IO exception unexpected" + e.getMessage());
@ -103,7 +106,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
Index ind = null;
try {
ind = builder.loadIndex(vcfFile, new VCFCodec(), true);
ind = builder.loadIndex(vcfFile, new VCFCodec());
} catch (IOException e) {
e.printStackTrace();
Assert.fail("IO exception unexpected" + e.getMessage());
@ -130,7 +133,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
if (vcfFileIndex.exists()) vcfFileIndex.delete();
try {
builder.loadIndex(vcfFile, new VCFCodec(), true);
builder.loadIndex(vcfFile, new VCFCodec());
} catch (IOException e) {
e.printStackTrace();
Assert.fail("IO exception unexpected" + e.getMessage());
@ -140,30 +143,19 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
}
// test to make sure we delete the index and regenerate if it's out of date
//@Test
public void testBuilderIndexOutOfDate() {
File vcfFile = createOutofDateIndexFile(new File(validationDataLocation + "/ROD_validation/newerTribbleTrack.vcf"));
try {
builder.loadIndex(vcfFile, new VCFCodec(), true);
} catch (IOException e) {
e.printStackTrace();
Assert.fail("IO exception unexpected" + e.getMessage());
}
//System.err.println("index : " + new File(vcfFile + ".idx").lastModified());
// System.err.println("vcf : " + vcfFile.lastModified());
// make sure that we removed and updated the index
Assert.assertTrue("VCF file index wasn't updated", new File(vcfFile + ".idx").lastModified() > vcfFile.lastModified());
}
// test to make sure we delete the index and regenerate if it's out of date
// @Test
public void testBuilderIndexGoodDate() {
// test to make sure we get a full sequence dictionary from the VCF (when we set the dictionary in the builder)
@Test
public void testBuilderIndexSequenceDictionary() {
File vcfFile = createCorrectDateIndexFile(new File(validationDataLocation + "/ROD_validation/newerTribbleTrack.vcf"));
Long indexTimeStamp = new File(vcfFile.getAbsolutePath() + ".idx").lastModified();
try {
builder.loadIndex(vcfFile, new VCFCodec(), true);
Index idx = builder.loadIndex(vcfFile, new VCFCodec());
RMDTrackBuilder.setIndexSequenceDictionary(idx,seq.getSequenceDictionary(),vcfFile,false);
SAMSequenceDictionary dict = RMDTrackBuilder.getSequenceDictionaryFromProperties(idx);
for (SAMSequenceRecord ent : seq.getSequenceDictionary().getSequences()) {
Assert.assertNotNull("Sequence missing from set dictionary: " + ent.getSequenceName(),dict.getSequence(ent.getSequenceName()));
Assert.assertEquals(ent.getSequenceLength(),dict.getSequence(ent.getSequenceName()).getSequenceLength());
}
} catch (IOException e) {
e.printStackTrace();
Assert.fail("IO exception unexpected" + e.getMessage());
@ -175,42 +167,6 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
Assert.assertTrue("Fail: index file was modified", new File(vcfFile + ".idx").lastModified() == indexTimeStamp);
}
/**
* create a temporary file and an associated out of date index file
*
* @param tribbleFile the tribble file
* @return a file pointing to the new tmp location, with out of date index
*/
private File createOutofDateIndexFile(File tribbleFile) {
try {
// first copy the tribble file to a temperary file
File tmpFile = File.createTempFile("TribbleUnitTestFile", "");
tmpFile.deleteOnExit();
logger.info("creating temp file " + tmpFile);
// create a fake index, before we copy so it's out of date
File tmpIndex = new File(tmpFile.getAbsolutePath() + ".idx");
tmpIndex.deleteOnExit();
// sleep, to make sure the timestamps are different
Thread.sleep(2000);
// copy the vcf (tribble) file to the tmp file location
copyFile(tribbleFile, tmpFile);
// sleep again, to make sure the timestamps are different (vcf vrs updated index file)
Thread.sleep(2000);
return tmpFile;
} catch (IOException e) {
Assert.fail("Fail: Unable to create temperary file");
} catch (InterruptedException e) {
Assert.fail("Fail: Somehow our thread got interupted");
}
return null;
}
/**
* create a temporary file and an associated out of date index file
*

View File

@ -13,15 +13,15 @@ public class ValidateRODForReadsIntegrationTest extends WalkerTest {
private final String vcfFile = validationDataLocation + "rodForReadsVCFCheck.vcf";
private final String dbSNPFile = GATKDataLocation + "dbsnp_129_hg18.rod";
public static String baseTestString1KG() {
return "-T ValidateRODForReads -o %s -R testdata/exampleFASTA.fasta -U ALLOW_SEQ_DICT_INCOMPATIBILITY -I testdata/exampleBAM.bam";
public static String baseTestString() {
return "-T ValidateRODForReads -o %s -R " + hg18Reference + " -I " + validationDataLocation + "small_bam_for_rods_for_reads.bam";
}
@Test
public void testSimpleVCFPileup() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString1KG() + " -B:vcf,vcf " + vcfFile, 1,
baseTestString() + " -B:vcf,vcf " + vcfFile, 1,
Arrays.asList("f7919e9dc156fb5d3ad0541666864ea5"));
executeTest("testSimpleVCFPileup", spec);
}
@ -29,7 +29,7 @@ public class ValidateRODForReadsIntegrationTest extends WalkerTest {
@Test
public void testSimpleDbSNPPileup() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString1KG() + " -B:dbsnp,dbsnp " + dbSNPFile, 1,
baseTestString() + " -B:dbsnp,dbsnp " + dbSNPFile, 1,
Arrays.asList("c63b8ef9291a450f0519c73ac9cae189"));
executeTest("testSimpleDbSNPPileup", spec);
}