diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index b0cb761fc..23520c3dd 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -386,6 +386,10 @@ public class GenomeAnalysisEngine { if (argCollection.DBSNPFile != null) bindConvenienceRods(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, "dbsnp", argCollection.DBSNPFile); RMDTrackBuilder manager = new RMDTrackBuilder(); + + // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference + manager.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()); + List tracks = manager.getReferenceMetaDataSources(this,argCollection.RODBindings); validateSuppliedReferenceOrderedDataAgainstWalker(my_walker, tracks); diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java index 8ed3d2c7f..e57e2b18b 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java @@ -59,23 +59,37 @@ import java.util.*; */ public class RMDTrackBuilder extends PluginManager { /** - * our log, which we want to capture anything from this class + * our log, which we use to capture anything from this class */ - private static Logger logger = Logger.getLogger(RMDTrackBuilder.class); + private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); + + // a constant we use for marking sequence dictionary entries in the Tribble index property list + public static final String SequenceDictionaryPropertyPredicate = "DICT:"; // the input strings we use to create RODs from - List inputs = new ArrayList(); + private final List inputs = new ArrayList(); // the linear index extension public static final String indexExtension = ".idx"; private Map classes = null; + // private sequence dictionary we use to set our tracks with + private SAMSequenceDictionary dict = null; + /** Create a new plugin manager. */ public RMDTrackBuilder() { super(FeatureCodec.class, "Codecs", "Codec"); } + /** + * + * @param dict the sequence dictionary to use as a reference for Tribble track contig length lookups + */ + public void setSequenceDictionary(SAMSequenceDictionary dict) { + this.dict = dict; + } + /** @return a list of all available track types we currently have access to create */ public Map getAvailableTrackNamesAndTypes() { classes = new HashMap(); @@ -135,7 +149,7 @@ public class RMDTrackBuilder extends PluginManager { if (inputFile.getAbsolutePath().endsWith(".gz")) pair = createBasicFeatureSourceNoAssumedIndex(targetClass, name, inputFile); else - pair = getLinearFeatureReader(targetClass, name, inputFile); + pair = getFeatureSource(targetClass, name, inputFile); return pair; } @@ -173,20 +187,29 @@ public class RMDTrackBuilder extends PluginManager { } /** - * create a linear feature reader, where we create the index ahead of time + * create a feature source object given: * @param targetClass the target class * @param name the name of the codec * @param inputFile the tribble file to parse * @return the input file as a FeatureReader */ - private Pair getLinearFeatureReader(Class targetClass, String name, File inputFile) { + private Pair getFeatureSource(Class targetClass, String name, File inputFile) { Pair reader; try { - Index index = loadIndex(inputFile, createCodec(targetClass, name), true); + Index index = loadIndex(inputFile, createCodec(targetClass, name)); + SAMSequenceDictionary dictFromIndex = getSequenceDictionaryFromProperties(index); + + // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match + if (dictFromIndex.size() == 0 && dict != null) { + File indexFile = new File(inputFile.getAbsoluteFile() + indexExtension); + setIndexSequenceDictionary(index,dict,indexFile,true); + dictFromIndex = getSequenceDictionaryFromProperties(index); + } + reader = new Pair(new BasicFeatureSource(inputFile.getAbsolutePath(), index, createCodec(targetClass, name)), - sequenceSetToDictionary(index.getSequenceNames())); + dictFromIndex); } catch (FileNotFoundException e) { throw new UserException.CouldNotReadInputFile(inputFile, "Unable to create reader with file", e); } catch (IOException e) { @@ -199,11 +222,10 @@ public class RMDTrackBuilder extends PluginManager { * create an index for the input file * @param inputFile the input file * @param codec the codec to use - * @param onDisk write the index to disk? * @return a linear index for the specified type * @throws IOException if we cannot write the index file */ - public synchronized static Index loadIndex(File inputFile, FeatureCodec codec, boolean onDisk) throws IOException { + public synchronized static Index loadIndex(File inputFile, FeatureCodec codec) throws IOException { // create the index file name, locking on the index file name File indexFile = new File(inputFile.getAbsoluteFile() + indexExtension); @@ -218,7 +240,7 @@ public class RMDTrackBuilder extends PluginManager { if (idx != null) return idx; // we couldn't read the file, or we fell out of the conditions above, continue on to making a new index - return createNewIndex(inputFile, codec, onDisk, indexFile, lock); + return writeIndexToDisk(createIndexInMemory(inputFile, codec), indexFile, lock); } /** @@ -277,33 +299,29 @@ public class RMDTrackBuilder extends PluginManager { /** - * attempt to create the index, and write it to disk - * @param inputFile the input file - * @param codec the codec to use - * @param onDisk if they asked for disk storage or now + * attempt to write the index to disk + * @param index the index to write to disk * @param indexFile the index file location * @param lock the locking object * @return the index object * @throws IOException when unable to create the new index */ - private static Index createNewIndex(File inputFile, FeatureCodec codec, boolean onDisk, File indexFile, FSLockWithShared lock) throws IOException { - Index index = createIndexInMemory(inputFile, codec); - + private static Index writeIndexToDisk(Index index, File indexFile, FSLockWithShared lock) throws IOException { boolean locked = false; // could we exclusive lock the file? try { locked = lock.exclusiveLock(); if (locked) { - logger.info("Writing Tribble index to disk for file " + inputFile); + logger.info("Writing Tribble index to disk for file " + indexFile); LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); index.write(stream); stream.close(); } else // we can't write it to disk, just store it in memory, tell them this - if (onDisk) logger.info("Unable to write to " + indexFile + " for the index file, creating index in memory only"); + logger.info("Unable to write to " + indexFile + " for the index file, creating index in memory only"); return index; } catch(FileSystemInabilityToLockException ex) { - throw new UserException.MissortedFile(inputFile,"Unexpected inability to lock exception", ex); + throw new UserException.CouldNotCreateOutputFile(indexFile,"Unexpected inability to lock exception", ex); } finally { if (locked) lock.unlock(); @@ -318,28 +336,12 @@ public class RMDTrackBuilder extends PluginManager { * @return a LinearIndex, given the file location * @throws IOException when unable to create the index in memory */ - private static Index createIndexInMemory(File inputFile, FeatureCodec codec) throws IOException { + private static Index createIndexInMemory(File inputFile, FeatureCodec codec) { // this can take a while, let them know what we're doing logger.info("Creating Tribble index in memory for file " + inputFile); return IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); } - /** - * convert a list of Strings into a sequence dictionary - * @param contigList the contig list, in coordinate order, this is allowed to be null - * @return a SAMSequenceDictionary, WITHOUT contig sizes - */ - private static SAMSequenceDictionary sequenceSetToDictionary(LinkedHashSet contigList) { - SAMSequenceDictionary dict = new SAMSequenceDictionary(); - if (contigList == null) return dict; - - for (String name : contigList) { - SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); - dict.addSequence(seq); - } - return dict; - } - /** * Returns a collection of track names that match the record type. * @param trackRecordType the record type specified in the @RMD annotation @@ -361,10 +363,11 @@ public class RMDTrackBuilder extends PluginManager { * find the associated reference meta data * * @param bindings the bindings of strings from the -B command line option + * @param engine the GATK engine to bind the tracks to * * @return a list of RMDTracks, one for each -B option */ - public List getReferenceMetaDataSources(GenomeAnalysisEngine engine,List bindings) { + public List getReferenceMetaDataSources(GenomeAnalysisEngine engine, List bindings) { initializeBindings(engine,bindings); // try and make the tracks given their requests return createRequestedTrackObjects(); @@ -416,4 +419,74 @@ public class RMDTrackBuilder extends PluginManager { } return tracks; } + + + // --------------------------------------------------------------------------------------------------------- + // static functions to work with the sequence dictionaries of indexes + // --------------------------------------------------------------------------------------------------------- + + /** + * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index + * @param index the index file to use + * @return a SAMSequenceDictionary if available, null if unavailable + */ + public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { + SAMSequenceDictionary dict = new SAMSequenceDictionary(); + for (Map.Entry entry : index.getProperties().entrySet()) { + if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) + dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), + Integer.valueOf(entry.getValue()))); + } + return dict; + } + + /** + * create the sequence dictionary with the contig list; a backup approach + * @param index the index file to use + * @param dict the sequence dictionary to add contigs to + * @return the filled-in sequence dictionary + */ + private static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) { + LinkedHashSet seqNames = index.getSequenceNames(); + if (seqNames == null) { + return dict; + } + for (String name : seqNames) { + SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); + dict.addSequence(seq); + } + return dict; + } + + /** + * set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible. + * (that each contig in the index is in the sequence dictionary). + * @param dict the sequence dictionary + * @param index the index file + * @param indexFile the index file + * @param rewriteIndex should we rewrite the index when we're done? + * + */ + public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) { + if (dict == null) return; + + SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); + // check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set + for (SAMSequenceRecord seq : currentDict.getSequences()) { + if (dict.getSequence(seq.getSequenceName()) == null) + throw new UserException.IncompatibleSequenceDictionaries("The sequence dictionary from the reference the GATK is running with is not compatible with the sequence " + + "dictionary in the Tribble file " + indexFile + ". It doesn't contain the contig: " + seq.getSequenceName(), + "RMD Sequence Dictionary", + currentDict, + "Reference Sequence Dictionary", + dict); + index.addProperty(SequenceDictionaryPropertyPredicate + dict.getSequence(seq.getSequenceName()).getSequenceName(), String.valueOf(dict.getSequence(seq.getSequenceName()).getSequenceLength())); + } + // re-write the index + if (rewriteIndex) try { + writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); + } catch (IOException e) { + logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK"); + } + } } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java index 873455df7..41f54a7c7 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.gatk.refdata.tracks.builders; import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; import org.broad.tribble.index.Index; import org.broad.tribble.vcf.VCFCodec; import org.broadinstitute.sting.BaseTest; @@ -48,11 +50,12 @@ import java.util.Map; */ public class RMDTrackBuilderUnitTest extends BaseTest { private RMDTrackBuilder builder; + private IndexedFastaSequenceFile seq; @Before public void setup() { builder = new RMDTrackBuilder(); - IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(b36KGReference)); + seq = new IndexedFastaSequenceFile(new File(b36KGReference)); GenomeLocParser.setupRefContigOrdering(seq); } @@ -67,7 +70,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { public void testBuilderIndexUnwriteable() { File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/relic.vcf"); try { - builder.loadIndex(vcfFile, new VCFCodec(), true); + builder.loadIndex(vcfFile, new VCFCodec()); } catch (IOException e) { e.printStackTrace(); Assert.fail("IO exception unexpected" + e.getMessage()); @@ -103,7 +106,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { Index ind = null; try { - ind = builder.loadIndex(vcfFile, new VCFCodec(), true); + ind = builder.loadIndex(vcfFile, new VCFCodec()); } catch (IOException e) { e.printStackTrace(); Assert.fail("IO exception unexpected" + e.getMessage()); @@ -130,7 +133,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { if (vcfFileIndex.exists()) vcfFileIndex.delete(); try { - builder.loadIndex(vcfFile, new VCFCodec(), true); + builder.loadIndex(vcfFile, new VCFCodec()); } catch (IOException e) { e.printStackTrace(); Assert.fail("IO exception unexpected" + e.getMessage()); @@ -140,30 +143,19 @@ public class RMDTrackBuilderUnitTest extends BaseTest { } - // test to make sure we delete the index and regenerate if it's out of date - //@Test - public void testBuilderIndexOutOfDate() { - File vcfFile = createOutofDateIndexFile(new File(validationDataLocation + "/ROD_validation/newerTribbleTrack.vcf")); - try { - builder.loadIndex(vcfFile, new VCFCodec(), true); - } catch (IOException e) { - e.printStackTrace(); - Assert.fail("IO exception unexpected" + e.getMessage()); - } - //System.err.println("index : " + new File(vcfFile + ".idx").lastModified()); - // System.err.println("vcf : " + vcfFile.lastModified()); - - // make sure that we removed and updated the index - Assert.assertTrue("VCF file index wasn't updated", new File(vcfFile + ".idx").lastModified() > vcfFile.lastModified()); - } - - // test to make sure we delete the index and regenerate if it's out of date - // @Test - public void testBuilderIndexGoodDate() { + // test to make sure we get a full sequence dictionary from the VCF (when we set the dictionary in the builder) + @Test + public void testBuilderIndexSequenceDictionary() { File vcfFile = createCorrectDateIndexFile(new File(validationDataLocation + "/ROD_validation/newerTribbleTrack.vcf")); Long indexTimeStamp = new File(vcfFile.getAbsolutePath() + ".idx").lastModified(); try { - builder.loadIndex(vcfFile, new VCFCodec(), true); + Index idx = builder.loadIndex(vcfFile, new VCFCodec()); + RMDTrackBuilder.setIndexSequenceDictionary(idx,seq.getSequenceDictionary(),vcfFile,false); + SAMSequenceDictionary dict = RMDTrackBuilder.getSequenceDictionaryFromProperties(idx); + for (SAMSequenceRecord ent : seq.getSequenceDictionary().getSequences()) { + Assert.assertNotNull("Sequence missing from set dictionary: " + ent.getSequenceName(),dict.getSequence(ent.getSequenceName())); + Assert.assertEquals(ent.getSequenceLength(),dict.getSequence(ent.getSequenceName()).getSequenceLength()); + } } catch (IOException e) { e.printStackTrace(); Assert.fail("IO exception unexpected" + e.getMessage()); @@ -175,42 +167,6 @@ public class RMDTrackBuilderUnitTest extends BaseTest { Assert.assertTrue("Fail: index file was modified", new File(vcfFile + ".idx").lastModified() == indexTimeStamp); } - /** - * create a temporary file and an associated out of date index file - * - * @param tribbleFile the tribble file - * @return a file pointing to the new tmp location, with out of date index - */ - private File createOutofDateIndexFile(File tribbleFile) { - try { - // first copy the tribble file to a temperary file - File tmpFile = File.createTempFile("TribbleUnitTestFile", ""); - tmpFile.deleteOnExit(); - - logger.info("creating temp file " + tmpFile); - // create a fake index, before we copy so it's out of date - File tmpIndex = new File(tmpFile.getAbsolutePath() + ".idx"); - tmpIndex.deleteOnExit(); - - // sleep, to make sure the timestamps are different - Thread.sleep(2000); - - // copy the vcf (tribble) file to the tmp file location - copyFile(tribbleFile, tmpFile); - - // sleep again, to make sure the timestamps are different (vcf vrs updated index file) - Thread.sleep(2000); - - return tmpFile; - - } catch (IOException e) { - Assert.fail("Fail: Unable to create temperary file"); - } catch (InterruptedException e) { - Assert.fail("Fail: Somehow our thread got interupted"); - } - return null; - } - /** * create a temporary file and an associated out of date index file * diff --git a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/ValidateRODForReadsIntegrationTest.java b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/ValidateRODForReadsIntegrationTest.java index eaa0570c5..cf243c86e 100644 --- a/java/test/org/broadinstitute/sting/oneoffprojects/walkers/ValidateRODForReadsIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/oneoffprojects/walkers/ValidateRODForReadsIntegrationTest.java @@ -13,15 +13,15 @@ public class ValidateRODForReadsIntegrationTest extends WalkerTest { private final String vcfFile = validationDataLocation + "rodForReadsVCFCheck.vcf"; private final String dbSNPFile = GATKDataLocation + "dbsnp_129_hg18.rod"; - public static String baseTestString1KG() { - return "-T ValidateRODForReads -o %s -R testdata/exampleFASTA.fasta -U ALLOW_SEQ_DICT_INCOMPATIBILITY -I testdata/exampleBAM.bam"; + public static String baseTestString() { + return "-T ValidateRODForReads -o %s -R " + hg18Reference + " -I " + validationDataLocation + "small_bam_for_rods_for_reads.bam"; } @Test public void testSimpleVCFPileup() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString1KG() + " -B:vcf,vcf " + vcfFile, 1, + baseTestString() + " -B:vcf,vcf " + vcfFile, 1, Arrays.asList("f7919e9dc156fb5d3ad0541666864ea5")); executeTest("testSimpleVCFPileup", spec); } @@ -29,7 +29,7 @@ public class ValidateRODForReadsIntegrationTest extends WalkerTest { @Test public void testSimpleDbSNPPileup() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString1KG() + " -B:dbsnp,dbsnp " + dbSNPFile, 1, + baseTestString() + " -B:dbsnp,dbsnp " + dbSNPFile, 1, Arrays.asList("c63b8ef9291a450f0519c73ac9cae189")); executeTest("testSimpleDbSNPPileup", spec); }