diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 71968fdb0..883dde665 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -334,8 +334,6 @@ public class GenomeAnalysisEngine { validateSuppliedReferenceAgainstWalker(my_walker, argCollection); referenceDataSource = openReferenceSequenceFile(argCollection.referenceFile); - validateReadsAndReferenceAreCompatible(readsDataSource, referenceDataSource); - // // please don't use these in the future, use the new syntax <- if we're not using these please remove them // @@ -353,6 +351,9 @@ public class GenomeAnalysisEngine { List tracks = manager.getReferenceMetaDataSources(argCollection.RODBindings); validateSuppliedReferenceOrderedDataAgainstWalker(my_walker, tracks); + // validate all the sequence dictionaries against the reference + validateSourcesAgainstReference(readsDataSource, referenceDataSource, tracks); + rodDataSources = getReferenceOrderedDataSources(my_walker, tracks); } @@ -604,22 +605,16 @@ public class GenomeAnalysisEngine { } /** - * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference. + * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). * * @param reads Reads data source. * @param reference Reference data source. + * @param tracks a collection of the reference ordered data tracks */ - private void validateReadsAndReferenceAreCompatible(SAMDataSource reads, ReferenceSequenceFile reference) { - if (reads == null || reference == null) + private void validateSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection tracks) { + if ((reads == null && (tracks == null || tracks.isEmpty())) || reference == null ) return; - // Compile a set of sequence names that exist in the BAM files. - SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); - - Set readsSequenceNames = new TreeSet(); - for (SAMSequenceRecord dictionaryEntry : readsDictionary.getSequences()) - readsSequenceNames.add(dictionaryEntry.getSequenceName()); - // Compile a set of sequence names that exist in the reference file. SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary(); @@ -627,32 +622,70 @@ public class GenomeAnalysisEngine { for (SAMSequenceRecord dictionaryEntry : referenceDictionary.getSequences()) referenceSequenceNames.add(dictionaryEntry.getSequenceName()); - if (readsSequenceNames.size() == 0) { - logger.info("Reads file is unmapped. Skipping validation against reference."); - return; + + if (reads != null) { + // Compile a set of sequence names that exist in the BAM files. + SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); + + Set readsSequenceNames = new TreeSet(); + for (SAMSequenceRecord dictionaryEntry : readsDictionary.getSequences()) + readsSequenceNames.add(dictionaryEntry.getSequenceName()); + + + if (readsSequenceNames.size() == 0) { + logger.info("Reads file is unmapped. Skipping validation against reference."); + return; + } + + // compare the reads to the reference + compareTwoDictionaries("reads", readsDictionary, readsSequenceNames, referenceDictionary, referenceSequenceNames); } + // compare the tracks to the reference, if they have a sequence dictionary + for (RMDTrack track : tracks) { + SAMSequenceDictionary trackDict = track.getSequenceDictionary(); + if (trackDict == null) { + logger.info("Track " + track.getName() + "doesn't have a sequence dictionary built in, skipping dictionary validation"); + continue; + } + Set trackSequences = new TreeSet(); + for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) + trackSequences.add(dictionaryEntry.getSequenceName()); + compareTwoDictionaries(track.getName(), trackDict, trackSequences, referenceDictionary, referenceSequenceNames); + } + + } + + /** + * compare two dictionaries, warning if one isn't a subset of the other, or erroring out if they have no overlap + * @param compareToName the name of the track or bam (used in the output to the user) + * @param comparedToDictionary the dictionary to compare to + * @param compareToSequenceNames the unique sequence names in the compared to dictionary + * @param referenceDictionary the reference dictionary + * @param referenceSequenceNames the reference unique sequence names + */ + private void compareTwoDictionaries(String compareToName, SAMSequenceDictionary comparedToDictionary, Set compareToSequenceNames, SAMSequenceDictionary referenceDictionary, Set referenceSequenceNames) { // If there's no overlap between reads and reference, data will be bogus. Throw an exception. - Set intersectingSequenceNames = new HashSet(readsSequenceNames); + Set intersectingSequenceNames = new HashSet(compareToSequenceNames); intersectingSequenceNames.retainAll(referenceSequenceNames); if (intersectingSequenceNames.size() == 0) { StringBuilder error = new StringBuilder(); - error.append("No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n"); + error.append("No overlap exists between sequence dictionary of the " + compareToName + " and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n"); error.append(System.getProperty("line.separator")); - error.append(String.format("Reads contigs: %s%n", prettyPrintSequenceRecords(readsDictionary))); + error.append(String.format(compareToName + " contigs: %s%n", prettyPrintSequenceRecords(comparedToDictionary))); error.append(String.format("Reference contigs: %s%n", prettyPrintSequenceRecords(referenceDictionary))); logger.error(error.toString()); - Utils.scareUser("No overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference."); + Utils.scareUser("No overlap exists between sequence dictionary of " + compareToName + " and the sequence dictionary of the reference."); } // If the two datasets are not equal and neither is a strict subset of the other, warn the user. - if (!readsSequenceNames.equals(referenceSequenceNames) && - !readsSequenceNames.containsAll(referenceSequenceNames) && - !referenceSequenceNames.containsAll(readsSequenceNames)) { + if (!compareToSequenceNames.equals(referenceSequenceNames) && + !compareToSequenceNames.containsAll(referenceSequenceNames) && + !referenceSequenceNames.containsAll(compareToSequenceNames)) { StringBuilder warning = new StringBuilder(); - warning.append("Limited overlap exists between sequence dictionary of the reads and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n"); + warning.append("Limited overlap exists between sequence dictionary of the " + compareToName + " and the sequence dictionary of the reference. Perhaps you're using the wrong reference?\n"); warning.append(System.getProperty("line.separator")); - warning.append(String.format("Reads contigs: %s%n", prettyPrintSequenceRecords(readsDictionary))); + warning.append(String.format(compareToName + " contigs: %s%n", prettyPrintSequenceRecords(comparedToDictionary))); warning.append(String.format("Reference contigs: %s%n", prettyPrintSequenceRecords(referenceDictionary))); logger.warn(warning.toString()); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java index cd2880f91..fe56bca0b 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java @@ -3,7 +3,7 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import org.broad.tribble.FeatureReader; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.tracks.FeatureReaderTrack; +import org.broadinstitute.sting.gatk.refdata.tracks.TribbleTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator; @@ -15,8 +15,6 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; import java.io.IOException; -import java.util.Arrays; -import java.util.LinkedList; import java.util.List; /** * User: hanna @@ -52,7 +50,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource { public ReferenceOrderedDataSource( Walker walker, RMDTrack rod) { this.rod = rod; if (rod.supportsQuery()) - iteratorPool = new ReferenceOrderedQueryDataPool(new TribbleRMDTrackBuilder(), (FeatureReaderTrack)rod); + iteratorPool = new ReferenceOrderedQueryDataPool(new TribbleRMDTrackBuilder(), (TribbleTrack)rod); else iteratorPool = new ReferenceOrderedDataPool( walker, rod ); } @@ -187,7 +185,7 @@ class ReferenceOrderedQueryDataPool extends ResourcePool implemen @Override public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException { // return a feature reader track - return new FeatureReaderTrack(targetClass, this.createByType(targetClass).getFeatureType(), name, inputFile, createFeatureReader(targetClass, inputFile)); + Pair pair = createFeatureReader(targetClass, inputFile); + if (pair == null) throw new StingException("Unable to make the feature reader for input file " + inputFile); + return new TribbleTrack(targetClass, this.createByType(targetClass).getFeatureType(), name, inputFile, pair.first, pair.second); } /** @@ -103,13 +106,13 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen * @param inputFile the input file to create the track from (of the codec type) * @return the FeatureReader instance */ - public FeatureReader createFeatureReader(Class targetClass, File inputFile) { - FeatureReader reader = null; + public Pair createFeatureReader(Class targetClass, File inputFile) { + Pair pair = null; if (inputFile.getAbsolutePath().endsWith(".gz")) - reader = createBasicFeatureReaderNoAssumedIndex(targetClass, inputFile); + pair = createBasicFeatureReaderNoAssumedIndex(targetClass, inputFile); else - reader = getLinearFeatureReader(targetClass, inputFile); - return reader; + pair = getLinearFeatureReader(targetClass, inputFile); + return pair; } /** @@ -121,11 +124,11 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen * @param inputFile the file to load * @return a feature reader implementation */ - private BasicFeatureReader createBasicFeatureReaderNoAssumedIndex(Class targetClass, File inputFile) { + private Pair createBasicFeatureReaderNoAssumedIndex(Class targetClass, File inputFile) { // we might not know the index type, try loading with the default reader constructor - logger.debug("Attempting to blindly load " + inputFile); + logger.info("Attempting to blindly load " + inputFile + " as a tabix indexed file"); try { - return new BasicFeatureReader(inputFile.getAbsolutePath(),this.createByType(targetClass)); + return new Pair(new BasicFeatureReader(inputFile.getAbsolutePath(),this.createByType(targetClass)),null); } catch (IOException e) { throw new StingException("Unable to create feature reader from file " + inputFile); } @@ -137,11 +140,11 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen * @param inputFile the tribble file to parse * @return the input file as a FeatureReader */ - private FeatureReader getLinearFeatureReader(Class targetClass, File inputFile) { - FeatureReader reader; + private Pair getLinearFeatureReader(Class targetClass, File inputFile) { + Pair reader; try { Index index = loadIndex(inputFile, this.createByType(targetClass), true); - reader = new BasicFeatureReader(inputFile.getAbsolutePath(), index, this.createByType(targetClass)); + reader = new Pair(new BasicFeatureReader(inputFile.getAbsolutePath(), index, this.createByType(targetClass)),index.getSequenceDictionary()); } catch (FileNotFoundException e) { throw new StingException("Unable to create reader with file " + inputFile, e); } catch (IOException e) { @@ -160,9 +163,6 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen */ public synchronized static Index loadIndex(File inputFile, FeatureCodec codec, boolean onDisk) throws IOException { - // our return index - LinearIndex returnIndex = null; - // create the index file name, locking on the index file name File indexFile = new File(inputFile.getAbsoluteFile() + linearIndexExtension); FSLock lock = new FSLock(indexFile); @@ -178,7 +178,12 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen // if the file exists, and we can read it, load the index from disk (i.e. wasn't deleted in the last step). if (indexFile.exists() && indexFile.canRead() && obtainedLock) { logger.info("Loading Tribble index from disk for file " + inputFile); - return LinearIndex.createIndex(indexFile); + Index index = LinearIndex.createIndex(indexFile); + if (index.isCurrentVersion()) + return index; + + logger.warn("Index file " + indexFile + " is out of date (old version), deleting and updating the index file"); + indexFile.delete(); } return writeIndexToDisk(inputFile, codec, onDisk, indexFile, obtainedLock); } @@ -200,6 +205,9 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen */ private static LinearIndex writeIndexToDisk(File inputFile, FeatureCodec codec, boolean onDisk, File indexFile, boolean obtainedLock) throws IOException { LinearIndexCreator create = new LinearIndexCreator(inputFile, codec); + + // this can take a while, let them know what we're doing + logger.info("Creating Tribble index in memory for file " + inputFile); LinearIndex index = create.createIndex(null); // we don't want to write initially, so we pass in null // if the index doesn't exist, and we can write to the directory, and we got a lock: write to the disk @@ -207,7 +215,7 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen (!indexFile.exists() || indexFile.canWrite()) && onDisk && obtainedLock) { - logger.info("Creating Tribble Index on disk for file " + inputFile); + logger.info("Writing Tribble index to disk for file " + inputFile); index.write(indexFile); return index; } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DbSNPWindowCounter.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DbSNPWindowCounter.java index b9ed2cf3b..a7ca7b5ba 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DbSNPWindowCounter.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DbSNPWindowCounter.java @@ -46,7 +46,7 @@ public class DbSNPWindowCounter extends LocusWalker { public void initialize() { TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder(); - reader = builder.createFeatureReader(DbSNPCodec.class,myDbSNPFile); + reader = builder.createFeatureReader(DbSNPCodec.class,myDbSNPFile).first; } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManagerUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManagerUnitTest.java index c7a09d7e2..ad812b7e4 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManagerUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManagerUnitTest.java @@ -71,7 +71,7 @@ public class RMDTrackManagerUnitTest extends BaseTest { int count = 0; Iterator fIter; try { - fIter = ((FeatureReaderTrack) t).query("1", 1, 5000); + fIter = ((TribbleTrack) t).query("1", 1, 5000); } catch (IOException e) { throw new StingException("blah I/O exception"); } @@ -126,7 +126,7 @@ public class RMDTrackManagerUnitTest extends BaseTest { long firstTime = System.currentTimeMillis(); long count = 0; try { - fIter = ((FeatureReaderTrack) t).query("1", x, x + intervalSize); + fIter = ((TribbleTrack) t).query("1", x, x + intervalSize); } catch (IOException e) { throw new StingException("blah I/O exception"); } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java index 75bfb301a..63c22dd37 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java @@ -70,7 +70,8 @@ public class TribbleRMDTrackBuilderUnitTest extends BaseTest { Assert.fail("IO exception unexpected" + e.getMessage()); } // make sure we didn't write the file (check that it's timestamp is within bounds) - Assert.assertTrue(Math.abs(1274210993000l - new File(vcfFile + TribbleRMDTrackBuilder.linearIndexExtension).lastModified()) < 100); + //System.err.println(new File(vcfFile + TribbleRMDTrackBuilder.linearIndexExtension).lastModified()); + Assert.assertTrue(Math.abs(1275597793000l - new File(vcfFile + TribbleRMDTrackBuilder.linearIndexExtension).lastModified()) < 100); } diff --git a/settings/repository/org.broad/tribble-87.xml b/settings/repository/org.broad/tribble-87.xml deleted file mode 100644 index bb714a30d..000000000 --- a/settings/repository/org.broad/tribble-87.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/settings/repository/org.broad/tribble-87.jar b/settings/repository/org.broad/tribble-88.jar similarity index 89% rename from settings/repository/org.broad/tribble-87.jar rename to settings/repository/org.broad/tribble-88.jar index 35d047b58..6786642fd 100644 Binary files a/settings/repository/org.broad/tribble-87.jar and b/settings/repository/org.broad/tribble-88.jar differ diff --git a/settings/repository/org.broad/tribble-88.xml b/settings/repository/org.broad/tribble-88.xml new file mode 100644 index 000000000..1c14d8043 --- /dev/null +++ b/settings/repository/org.broad/tribble-88.xml @@ -0,0 +1,3 @@ + + +