diff --git a/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java index 0c7e29304..e2ff2b68f 100755 --- a/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/AbstractGenomeAnalysisEngine.java @@ -329,7 +329,7 @@ public abstract class AbstractGenomeAnalysisEngine { RMDTrackBuilder manager = new RMDTrackBuilder(); // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference - manager.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser); + manager.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe); List tracks = manager.getReferenceMetaDataSources(this,argCollection); validateSuppliedReferenceOrderedData(tracks); diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java b/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java index 9b8fff5c4..bfc6c2b18 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java @@ -105,7 +105,7 @@ public class RMDIndexer extends CommandLineProgram { IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(referenceFile); // add writing of the sequence dictionary, if supplied - RMDTrackBuilder.setIndexSequenceDictionary(index, seq.getSequenceDictionary(), indexFile, false); + builder.setIndexSequenceDictionary(inputFileSource, index, seq.getSequenceDictionary(), indexFile, false); } // create the output stream, and write the index diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java index fb68a8e37..24d36af36 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java @@ -34,6 +34,7 @@ import org.broad.tribble.index.IndexFactory; import org.broad.tribble.source.BasicFeatureSource; import org.broad.tribble.util.LittleEndianOutputStream; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException; @@ -41,6 +42,7 @@ import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.AbstractGenomeAnalysisEngine; import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.SequenceDictionaryUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -85,6 +87,11 @@ public class RMDTrackBuilder extends PluginManager { */ private GenomeLocParser genomeLocParser; + /** + * Validation exclusions, for validating the sequence dictionary. + */ + private ValidationExclusion.TYPE validationExclusionType; + /** Create a new plugin manager. */ public RMDTrackBuilder() { super(FeatureCodec.class, "Codecs", "Codec"); @@ -92,21 +99,25 @@ public class RMDTrackBuilder extends PluginManager { /** * Create a new RMDTrackBuilder, with dictionary and genomeLocParser predefined. - * @param dict - * @param genomeLocParser + * @param dict Sequence dictionary to use. + * @param genomeLocParser Location parser to use. + * @param validationExclusionType Types of validations to exclude, for sequence dictionary verification. */ - public RMDTrackBuilder(SAMSequenceDictionary dict,GenomeLocParser genomeLocParser) { + public RMDTrackBuilder(SAMSequenceDictionary dict,GenomeLocParser genomeLocParser, ValidationExclusion.TYPE validationExclusionType) { super(FeatureCodec.class, "Codecs", "Codec"); - setSequenceDictionary(dict,genomeLocParser); + setSequenceDictionary(dict,genomeLocParser,validationExclusionType); } /** - * - * @param dict the sequence dictionary to use as a reference for Tribble track contig length lookups + * Establish location-aware parsing and services for relevant reference metadata. + * @param dict Sequence dictionary to use. + * @param genomeLocParser Location parser to use. + * @param validationExclusionType Types of validations to exclude, for sequence dictionary verification. */ - public void setSequenceDictionary(SAMSequenceDictionary dict,GenomeLocParser genomeLocParser) { + public void setSequenceDictionary(SAMSequenceDictionary dict,GenomeLocParser genomeLocParser,ValidationExclusion.TYPE validationExclusionType) { this.dict = dict; this.genomeLocParser = genomeLocParser; + this.validationExclusionType = validationExclusionType; } /** @return a list of all available track types we currently have access to create */ @@ -226,7 +237,7 @@ public class RMDTrackBuilder extends PluginManager { // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match if (dictFromIndex.size() == 0 && dict != null) { File indexFile = Tribble.indexFile(inputFile); - setIndexSequenceDictionary(index,dict,indexFile,true); + setIndexSequenceDictionary(inputFile,index,dict,indexFile,true); dictFromIndex = getSequenceDictionaryFromProperties(index); } @@ -366,7 +377,7 @@ public class RMDTrackBuilder extends PluginManager { // this can take a while, let them know what we're doing logger.info("Creating Tribble index in memory for file " + inputFile); Index idx = IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - setIndexSequenceDictionary(idx, dict, null, false); + setIndexSequenceDictionary(inputFile, idx, dict, null, false); return idx; } @@ -493,25 +504,23 @@ public class RMDTrackBuilder extends PluginManager { /** * set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible. * (that each contig in the index is in the sequence dictionary). + * @param inputFile for proper error message formatting. * @param dict the sequence dictionary * @param index the index file * @param indexFile the index file * @param rewriteIndex should we rewrite the index when we're done? * */ - public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) { + public void setIndexSequenceDictionary(File inputFile, Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) { if (dict == null) return; SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); + SequenceDictionaryUtils.validateDictionaries(logger,validationExclusionType,"GATK",dict,inputFile.getAbsolutePath(),currentDict); + // check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set for (SAMSequenceRecord seq : currentDict.getSequences()) { if (dict.getSequence(seq.getSequenceName()) == null) - throw new UserException.IncompatibleSequenceDictionaries("The sequence dictionary from the reference the GATK is running with is not compatible with the sequence " + - "dictionary in the Tribble file " + indexFile + ". It doesn't contain the contig: " + seq.getSequenceName(), - "RMD Sequence Dictionary", - currentDict, - "Reference Sequence Dictionary", - dict); + continue; index.addProperty(SequenceDictionaryPropertyPredicate + dict.getSequence(seq.getSequenceName()).getSequenceName(), String.valueOf(dict.getSequence(seq.getSequenceName()).getSequenceLength())); } // re-write the index diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 08b2dcc55..56443415b 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -403,7 +403,9 @@ public class DepthOfCoverageWalker extends LocusWalker { logger.info("Loading SNP mask... "); ReferenceOrderedData snp_mask; if ( SNP_MASK.contains(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)) { - RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser()); + RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),getToolkit().getArguments().unsafe); CloseableIterator iter = builder.createInstanceOfTrack(DbSNPCodec.class,"snp_mask",new java.io.File(SNP_MASK)).getIterator(); snpMaskIterator = new SeekableRODIterator(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),iter); diff --git a/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java b/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java index 0a1b088e7..fc28d6c26 100755 --- a/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java +++ b/java/src/org/broadinstitute/sting/utils/SequenceDictionaryUtils.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -66,7 +65,7 @@ public class SequenceDictionaryUtils { private static SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373); private static SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747); - public enum SequenceDictionaryCompatability { + public enum SequenceDictionaryCompatibility { IDENTICAL, // the dictionaries are identical COMMON_SUBSET, // there exists a common subset of equivalent contigs NO_COMMON_CONTIGS, // no overlap between dictionaries @@ -96,7 +95,7 @@ public class SequenceDictionaryUtils { * @param dict2 the sequence dictionary dict2 */ public static void validateDictionaries(Logger logger, ValidationExclusion.TYPE validationExclusion, String name1, SAMSequenceDictionary dict1, String name2, SAMSequenceDictionary dict2) { - SequenceDictionaryCompatability type = compareDictionaries(dict1, dict2); + SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2); switch ( type ) { case IDENTICAL: return; @@ -155,23 +154,23 @@ public class SequenceDictionaryUtils { * @param dict2 * @return */ - public static SequenceDictionaryCompatability compareDictionaries(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + public static SequenceDictionaryCompatibility compareDictionaries(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { // If there's no overlap between reads and reference, data will be bogus. Throw an exception. if ( nonCanonicalHumanContigOrder( dict1 ) || nonCanonicalHumanContigOrder(dict2) ) - return SequenceDictionaryCompatability.NON_CANONICAL_HUMAN_ORDER; + return SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER; Set commonContigs = getCommonContigsByName(dict1, dict2); if (commonContigs.size() == 0) - return SequenceDictionaryCompatability.NO_COMMON_CONTIGS; + return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS; else if ( ! commonContigsAreEquivalent( commonContigs, dict1, dict2 ) ) - return SequenceDictionaryCompatability.UNEQUAL_COMMON_CONTIGS; + return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS; else if ( ! commonContigsAreInOrder( commonContigs, dict1, dict2 ) ) - return SequenceDictionaryCompatability.OUT_OF_ORDER; + return SequenceDictionaryCompatibility.OUT_OF_ORDER; else if ( commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() ) - return SequenceDictionaryCompatability.IDENTICAL; + return SequenceDictionaryCompatibility.IDENTICAL; else { - return SequenceDictionaryCompatability.COMMON_SUBSET; + return SequenceDictionaryCompatibility.COMMON_SUBSET; } } diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java index 8227435ae..5ba5cc034 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -56,7 +56,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { seq = new IndexedFastaSequenceFile(new File(hg18Reference)); genomeLocParser = new GenomeLocParser(seq); builder = new RMDTrackBuilder(); - builder.setSequenceDictionary(seq.getSequenceDictionary(),genomeLocParser); + builder.setSequenceDictionary(seq.getSequenceDictionary(),genomeLocParser,null); } /** diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java index 837e3735e..3652ef3ef 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java @@ -61,7 +61,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { public void setUp() { File file = new File(testDir + "TabularDataTest.dat"); RMDTrackBuilder builder = new RMDTrackBuilder(); - builder.setSequenceDictionary(seq.getSequenceDictionary(),genomeLocParser); + builder.setSequenceDictionary(seq.getSequenceDictionary(),genomeLocParser,null); rod = builder.createInstanceOfTrack(TableCodec.class, "tableTest", file); } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java index 0e5744697..c62175182 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java @@ -29,6 +29,7 @@ import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.index.Index; import org.broad.tribble.vcf.VCFCodec; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -152,8 +153,8 @@ public class RMDTrackBuilderUnitTest extends BaseTest { File vcfFile = createCorrectDateIndexFile(new File(validationDataLocation + "/ROD_validation/newerTribbleTrack.vcf")); Long indexTimeStamp = Tribble.indexFile(vcfFile).lastModified(); try { + builder.setSequenceDictionary(seq.getSequenceDictionary(),genomeLocParser,null); Index idx = builder.loadIndex(vcfFile, new VCFCodec()); - RMDTrackBuilder.setIndexSequenceDictionary(idx,seq.getSequenceDictionary(),vcfFile,false); // catch any exception; this call should pass correctly SAMSequenceDictionary dict = RMDTrackBuilder.getSequenceDictionaryFromProperties(idx); } catch (IOException e) {