- Fix DepthOfCoverage so that, when it abuses the ROD system by instantiating a track in onTraversalDone, it also supplies the correct sequence dictionary and parser.

- Changed RMDTrackBuilder to use SequenceDictionaryUtils.validateDictionaries for ref <-> ROD sequence dictionary validation.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4683 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-11-15 20:34:04 +00:00
parent f8e1ea7b64
commit 5b83942cee
9 changed files with 44 additions and 33 deletions

View File

@ -329,7 +329,7 @@ public abstract class AbstractGenomeAnalysisEngine {
RMDTrackBuilder manager = new RMDTrackBuilder();
// set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference
manager.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser);
manager.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe);
List<RMDTrack> tracks = manager.getReferenceMetaDataSources(this,argCollection);
validateSuppliedReferenceOrderedData(tracks);

View File

@ -105,7 +105,7 @@ public class RMDIndexer extends CommandLineProgram {
IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(referenceFile);
// add writing of the sequence dictionary, if supplied
RMDTrackBuilder.setIndexSequenceDictionary(index, seq.getSequenceDictionary(), indexFile, false);
builder.setIndexSequenceDictionary(inputFileSource, index, seq.getSequenceDictionary(), indexFile, false);
}
// create the output stream, and write the index

View File

@ -34,6 +34,7 @@ import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.source.BasicFeatureSource;
import org.broad.tribble.util.LittleEndianOutputStream;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException;
@ -41,6 +42,7 @@ import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.AbstractGenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -85,6 +87,11 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
*/
private GenomeLocParser genomeLocParser;
/**
* Validation exclusions, for validating the sequence dictionary.
*/
private ValidationExclusion.TYPE validationExclusionType;
/** Create a new plugin manager. */
public RMDTrackBuilder() {
super(FeatureCodec.class, "Codecs", "Codec");
@ -92,21 +99,25 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
/**
* Create a new RMDTrackBuilder, with dictionary and genomeLocParser predefined.
* @param dict
* @param genomeLocParser
* @param dict Sequence dictionary to use.
* @param genomeLocParser Location parser to use.
* @param validationExclusionType Types of validations to exclude, for sequence dictionary verification.
*/
public RMDTrackBuilder(SAMSequenceDictionary dict,GenomeLocParser genomeLocParser) {
public RMDTrackBuilder(SAMSequenceDictionary dict,GenomeLocParser genomeLocParser, ValidationExclusion.TYPE validationExclusionType) {
super(FeatureCodec.class, "Codecs", "Codec");
setSequenceDictionary(dict,genomeLocParser);
setSequenceDictionary(dict,genomeLocParser,validationExclusionType);
}
/**
*
* @param dict the sequence dictionary to use as a reference for Tribble track contig length lookups
* Establish location-aware parsing and services for relevant reference metadata.
* @param dict Sequence dictionary to use.
* @param genomeLocParser Location parser to use.
* @param validationExclusionType Types of validations to exclude, for sequence dictionary verification.
*/
public void setSequenceDictionary(SAMSequenceDictionary dict,GenomeLocParser genomeLocParser) {
public void setSequenceDictionary(SAMSequenceDictionary dict,GenomeLocParser genomeLocParser,ValidationExclusion.TYPE validationExclusionType) {
this.dict = dict;
this.genomeLocParser = genomeLocParser;
this.validationExclusionType = validationExclusionType;
}
/** @return a list of all available track types we currently have access to create */
@ -226,7 +237,7 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
// if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
if (dictFromIndex.size() == 0 && dict != null) {
File indexFile = Tribble.indexFile(inputFile);
setIndexSequenceDictionary(index,dict,indexFile,true);
setIndexSequenceDictionary(inputFile,index,dict,indexFile,true);
dictFromIndex = getSequenceDictionaryFromProperties(index);
}
@ -366,7 +377,7 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
// this can take a while, let them know what we're doing
logger.info("Creating Tribble index in memory for file " + inputFile);
Index idx = IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
setIndexSequenceDictionary(idx, dict, null, false);
setIndexSequenceDictionary(inputFile, idx, dict, null, false);
return idx;
}
@ -493,25 +504,23 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
/**
* set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible.
* (that each contig in the index is in the sequence dictionary).
* @param inputFile for proper error message formatting.
* @param dict the sequence dictionary
* @param index the index file
* @param indexFile the index file
* @param rewriteIndex should we rewrite the index when we're done?
*
*/
public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) {
public void setIndexSequenceDictionary(File inputFile, Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) {
if (dict == null) return;
SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
SequenceDictionaryUtils.validateDictionaries(logger,validationExclusionType,"GATK",dict,inputFile.getAbsolutePath(),currentDict);
// check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set
for (SAMSequenceRecord seq : currentDict.getSequences()) {
if (dict.getSequence(seq.getSequenceName()) == null)
throw new UserException.IncompatibleSequenceDictionaries("The sequence dictionary from the reference the GATK is running with is not compatible with the sequence " +
"dictionary in the Tribble file " + indexFile + ". It doesn't contain the contig: " + seq.getSequenceName(),
"RMD Sequence Dictionary",
currentDict,
"Reference Sequence Dictionary",
dict);
continue;
index.addProperty(SequenceDictionaryPropertyPredicate + dict.getSequence(seq.getSequenceName()).getSequenceName(), String.valueOf(dict.getSequence(seq.getSequenceName()).getSequenceLength()));
}
// re-write the index

View File

@ -403,7 +403,9 @@ public class DepthOfCoverageWalker extends LocusWalker<Map<DoCOutputType.Partiti
}
private LocationAwareSeekableRODIterator initializeRefSeq() {
RMDTrackBuilder builder = new RMDTrackBuilder();
RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),
getToolkit().getGenomeLocParser(),
getToolkit().getArguments().unsafe);
FeatureSource refseq = builder.createFeatureReader(RefSeqCodec.class,refSeqGeneList).first;
try {
return new SeekableRODIterator(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),

View File

@ -86,7 +86,7 @@ public class PickSequenomProbes extends RodWalker<String, String> {
logger.info("Loading SNP mask... ");
ReferenceOrderedData snp_mask;
if ( SNP_MASK.contains(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)) {
RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser());
RMDTrackBuilder builder = new RMDTrackBuilder(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),getToolkit().getArguments().unsafe);
CloseableIterator<GATKFeature> iter = builder.createInstanceOfTrack(DbSNPCodec.class,"snp_mask",new java.io.File(SNP_MASK)).getIterator();
snpMaskIterator = new SeekableRODIterator(getToolkit().getReferenceDataSource().getReference().getSequenceDictionary(),getToolkit().getGenomeLocParser(),iter);

View File

@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -66,7 +65,7 @@ public class SequenceDictionaryUtils {
private static SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373);
private static SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747);
public enum SequenceDictionaryCompatability {
public enum SequenceDictionaryCompatibility {
IDENTICAL, // the dictionaries are identical
COMMON_SUBSET, // there exists a common subset of equivalent contigs
NO_COMMON_CONTIGS, // no overlap between dictionaries
@ -96,7 +95,7 @@ public class SequenceDictionaryUtils {
* @param dict2 the sequence dictionary dict2
*/
public static void validateDictionaries(Logger logger, ValidationExclusion.TYPE validationExclusion, String name1, SAMSequenceDictionary dict1, String name2, SAMSequenceDictionary dict2) {
SequenceDictionaryCompatability type = compareDictionaries(dict1, dict2);
SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2);
switch ( type ) {
case IDENTICAL:
return;
@ -155,23 +154,23 @@ public class SequenceDictionaryUtils {
* @param dict2
* @return
*/
public static SequenceDictionaryCompatability compareDictionaries(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
public static SequenceDictionaryCompatibility compareDictionaries(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) {
// If there's no overlap between reads and reference, data will be bogus. Throw an exception.
if ( nonCanonicalHumanContigOrder( dict1 ) || nonCanonicalHumanContigOrder(dict2) )
return SequenceDictionaryCompatability.NON_CANONICAL_HUMAN_ORDER;
return SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER;
Set<String> commonContigs = getCommonContigsByName(dict1, dict2);
if (commonContigs.size() == 0)
return SequenceDictionaryCompatability.NO_COMMON_CONTIGS;
return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS;
else if ( ! commonContigsAreEquivalent( commonContigs, dict1, dict2 ) )
return SequenceDictionaryCompatability.UNEQUAL_COMMON_CONTIGS;
return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS;
else if ( ! commonContigsAreInOrder( commonContigs, dict1, dict2 ) )
return SequenceDictionaryCompatability.OUT_OF_ORDER;
return SequenceDictionaryCompatibility.OUT_OF_ORDER;
else if ( commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() )
return SequenceDictionaryCompatability.IDENTICAL;
return SequenceDictionaryCompatibility.IDENTICAL;
else {
return SequenceDictionaryCompatability.COMMON_SUBSET;
return SequenceDictionaryCompatibility.COMMON_SUBSET;
}
}

View File

@ -56,7 +56,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest {
seq = new IndexedFastaSequenceFile(new File(hg18Reference));
genomeLocParser = new GenomeLocParser(seq);
builder = new RMDTrackBuilder();
builder.setSequenceDictionary(seq.getSequenceDictionary(),genomeLocParser);
builder.setSequenceDictionary(seq.getSequenceDictionary(),genomeLocParser,null);
}
/**

View File

@ -61,7 +61,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest {
public void setUp() {
File file = new File(testDir + "TabularDataTest.dat");
RMDTrackBuilder builder = new RMDTrackBuilder();
builder.setSequenceDictionary(seq.getSequenceDictionary(),genomeLocParser);
builder.setSequenceDictionary(seq.getSequenceDictionary(),genomeLocParser,null);
rod = builder.createInstanceOfTrack(TableCodec.class, "tableTest", file);
}

View File

@ -29,6 +29,7 @@ import net.sf.samtools.SAMSequenceDictionary;
import org.broad.tribble.Tribble;
import org.broad.tribble.index.Index;
import org.broad.tribble.vcf.VCFCodec;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.testng.Assert;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -152,8 +153,8 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
File vcfFile = createCorrectDateIndexFile(new File(validationDataLocation + "/ROD_validation/newerTribbleTrack.vcf"));
Long indexTimeStamp = Tribble.indexFile(vcfFile).lastModified();
try {
builder.setSequenceDictionary(seq.getSequenceDictionary(),genomeLocParser,null);
Index idx = builder.loadIndex(vcfFile, new VCFCodec());
RMDTrackBuilder.setIndexSequenceDictionary(idx,seq.getSequenceDictionary(),vcfFile,false);
// catch any exception; this call should pass correctly
SAMSequenceDictionary dict = RMDTrackBuilder.getSequenceDictionaryFromProperties(idx);
} catch (IOException e) {