diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 5b9ebd99b..972943e26 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -929,6 +929,14 @@ public class GenomeAnalysisEngine { return readsDataSource.getHeader(reader); } + /** + * Gets the master sequence dictionary for this GATK engine instance + * @return a never-null dictionary listing all of the contigs known to this engine instance + */ + public SAMSequenceDictionary getMasterSequenceDictionary() { + return getReferenceDataSource().getReference().getSequenceDictionary(); + } + /** * Returns data source object encapsulating all essential info and handlers used to traverse * reads; header merger, individual file readers etc can be accessed through the returned data source object. diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java index ebb4cbe66..4ca7b935f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java @@ -46,7 +46,7 @@ public class VCFWriterStorage implements Storage, VCFWriter { else if ( stub.getOutputStream() != null ) { this.file = null; this.stream = stub.getOutputStream(); - writer = new StandardVCFWriter(stream, stub.doNotWriteGenotypes()); + writer = new StandardVCFWriter(stream, stub.getMasterSequenceDictionary(), stub.doNotWriteGenotypes()); } else throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream."); @@ -71,7 +71,7 @@ public class VCFWriterStorage implements Storage, VCFWriter { } // The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it. - return new StandardVCFWriter(file, this.stream, indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes()); + return new StandardVCFWriter(file, this.stream, stub.getMasterSequenceDictionary(), indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java index 936243f9d..82cb43634 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.io.stubs; +import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.gatk.CommandLineExecutable; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -150,6 +151,15 @@ public class VCFWriterStub implements Stub, VCFWriter { return isCompressed; } + /** + * Gets the master sequence dictionary from the engine associated with this stub + * @link GenomeAnalysisEngine.getMasterSequenceDictionary + * @return + */ + public SAMSequenceDictionary getMasterSequenceDictionary() { + return engine.getMasterSequenceDictionary(); + } + /** * Should we tell the VCF writer not to write genotypes? * @return true if the writer should not write genotypes. diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java index 029800aea..9e5a95d10 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/indexer/RMDIndexer.java @@ -101,7 +101,7 @@ public class RMDIndexer extends CommandLineProgram { Index index = IndexFactory.createIndex(inputFileSource, codec, approach); // add writing of the sequence dictionary, if supplied - builder.setIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary(), indexFile, false); + builder.validateAndUpdateIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary()); // create the output stream, and write the index LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java new file mode 100644 index 000000000..d133439dc --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/IndexDictionaryUtils.java @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.refdata.tracks; + +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.apache.log4j.Logger; +import org.broad.tribble.index.Index; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.utils.SequenceDictionaryUtils; + +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +/** + * Utilities for working with Sequence Dictionaries embedded in tribble indices + * + * @author Your Name + * @since Date created + */ +public class IndexDictionaryUtils { + private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class); + + // a constant we use for marking sequence dictionary entries in the Tribble index property list + public static final String SequenceDictionaryPropertyPredicate = "DICT:"; + + /** + * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index + * @param index the index file to use + * @return a SAMSequenceDictionary if available, null if unavailable + */ + public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { + SAMSequenceDictionary dict = new SAMSequenceDictionary(); + for (Map.Entry entry : index.getProperties().entrySet()) { + if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) + dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), + Integer.valueOf(entry.getValue()))); + } + return dict; + } + + /** + * create the sequence dictionary with the contig list; a backup approach + * @param index the index file to use + * @param dict the sequence dictionary to add contigs to + * @return the filled-in sequence dictionary + */ + static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) { + LinkedHashSet seqNames = index.getSequenceNames(); + if (seqNames == null) { + return dict; + } + for (String name : seqNames) { + SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); + dict.addSequence(seq); + } + return dict; + } + + public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) { + for ( SAMSequenceRecord seq : dict.getSequences() ) { + final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName(); + final String length = String.valueOf(seq.getSequenceLength()); + index.addProperty(contig,length); + } + } + + public static void validateTrackSequenceDictionary(final String trackName, + final SAMSequenceDictionary trackDict, + final SAMSequenceDictionary referenceDict, + final ValidationExclusion.TYPE validationExclusionType ) { + // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation + if (trackDict == null || trackDict.size() == 0) + logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); + else { + Set trackSequences = new TreeSet(); + for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) + trackSequences.add(dictionaryEntry.getSequenceName()); + SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java index 46eb79aa7..3b4558579 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.gatk.refdata.tracks; import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.FeatureCodec; import org.broad.tribble.FeatureSource; @@ -41,7 +40,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.SequenceDictionaryUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -52,16 +50,11 @@ import org.broadinstitute.sting.utils.instrumentation.Sizeof; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; -import java.util.LinkedHashSet; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - /** - * - * @author aaron + * + * @author aaron * ` * Class RMDTrackBuilder * @@ -76,9 +69,6 @@ public class RMDTrackBuilder { // extends PluginManager { private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); public final static boolean MEASURE_TRIBBLE_QUERY_PERFORMANCE = false; - // a constant we use for marking sequence dictionary entries in the Tribble index property list - public static final String SequenceDictionaryPropertyPredicate = "DICT:"; - // private sequence dictionary we use to set our tracks with private SAMSequenceDictionary dict = null; @@ -210,13 +200,19 @@ public class RMDTrackBuilder { // extends PluginManager { try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); } catch (ReviewedStingException e) { } - sequenceDictionary = getSequenceDictionaryFromProperties(index); + sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match if (sequenceDictionary.size() == 0 && dict != null) { File indexFile = Tribble.indexFile(inputFile); - setIndexSequenceDictionary(inputFile,index,dict,indexFile,true); - sequenceDictionary = getSequenceDictionaryFromProperties(index); + validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); + try { // re-write the index + writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); + } catch (IOException e) { + logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK"); + } + + sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); } if ( MEASURE_TRIBBLE_QUERY_PERFORMANCE ) @@ -363,96 +359,31 @@ public class RMDTrackBuilder { // extends PluginManager { // this can take a while, let them know what we're doing logger.info("Creating Tribble index in memory for file " + inputFile); Index idx = IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - setIndexSequenceDictionary(inputFile, idx, dict, null, false); + validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict); return idx; } - - // --------------------------------------------------------------------------------------------------------- - // static functions to work with the sequence dictionaries of indexes - // --------------------------------------------------------------------------------------------------------- - - /** - * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index - * @param index the index file to use - * @return a SAMSequenceDictionary if available, null if unavailable - */ - public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { - SAMSequenceDictionary dict = new SAMSequenceDictionary(); - for (Map.Entry entry : index.getProperties().entrySet()) { - if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) - dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), - Integer.valueOf(entry.getValue()))); - } - return dict; - } - - /** - * create the sequence dictionary with the contig list; a backup approach - * @param index the index file to use - * @param dict the sequence dictionary to add contigs to - * @return the filled-in sequence dictionary - */ - private static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) { - LinkedHashSet seqNames = index.getSequenceNames(); - if (seqNames == null) { - return dict; - } - for (String name : seqNames) { - SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); - dict.addSequence(seq); - } - return dict; - } - /** * set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible. * (that each contig in the index is in the sequence dictionary). * @param inputFile for proper error message formatting. * @param dict the sequence dictionary * @param index the index file - * @param indexFile the index file - * @param rewriteIndex should we rewrite the index when we're done? - * */ - public void setIndexSequenceDictionary(File inputFile, Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) { - if (dict == null) return; - - SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); + public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) { + if (dict == null) throw new ReviewedStingException("BUG: dict cannot be null"); // check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set - validateTrackSequenceDictionary(inputFile.getAbsolutePath(),currentDict,dict); + final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); + validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict); - for (SAMSequenceRecord seq : currentDict.getSequences()) { - if (dict.getSequence(seq.getSequenceName()) == null) - continue; - index.addProperty(SequenceDictionaryPropertyPredicate + dict.getSequence(seq.getSequenceName()).getSequenceName(), String.valueOf(dict.getSequence(seq.getSequenceName()).getSequenceLength())); - } - // re-write the index - if (rewriteIndex) try { - writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); - } catch (IOException e) { - logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK"); - } + // actually update the dictionary in the index + IndexDictionaryUtils.setIndexSequenceDictionary(index, dict); } - public void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) { - for ( SAMSequenceRecord seq : dict.getSequences() ) { - final String contig = SequenceDictionaryPropertyPredicate + seq.getSequenceName(); - final String length = String.valueOf(seq.getSequenceLength()); - index.addProperty(contig,length); - } - } - - public void validateTrackSequenceDictionary(String trackName, SAMSequenceDictionary trackDict, SAMSequenceDictionary referenceDict) { - // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation - if (trackDict == null || trackDict.size() == 0) - logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); - else { - Set trackSequences = new TreeSet(); - for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) - trackSequences.add(dictionaryEntry.getSequenceName()); - SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict); - } + public void validateTrackSequenceDictionary(final String trackName, + final SAMSequenceDictionary trackDict, + final SAMSequenceDictionary referenceDict ) { + IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index c88c7c3c4..10261112c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers; +import net.sf.samtools.SAMSequenceDictionary; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -77,6 +78,15 @@ public abstract class Walker { return toolkit; } + /** + * Gets the master sequence dictionary for this walker + * @link GenomeAnalysisEngine.getMasterSequenceDictionary + * @return + */ + protected SAMSequenceDictionary getMasterSequenceDictionary() { + return getToolkit().getMasterSequenceDictionary(); + } + /** * (conceptual static) method that states whether you want to see reads piling up at a locus * that contain a deletion at the locus. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index 1c76a21ea..a932d44ed 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -99,7 +99,7 @@ public class LiftoverVariants extends RodWalker { final VCFHeader vcfHeader = new VCFHeader(metaData, samples); - writer = new StandardVCFWriter(file, false); + writer = new StandardVCFWriter(file, getMasterSequenceDictionary(), false); writer.writeHeader(vcfHeader); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java index 1fefd20fc..fa5093839 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java @@ -75,7 +75,7 @@ public class RandomlySplitVariants extends RodWalker { hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames)); vcfWriter1.writeHeader(new VCFHeader(hInfo, samples)); - vcfWriter2 = new StandardVCFWriter(file2, true); + vcfWriter2 = new StandardVCFWriter(file2, getMasterSequenceDictionary(), true); vcfWriter2.writeHeader(new VCFHeader(hInfo, samples)); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java index 4ae87ddcb..71ec4ce1b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java @@ -24,6 +24,9 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.TribbleException; import org.broad.tribble.index.DynamicIndexCreator; @@ -31,7 +34,9 @@ import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; import org.broad.tribble.util.LittleEndianOutputStream; import org.broad.tribble.util.PositionalStream; +import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.*; @@ -41,21 +46,24 @@ import java.io.*; */ public abstract class IndexingVCFWriter implements VCFWriter { final private String name; + private final SAMSequenceDictionary refDict; - private File indexFile = null; private OutputStream outputStream; private PositionalStream positionalStream = null; private DynamicIndexCreator indexer = null; private LittleEndianOutputStream idxStream = null; - protected IndexingVCFWriter(String name, File location, OutputStream output, boolean enableOnTheFlyIndexing) { + @Requires({"name != null", + "! ( location == null && output == null )", + "! ( enableOnTheFlyIndexing && location == null )"}) + protected IndexingVCFWriter(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) { outputStream = output; this.name = name; + this.refDict = refDict; if ( enableOnTheFlyIndexing ) { - indexFile = Tribble.indexFile(location); try { - idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); + idxStream = new LittleEndianOutputStream(new FileOutputStream(Tribble.indexFile(location))); //System.out.println("Creating index on the fly for " + location); indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); indexer.initialize(location, indexer.defaultBinSize()); @@ -66,15 +74,16 @@ public abstract class IndexingVCFWriter implements VCFWriter { idxStream = null; indexer = null; positionalStream = null; - indexFile = null; } } } + @Ensures("result != null") public OutputStream getOutputStream() { return outputStream; } + @Ensures("result != null") public String getStreamName() { return name; } @@ -89,6 +98,7 @@ public abstract class IndexingVCFWriter implements VCFWriter { if ( indexer != null ) { try { Index index = indexer.finalizeIndex(positionalStream.getPosition()); + IndexDictionaryUtils.setIndexSequenceDictionary(index, refDict); index.write(idxStream); idxStream.close(); } catch (IOException e) { @@ -108,15 +118,27 @@ public abstract class IndexingVCFWriter implements VCFWriter { indexer.addFeature(vc, positionalStream.getPosition()); } - protected static final String writerName(File location, OutputStream stream) { + /** + * Returns a reasonable "name" for this writer, to display to the user if something goes wrong + * + * @param location + * @param stream + * @return + */ + protected static final String writerName(final File location, final OutputStream stream) { return location == null ? stream.toString() : location.getAbsolutePath(); } - protected static OutputStream openOutputStream(File location) { + /** + * Returns a output stream writing to location, or throws a UserException if this fails + * @param location + * @return + */ + protected static OutputStream openOutputStream(final File location) { try { return new FileOutputStream(location); } catch (FileNotFoundException e) { - throw new ReviewedStingException("Unable to create VCF file at location: " + location, e); + throw new UserException.CouldNotCreateOutputFile(location, "Unable to create VCF writer", e); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index 7cba5fc3e..0da7a100f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.TribbleException; import org.broad.tribble.index.DynamicIndexCreator; @@ -62,21 +63,12 @@ public class StandardVCFWriter extends IndexingVCFWriter { * * @param location the file location to write to */ - public StandardVCFWriter(File location) { - this(location, openOutputStream(location), true, false); + public StandardVCFWriter(final File location, final SAMSequenceDictionary refDict) { + this(location, openOutputStream(location), refDict, true, false); } - public StandardVCFWriter(File location, boolean enableOnTheFlyIndexing) { - this(location, openOutputStream(location), enableOnTheFlyIndexing, false); - } - - /** - * create a VCF writer, given a stream to write to - * - * @param output the file location to write to - */ - public StandardVCFWriter(OutputStream output) { - this(output, false); + public StandardVCFWriter(File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing) { + this(location, openOutputStream(location), refDict, enableOnTheFlyIndexing, false); } /** @@ -85,12 +77,12 @@ public class StandardVCFWriter extends IndexingVCFWriter { * @param output the file location to write to * @param doNotWriteGenotypes do not write genotypes */ - public StandardVCFWriter(OutputStream output, boolean doNotWriteGenotypes) { - this(null, output, false, doNotWriteGenotypes); + public StandardVCFWriter(final OutputStream output, final SAMSequenceDictionary refDict, final boolean doNotWriteGenotypes) { + this(null, output, refDict, false, doNotWriteGenotypes); } - public StandardVCFWriter(File location, OutputStream output, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { - super(writerName(location, output), location, output, enableOnTheFlyIndexing); + public StandardVCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { + super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing); mWriter = new BufferedWriter(new OutputStreamWriter(getOutputStream())); // todo -- fix buffer size this.doNotWriteGenotypes = doNotWriteGenotypes; } diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java index 7ff6e27a2..18fae18c4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils.gcf; +import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.utils.codecs.vcf.IndexingVCFWriter; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -52,8 +53,8 @@ public class GCFWriter extends IndexingVCFWriter { // // -------------------------------------------------------------------------------- - public GCFWriter(File location, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { - super(writerName(location, null), location, null, enableOnTheFlyIndexing); + public GCFWriter(final File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { + super(writerName(location, null), location, null, refDict, enableOnTheFlyIndexing); this.location = location; this.skipGenotypes = doNotWriteGenotypes; diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 386c17659..a1817e3c7 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -75,7 +75,7 @@ public class WalkerTest extends BaseTest { Index indexFromOutputFile = IndexFactory.createIndex(resultFile, new VCFCodec()); Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath()); - if ( ! indexFromOutputFile.equalsIgnoreTimestamp(dynamicIndex) ) { + if ( ! indexFromOutputFile.equalsIgnoreProperties(dynamicIndex) ) { Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed. FileIndex %s vs. on-the-fly %s%n", indexFromOutputFile.getProperties(), dynamicIndex.getProperties())); diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java index ae218e898..724c343e4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java @@ -29,7 +29,6 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.index.Index; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -45,7 +44,6 @@ import org.testng.annotations.Test; import java.io.*; import java.nio.channels.FileChannel; -import java.util.Map; /** @@ -164,7 +162,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { try { Index idx = builder.loadIndex(vcfFile, new VCFCodec()); // catch any exception; this call should pass correctly - SAMSequenceDictionary dict = RMDTrackBuilder.getSequenceDictionaryFromProperties(idx); + SAMSequenceDictionary dict = IndexDictionaryUtils.getSequenceDictionaryFromProperties(idx); } catch (IOException e) { e.printStackTrace(); Assert.fail("IO exception unexpected" + e.getMessage()); diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java index 1809ab778..55bd4783b 100755 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java @@ -1,27 +1,45 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.index.*; import org.broad.tribble.iterators.CloseableTribbleIterator; import org.broad.tribble.source.BasicFeatureSource; +import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.testng.Assert; +import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; /** * tests out the various functions in the index factory class */ -public class IndexFactoryUnitTest { +public class IndexFactoryUnitTest extends BaseTest { File inputFile = new File("public/testdata/HiSeq.10000.vcf"); File outputFile = new File("public/testdata/onTheFlyOutputTest.vcf"); File outputFileIndex = Tribble.indexFile(outputFile); + private SAMSequenceDictionary dict; + + @BeforeTest + public void setup() { + try { + dict = new CachingIndexedFastaSequenceFile(new File(b37KGReference)).getSequenceDictionary(); + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(b37KGReference,ex); + } + } + // // test out scoring the indexes // @@ -37,7 +55,7 @@ public class IndexFactoryUnitTest { BasicFeatureSource source = new BasicFeatureSource(inputFile.getAbsolutePath(), indexFromInputFile, new VCFCodec()); int counter = 0; - VCFWriter writer = new StandardVCFWriter(outputFile); + VCFWriter writer = new StandardVCFWriter(outputFile, dict); writer.writeHeader((VCFHeader)source.getHeader()); CloseableTribbleIterator it = source.iterator(); while (it.hasNext() && (counter++ < maxRecords || maxRecords == -1) ) { diff --git a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index e3a926fb9..a8e6593b1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -38,12 +38,13 @@ public class VCFWriterUnitTest extends BaseTest { private Set additionalColumns = new HashSet(); private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf"); private GenomeLocParser genomeLocParser; + private IndexedFastaSequenceFile seq; @BeforeClass public void beforeTests() { File referenceFile = new File(hg18Reference); try { - IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile); + seq = new CachingIndexedFastaSequenceFile(referenceFile); genomeLocParser = new GenomeLocParser(seq); } catch(FileNotFoundException ex) { @@ -55,7 +56,7 @@ public class VCFWriterUnitTest extends BaseTest { @Test public void testBasicWriteAndRead() { VCFHeader header = createFakeHeader(metaData,additionalColumns); - VCFWriter writer = new StandardVCFWriter(fakeVCFFile); + VCFWriter writer = new StandardVCFWriter(fakeVCFFile, seq.getSequenceDictionary()); writer.writeHeader(header); writer.add(createVC(header)); writer.add(createVC(header));