Fixed long-standing bug in tribble index creation
-- Previously, on the fly indices didn't have dictionary set on the fly, so the GATK would read, add dictionary, and rewrite the index. This is now fixed, so that the on the fly index contains the reference dictionary when first written, avoiding the unnecessary read and write -- Added a GenomeAnalysisEngine and Walker function called getMasterSequenceDictionary() that fetches the reference sequence dictionary. This can be used conveniently everywhere, and is what's written into the Tribble index -- Refactored tribble index utilities from RMDTrackBuilder into IndexDictionaryUtils -- VCFWriter now requires the master sequence dictionary -- Updated walkers that create VCFWriters to provide the master sequence dictionary
This commit is contained in:
parent
230e16d7c0
commit
b7511c5ff3
|
|
@ -929,6 +929,14 @@ public class GenomeAnalysisEngine {
|
|||
return readsDataSource.getHeader(reader);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the master sequence dictionary for this GATK engine instance
|
||||
* @return a never-null dictionary listing all of the contigs known to this engine instance
|
||||
*/
|
||||
public SAMSequenceDictionary getMasterSequenceDictionary() {
|
||||
return getReferenceDataSource().getReference().getSequenceDictionary();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns data source object encapsulating all essential info and handlers used to traverse
|
||||
* reads; header merger, individual file readers etc can be accessed through the returned data source object.
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
|
|||
else if ( stub.getOutputStream() != null ) {
|
||||
this.file = null;
|
||||
this.stream = stub.getOutputStream();
|
||||
writer = new StandardVCFWriter(stream, stub.doNotWriteGenotypes());
|
||||
writer = new StandardVCFWriter(stream, stub.getMasterSequenceDictionary(), stub.doNotWriteGenotypes());
|
||||
}
|
||||
else
|
||||
throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream.");
|
||||
|
|
@ -71,7 +71,7 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
|
|||
}
|
||||
|
||||
// The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it.
|
||||
return new StandardVCFWriter(file, this.stream, indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
|
||||
return new StandardVCFWriter(file, this.stream, stub.getMasterSequenceDictionary(), indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.io.stubs;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.gatk.CommandLineExecutable;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
|
|
@ -150,6 +151,15 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
|
|||
return isCompressed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the master sequence dictionary from the engine associated with this stub
|
||||
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
|
||||
* @return
|
||||
*/
|
||||
public SAMSequenceDictionary getMasterSequenceDictionary() {
|
||||
return engine.getMasterSequenceDictionary();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we tell the VCF writer not to write genotypes?
|
||||
* @return true if the writer should not write genotypes.
|
||||
|
|
|
|||
|
|
@ -101,7 +101,7 @@ public class RMDIndexer extends CommandLineProgram {
|
|||
Index index = IndexFactory.createIndex(inputFileSource, codec, approach);
|
||||
|
||||
// add writing of the sequence dictionary, if supplied
|
||||
builder.setIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary(), indexFile, false);
|
||||
builder.validateAndUpdateIndexSequenceDictionary(inputFileSource, index, ref.getSequenceDictionary());
|
||||
|
||||
// create the output stream, and write the index
|
||||
LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
|
||||
|
|
|
|||
|
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.refdata.tracks;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.index.Index;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
|
||||
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* Utilities for working with Sequence Dictionaries embedded in tribble indices
|
||||
*
|
||||
* @author Your Name
|
||||
* @since Date created
|
||||
*/
|
||||
public class IndexDictionaryUtils {
|
||||
private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class);
|
||||
|
||||
// a constant we use for marking sequence dictionary entries in the Tribble index property list
|
||||
public static final String SequenceDictionaryPropertyPredicate = "DICT:";
|
||||
|
||||
/**
|
||||
* get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index
|
||||
* @param index the index file to use
|
||||
* @return a SAMSequenceDictionary if available, null if unavailable
|
||||
*/
|
||||
public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) {
|
||||
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
||||
for (Map.Entry<String,String> entry : index.getProperties().entrySet()) {
|
||||
if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate))
|
||||
dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()),
|
||||
Integer.valueOf(entry.getValue())));
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the sequence dictionary with the contig list; a backup approach
|
||||
* @param index the index file to use
|
||||
* @param dict the sequence dictionary to add contigs to
|
||||
* @return the filled-in sequence dictionary
|
||||
*/
|
||||
static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) {
|
||||
LinkedHashSet<String> seqNames = index.getSequenceNames();
|
||||
if (seqNames == null) {
|
||||
return dict;
|
||||
}
|
||||
for (String name : seqNames) {
|
||||
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
|
||||
dict.addSequence(seq);
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) {
|
||||
for ( SAMSequenceRecord seq : dict.getSequences() ) {
|
||||
final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName();
|
||||
final String length = String.valueOf(seq.getSequenceLength());
|
||||
index.addProperty(contig,length);
|
||||
}
|
||||
}
|
||||
|
||||
public static void validateTrackSequenceDictionary(final String trackName,
|
||||
final SAMSequenceDictionary trackDict,
|
||||
final SAMSequenceDictionary referenceDict,
|
||||
final ValidationExclusion.TYPE validationExclusionType ) {
|
||||
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
|
||||
if (trackDict == null || trackDict.size() == 0)
|
||||
logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
|
||||
else {
|
||||
Set<String> trackSequences = new TreeSet<String>();
|
||||
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
|
||||
trackSequences.add(dictionaryEntry.getSequenceName());
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -25,7 +25,6 @@
|
|||
package org.broadinstitute.sting.gatk.refdata.tracks;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.FeatureSource;
|
||||
|
|
@ -41,7 +40,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
|||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.SequenceDictionaryUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -52,16 +50,11 @@ import org.broadinstitute.sting.utils.instrumentation.Sizeof;
|
|||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* @author aaron
|
||||
* `
|
||||
* Class RMDTrackBuilder
|
||||
*
|
||||
|
|
@ -76,9 +69,6 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
|||
private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class);
|
||||
public final static boolean MEASURE_TRIBBLE_QUERY_PERFORMANCE = false;
|
||||
|
||||
// a constant we use for marking sequence dictionary entries in the Tribble index property list
|
||||
public static final String SequenceDictionaryPropertyPredicate = "DICT:";
|
||||
|
||||
// private sequence dictionary we use to set our tracks with
|
||||
private SAMSequenceDictionary dict = null;
|
||||
|
||||
|
|
@ -210,13 +200,19 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
|||
try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); }
|
||||
catch (ReviewedStingException e) { }
|
||||
|
||||
sequenceDictionary = getSequenceDictionaryFromProperties(index);
|
||||
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
|
||||
|
||||
// if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
|
||||
if (sequenceDictionary.size() == 0 && dict != null) {
|
||||
File indexFile = Tribble.indexFile(inputFile);
|
||||
setIndexSequenceDictionary(inputFile,index,dict,indexFile,true);
|
||||
sequenceDictionary = getSequenceDictionaryFromProperties(index);
|
||||
validateAndUpdateIndexSequenceDictionary(inputFile, index, dict);
|
||||
try { // re-write the index
|
||||
writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
|
||||
} catch (IOException e) {
|
||||
logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK");
|
||||
}
|
||||
|
||||
sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index);
|
||||
}
|
||||
|
||||
if ( MEASURE_TRIBBLE_QUERY_PERFORMANCE )
|
||||
|
|
@ -363,96 +359,31 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
|
|||
// this can take a while, let them know what we're doing
|
||||
logger.info("Creating Tribble index in memory for file " + inputFile);
|
||||
Index idx = IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
|
||||
setIndexSequenceDictionary(inputFile, idx, dict, null, false);
|
||||
validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict);
|
||||
return idx;
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
// static functions to work with the sequence dictionaries of indexes
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index
|
||||
* @param index the index file to use
|
||||
* @return a SAMSequenceDictionary if available, null if unavailable
|
||||
*/
|
||||
public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) {
|
||||
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
||||
for (Map.Entry<String,String> entry : index.getProperties().entrySet()) {
|
||||
if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate))
|
||||
dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()),
|
||||
Integer.valueOf(entry.getValue())));
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the sequence dictionary with the contig list; a backup approach
|
||||
* @param index the index file to use
|
||||
* @param dict the sequence dictionary to add contigs to
|
||||
* @return the filled-in sequence dictionary
|
||||
*/
|
||||
private static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) {
|
||||
LinkedHashSet<String> seqNames = index.getSequenceNames();
|
||||
if (seqNames == null) {
|
||||
return dict;
|
||||
}
|
||||
for (String name : seqNames) {
|
||||
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
|
||||
dict.addSequence(seq);
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible.
|
||||
* (that each contig in the index is in the sequence dictionary).
|
||||
* @param inputFile for proper error message formatting.
|
||||
* @param dict the sequence dictionary
|
||||
* @param index the index file
|
||||
* @param indexFile the index file
|
||||
* @param rewriteIndex should we rewrite the index when we're done?
|
||||
*
|
||||
*/
|
||||
public void setIndexSequenceDictionary(File inputFile, Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) {
|
||||
if (dict == null) return;
|
||||
|
||||
SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
|
||||
public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) {
|
||||
if (dict == null) throw new ReviewedStingException("BUG: dict cannot be null");
|
||||
|
||||
// check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set
|
||||
validateTrackSequenceDictionary(inputFile.getAbsolutePath(),currentDict,dict);
|
||||
final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
|
||||
validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict);
|
||||
|
||||
for (SAMSequenceRecord seq : currentDict.getSequences()) {
|
||||
if (dict.getSequence(seq.getSequenceName()) == null)
|
||||
continue;
|
||||
index.addProperty(SequenceDictionaryPropertyPredicate + dict.getSequence(seq.getSequenceName()).getSequenceName(), String.valueOf(dict.getSequence(seq.getSequenceName()).getSequenceLength()));
|
||||
}
|
||||
// re-write the index
|
||||
if (rewriteIndex) try {
|
||||
writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile));
|
||||
} catch (IOException e) {
|
||||
logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK");
|
||||
}
|
||||
// actually update the dictionary in the index
|
||||
IndexDictionaryUtils.setIndexSequenceDictionary(index, dict);
|
||||
}
|
||||
|
||||
public void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) {
|
||||
for ( SAMSequenceRecord seq : dict.getSequences() ) {
|
||||
final String contig = SequenceDictionaryPropertyPredicate + seq.getSequenceName();
|
||||
final String length = String.valueOf(seq.getSequenceLength());
|
||||
index.addProperty(contig,length);
|
||||
}
|
||||
}
|
||||
|
||||
public void validateTrackSequenceDictionary(String trackName, SAMSequenceDictionary trackDict, SAMSequenceDictionary referenceDict) {
|
||||
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation
|
||||
if (trackDict == null || trackDict.size() == 0)
|
||||
logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation");
|
||||
else {
|
||||
Set<String> trackSequences = new TreeSet<String>();
|
||||
for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences())
|
||||
trackSequences.add(dictionaryEntry.getSequenceName());
|
||||
SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict);
|
||||
}
|
||||
public void validateTrackSequenceDictionary(final String trackName,
|
||||
final SAMSequenceDictionary trackDict,
|
||||
final SAMSequenceDictionary referenceDict ) {
|
||||
IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
|
|
@ -77,6 +78,15 @@ public abstract class Walker<MapType, ReduceType> {
|
|||
return toolkit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the master sequence dictionary for this walker
|
||||
* @link GenomeAnalysisEngine.getMasterSequenceDictionary
|
||||
* @return
|
||||
*/
|
||||
protected SAMSequenceDictionary getMasterSequenceDictionary() {
|
||||
return getToolkit().getMasterSequenceDictionary();
|
||||
}
|
||||
|
||||
/**
|
||||
* (conceptual static) method that states whether you want to see reads piling up at a locus
|
||||
* that contain a deletion at the locus.
|
||||
|
|
|
|||
|
|
@ -99,7 +99,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
|
||||
final VCFHeader vcfHeader = new VCFHeader(metaData, samples);
|
||||
writer = new StandardVCFWriter(file, false);
|
||||
writer = new StandardVCFWriter(file, getMasterSequenceDictionary(), false);
|
||||
writer.writeHeader(vcfHeader);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ public class RandomlySplitVariants extends RodWalker<Integer, Integer> {
|
|||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
|
||||
|
||||
vcfWriter1.writeHeader(new VCFHeader(hInfo, samples));
|
||||
vcfWriter2 = new StandardVCFWriter(file2, true);
|
||||
vcfWriter2 = new StandardVCFWriter(file2, getMasterSequenceDictionary(), true);
|
||||
vcfWriter2.writeHeader(new VCFHeader(hInfo, samples));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -24,6 +24,9 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broad.tribble.Tribble;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.index.DynamicIndexCreator;
|
||||
|
|
@ -31,7 +34,9 @@ import org.broad.tribble.index.Index;
|
|||
import org.broad.tribble.index.IndexFactory;
|
||||
import org.broad.tribble.util.LittleEndianOutputStream;
|
||||
import org.broad.tribble.util.PositionalStream;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.*;
|
||||
|
|
@ -41,21 +46,24 @@ import java.io.*;
|
|||
*/
|
||||
public abstract class IndexingVCFWriter implements VCFWriter {
|
||||
final private String name;
|
||||
private final SAMSequenceDictionary refDict;
|
||||
|
||||
private File indexFile = null;
|
||||
private OutputStream outputStream;
|
||||
private PositionalStream positionalStream = null;
|
||||
private DynamicIndexCreator indexer = null;
|
||||
private LittleEndianOutputStream idxStream = null;
|
||||
|
||||
protected IndexingVCFWriter(String name, File location, OutputStream output, boolean enableOnTheFlyIndexing) {
|
||||
@Requires({"name != null",
|
||||
"! ( location == null && output == null )",
|
||||
"! ( enableOnTheFlyIndexing && location == null )"})
|
||||
protected IndexingVCFWriter(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) {
|
||||
outputStream = output;
|
||||
this.name = name;
|
||||
this.refDict = refDict;
|
||||
|
||||
if ( enableOnTheFlyIndexing ) {
|
||||
indexFile = Tribble.indexFile(location);
|
||||
try {
|
||||
idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
|
||||
idxStream = new LittleEndianOutputStream(new FileOutputStream(Tribble.indexFile(location)));
|
||||
//System.out.println("Creating index on the fly for " + location);
|
||||
indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
|
||||
indexer.initialize(location, indexer.defaultBinSize());
|
||||
|
|
@ -66,15 +74,16 @@ public abstract class IndexingVCFWriter implements VCFWriter {
|
|||
idxStream = null;
|
||||
indexer = null;
|
||||
positionalStream = null;
|
||||
indexFile = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public OutputStream getOutputStream() {
|
||||
return outputStream;
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public String getStreamName() {
|
||||
return name;
|
||||
}
|
||||
|
|
@ -89,6 +98,7 @@ public abstract class IndexingVCFWriter implements VCFWriter {
|
|||
if ( indexer != null ) {
|
||||
try {
|
||||
Index index = indexer.finalizeIndex(positionalStream.getPosition());
|
||||
IndexDictionaryUtils.setIndexSequenceDictionary(index, refDict);
|
||||
index.write(idxStream);
|
||||
idxStream.close();
|
||||
} catch (IOException e) {
|
||||
|
|
@ -108,15 +118,27 @@ public abstract class IndexingVCFWriter implements VCFWriter {
|
|||
indexer.addFeature(vc, positionalStream.getPosition());
|
||||
}
|
||||
|
||||
protected static final String writerName(File location, OutputStream stream) {
|
||||
/**
|
||||
* Returns a reasonable "name" for this writer, to display to the user if something goes wrong
|
||||
*
|
||||
* @param location
|
||||
* @param stream
|
||||
* @return
|
||||
*/
|
||||
protected static final String writerName(final File location, final OutputStream stream) {
|
||||
return location == null ? stream.toString() : location.getAbsolutePath();
|
||||
}
|
||||
|
||||
protected static OutputStream openOutputStream(File location) {
|
||||
/**
|
||||
* Returns a output stream writing to location, or throws a UserException if this fails
|
||||
* @param location
|
||||
* @return
|
||||
*/
|
||||
protected static OutputStream openOutputStream(final File location) {
|
||||
try {
|
||||
return new FileOutputStream(location);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new ReviewedStingException("Unable to create VCF file at location: " + location, e);
|
||||
throw new UserException.CouldNotCreateOutputFile(location, "Unable to create VCF writer", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broad.tribble.Tribble;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.index.DynamicIndexCreator;
|
||||
|
|
@ -62,21 +63,12 @@ public class StandardVCFWriter extends IndexingVCFWriter {
|
|||
*
|
||||
* @param location the file location to write to
|
||||
*/
|
||||
public StandardVCFWriter(File location) {
|
||||
this(location, openOutputStream(location), true, false);
|
||||
public StandardVCFWriter(final File location, final SAMSequenceDictionary refDict) {
|
||||
this(location, openOutputStream(location), refDict, true, false);
|
||||
}
|
||||
|
||||
public StandardVCFWriter(File location, boolean enableOnTheFlyIndexing) {
|
||||
this(location, openOutputStream(location), enableOnTheFlyIndexing, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF writer, given a stream to write to
|
||||
*
|
||||
* @param output the file location to write to
|
||||
*/
|
||||
public StandardVCFWriter(OutputStream output) {
|
||||
this(output, false);
|
||||
public StandardVCFWriter(File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing) {
|
||||
this(location, openOutputStream(location), refDict, enableOnTheFlyIndexing, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -85,12 +77,12 @@ public class StandardVCFWriter extends IndexingVCFWriter {
|
|||
* @param output the file location to write to
|
||||
* @param doNotWriteGenotypes do not write genotypes
|
||||
*/
|
||||
public StandardVCFWriter(OutputStream output, boolean doNotWriteGenotypes) {
|
||||
this(null, output, false, doNotWriteGenotypes);
|
||||
public StandardVCFWriter(final OutputStream output, final SAMSequenceDictionary refDict, final boolean doNotWriteGenotypes) {
|
||||
this(null, output, refDict, false, doNotWriteGenotypes);
|
||||
}
|
||||
|
||||
public StandardVCFWriter(File location, OutputStream output, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
|
||||
super(writerName(location, output), location, output, enableOnTheFlyIndexing);
|
||||
public StandardVCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
|
||||
super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
|
||||
mWriter = new BufferedWriter(new OutputStreamWriter(getOutputStream())); // todo -- fix buffer size
|
||||
this.doNotWriteGenotypes = doNotWriteGenotypes;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.gcf;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.IndexingVCFWriter;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -52,8 +53,8 @@ public class GCFWriter extends IndexingVCFWriter {
|
|||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
public GCFWriter(File location, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
|
||||
super(writerName(location, null), location, null, enableOnTheFlyIndexing);
|
||||
public GCFWriter(final File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
|
||||
super(writerName(location, null), location, null, refDict, enableOnTheFlyIndexing);
|
||||
this.location = location;
|
||||
this.skipGenotypes = doNotWriteGenotypes;
|
||||
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ public class WalkerTest extends BaseTest {
|
|||
Index indexFromOutputFile = IndexFactory.createIndex(resultFile, new VCFCodec());
|
||||
Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath());
|
||||
|
||||
if ( ! indexFromOutputFile.equalsIgnoreTimestamp(dynamicIndex) ) {
|
||||
if ( ! indexFromOutputFile.equalsIgnoreProperties(dynamicIndex) ) {
|
||||
Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed. FileIndex %s vs. on-the-fly %s%n",
|
||||
indexFromOutputFile.getProperties(),
|
||||
dynamicIndex.getProperties()));
|
||||
|
|
|
|||
|
|
@ -29,7 +29,6 @@ import net.sf.picard.reference.IndexedFastaSequenceFile;
|
|||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broad.tribble.Tribble;
|
||||
import org.broad.tribble.index.Index;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -45,7 +44,6 @@ import org.testng.annotations.Test;
|
|||
|
||||
import java.io.*;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -164,7 +162,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
|
|||
try {
|
||||
Index idx = builder.loadIndex(vcfFile, new VCFCodec());
|
||||
// catch any exception; this call should pass correctly
|
||||
SAMSequenceDictionary dict = RMDTrackBuilder.getSequenceDictionaryFromProperties(idx);
|
||||
SAMSequenceDictionary dict = IndexDictionaryUtils.getSequenceDictionaryFromProperties(idx);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
Assert.fail("IO exception unexpected" + e.getMessage());
|
||||
|
|
|
|||
|
|
@ -1,27 +1,45 @@
|
|||
package org.broadinstitute.sting.utils.codecs.vcf;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broad.tribble.Tribble;
|
||||
import org.broad.tribble.index.*;
|
||||
import org.broad.tribble.iterators.CloseableTribbleIterator;
|
||||
import org.broad.tribble.source.BasicFeatureSource;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* tests out the various functions in the index factory class
|
||||
*/
|
||||
public class IndexFactoryUnitTest {
|
||||
public class IndexFactoryUnitTest extends BaseTest {
|
||||
|
||||
File inputFile = new File("public/testdata/HiSeq.10000.vcf");
|
||||
File outputFile = new File("public/testdata/onTheFlyOutputTest.vcf");
|
||||
File outputFileIndex = Tribble.indexFile(outputFile);
|
||||
|
||||
private SAMSequenceDictionary dict;
|
||||
|
||||
@BeforeTest
|
||||
public void setup() {
|
||||
try {
|
||||
dict = new CachingIndexedFastaSequenceFile(new File(b37KGReference)).getSequenceDictionary();
|
||||
}
|
||||
catch(FileNotFoundException ex) {
|
||||
throw new UserException.CouldNotReadInputFile(b37KGReference,ex);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// test out scoring the indexes
|
||||
//
|
||||
|
|
@ -37,7 +55,7 @@ public class IndexFactoryUnitTest {
|
|||
BasicFeatureSource<VariantContext> source = new BasicFeatureSource<VariantContext>(inputFile.getAbsolutePath(), indexFromInputFile, new VCFCodec());
|
||||
|
||||
int counter = 0;
|
||||
VCFWriter writer = new StandardVCFWriter(outputFile);
|
||||
VCFWriter writer = new StandardVCFWriter(outputFile, dict);
|
||||
writer.writeHeader((VCFHeader)source.getHeader());
|
||||
CloseableTribbleIterator<VariantContext> it = source.iterator();
|
||||
while (it.hasNext() && (counter++ < maxRecords || maxRecords == -1) ) {
|
||||
|
|
|
|||
|
|
@ -38,12 +38,13 @@ public class VCFWriterUnitTest extends BaseTest {
|
|||
private Set<String> additionalColumns = new HashSet<String>();
|
||||
private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf");
|
||||
private GenomeLocParser genomeLocParser;
|
||||
private IndexedFastaSequenceFile seq;
|
||||
|
||||
@BeforeClass
|
||||
public void beforeTests() {
|
||||
File referenceFile = new File(hg18Reference);
|
||||
try {
|
||||
IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile);
|
||||
seq = new CachingIndexedFastaSequenceFile(referenceFile);
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
}
|
||||
catch(FileNotFoundException ex) {
|
||||
|
|
@ -55,7 +56,7 @@ public class VCFWriterUnitTest extends BaseTest {
|
|||
@Test
|
||||
public void testBasicWriteAndRead() {
|
||||
VCFHeader header = createFakeHeader(metaData,additionalColumns);
|
||||
VCFWriter writer = new StandardVCFWriter(fakeVCFFile);
|
||||
VCFWriter writer = new StandardVCFWriter(fakeVCFFile, seq.getSequenceDictionary());
|
||||
writer.writeHeader(header);
|
||||
writer.add(createVC(header));
|
||||
writer.add(createVC(header));
|
||||
|
|
|
|||
Loading…
Reference in New Issue