diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java index 219fc7463..3dd7c533e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java @@ -1,3 +1,27 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.io.storage; import net.sf.samtools.util.BlockCompressedOutputStream; diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 4a5032936..e33ff6f30 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -46,7 +46,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; @Invariant({ "logger != null", "contigInfo != null"}) -public class GenomeLocParser { +public final class GenomeLocParser { private static Logger logger = Logger.getLogger(GenomeLocParser.class); // -------------------------------------------------------------------------------------------------------------- @@ -86,12 +86,12 @@ public class GenomeLocParser { } @Requires("contig != null") - public synchronized boolean hasContig(final String contig) { + public final synchronized boolean hasContig(final String contig) { return contig.equals(lastContig) || dict.getSequence(contig) != null; } @Requires("index >= 0") - public synchronized boolean hasContig(final int index) { + public final synchronized boolean hasContig(final int index) { return lastIndex == index || dict.getSequence(index) != null; } @@ -125,12 +125,12 @@ public class GenomeLocParser { } @Requires({"contig != null", "lastContig != null"}) - private synchronized boolean isCached(final String contig) { + private final synchronized boolean isCached(final String contig) { return lastContig.equals(contig); } @Requires({"lastIndex != -1", "index >= 0"}) - private synchronized boolean isCached(final int index) { + private final synchronized boolean isCached(final int index) { return lastIndex == index; } @@ -144,7 +144,7 @@ public class GenomeLocParser { */ @Requires("contig != null || index >= 0") @Ensures("result != null") - private synchronized SAMSequenceRecord updateCache(final String contig, int index ) { + private final synchronized SAMSequenceRecord updateCache(final String contig, int index ) { SAMSequenceRecord rec = contig == null ? dict.getSequence(index) : dict.getSequence(contig); if ( rec == null ) { throw new ReviewedStingException("BUG: requested unknown contig=" + contig + " index=" + index); @@ -187,11 +187,11 @@ public class GenomeLocParser { * * @return True if the contig is valid. False otherwise. */ - public boolean contigIsInDictionary(String contig) { + public final boolean contigIsInDictionary(String contig) { return contig != null && contigInfo.hasContig(contig); } - public boolean indexIsInDictionary(final int index) { + public final boolean indexIsInDictionary(final int index) { return index >= 0 && contigInfo.hasContig(index); } @@ -205,7 +205,7 @@ public class GenomeLocParser { */ @Ensures("result != null") @ThrowEnsures({"UserException.MalformedGenomeLoc", "!contigIsInDictionary(contig) || contig == null"}) - public SAMSequenceRecord getContigInfo(final String contig) { + public final SAMSequenceRecord getContigInfo(final String contig) { if ( contig == null || ! contigIsInDictionary(contig) ) throw new UserException.MalformedGenomeLoc(String.format("Contig %s given as location, but this contig isn't present in the Fasta sequence dictionary", contig)); return contigInfo.getSequence(contig); @@ -220,7 +220,7 @@ public class GenomeLocParser { */ @Ensures("result >= 0") @ThrowEnsures({"UserException.MalformedGenomeLoc", "!contigIsInDictionary(contig) || contig == null"}) - public int getContigIndex(final String contig) { + public final int getContigIndex(final String contig) { return getContigInfo(contig).getSequenceIndex(); } @@ -231,6 +231,14 @@ public class GenomeLocParser { return contigInfo.getSequenceIndex(contig); } + /** + * Return the master sequence dictionary used within this GenomeLocParser + * @return + */ + public final SAMSequenceDictionary getContigs() { + return contigInfo.dict; + } + // -------------------------------------------------------------------------------------------------------------- // // Low-level creation functions diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 4835228b2..3c9eff51c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -24,12 +24,15 @@ package org.broadinstitute.sting.utils.codecs.bcf2; +import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broad.tribble.FeatureCodec; import org.broad.tribble.FeatureCodecHeader; import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -45,7 +48,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; -public class BCF2Codec implements FeatureCodec { +public class BCF2Codec implements FeatureCodec, ReferenceDependentFeatureCodec { final protected static Logger logger = Logger.getLogger(BCF2Codec.class); private VCFHeader header = null; private final ArrayList contigNames = new ArrayList(); @@ -147,14 +150,18 @@ public class BCF2Codec implements FeatureCodec { } } - private final ArrayList parseDictionary(final VCFHeader header) { - final ArrayList dict = BCF2Utils.makeDictionary(header); + // -------------------------------------------------------------------------------- + // + // Reference dependence + // + // -------------------------------------------------------------------------------- - // if we got here we never found a dictionary, or there are no elements in the dictionary - if ( dict.size() == 0 ) - throw new UserException.MalformedBCF2("Dictionary header element was absent or empty"); - return dict; + @Override + public void setGenomeLocParser(final GenomeLocParser genomeLocParser) { + // initialize contigNames to standard ones in reference + for ( final SAMSequenceRecord contig : genomeLocParser.getContigs().getSequences() ) + contigNames.add(contig.getSequenceName()); } public boolean isSkippingGenotypes() { @@ -412,4 +419,14 @@ public class BCF2Codec implements FeatureCodec { throw new UserException.MalformedBCF2(String.format("No contig at index %d present in the sequence dictionary from the BCF2 header (%s)", contigOffset, contigNames)); } } + + private final ArrayList parseDictionary(final VCFHeader header) { + final ArrayList dict = BCF2Utils.makeDictionary(header); + + // if we got here we never found a dictionary, or there are no elements in the dictionary + if ( dict.size() == 0 ) + throw new UserException.MalformedBCF2("Dictionary header element was absent or empty"); + + return dict; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index 8fdb39d01..8a1b03149 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -257,15 +257,29 @@ public class VCFUtils { lines.add(line); } - final String assembly = getReferenceAssembly(referenceFile.getName()); - for ( SAMSequenceRecord contig : refDict.getSequences() ) - lines.add(getContigHeaderLine(contig, assembly)); + for ( final VCFHeaderLine contigLine : makeContigHeaderLines(refDict, referenceFile) ) + lines.add(contigLine); lines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + referenceFile.getAbsolutePath())); return new VCFHeader(lines, oldHeader.getGenotypeSamples()); } - private final static VCFHeaderLine getContigHeaderLine(final SAMSequenceRecord contig, final String assembly) { + /** + * Create VCFHeaderLines for each refDict entry, and optionally the assembly if referenceFile != null + * @param refDict + * @param referenceFile for assembly name. May be null + * @return + */ + public final static List makeContigHeaderLines(final SAMSequenceDictionary refDict, + final File referenceFile) { + final List lines = new ArrayList(); + final String assembly = referenceFile != null ? getReferenceAssembly(referenceFile.getName()) : null; + for ( SAMSequenceRecord contig : refDict.getSequences() ) + lines.add(makeContigHeaderLine(contig, assembly)); + return lines; + } + + private final static VCFContigHeaderLine makeContigHeaderLine(final SAMSequenceRecord contig, final String assembly) { final Map map = new LinkedHashMap(3); map.put("ID", contig.getSequenceName()); map.put("length", String.valueOf(contig.getSequenceLength())); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index d4d1cbab9..57c07fb90 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -60,11 +60,20 @@ class BCF2Writer extends IndexingVariantContextWriter { // // -------------------------------------------------------------------------------- + private final void createContigDictionary(final Collection contigLines) { + for ( final VCFContigHeaderLine contig : contigLines ) + contigDictionary.put(contig.getID(), contig.getContigIndex()); + } + @Override public void writeHeader(final VCFHeader header) { // create the config offsets map - for ( final VCFContigHeaderLine contig : header.getContigLines()) - contigDictionary.put(contig.getID(), contig.getContigIndex()); + if ( header.getContigLines().isEmpty() ) { + logger.warn("No contig dictionary found in header, falling back to reference sequence dictionary"); + createContigDictionary(VCFUtils.makeContigHeaderLines(getRefDict(), null)); + } else { + createContigDictionary(header.getContigLines()); + } // set up the map from dictionary string values -> offset final ArrayList dict = BCF2Utils.makeDictionary(header); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/IndexingVariantContextWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/IndexingVariantContextWriter.java index 93fcfdeda..4f577fbcf 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/IndexingVariantContextWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/IndexingVariantContextWriter.java @@ -106,6 +106,13 @@ abstract class IndexingVariantContextWriter implements VariantContextWriter { } } + /** + * @return the reference sequence dictionary used for the variant contexts being written + */ + public SAMSequenceDictionary getRefDict() { + return refDict; + } + /** * add a record to the file *