From 3afbc50511d5b294147584e102fe8ec10d55631a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 9 May 2012 17:55:42 -0400 Subject: [PATCH] More BCF2 improvements -- Refactored setting of contigs from VCFWriterStub to VCFUtils. Necessary for proper BCF working -- Added VCFContigHeaderLine that manages the order for sorting, so we now emit contigs in the proper order. -- Cleaned up VCFHeader operations -- BCF now uses the right header files correctly when encoding / decoding contigs -- Clean up unused tools -- Refactored header parsing routines to make them more accessible -- More minor header changes from Intellij --- .../sting/gatk/io/stubs/VCFWriterStub.java | 32 +------- .../utils/codecs/vcf/AbstractVCFCodec.java | 28 +++---- .../utils/codecs/vcf/StandardVCFWriter.java | 60 +++++++++------ .../sting/utils/codecs/vcf/VCF3Codec.java | 3 +- .../sting/utils/codecs/vcf/VCFCodec.java | 4 +- .../utils/codecs/vcf/VCFContigHeaderLine.java | 73 +++++++++++++++++++ .../sting/utils/codecs/vcf/VCFHeader.java | 37 +++++----- .../utils/codecs/vcf/VCFSimpleHeaderLine.java | 23 ++---- .../sting/utils/codecs/vcf/VCFUtils.java | 61 ++++++++++++++++ .../variantcontext/VariantContextUtils.java | 3 +- 10 files changed, 214 insertions(+), 110 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFContigHeaderLine.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java index 94051cc7f..4672bca4e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java @@ -25,13 +25,13 @@ package org.broadinstitute.sting.gatk.io.stubs; import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.gatk.CommandLineExecutable; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.utils.classloader.JVMUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -210,12 +210,7 @@ public class VCFWriterStub implements Stub, VCFWriter { vcfHeader.addMetaDataLine(commandLineArgHeaderLine); } - // also put in the reference contig header lines - String assembly = getReferenceAssembly(engine.getArguments().referenceFile.getName()); - for ( SAMSequenceRecord contig : engine.getReferenceDataSource().getReference().getSequenceDictionary().getSequences() ) - vcfHeader.addMetaDataLine(getContigHeaderLine(contig, assembly)); - - vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + engine.getArguments().referenceFile.getAbsolutePath())); + vcfHeader = VCFUtils.withUpdatedContigs(vcfHeader, engine); } outputTracker.getStorage(this).writeHeader(vcfHeader); @@ -252,27 +247,4 @@ public class VCFWriterStub implements Stub, VCFWriter { CommandLineExecutable executable = JVMUtils.getObjectOfType(argumentSources,CommandLineExecutable.class); return new VCFHeaderLine(executable.getAnalysisName(), "\"" + engine.createApproximateCommandLineArgumentString(argumentSources.toArray()) + "\""); } - - private VCFHeaderLine getContigHeaderLine(SAMSequenceRecord contig, String assembly) { - String val; - if ( assembly != null ) - val = String.format("", contig.getSequenceName(), contig.getSequenceLength(), assembly); - else - val = String.format("", contig.getSequenceName(), contig.getSequenceLength()); - return new VCFHeaderLine(VCFHeader.CONTIG_KEY, val); - } - - private String getReferenceAssembly(String refPath) { - // This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot - String assembly = null; - if (refPath.contains("b37") || refPath.contains("v37")) - assembly = "b37"; - else if (refPath.contains("b36")) - assembly = "b36"; - else if (refPath.contains("hg18")) - assembly = "hg18"; - else if (refPath.contains("hg19")) - assembly = "hg19"; - return assembly; - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 949f488c8..a5b67d0c9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -29,8 +29,6 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec protected final static Logger log = Logger.getLogger(VCFCodec.class); protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column - protected VCFHeaderVersion version; - // we have to store the list of strings that make up the header until they're needed protected VCFHeader header = null; @@ -122,16 +120,13 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec /** * create a VCF header - * @param headerStrings a list of strings that represent all the ## entries - * @param line the single # line (column names) - * @return the count of header lines + * @param headerStrings a list of strings that represent all the ## and # entries + * @return a VCFHeader object */ - protected Object createHeader(List headerStrings, String line) { - - headerStrings.add(line); - + public static VCFHeader parseHeader(final List headerStrings, final VCFHeaderVersion version) { Set metaData = new TreeSet(); Set sampleNames = new LinkedHashSet(); + int contigCounter = 0; // iterate over all the passed in strings for ( String str : headerStrings ) { if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) { @@ -166,19 +161,16 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec } else { if ( str.startsWith(VCFConstants.INFO_HEADER_START) ) { - final VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version); + final VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7), version); metaData.add(info); - infoFields.put(info.getID(), info.getType()); } else if ( str.startsWith(VCFConstants.FILTER_HEADER_START) ) { final VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9), version); metaData.add(filter); - filterFields.add(filter.getID()); } else if ( str.startsWith(VCFConstants.FORMAT_HEADER_START) ) { final VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9), version); metaData.add(format); - formatFields.put(format.getID(), format.getType()); } else if ( str.startsWith(VCFConstants.CONTIG_HEADER_START) ) { - final VCFSimpleHeaderLine contig = new VCFSimpleHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), null); + final VCFContigHeaderLine contig = new VCFContigHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), contigCounter++); metaData.add(contig); } else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) { final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description")); @@ -191,8 +183,12 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec } } - header = new VCFHeader(metaData, sampleNames); - header.buildVCFReaderMaps(new ArrayList(sampleNames)); + return new VCFHeader(metaData, sampleNames); + } + + protected VCFHeader createAndSetVCFHeader(final List headerStrings, final String line, final VCFHeaderVersion version) { + headerStrings.add(line); + header = parseHeader(headerStrings, version); return header; } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java index 994a2575d..98c8bc081 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java @@ -38,6 +38,8 @@ import java.util.*; * this class writes VCF files */ public class StandardVCFWriter extends IndexingVCFWriter { + private final static String VERSION_LINE = VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_1.getFormatString() + "=" + VCFHeaderVersion.VCF4_1.getVersionString(); + // the print stream we're writing to final protected BufferedWriter mWriter; @@ -87,50 +89,62 @@ public class StandardVCFWriter extends IndexingVCFWriter { @Override public void writeHeader(VCFHeader header) { - mHeader = doNotWriteGenotypes ? new VCFHeader(header.getMetaData()) : header; + mHeader = header; + writeHeader(mHeader, mWriter, doNotWriteGenotypes, VERSION_LINE, getStreamName()); + + // determine if we use filters, so we should FORCE pass the records + // TODO -- this might not be necessary any longer as we have unfiltered, filtered, and PASS VCs + for ( final VCFHeaderLine line : header.getMetaData() ) { + if ( line instanceof VCFFilterHeaderLine) + filtersWereAppliedToContext = true; + } + } + + public static void writeHeader(VCFHeader header, + final Writer writer, + final boolean doNotWriteGenotypes, + final String versionLine, + final String streamNameForError) { + header = doNotWriteGenotypes ? new VCFHeader(header.getMetaData()) : header; try { // the file format field needs to be written first - mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_1.getFormatString() + "=" + VCFHeaderVersion.VCF4_1.getVersionString() + "\n"); + writer.write(versionLine + "\n"); - for ( VCFHeaderLine line : mHeader.getMetaData() ) { + for ( VCFHeaderLine line : header.getMetaData() ) { if ( VCFHeaderVersion.isFormatString(line.getKey()) ) continue; - // are the records filtered (so we know what to put in the FILTER column of passing records) ? - if ( line instanceof VCFFilterHeaderLine) - filtersWereAppliedToContext = true; - - mWriter.write(VCFHeader.METADATA_INDICATOR); - mWriter.write(line.toString()); - mWriter.write("\n"); + writer.write(VCFHeader.METADATA_INDICATOR); + writer.write(line.toString()); + writer.write("\n"); } // write out the column line - mWriter.write(VCFHeader.HEADER_INDICATOR); + writer.write(VCFHeader.HEADER_INDICATOR); boolean isFirst = true; - for ( VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields() ) { + for ( VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) { if ( isFirst ) isFirst = false; // don't write out a field separator else - mWriter.write(VCFConstants.FIELD_SEPARATOR); - mWriter.write(field.toString()); + writer.write(VCFConstants.FIELD_SEPARATOR); + writer.write(field.toString()); } - if ( mHeader.hasGenotypingData() ) { - mWriter.write(VCFConstants.FIELD_SEPARATOR); - mWriter.write("FORMAT"); - for ( String sample : mHeader.getGenotypeSamples() ) { - mWriter.write(VCFConstants.FIELD_SEPARATOR); - mWriter.write(sample); + if ( header.hasGenotypingData() ) { + writer.write(VCFConstants.FIELD_SEPARATOR); + writer.write("FORMAT"); + for ( String sample : header.getGenotypeSamples() ) { + writer.write(VCFConstants.FIELD_SEPARATOR); + writer.write(sample); } } - mWriter.write("\n"); - mWriter.flush(); // necessary so that writing to an output stream will work + writer.write("\n"); + writer.flush(); // necessary so that writing to an output stream will work } catch (IOException e) { - throw new ReviewedStingException("IOException writing the VCF header to " + getStreamName(), e); + throw new ReviewedStingException("IOException writing the VCF header to " + streamNameForError, e); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java index b3329c708..18bb41293 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCF3Codec.java @@ -37,6 +37,7 @@ public class VCF3Codec extends AbstractVCFCodec { List headerStrings = new ArrayList(); String line; + VCFHeaderVersion version = null; try { boolean foundHeaderVersion = false; while ((line = reader.readLine()) != null) { @@ -57,7 +58,7 @@ public class VCF3Codec extends AbstractVCFCodec { if (!foundHeaderVersion) { throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); } - return createHeader(headerStrings, line); + return createAndSetVCFHeader(headerStrings, line, version); } else { throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 01cc367c4..bfb4d4daf 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -47,8 +47,8 @@ import java.util.*; */ public class VCFCodec extends AbstractVCFCodec { // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. - public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; + private VCFHeaderVersion version = null; /** * @param reader the line reader to take header lines from @@ -80,7 +80,7 @@ public class VCFCodec extends AbstractVCFCodec { if (!foundHeaderVersion) { throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); } - return createHeader(headerStrings, line); + return createAndSetVCFHeader(headerStrings, line, version); } else { throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFContigHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFContigHeaderLine.java new file mode 100644 index 000000000..d5d76cab7 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFContigHeaderLine.java @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.vcf; + +import java.util.Map; + +/** + * A special class representing a contig VCF header line. Nows the true contig order and sorts on that + * + * @author mdepristo + */ +public class VCFContigHeaderLine extends VCFSimpleHeaderLine { + final Integer contigIndex; + + + /** + * create a VCF contig header line + * + * @param line the header line + * @param version the vcf header version + * @param key the key for this header line + */ + public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, final String key, int contigIndex) { + super(line, version, key, null); + this.contigIndex = contigIndex; + } + + public VCFContigHeaderLine(final String key, final Map mapping, int contigIndex) { + super(key, mapping, null); + this.contigIndex = contigIndex; + } + + public Integer getContigIndex() { + return contigIndex; + } + + /** + * IT IS CRITIAL THAT THIS BE OVERRIDDEN SO WE SORT THE CONTIGS IN THE CORRECT ORDER + * + * @param other + * @return + */ + @Override + public int compareTo(final Object other) { + if ( other instanceof VCFContigHeaderLine ) + return contigIndex.compareTo(((VCFContigHeaderLine) other).contigIndex); + else { + return super.compareTo(other); + } + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index 20f71b956..63b2bc0f1 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -44,10 +44,11 @@ public class VCFHeader { } // the associated meta data - private final Set mMetaData; + private final Set mMetaData = new TreeSet(); private final Map mInfoMetaData = new HashMap(); private final Map mFormatMetaData = new HashMap(); private final Map mOtherMetaData = new HashMap(); + private final List contigMetaData = new ArrayList(); // the list of auxillary tags private final Set mGenotypeSampleNames = new LinkedHashSet(); @@ -79,7 +80,7 @@ public class VCFHeader { * @param metaData the meta data associated with this header */ public VCFHeader(Set metaData) { - mMetaData = new TreeSet(metaData); + mMetaData.addAll(metaData); loadVCFVersion(); loadMetaDataMaps(); } @@ -91,16 +92,10 @@ public class VCFHeader { * @param genotypeSampleNames the sample names */ public VCFHeader(Set metaData, Set genotypeSampleNames) { - mMetaData = new TreeSet(); - if ( metaData != null ) - mMetaData.addAll(metaData); - + this(metaData); mGenotypeSampleNames.addAll(genotypeSampleNames); - - loadVCFVersion(); - loadMetaDataMaps(); - samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames); + buildVCFReaderMaps(genotypeSampleNames); } /** @@ -109,10 +104,9 @@ public class VCFHeader { * using this header (i.e., read by the VCFCodec) will have genotypes * occurring in the same order * - * @param genotypeSampleNamesInAppearenceOrder genotype sample names + * @param genotypeSampleNamesInAppearenceOrder genotype sample names, must iterator in order of appearence */ - - public void buildVCFReaderMaps(List genotypeSampleNamesInAppearenceOrder) { + private void buildVCFReaderMaps(Collection genotypeSampleNamesInAppearenceOrder) { sampleNamesInOrder = new ArrayList(genotypeSampleNamesInAppearenceOrder.size()); sampleNameToOffset = new HashMap(genotypeSampleNamesInAppearenceOrder.size()); @@ -131,7 +125,14 @@ public class VCFHeader { * @param headerLine Line to add to the existing metadata component. */ public void addMetaDataLine(VCFHeaderLine headerLine) { - mMetaData.add(headerLine); + mMetaData.add(headerLine); + } + + /** + * @return all of the VCF header lines of the ##contig form in order, or an empty set if none were present + */ + public List getContigLines() { + return Collections.unmodifiableList(contigMetaData); } /** @@ -157,12 +158,12 @@ public class VCFHeader { if ( line instanceof VCFInfoHeaderLine ) { VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line; mInfoMetaData.put(infoLine.getID(), infoLine); - } - else if ( line instanceof VCFFormatHeaderLine ) { + } else if ( line instanceof VCFFormatHeaderLine ) { VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; mFormatMetaData.put(formatLine.getID(), formatLine); - } - else { + } else if ( line instanceof VCFContigHeaderLine ) { + contigMetaData.add((VCFContigHeaderLine)line); + } else { mOtherMetaData.put(line.getKey(), line); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java index e0a057eec..c9699e7b5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFSimpleHeaderLine.java @@ -14,18 +14,6 @@ public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLin private String name; private Map genericFields = new LinkedHashMap(); - /** - * create a VCF filter header line - * - * @param key the key for this header line - * @param name the name for this header line - * @param genericFields other fields for this header line - */ - public VCFSimpleHeaderLine(String key, String name, Map genericFields) { - super(key, ""); - initialize(name, genericFields); - } - /** * create a VCF filter header line * @@ -48,9 +36,12 @@ public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLin * @param key the key for this header line * @param expectedTagOrdering the tag ordering expected for this header line */ - public VCFSimpleHeaderLine(String line, VCFHeaderVersion version, String key, List expectedTagOrdering) { + public VCFSimpleHeaderLine(final String line, final VCFHeaderVersion version, final String key, final List expectedTagOrdering) { + this(key, VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering), expectedTagOrdering); + } + + public VCFSimpleHeaderLine(final String key, final Map mapping, final List expectedTagOrdering) { super(key, ""); - Map mapping = VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering); name = mapping.get("ID"); initialize(name, mapping); } @@ -87,8 +78,4 @@ public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLin public String getID() { return name; } - - public Map getGenericFields() { - return genericFields; - } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index 238a06243..8fdb39d01 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.RodBinding; @@ -32,6 +34,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.File; import java.util.*; /** @@ -225,4 +228,62 @@ public class VCFUtils { return rsID; } + + /** + * Add / replace the contig header lines in the VCFHeader with the information in the GATK engine + * + * @param header the header to update + * @param engine the GATK engine containing command line arguments and the master sequence dictionary + */ + public final static VCFHeader withUpdatedContigs(final VCFHeader header, final GenomeAnalysisEngine engine) { + return VCFUtils.withUpdatedContigs(header, engine.getArguments().referenceFile, engine.getMasterSequenceDictionary()); + } + + /** + * Add / replace the contig header lines in the VCFHeader with the in the reference file and master reference dictionary + * + * @param oldHeader the header to update + * @param referenceFile the file path to the reference sequence used to generate this vcf + * @param refDict the SAM formatted reference sequence dictionary + */ + public final static VCFHeader withUpdatedContigs(final VCFHeader oldHeader, final File referenceFile, final SAMSequenceDictionary refDict) { + final Set lines = new LinkedHashSet(oldHeader.getMetaData().size()); + + for ( final VCFHeaderLine line : oldHeader.getMetaData() ) { + if ( line instanceof VCFContigHeaderLine ) + continue; // skip old contig lines + if ( line.getKey().equals(VCFHeader.REFERENCE_KEY) ) + continue; // skip the old reference key + lines.add(line); + } + + final String assembly = getReferenceAssembly(referenceFile.getName()); + for ( SAMSequenceRecord contig : refDict.getSequences() ) + lines.add(getContigHeaderLine(contig, assembly)); + + lines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + referenceFile.getAbsolutePath())); + return new VCFHeader(lines, oldHeader.getGenotypeSamples()); + } + + private final static VCFHeaderLine getContigHeaderLine(final SAMSequenceRecord contig, final String assembly) { + final Map map = new LinkedHashMap(3); + map.put("ID", contig.getSequenceName()); + map.put("length", String.valueOf(contig.getSequenceLength())); + if ( assembly != null ) map.put("assembly", assembly); + return new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, map, contig.getSequenceIndex()); + } + + private final static String getReferenceAssembly(final String refPath) { + // This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot + String assembly = null; + if (refPath.contains("b37") || refPath.contains("v37")) + assembly = "b37"; + else if (refPath.contains("b36")) + assembly = "b36"; + else if (refPath.contains("hg18")) + assembly = "hg18"; + else if (refPath.contains("hg19")) + assembly = "hg19"; + return assembly; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index d5a1a75fd..82a4c055f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -32,9 +32,8 @@ import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException;