More BCF2 improvements

-- Refactored setting of contigs from VCFWriterStub to VCFUtils.  Necessary for proper BCF working
-- Added VCFContigHeaderLine that manages the order for sorting, so we now emit contigs in the proper order.
-- Cleaned up VCFHeader operations
-- BCF now uses the right header files correctly when encoding / decoding contigs
-- Clean up unused tools
-- Refactored header parsing routines to make them more accessible
-- More minor header changes from Intellij
This commit is contained in:
Mark DePristo 2012-05-09 17:55:42 -04:00
parent 7541b523e1
commit 3afbc50511
10 changed files with 214 additions and 110 deletions

View File

@ -25,13 +25,13 @@
package org.broadinstitute.sting.gatk.io.stubs;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broadinstitute.sting.gatk.CommandLineExecutable;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.utils.classloader.JVMUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -210,12 +210,7 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
vcfHeader.addMetaDataLine(commandLineArgHeaderLine);
}
// also put in the reference contig header lines
String assembly = getReferenceAssembly(engine.getArguments().referenceFile.getName());
for ( SAMSequenceRecord contig : engine.getReferenceDataSource().getReference().getSequenceDictionary().getSequences() )
vcfHeader.addMetaDataLine(getContigHeaderLine(contig, assembly));
vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + engine.getArguments().referenceFile.getAbsolutePath()));
vcfHeader = VCFUtils.withUpdatedContigs(vcfHeader, engine);
}
outputTracker.getStorage(this).writeHeader(vcfHeader);
@ -252,27 +247,4 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
CommandLineExecutable executable = JVMUtils.getObjectOfType(argumentSources,CommandLineExecutable.class);
return new VCFHeaderLine(executable.getAnalysisName(), "\"" + engine.createApproximateCommandLineArgumentString(argumentSources.toArray()) + "\"");
}
private VCFHeaderLine getContigHeaderLine(SAMSequenceRecord contig, String assembly) {
String val;
if ( assembly != null )
val = String.format("<ID=%s,length=%d,assembly=%s>", contig.getSequenceName(), contig.getSequenceLength(), assembly);
else
val = String.format("<ID=%s,length=%d>", contig.getSequenceName(), contig.getSequenceLength());
return new VCFHeaderLine(VCFHeader.CONTIG_KEY, val);
}
private String getReferenceAssembly(String refPath) {
// This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot
String assembly = null;
if (refPath.contains("b37") || refPath.contains("v37"))
assembly = "b37";
else if (refPath.contains("b36"))
assembly = "b36";
else if (refPath.contains("hg18"))
assembly = "hg18";
else if (refPath.contains("hg19"))
assembly = "hg19";
return assembly;
}
}

View File

@ -29,8 +29,6 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
protected final static Logger log = Logger.getLogger(VCFCodec.class);
protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column
protected VCFHeaderVersion version;
// we have to store the list of strings that make up the header until they're needed
protected VCFHeader header = null;
@ -122,16 +120,13 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
/**
* create a VCF header
* @param headerStrings a list of strings that represent all the ## entries
* @param line the single # line (column names)
* @return the count of header lines
* @param headerStrings a list of strings that represent all the ## and # entries
* @return a VCFHeader object
*/
protected Object createHeader(List<String> headerStrings, String line) {
headerStrings.add(line);
public static VCFHeader parseHeader(final List<String> headerStrings, final VCFHeaderVersion version) {
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
Set<String> sampleNames = new LinkedHashSet<String>();
int contigCounter = 0;
// iterate over all the passed in strings
for ( String str : headerStrings ) {
if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) {
@ -166,19 +161,16 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
} else {
if ( str.startsWith(VCFConstants.INFO_HEADER_START) ) {
final VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version);
final VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7), version);
metaData.add(info);
infoFields.put(info.getID(), info.getType());
} else if ( str.startsWith(VCFConstants.FILTER_HEADER_START) ) {
final VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9), version);
metaData.add(filter);
filterFields.add(filter.getID());
} else if ( str.startsWith(VCFConstants.FORMAT_HEADER_START) ) {
final VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9), version);
metaData.add(format);
formatFields.put(format.getID(), format.getType());
} else if ( str.startsWith(VCFConstants.CONTIG_HEADER_START) ) {
final VCFSimpleHeaderLine contig = new VCFSimpleHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), null);
final VCFContigHeaderLine contig = new VCFContigHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), contigCounter++);
metaData.add(contig);
} else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) {
final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description"));
@ -191,8 +183,12 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
}
}
header = new VCFHeader(metaData, sampleNames);
header.buildVCFReaderMaps(new ArrayList<String>(sampleNames));
return new VCFHeader(metaData, sampleNames);
}
protected VCFHeader createAndSetVCFHeader(final List<String> headerStrings, final String line, final VCFHeaderVersion version) {
headerStrings.add(line);
header = parseHeader(headerStrings, version);
return header;
}

View File

@ -38,6 +38,8 @@ import java.util.*;
* this class writes VCF files
*/
public class StandardVCFWriter extends IndexingVCFWriter {
private final static String VERSION_LINE = VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_1.getFormatString() + "=" + VCFHeaderVersion.VCF4_1.getVersionString();
// the print stream we're writing to
final protected BufferedWriter mWriter;
@ -87,50 +89,62 @@ public class StandardVCFWriter extends IndexingVCFWriter {
@Override
public void writeHeader(VCFHeader header) {
mHeader = doNotWriteGenotypes ? new VCFHeader(header.getMetaData()) : header;
mHeader = header;
writeHeader(mHeader, mWriter, doNotWriteGenotypes, VERSION_LINE, getStreamName());
// determine if we use filters, so we should FORCE pass the records
// TODO -- this might not be necessary any longer as we have unfiltered, filtered, and PASS VCs
for ( final VCFHeaderLine line : header.getMetaData() ) {
if ( line instanceof VCFFilterHeaderLine)
filtersWereAppliedToContext = true;
}
}
public static void writeHeader(VCFHeader header,
final Writer writer,
final boolean doNotWriteGenotypes,
final String versionLine,
final String streamNameForError) {
header = doNotWriteGenotypes ? new VCFHeader(header.getMetaData()) : header;
try {
// the file format field needs to be written first
mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_1.getFormatString() + "=" + VCFHeaderVersion.VCF4_1.getVersionString() + "\n");
writer.write(versionLine + "\n");
for ( VCFHeaderLine line : mHeader.getMetaData() ) {
for ( VCFHeaderLine line : header.getMetaData() ) {
if ( VCFHeaderVersion.isFormatString(line.getKey()) )
continue;
// are the records filtered (so we know what to put in the FILTER column of passing records) ?
if ( line instanceof VCFFilterHeaderLine)
filtersWereAppliedToContext = true;
mWriter.write(VCFHeader.METADATA_INDICATOR);
mWriter.write(line.toString());
mWriter.write("\n");
writer.write(VCFHeader.METADATA_INDICATOR);
writer.write(line.toString());
writer.write("\n");
}
// write out the column line
mWriter.write(VCFHeader.HEADER_INDICATOR);
writer.write(VCFHeader.HEADER_INDICATOR);
boolean isFirst = true;
for ( VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields() ) {
for ( VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) {
if ( isFirst )
isFirst = false; // don't write out a field separator
else
mWriter.write(VCFConstants.FIELD_SEPARATOR);
mWriter.write(field.toString());
writer.write(VCFConstants.FIELD_SEPARATOR);
writer.write(field.toString());
}
if ( mHeader.hasGenotypingData() ) {
mWriter.write(VCFConstants.FIELD_SEPARATOR);
mWriter.write("FORMAT");
for ( String sample : mHeader.getGenotypeSamples() ) {
mWriter.write(VCFConstants.FIELD_SEPARATOR);
mWriter.write(sample);
if ( header.hasGenotypingData() ) {
writer.write(VCFConstants.FIELD_SEPARATOR);
writer.write("FORMAT");
for ( String sample : header.getGenotypeSamples() ) {
writer.write(VCFConstants.FIELD_SEPARATOR);
writer.write(sample);
}
}
mWriter.write("\n");
mWriter.flush(); // necessary so that writing to an output stream will work
writer.write("\n");
writer.flush(); // necessary so that writing to an output stream will work
}
catch (IOException e) {
throw new ReviewedStingException("IOException writing the VCF header to " + getStreamName(), e);
throw new ReviewedStingException("IOException writing the VCF header to " + streamNameForError, e);
}
}

View File

@ -37,6 +37,7 @@ public class VCF3Codec extends AbstractVCFCodec {
List<String> headerStrings = new ArrayList<String>();
String line;
VCFHeaderVersion version = null;
try {
boolean foundHeaderVersion = false;
while ((line = reader.readLine()) != null) {
@ -57,7 +58,7 @@ public class VCF3Codec extends AbstractVCFCodec {
if (!foundHeaderVersion) {
throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version");
}
return createHeader(headerStrings, line);
return createAndSetVCFHeader(headerStrings, line, version);
}
else {
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");

View File

@ -47,8 +47,8 @@ import java.util.*;
*/
public class VCFCodec extends AbstractVCFCodec {
// Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4";
private VCFHeaderVersion version = null;
/**
* @param reader the line reader to take header lines from
@ -80,7 +80,7 @@ public class VCFCodec extends AbstractVCFCodec {
if (!foundHeaderVersion) {
throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version");
}
return createHeader(headerStrings, line);
return createAndSetVCFHeader(headerStrings, line, version);
}
else {
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");

View File

@ -0,0 +1,73 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.vcf;
import java.util.Map;
/**
* A special class representing a contig VCF header line. Nows the true contig order and sorts on that
*
* @author mdepristo
*/
public class VCFContigHeaderLine extends VCFSimpleHeaderLine {
final Integer contigIndex;
/**
* create a VCF contig header line
*
* @param line the header line
* @param version the vcf header version
* @param key the key for this header line
*/
public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, final String key, int contigIndex) {
super(line, version, key, null);
this.contigIndex = contigIndex;
}
public VCFContigHeaderLine(final String key, final Map<String, String> mapping, int contigIndex) {
super(key, mapping, null);
this.contigIndex = contigIndex;
}
public Integer getContigIndex() {
return contigIndex;
}
/**
* IT IS CRITIAL THAT THIS BE OVERRIDDEN SO WE SORT THE CONTIGS IN THE CORRECT ORDER
*
* @param other
* @return
*/
@Override
public int compareTo(final Object other) {
if ( other instanceof VCFContigHeaderLine )
return contigIndex.compareTo(((VCFContigHeaderLine) other).contigIndex);
else {
return super.compareTo(other);
}
}
}

View File

@ -44,10 +44,11 @@ public class VCFHeader {
}
// the associated meta data
private final Set<VCFHeaderLine> mMetaData;
private final Set<VCFHeaderLine> mMetaData = new TreeSet<VCFHeaderLine>();
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new HashMap<String, VCFInfoHeaderLine>();
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new HashMap<String, VCFFormatHeaderLine>();
private final Map<String, VCFHeaderLine> mOtherMetaData = new HashMap<String, VCFHeaderLine>();
private final List<VCFContigHeaderLine> contigMetaData = new ArrayList<VCFContigHeaderLine>();
// the list of auxillary tags
private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
@ -79,7 +80,7 @@ public class VCFHeader {
* @param metaData the meta data associated with this header
*/
public VCFHeader(Set<VCFHeaderLine> metaData) {
mMetaData = new TreeSet<VCFHeaderLine>(metaData);
mMetaData.addAll(metaData);
loadVCFVersion();
loadMetaDataMaps();
}
@ -91,16 +92,10 @@ public class VCFHeader {
* @param genotypeSampleNames the sample names
*/
public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) {
mMetaData = new TreeSet<VCFHeaderLine>();
if ( metaData != null )
mMetaData.addAll(metaData);
this(metaData);
mGenotypeSampleNames.addAll(genotypeSampleNames);
loadVCFVersion();
loadMetaDataMaps();
samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames);
buildVCFReaderMaps(genotypeSampleNames);
}
/**
@ -109,10 +104,9 @@ public class VCFHeader {
* using this header (i.e., read by the VCFCodec) will have genotypes
* occurring in the same order
*
* @param genotypeSampleNamesInAppearenceOrder genotype sample names
* @param genotypeSampleNamesInAppearenceOrder genotype sample names, must iterator in order of appearence
*/
public void buildVCFReaderMaps(List<String> genotypeSampleNamesInAppearenceOrder) {
private void buildVCFReaderMaps(Collection<String> genotypeSampleNamesInAppearenceOrder) {
sampleNamesInOrder = new ArrayList<String>(genotypeSampleNamesInAppearenceOrder.size());
sampleNameToOffset = new HashMap<String, Integer>(genotypeSampleNamesInAppearenceOrder.size());
@ -131,7 +125,14 @@ public class VCFHeader {
* @param headerLine Line to add to the existing metadata component.
*/
public void addMetaDataLine(VCFHeaderLine headerLine) {
mMetaData.add(headerLine);
mMetaData.add(headerLine);
}
/**
* @return all of the VCF header lines of the ##contig form in order, or an empty set if none were present
*/
public List<VCFContigHeaderLine> getContigLines() {
return Collections.unmodifiableList(contigMetaData);
}
/**
@ -157,12 +158,12 @@ public class VCFHeader {
if ( line instanceof VCFInfoHeaderLine ) {
VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
mInfoMetaData.put(infoLine.getID(), infoLine);
}
else if ( line instanceof VCFFormatHeaderLine ) {
} else if ( line instanceof VCFFormatHeaderLine ) {
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
mFormatMetaData.put(formatLine.getID(), formatLine);
}
else {
} else if ( line instanceof VCFContigHeaderLine ) {
contigMetaData.add((VCFContigHeaderLine)line);
} else {
mOtherMetaData.put(line.getKey(), line);
}
}

View File

@ -14,18 +14,6 @@ public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLin
private String name;
private Map<String, String> genericFields = new LinkedHashMap<String, String>();
/**
* create a VCF filter header line
*
* @param key the key for this header line
* @param name the name for this header line
* @param genericFields other fields for this header line
*/
public VCFSimpleHeaderLine(String key, String name, Map<String, String> genericFields) {
super(key, "");
initialize(name, genericFields);
}
/**
* create a VCF filter header line
*
@ -48,9 +36,12 @@ public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLin
* @param key the key for this header line
* @param expectedTagOrdering the tag ordering expected for this header line
*/
public VCFSimpleHeaderLine(String line, VCFHeaderVersion version, String key, List<String> expectedTagOrdering) {
public VCFSimpleHeaderLine(final String line, final VCFHeaderVersion version, final String key, final List<String> expectedTagOrdering) {
this(key, VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering), expectedTagOrdering);
}
public VCFSimpleHeaderLine(final String key, final Map<String, String> mapping, final List<String> expectedTagOrdering) {
super(key, "");
Map<String, String> mapping = VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering);
name = mapping.get("ID");
initialize(name, mapping);
}
@ -87,8 +78,4 @@ public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLin
public String getID() {
return name;
}
public Map<String, String> getGenericFields() {
return genericFields;
}
}

View File

@ -25,6 +25,8 @@
package org.broadinstitute.sting.utils.codecs.vcf;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.RodBinding;
@ -32,6 +34,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.util.*;
/**
@ -225,4 +228,62 @@ public class VCFUtils {
return rsID;
}
/**
* Add / replace the contig header lines in the VCFHeader with the information in the GATK engine
*
* @param header the header to update
* @param engine the GATK engine containing command line arguments and the master sequence dictionary
*/
public final static VCFHeader withUpdatedContigs(final VCFHeader header, final GenomeAnalysisEngine engine) {
return VCFUtils.withUpdatedContigs(header, engine.getArguments().referenceFile, engine.getMasterSequenceDictionary());
}
/**
* Add / replace the contig header lines in the VCFHeader with the in the reference file and master reference dictionary
*
* @param oldHeader the header to update
* @param referenceFile the file path to the reference sequence used to generate this vcf
* @param refDict the SAM formatted reference sequence dictionary
*/
public final static VCFHeader withUpdatedContigs(final VCFHeader oldHeader, final File referenceFile, final SAMSequenceDictionary refDict) {
final Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>(oldHeader.getMetaData().size());
for ( final VCFHeaderLine line : oldHeader.getMetaData() ) {
if ( line instanceof VCFContigHeaderLine )
continue; // skip old contig lines
if ( line.getKey().equals(VCFHeader.REFERENCE_KEY) )
continue; // skip the old reference key
lines.add(line);
}
final String assembly = getReferenceAssembly(referenceFile.getName());
for ( SAMSequenceRecord contig : refDict.getSequences() )
lines.add(getContigHeaderLine(contig, assembly));
lines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + referenceFile.getAbsolutePath()));
return new VCFHeader(lines, oldHeader.getGenotypeSamples());
}
private final static VCFHeaderLine getContigHeaderLine(final SAMSequenceRecord contig, final String assembly) {
final Map<String, String> map = new LinkedHashMap<String, String>(3);
map.put("ID", contig.getSequenceName());
map.put("length", String.valueOf(contig.getSequenceLength()));
if ( assembly != null ) map.put("assembly", assembly);
return new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, map, contig.getSequenceIndex());
}
private final static String getReferenceAssembly(final String refPath) {
// This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot
String assembly = null;
if (refPath.contains("b37") || refPath.contains("v37"))
assembly = "b37";
else if (refPath.contains("b36"))
assembly = "b36";
else if (refPath.contains("hg18"))
assembly = "hg18";
else if (refPath.contains("hg19"))
assembly = "hg19";
return assembly;
}
}

View File

@ -32,9 +32,8 @@ import org.apache.log4j.Logger;
import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;