/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.variant.vcf;
import org.broad.tribble.TribbleException;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.variant.utils.GeneralUtils;
import java.util.*;
/**
* This class is really a POS. It allows duplicate entries in the metadata,
* stores header lines in lots of places, and all around f*cking sucks.
*
* todo -- clean this POS up
*
* @author aaron
*
* Class VCFHeader
*
* A class representing the VCF header
*/
public class VCFHeader {
// the mandatory header fields
public enum HEADER_FIELDS {
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
}
// the associated meta data
private final Set mMetaData = new LinkedHashSet();
private final Map mInfoMetaData = new LinkedHashMap();
private final Map mFormatMetaData = new LinkedHashMap();
private final Map mFilterMetaData = new LinkedHashMap();
private final Map mOtherMetaData = new LinkedHashMap();
private final List contigMetaData = new ArrayList();
// the list of auxillary tags
private final List mGenotypeSampleNames = new ArrayList();
// the character string that indicates meta data
public static final String METADATA_INDICATOR = "##";
// the header string indicator
public static final String HEADER_INDICATOR = "#";
public static final String SOURCE_KEY = "source";
public static final String REFERENCE_KEY = "reference";
public static final String CONTIG_KEY = "contig";
public static final String INTERVALS_KEY = "intervals";
public static final String EXCLUDE_INTERVALS_KEY = "excludeIntervals";
public static final String INTERVAL_MERGING_KEY = "interval_merging";
public static final String INTERVAL_SET_RULE_KEY = "interval_set_rule";
public static final String INTERVAL_PADDING_KEY = "interval_padding";
// were the input samples sorted originally (or are we sorting them)?
private boolean samplesWereAlreadySorted = true;
// cache for efficient conversion of VCF -> VariantContext
private ArrayList sampleNamesInOrder = null;
private HashMap sampleNameToOffset = null;
private boolean writeEngineHeaders = true;
private boolean writeCommandLine = true;
/**
* Create an empty VCF header with no header lines and no samples
*/
public VCFHeader() {
this(Collections.emptySet(), Collections.emptySet());
}
/**
* create a VCF header, given a list of meta data and auxillary tags
*
* @param metaData the meta data associated with this header
*/
public VCFHeader(Set metaData) {
mMetaData.addAll(metaData);
loadVCFVersion();
loadMetaDataMaps();
}
/**
* Creates a shallow copy of the meta data in VCF header toCopy
*
* @param toCopy
*/
public VCFHeader(final VCFHeader toCopy) {
this(toCopy.mMetaData);
}
/**
* create a VCF header, given a list of meta data and auxillary tags
*
* @param metaData the meta data associated with this header
* @param genotypeSampleNames the sample names
*/
public VCFHeader(Set metaData, Set genotypeSampleNames) {
this(metaData, new ArrayList(genotypeSampleNames));
}
public VCFHeader(Set metaData, List genotypeSampleNames) {
this(metaData);
if ( genotypeSampleNames.size() != new HashSet(genotypeSampleNames).size() )
throw new TribbleException.InvalidHeader("BUG: VCF header has duplicate sample names");
mGenotypeSampleNames.addAll(genotypeSampleNames);
samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames);
buildVCFReaderMaps(genotypeSampleNames);
}
/**
* Tell this VCF header to use pre-calculated sample name ordering and the
* sample name -> offset map. This assumes that all VariantContext created
* using this header (i.e., read by the VCFCodec) will have genotypes
* occurring in the same order
*
* @param genotypeSampleNamesInAppearenceOrder genotype sample names, must iterator in order of appearence
*/
private void buildVCFReaderMaps(Collection genotypeSampleNamesInAppearenceOrder) {
sampleNamesInOrder = new ArrayList(genotypeSampleNamesInAppearenceOrder.size());
sampleNameToOffset = new HashMap(genotypeSampleNamesInAppearenceOrder.size());
int i = 0;
for ( final String name : genotypeSampleNamesInAppearenceOrder ) {
sampleNamesInOrder.add(name);
sampleNameToOffset.put(name, i++);
}
Collections.sort(sampleNamesInOrder);
}
/**
* Adds a header line to the header metadata.
*
* @param headerLine Line to add to the existing metadata component.
*/
public void addMetaDataLine(VCFHeaderLine headerLine) {
mMetaData.add(headerLine);
loadMetaDataMaps();
}
/**
* @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present
*/
public List getContigLines() {
return Collections.unmodifiableList(contigMetaData);
}
/**
* @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
*/
public List getFilterLines() {
final List filters = new ArrayList();
for ( VCFHeaderLine line : mMetaData ) {
if ( line instanceof VCFFilterHeaderLine ) {
filters.add((VCFFilterHeaderLine)line);
}
}
return filters;
}
/**
* @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
*/
public List getIDHeaderLines() {
final List filters = new ArrayList();
for ( VCFHeaderLine line : mMetaData ) {
if ( line instanceof VCFIDHeaderLine ) {
filters.add((VCFIDHeaderLine)line);
}
}
return filters;
}
/**
* check our metadata for a VCF version tag, and throw an exception if the version is out of date
* or the version is not present
*/
public void loadVCFVersion() {
List toRemove = new ArrayList();
for ( VCFHeaderLine line : mMetaData )
if ( VCFHeaderVersion.isFormatString(line.getKey())) {
toRemove.add(line);
}
// remove old header lines for now,
mMetaData.removeAll(toRemove);
}
/**
* load the format/info meta data maps (these are used for quick lookup by key name)
*/
private void loadMetaDataMaps() {
for ( VCFHeaderLine line : mMetaData ) {
if ( line instanceof VCFInfoHeaderLine ) {
VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
addMetaDataMapBinding(mInfoMetaData, infoLine);
} else if ( line instanceof VCFFormatHeaderLine ) {
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
addMetaDataMapBinding(mFormatMetaData, formatLine);
} else if ( line instanceof VCFFilterHeaderLine ) {
VCFFilterHeaderLine filterLine = (VCFFilterHeaderLine)line;
mFilterMetaData.put(filterLine.getID(), filterLine);
} else if ( line instanceof VCFContigHeaderLine ) {
contigMetaData.add((VCFContigHeaderLine)line);
} else {
mOtherMetaData.put(line.getKey(), line);
}
}
if ( hasFormatLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && ! hasFormatLine(VCFConstants.GENOTYPE_PL_KEY) ) {
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
System.err.println("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no "
+ VCFConstants.GENOTYPE_PL_KEY + " field. We now only manage PL fields internally"
+ " automatically adding a corresponding PL field to your VCF header");
}
addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
}
}
/**
* Add line to map, issuing warnings about duplicates
*
* @param map
* @param line
* @param
*/
private final void addMetaDataMapBinding(final Map map, T line) {
final String key = line.getID();
if ( map.containsKey(key) ) {
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
System.err.println("Found duplicate VCF header lines for " + key + "; keeping the first only" );
}
}
else {
map.put(key, line);
}
}
/**
* get the header fields in order they're presented in the input file (which is now required to be
* the order presented in the spec).
*
* @return a set of the header fields, in order
*/
public Set getHeaderFields() {
return new LinkedHashSet(Arrays.asList(HEADER_FIELDS.values()));
}
/**
* get the meta data, associated with this header, in sorted order
*
* @return a set of the meta data
*/
public Set getMetaDataInInputOrder() {
return makeGetMetaDataSet(mMetaData);
}
public Set getMetaDataInSortedOrder() {
return makeGetMetaDataSet(new TreeSet(mMetaData));
}
private static Set makeGetMetaDataSet(final Set headerLinesInSomeOrder) {
final Set lines = new LinkedHashSet();
lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_1.getFormatString(), VCFHeaderVersion.VCF4_1.getVersionString()));
lines.addAll(headerLinesInSomeOrder);
return Collections.unmodifiableSet(lines);
}
/**
* Get the VCFHeaderLine whose key equals key. Returns null if no such line exists
* @param key
* @return
*/
public VCFHeaderLine getMetaDataLine(final String key) {
for (final VCFHeaderLine line: mMetaData) {
if ( line.getKey().equals(key) )
return line;
}
return null;
}
/**
* get the genotyping sample names
*
* @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false
*/
public List getGenotypeSamples() {
return mGenotypeSampleNames;
}
public int getNGenotypeSamples() {
return mGenotypeSampleNames.size();
}
/**
* do we have genotyping data?
*
* @return true if we have genotyping columns, false otherwise
*/
public boolean hasGenotypingData() {
return getNGenotypeSamples() > 0;
}
/**
* were the input samples sorted originally?
*
* @return true if the input samples were sorted originally, false otherwise
*/
public boolean samplesWereAlreadySorted() {
return samplesWereAlreadySorted;
}
/** @return the column count */
public int getColumnCount() {
return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
}
/**
* Returns the INFO HeaderLines in their original ordering
*/
public Collection getInfoHeaderLines() {
return mInfoMetaData.values();
}
/**
* Returns the FORMAT HeaderLines in their original ordering
*/
public Collection getFormatHeaderLines() {
return mFormatMetaData.values();
}
/**
* @param id the header key name
* @return the meta data line, or null if there is none
*/
public VCFInfoHeaderLine getInfoHeaderLine(String id) {
return mInfoMetaData.get(id);
}
/**
* @param id the header key name
* @return the meta data line, or null if there is none
*/
public VCFFormatHeaderLine getFormatHeaderLine(String id) {
return mFormatMetaData.get(id);
}
/**
* @param id the header key name
* @return the meta data line, or null if there is none
*/
public VCFFilterHeaderLine getFilterHeaderLine(final String id) {
return mFilterMetaData.get(id);
}
public boolean hasInfoLine(final String id) {
return getInfoHeaderLine(id) != null;
}
public boolean hasFormatLine(final String id) {
return getFormatHeaderLine(id) != null;
}
public boolean hasFilterLine(final String id) {
return getFilterHeaderLine(id) != null;
}
/**
* @param key the header key name
* @return the meta data line, or null if there is none
*/
public VCFHeaderLine getOtherHeaderLine(String key) {
return mOtherMetaData.get(key);
}
/**
* If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output.
* @return true if additional engine headers will be written to the VCF
*/
public boolean isWriteEngineHeaders() {
return writeEngineHeaders;
}
/**
* If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output.
* @param writeEngineHeaders true if additional engine headers will be written to the VCF
*/
public void setWriteEngineHeaders(boolean writeEngineHeaders) {
this.writeEngineHeaders = writeEngineHeaders;
}
/**
* If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF.
* @return true if the command line will be written to the VCF
*/
public boolean isWriteCommandLine() {
return writeCommandLine;
}
/**
* If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF.
* @param writeCommandLine true if the command line will be written to the VCF
*/
public void setWriteCommandLine(boolean writeCommandLine) {
this.writeCommandLine = writeCommandLine;
}
public ArrayList getSampleNamesInOrder() {
return sampleNamesInOrder;
}
public HashMap getSampleNameToOffset() {
return sampleNameToOffset;
}
@Override
public String toString() {
final StringBuilder b = new StringBuilder();
b.append("[VCFHeader:");
for ( final VCFHeaderLine line : mMetaData )
b.append("\n\t").append(line);
return b.append("\n]").toString();
}
}