moving VCF 3.3 back into the GATK so Guillermo can make changes for VCF 4 output

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3639 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-06-25 18:20:06 +00:00
parent b3edb7dc08
commit d3848745ab
12 changed files with 2281 additions and 0 deletions

View File

@ -0,0 +1,125 @@
package org.broad.tribble.vcf;
import org.broad.tribble.Feature;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.exception.CodecLineParsingException;
import org.broad.tribble.util.LineReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
*
* @author aaron
*
* Class VCFCodec
*
* The codec for VCF, which relies on VCFReaderUtils to do most of the processing
*/
public class VCFCodec implements FeatureCodec {
// we have to store the list of strings that make up the header until they're needed
private List<String> headerStrings = new ArrayList<String>();
private VCFHeader header = null;
private VCFHeaderVersion version = VCFHeaderVersion.VCF3_3;
// some classes need to transform the line before
private LineTransform transformer = null;
/**
* Fast path to get the location of the Feature for indexing
* @param line the input line to decode
* @return
*/
public Feature decodeLoc(String line) {
return reallyDecode(line, true);
}
/**
* Decode a line as a Feature.
*
* @param line
*
* @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is
* a comment)
*/
public Feature decode(String line) {
return reallyDecode(line, false);
}
private Feature reallyDecode(String line, boolean justLocationPlease ) {
// transform the line, if we have a transform to do
if (transformer != null) line = transformer.lineTransform(line);
if (line.startsWith("#"))
return null;
// make a VCFRecord of the line and return it
VCFRecord rec = VCFReaderUtils.createRecord(line, header, justLocationPlease);
if ( ! justLocationPlease ) rec.setHeader(header);
return rec;
}
/**
* Return the # of header lines for this file. We use this to parse out the header
*
* @return 0
*/
public int readHeader(LineReader reader) {
String line = "";
try {
while ((line = reader.readLine()) != null) {
if (line.startsWith("##")) {
headerStrings.add(line);
}
else if (line.startsWith("#")) {
headerStrings.add(line);
header = VCFReaderUtils.createHeader(headerStrings,version);
return headerStrings.size();
}
else {
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
}
}
} catch (IOException e) {
throw new RuntimeException("IO Exception ", e);
}
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
}
/**
* @return VCFRecord.class
*/
public Class getFeatureType() {
return VCFRecord.class;
}
public VCFHeader getHeader(Class clazz) throws ClassCastException {
if (!clazz.equals(VCFHeader.class))
throw new ClassCastException("Unable to cast to expected type " + clazz + " from type " + VCFHeader.class);
return header;
}
public static interface LineTransform {
public String lineTransform(String line);
}
public LineTransform getTransformer() {
return transformer;
}
public void setTransformer(LineTransform transformer) {
this.transformer = transformer;
}
public VCFHeaderVersion getVersion() {
return version;
}
public void setVersion(VCFHeaderVersion version) {
this.version = version;
}
}

View File

@ -0,0 +1,72 @@
package org.broad.tribble.vcf;
import org.broad.tribble.util.ParsingUtils;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* @author ebanks
* A class representing a key=value entry for FILTER fields in the VCF header
*/
public class VCFFilterHeaderLine extends VCFHeaderLine {
private String mName;
private String mDescription;
/**
* create a VCF filter header line
*
* @param name the name for this header line
* @param description the description for this header line
*/
public VCFFilterHeaderLine(String name, String description) {
super("FILTER", "");
mName = name;
mDescription = description;
}
/**
* create a VCF info header line
*
* @param line the header line
* @param version the vcf header version
*/
protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) {
super("FILTER", "", version);
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description"));
mName = mapping.get("ID");
mDescription = mapping.get("Description");
}
protected String makeStringRep() {
if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2)
return String.format("FILTER=%s,\"%s\"", mName, mDescription);
else if (mVersion == VCFHeaderVersion.VCF4_0) {
Map<String,Object> map = new LinkedHashMap<String,Object>();
map.put("ID",mName);
map.put("Description",mDescription);
return "FILTER=" + VCFHeaderLineTranslator.toValue(this.mVersion,map);
}
else throw new RuntimeException("Unsupported VCFVersion " + mVersion);
}
public boolean equals(Object o) {
if ( !(o instanceof VCFFilterHeaderLine) )
return false;
VCFFilterHeaderLine other = (VCFFilterHeaderLine)o;
return mName.equals(other.mName) &&
mDescription.equals(other.mDescription);
}
public String getmName() {
return mName;
}
public String getmDescription() {
return mDescription;
}
}

View File

@ -0,0 +1,118 @@
package org.broad.tribble.vcf;
import org.broad.tribble.util.ParsingUtils;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* @author ebanks
* <p/>
* Class VCFFormatHeaderLine
* <p/>
* A class representing a key=value entry for genotype FORMAT fields in the VCF header
*/
public class VCFFormatHeaderLine extends VCFHeaderLine {
// the format field types
public enum FORMAT_TYPE {
Integer, Float, String;
public Object convert(String value) {
switch (this) {
case Integer:
return java.lang.Integer.valueOf(value); // the java.lang is needed since we use Integer as a enum name
case Float:
return java.lang.Float.valueOf(value);
case String:
return value;
default:
throw new IllegalStateException("field." + this + " doesn't have a set conversion approach");
}
}
}
private String mName;
private int mCount;
private String mDescription;
private FORMAT_TYPE mType;
/**
* create a VCF format header line
*
* @param name the name for this header line
* @param count the count for this header line
* @param type the type for this header line
* @param description the description for this header line
*/
public VCFFormatHeaderLine(String name, int count, FORMAT_TYPE type, String description) {
super("FORMAT", "");
mName = name;
mCount = count;
mType = type;
mDescription = description;
}
/**
* create a VCF format header line
*
* @param line the header line
* @param version the VCF header version
*
*/
protected VCFFormatHeaderLine(String line, VCFHeaderVersion version) {
super("FORMAT", "", version);
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description"));
mName = mapping.get("ID");
mCount = Integer.valueOf(mapping.get("Number"));
mType = FORMAT_TYPE.valueOf(mapping.get("Type"));
mDescription = mapping.get("Description");
}
protected String makeStringRep() {
if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2)
return String.format("FORMAT=%s,%d,%s,\"%s\"", mName, mCount, mType.toString(), mDescription);
else if (mVersion == VCFHeaderVersion.VCF4_0) {
Map<String,Object> map = new LinkedHashMap<String,Object>();
map.put("ID",mName);
map.put("Number",mCount);
map.put("Type",mType);
map.put("Description",mDescription);
return "FORMAT=" + VCFHeaderLineTranslator.toValue(this.mVersion,map);
}
else throw new RuntimeException("Unsupported VCFVersion " + mVersion);
}
public String getName() { return mName; }
public int getCount() { return mCount; }
public String getDescription() { return mDescription; }
public FORMAT_TYPE getType() { return mType; }
public boolean equals(Object o) {
if ( !(o instanceof VCFFormatHeaderLine) )
return false;
VCFFormatHeaderLine other = (VCFFormatHeaderLine)o;
return mName.equals(other.mName) &&
mCount == other.mCount &&
mDescription.equals(other.mDescription) &&
mType == other.mType;
}
public String getmName() {
return mName;
}
public int getmCount() {
return mCount;
}
public String getmDescription() {
return mDescription;
}
public FORMAT_TYPE getmType() {
return mType;
}
}

View File

@ -0,0 +1,128 @@
package org.broad.tribble.vcf;
/**
* @author aaron
* <p/>
* Class VCFGenotypeEncoding
* <p/>
* basic encoding class for genotype fields in VCF
*/
public class VCFGenotypeEncoding {
public enum TYPE {
SINGLE_BASE,
INSERTION,
DELETION,
UNCALLED,
MIXED // this type is only valid in aggregate, not for a single VCFGenotypeEncoding
}
// our length (0 for SINGLE_BASE), our bases, and our type
private final int mLength;
private final String mBases;
private final TYPE mType;
// public constructor, that parses out the base string
public VCFGenotypeEncoding(String baseString) {
if ((baseString.length() == 1)) {
// are we an empty (no-call) genotype?
if (baseString.equals(VCFGenotypeRecord.EMPTY_ALLELE)) {
mBases = VCFGenotypeRecord.EMPTY_ALLELE;
mLength = 0;
mType = TYPE.UNCALLED;
} else if (!validBases(baseString)) {
throw new IllegalArgumentException("Alleles of length 1 must be one of A,C,G,T, " + baseString + " was passed in");
} else { // we're a valid base
mBases = baseString.toUpperCase();
mLength = 0;
mType = TYPE.SINGLE_BASE;
}
} else { // deletion or insertion
if (baseString.length() < 1 || (baseString.toUpperCase().charAt(0) != 'D' && baseString.toUpperCase().charAt(0) != 'I')) {
throw new IllegalArgumentException("Genotype encoding of " + baseString + " was passed in, but is not a valid deletion, insertion, base, or no call (.)");
}
if (baseString.toUpperCase().charAt(0) == 'D') {
mLength = Integer.valueOf(baseString.substring(1, baseString.length()));
mBases = "";
mType = TYPE.DELETION;
} else { // we're an I
mBases = baseString.substring(1, baseString.length()).toUpperCase();
if (!validBases(mBases))
throw new IllegalArgumentException("The insertion base string contained invalid bases -> " + baseString);
mLength = mBases.length();
mType = TYPE.INSERTION;
}
}
}
public int getLength() {
return mLength;
}
public String getBases() {
return mBases;
}
public TYPE getType() {
return mType;
}
public boolean equals(Object obj) {
if ( obj == null )
return false;
if ( obj instanceof VCFGenotypeEncoding ) {
VCFGenotypeEncoding d = (VCFGenotypeEncoding) obj;
return (mType == d.mType) && (mBases.equals(d.mBases)) && (mLength == d.mLength);
}
if ( mType == TYPE.UNCALLED && obj.toString().equals(VCFGenotypeRecord.EMPTY_ALLELE) )
return true;
return false;
}
public int hashCode() {
// our underlying data is immutable, so this is safe (we won't strand a value in a hashtable somewhere
// when the data changes underneath, altering this value).
String str = this.mBases + String.valueOf(this.mLength) + this.mType.toString();
return str.hashCode();
}
/**
* dump the string representation of this genotype encoding
*
* @return string representation
*/
public String toString() {
StringBuilder builder = new StringBuilder();
switch (mType) {
case SINGLE_BASE:
case UNCALLED:
builder.append(mBases);
break;
case INSERTION:
builder.append("I");
builder.append(mBases);
break;
case DELETION:
builder.append("D");
builder.append(mLength);
break;
}
return builder.toString();
}
/**
* ensure that string contains valid bases
*
* @param bases the bases to check
*
* @return true if they're all either A,C,G,T; false otherwise
*/
private static boolean validBases(String bases) {
for (char c : bases.toUpperCase().toCharArray()) {
if (c != 'A' && c != 'C' && c != 'G' && c != 'T')
return false;
}
return true;
}
}

View File

@ -0,0 +1,353 @@
package org.broad.tribble.vcf;
import java.util.*;
/**
*
* @author aaron
*
* Class VCFGenotypeRecord
*
* A descriptions should go here. Blame aaron if it's missing.
*/
public class VCFGenotypeRecord {
// key names
public static final String GENOTYPE_KEY = "GT";
public static final String GENOTYPE_QUALITY_KEY = "GQ";
public static final String DEPTH_KEY = "DP";
public static final String HAPLOTYPE_QUALITY_KEY = "HQ";
public static final String GENOTYPE_FILTER_KEY = "FT";
public static final String GENOTYPE_LIKELIHOODS_KEY = "GL";
public static final String OLD_DEPTH_KEY = "RD";
// the values for empty fields
public static final String EMPTY_GENOTYPE = "./.";
public static final String EMPTY_ALLELE = ".";
public static final int MISSING_GENOTYPE_QUALITY = -1;
public static final int MISSING_DEPTH = -1;
public static final int MISSING_HAPLOTYPE_QUALITY = -1;
public static final String PASSES_FILTERS = "0";
public static final String UNFILTERED = ".";
public static final double MAX_QUAL_VALUE = 99.0;
// what kind of phasing this genotype has
public enum PHASE {
UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN
}
// our record
private VCFRecord mRecord;
// our phasing
private PHASE mPhaseType;
// our bases(s)
private final List<VCFGenotypeEncoding> mGenotypeAlleles = new ArrayList<VCFGenotypeEncoding>();
// our mapping of the format mFields to values
private final Map<String, String> mFields = new HashMap<String, String>();
// our sample name
private String mSampleName;
/**
* Create a VCF genotype record
*
* @param sampleName sample name
* @param genotypes list of genotypes
* @param phasing phasing
*/
public VCFGenotypeRecord(String sampleName, List<VCFGenotypeEncoding> genotypes, PHASE phasing) {
mSampleName = sampleName;
if (genotypes != null)
this.mGenotypeAlleles.addAll(genotypes);
mPhaseType = phasing;
}
public void setVCFRecord(VCFRecord record) {
mRecord = record;
}
public void setSampleName(String name) {
mSampleName = name;
}
/**
* Adds a field to the genotype record.
* Throws an exception if the key is GT, as that's computed internally.
*
* @param key the field name (use static variables above for common fields)
* @param value the field value
*/
public void setField(String key, String value) {
// make sure the GT field isn't being set
if ( key.equals(GENOTYPE_KEY) )
throw new IllegalArgumentException("Setting the GT field is not allowed as that's done internally");
mFields.put(key, value);
}
/**
* determine the phase of the genotype
*
* @param phase the string that contains the phase character
*
* @return the phase
*/
static PHASE determinePhase(String phase) {
// find the phasing information
if (phase.equals("/"))
return PHASE.UNPHASED;
else if (phase.equals("|"))
return PHASE.PHASED;
else if (phase.equals("\\"))
return PHASE.PHASED_SWITCH_PROB;
else
throw new IllegalArgumentException("Unknown genotype phasing parameter");
}
public PHASE getPhaseType() {
return mPhaseType;
}
public String getSampleName() {
return mSampleName;
}
public List<VCFGenotypeEncoding> getAlleles() {
return mGenotypeAlleles;
}
public Map<String, String> getFields() {
return mFields;
}
/**
* @return the phred-scaled quality score
*/
public double getQual() {
return ( mFields.containsKey(GENOTYPE_QUALITY_KEY) ? Double.valueOf(mFields.get(GENOTYPE_QUALITY_KEY)) : MISSING_GENOTYPE_QUALITY);
}
public boolean isMissingQual() {
return (int)getQual() == MISSING_GENOTYPE_QUALITY;
}
public double getNegLog10PError() {
double qual = getQual();
return (qual == MISSING_GENOTYPE_QUALITY ? MISSING_GENOTYPE_QUALITY : qual / 10.0);
}
public int getReadCount() {
return ( mFields.containsKey(DEPTH_KEY) ? Integer.valueOf(mFields.get(DEPTH_KEY)) : MISSING_DEPTH);
}
public String getLocation() {
return mRecord != null ? mRecord.getChr() + ":" + mRecord.getPosition() : null;
}
public String getReference() {
return mRecord != null ? mRecord.getReference() : "N";
}
public String getBases() {
String genotype = "";
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles )
genotype += encoding.getBases();
return genotype;
}
public boolean isVariant(char ref) {
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) {
if ( encoding.getType() == VCFGenotypeEncoding.TYPE.UNCALLED )
continue;
if ( encoding.getType() != VCFGenotypeEncoding.TYPE.SINGLE_BASE ||
encoding.getBases().charAt(0) != ref )
return true;
}
return false;
}
public boolean isPointGenotype() {
return (mRecord != null ? !mRecord.isIndel() : true);
}
public boolean isHom() {
if ( mGenotypeAlleles.size() == 0 )
return true;
String bases = mGenotypeAlleles.get(0).getBases();
for ( int i = 1; i < mGenotypeAlleles.size(); i++ ) {
if ( !bases.equals(mGenotypeAlleles.get(1).getBases()) )
return false;
}
return true;
}
public boolean isHet() {
return !isHom();
}
public boolean isNoCall() {
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) {
if ( encoding.getType() != VCFGenotypeEncoding.TYPE.UNCALLED )
return false;
}
return true;
}
public boolean isFiltered() {
return ( mFields.get(GENOTYPE_FILTER_KEY) != null &&
!mFields.get(GENOTYPE_FILTER_KEY).equals(UNFILTERED) &&
!mFields.get(GENOTYPE_FILTER_KEY).equals(PASSES_FILTERS));
}
public int getPloidy() {
return 2;
}
public VCFRecord getRecord() {
return mRecord;
}
private String toGenotypeString(List<VCFGenotypeEncoding> altAlleles) {
String str = "";
boolean first = true;
for (VCFGenotypeEncoding allele : mGenotypeAlleles) {
if (allele.getType() == VCFGenotypeEncoding.TYPE.UNCALLED)
str += VCFGenotypeRecord.EMPTY_ALLELE;
else
str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0);
if (first) {
switch (mPhaseType) {
case UNPHASED:
str += "/";
break;
case PHASED:
str += "|";
break;
case PHASED_SWITCH_PROB:
str += "\\";
break;
case UNKNOWN:
throw new UnsupportedOperationException("Unknown phase type");
}
first = false;
}
}
return str;
}
@Override
public String toString() {
return String.format("[VCFGenotype %s %s %s %s]", getLocation(), mSampleName, this.mGenotypeAlleles, mFields);
}
public boolean isEmptyGenotype() {
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) {
if ( encoding.getType() != VCFGenotypeEncoding.TYPE.UNCALLED )
return false;
}
return true;
}
public boolean equals(Object other) {
if (other instanceof VCFGenotypeRecord) {
if (((VCFGenotypeRecord) other).mPhaseType != this.mPhaseType) return false;
if (!((VCFGenotypeRecord) other).mGenotypeAlleles.equals(this.mGenotypeAlleles)) return false;
if (!((VCFGenotypeRecord) other).mFields.equals(mFields)) return false;
if (!((VCFGenotypeRecord) other).mSampleName.equals(this.mSampleName)) return false;
return true;
}
return false;
}
/**
* output a string representation of the VCFGenotypeRecord, given the alternate alleles
*
* @param altAlleles the alternate alleles, needed for toGenotypeString()
* @param genotypeFormatStrings genotype format strings
*
* @return a string
*/
public String toStringEncoding(List<VCFGenotypeEncoding> altAlleles, String[] genotypeFormatStrings) {
StringBuilder builder = new StringBuilder();
builder.append(toGenotypeString(altAlleles));
for ( String field : genotypeFormatStrings ) {
if ( field.equals(GENOTYPE_KEY) )
continue;
String value = mFields.get(field);
if ( value == null && field.equals(OLD_DEPTH_KEY) )
value = mFields.get(DEPTH_KEY);
builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR);
if ( value == null || value.equals("") )
builder.append(getMissingFieldValue(field));
else
builder.append(value);
}
return builder.toString();
}
/**
* output a string representation of an empty genotype
*
* @param genotypeFormatStrings genotype format strings
*
* @return a string
*/
public static String stringEncodingForEmptyGenotype(String[] genotypeFormatStrings) {
StringBuilder builder = new StringBuilder();
builder.append(EMPTY_GENOTYPE);
for ( String field : genotypeFormatStrings ) {
if ( field.equals(GENOTYPE_KEY) )
continue;
builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR);
builder.append(getMissingFieldValue(field));
}
return builder.toString();
}
public static String getMissingFieldValue(String field) {
String result = "";
if ( field.equals(GENOTYPE_QUALITY_KEY) )
result = String.valueOf(MISSING_GENOTYPE_QUALITY);
else if ( field.equals(DEPTH_KEY) || field.equals(OLD_DEPTH_KEY) )
result = String.valueOf(MISSING_DEPTH);
else if ( field.equals(GENOTYPE_FILTER_KEY) )
result = UNFILTERED;
else if ( field.equals(GENOTYPE_LIKELIHOODS_KEY) )
result = "0,0,0";
// TODO -- support haplotype quality
//else if ( field.equals(HAPLOTYPE_QUALITY_KEY) )
// result = String.valueOf(MISSING_HAPLOTYPE_QUALITY);
return result;
}
public static Set<VCFFormatHeaderLine> getSupportedHeaderStrings(VCFHeaderVersion version) {
Set<VCFFormatHeaderLine> result = new HashSet<VCFFormatHeaderLine>();
result.add(new VCFFormatHeaderLine(GENOTYPE_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.String, "Genotype"));
result.add(new VCFFormatHeaderLine(GENOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.Float, "Genotype Quality"));
result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.Integer, "Read Depth (only filtered reads used for calling)"));
result.add(new VCFFormatHeaderLine(GENOTYPE_LIKELIHOODS_KEY, 3, VCFFormatHeaderLine.FORMAT_TYPE.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
//result.add(new VCFFormatHeaderLine(HAPLOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Haplotype Quality"));
return result;
}
public void replaceFields(HashMap<String,String> newFields) {
mFields.clear();
for ( String s : newFields.keySet() ) {
mFields.put(s,newFields.get(s));
}
}
}

View File

@ -0,0 +1,137 @@
package org.broad.tribble.vcf;
import java.util.*;
/**
* @author aaron
* <p/>
* Class VCFHeader
* <p/>
* A class representing the VCF header
*/
public class VCFHeader {
// the manditory header fields
public enum HEADER_FIELDS {
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
}
// the associated meta data
private final Set<VCFHeaderLine> mMetaData;
// the list of auxillary tags
private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
// the character string that indicates meta data
public static final String METADATA_INDICATOR = "##";
// the header string indicator
public static final String HEADER_INDICATOR = "#";
// our header versionVCF
private VCFHeaderVersion versionVCF;
/** do we have genotying data? */
private boolean hasGenotypingData = false;
/**
* create a VCF header, given a list of meta data and auxillary tags
*
* @param metaData the meta data associated with this header
*/
public VCFHeader(Set<VCFHeaderLine> metaData) {
mMetaData = new TreeSet<VCFHeaderLine>(metaData);
checkVCFVersion();
}
/**
* create a VCF header, given a list of meta data and auxillary tags
*
* @param metaData the meta data associated with this header
* @param genotypeSampleNames the genotype format field, and the sample names
*/
public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) {
mMetaData = new TreeSet<VCFHeaderLine>(metaData);
for (String col : genotypeSampleNames) {
if (!col.equals("FORMAT"))
mGenotypeSampleNames.add(col);
}
if (genotypeSampleNames.size() > 0) hasGenotypingData = true;
checkVCFVersion();
}
/**
* check our metadata for a VCF versionVCF tag, and throw an exception if the versionVCF is out of date
* or the versionVCF is not present
*/
// TODO: fix this function
public void checkVCFVersion() {
VCFHeaderVersion version;
List<VCFHeaderLine> toRemove = new ArrayList<VCFHeaderLine>();
for ( VCFHeaderLine line : mMetaData )
if ( VCFHeaderVersion.isFormatString(line.getKey())) {
version = VCFHeaderVersion.toHeaderVersion(line.getValue(),line.getKey());
if (version == null)
{
toRemove.add(line);
}
/**throw new RuntimeException("VCF version " + line.getValue() +
" is not supported; only versionVCF " + VCFHeaderVersion.VCF3_2 + " and greater can be used");*/
else return;
}
// remove old header lines for now,
mMetaData.removeAll(toRemove);
mMetaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF3_3.getFormatString(), VCFHeaderVersion.VCF3_3.getVersionString()));
}
/**
* get the header fields in order they're presented in the input file (which is now required to be
* the order presented in the spec).
*
* @return a set of the header fields, in order
*/
public Set<HEADER_FIELDS> getHeaderFields() {
Set<HEADER_FIELDS> fields = new LinkedHashSet<HEADER_FIELDS>();
for (HEADER_FIELDS field : HEADER_FIELDS.values())
fields.add(field);
return fields;
}
/**
* get the meta data, associated with this header
*
* @return a set of the meta data
*/
public Set<VCFHeaderLine> getMetaData() {
return mMetaData;
}
/**
* get the genotyping sample names
*
* @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false
*/
public Set<String> getGenotypeSamples() {
return mGenotypeSampleNames;
}
/**
* do we have genotyping data?
*
* @return true if we have genotyping columns, false otherwise
*/
public boolean hasGenotypingData() {
return hasGenotypingData;
}
/** @return the column count, */
public int getColumnCount() {
return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
}
}

View File

@ -0,0 +1,100 @@
package org.broad.tribble.vcf;
/**
* @author ebanks
* <p/>
* Class VCFHeaderLine
* <p/>
* A class representing a key=value entry in the VCF header
*/
public class VCFHeaderLine implements Comparable {
private String stringRep = null;
private String mKey = null;
private String mValue = null;
protected VCFHeaderVersion mVersion = null;
/**
* create a VCF header line
*
* @param key the key for this header line
* @param value the value for this header line
*/
public VCFHeaderLine(String key, String value, VCFHeaderVersion version) {
mKey = key;
mValue = value;
mVersion = version;
}
/**
* create a VCF header line
*
* @param key the key for this header line
* @param value the value for this header line
*/
public VCFHeaderLine(String key, String value) {
mKey = key;
mValue = value;
mVersion = VCFHeaderVersion.VCF3_3;
}
/**
* Get the key
*
* @return the key
*/
public String getKey() {
return mKey;
}
/**
* Set the key
*
* @param key the key for this header line
*/
public void setKey(String key) {
mKey = key;
stringRep = null;
}
/**
* Get the value
*
* @return the value
*/
public String getValue() {
return mValue;
}
/**
* Set the value
*
* @param value the value for this header line
*/
public void setValue(String value) {
mValue = value;
stringRep = null;
}
public String toString() {
if ( stringRep == null )
stringRep = makeStringRep();
return stringRep;
}
protected String makeStringRep() {
return mKey + "=" + mValue;
}
public boolean equals(Object o) {
if ( !(o instanceof VCFHeaderLine) )
return false;
return mKey.equals(((VCFHeaderLine)o).getKey()) && mValue.equals(((VCFHeaderLine)o).getValue());
}
public int compareTo(Object other) {
return toString().compareTo(other.toString());
}
}

View File

@ -0,0 +1,161 @@
package org.broad.tribble.vcf;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: aaron
* Date: Jun 17, 2010
* Time: 12:28:46 PM
* To change this template use File | Settings | File Templates.
*/
public class VCFHeaderLineTranslator {
private static Map<VCFHeaderVersion,VCFLineParser> mapping;
static {
mapping = new HashMap<VCFHeaderVersion,VCFLineParser>();
mapping.put(VCFHeaderVersion.VCF4_0,new VCF4Parser());
mapping.put(VCFHeaderVersion.VCF3_3,new VCF3Parser());
}
public static Map<String,String> parseLine(VCFHeaderVersion version, String valueLine, List<String> expectedTagOrder) {
return mapping.get(version).parseLine(valueLine,expectedTagOrder);
}
public static String toValue(VCFHeaderVersion version, Map<String,Object> keyValues) {
return mapping.get(version).toValue(keyValues);
}
}
interface VCFLineParser {
public String toValue(Map<String,? extends Object> keyValues);
public Map<String,String> parseLine(String valueLine, List<String> expectedTagOrder);
}
/**
* a class that handles the to and from disk for VCF 4 lines
*/
class VCF4Parser implements VCFLineParser {
Set<String> bracketed = new HashSet<String>();
/**
* create a string of a mapping pair for the target VCF version
* @param keyValues a mapping of the key->value pairs to output
* @return a string, correctly formatted
*/
public String toValue(Map<String, ? extends Object> keyValues) {
StringBuilder builder = new StringBuilder();
builder.append("<");
boolean start = true;
for (Map.Entry<String,?> entry : keyValues.entrySet()) {
if (start) start = false;
else builder.append(",");
builder.append(entry.getKey());
builder.append("=");
builder.append(entry.getValue().toString().contains(",") ||
entry.getValue().toString().contains(" ") ||
entry.getKey().equals("Description") ? "\""+ entry.getValue() + "\"" : entry.getValue());
}
builder.append(">");
return builder.toString();
}
/**
* parse a VCF4 line
* @param valueLine the line
* @return a mapping of the tags parsed out
*/
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder) {
// our return map
Map<String, String> ret = new LinkedHashMap<String, String>();
// a builder to store up characters as we go
StringBuilder builder = new StringBuilder();
// store the key when we're parsing out the values
String key = "";
// where are we in the stream of characters?
int index = 0;
// are we inside a quotation? we don't special case ',' then
boolean inQuote = false;
// a little switch machine to parse out the tags. Regex ended up being really complicated and ugly
for (char c: valueLine.toCharArray()) {
switch (c) {
case ('<') : if (index == 0) break; // if we see a open bracket at the beginning, ignore it
case ('>') : if (index == valueLine.length()-1) ret.put(key,builder.toString().trim()); break; // if we see a close bracket, and we're at the end, add an entry to our list
case ('=') : key = builder.toString().trim(); builder = new StringBuilder(); break; // at an equals, copy the key and reset the builder
case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it
case (',') : if (!inQuote) { ret.put(key,builder.toString().trim()); builder = new StringBuilder(); break; } // drop the current key value to the return map
default: builder.append(c); // otherwise simply append to the current string
}
index++;
}
// validate the tags against the expected list
index = 0;
if (ret.size() > expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + ret.size() + " in string " + expectedTagOrder.size());
for (String str : ret.keySet()) {
if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
index++;
}
return ret;
}
}
class VCF3Parser implements VCFLineParser {
public String toValue(Map<String, ? extends Object> keyValues) {
StringBuilder builder = new StringBuilder();
boolean start = true;
for (Map.Entry<String,?> entry : keyValues.entrySet()) {
if (start) start = false;
else builder.append(",");
builder.append(entry.getValue().toString().contains(",") || entry.getValue().toString().contains(" ")? "\""+ entry.getValue() + "\"" : entry.getValue());
}
return builder.toString();
}
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder) {
// our return map
Map<String, String> ret = new LinkedHashMap<String, String>();
// a builder to store up characters as we go
StringBuilder builder = new StringBuilder();
// store the key when we're parsing out the values
String key = "";
// where are we in the stream of characters?
int index = 0;
// where in the expected tag order are we?
int tagIndex = 0;
// are we inside a quotation? we don't special case ',' then
boolean inQuote = false;
// a little switch machine to parse out the tags. Regex ended up being really complicated and ugly
for (char c: valueLine.toCharArray()) {
switch (c) {
case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it
case (',') : if (!inQuote) { ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); builder = new StringBuilder(); break; } // drop the current key value to the return map
default: builder.append(c); // otherwise simply append to the current string
}
index++;
}
ret.put(expectedTagOrder.get(tagIndex++),builder.toString());
// validate the tags against the expected list
index = 0;
if (tagIndex != expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size());
for (String str : ret.keySet()){
if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
index++;
}
return ret;
}
}

View File

@ -0,0 +1,78 @@
package org.broad.tribble.vcf;
/**
* information that identifies each header version
*/
public enum VCFHeaderVersion {
VCF3_2("VCRv3.2","format"),
VCF3_3("VCFv3.3","fileformat"),
VCF4_0("VCFv4.0","fileformat");
private final String versionString;
private final String formatString;
/**
* create the enum, privately, using:
* @param vString the version string
* @param fString the format string
*/
VCFHeaderVersion(String vString, String fString) {
this.versionString = vString;
this.formatString = fString;
}
/**
* get the header version
* @param version the version string
* @param format the format string
* @return a VCFHeaderVersion object
*/
public static VCFHeaderVersion toHeaderVersion(String version, String format) {
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
if (hv.versionString.equals(version) && hv.formatString.equals(format))
return hv;
return null;
}
/**
* get the header version
* @param version the version string
* @return a VCFHeaderVersion object
*/
public static VCFHeaderVersion toHeaderVersion(String version) {
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
if (hv.versionString.equals(version))
return hv;
return null;
}
/**
* are we a valid version string of some type
* @param version the version string
* @return true if we're valid of some type, false otherwise
*/
public static boolean isVersionString(String version){
return toHeaderVersion(version) != null;
}
/**
* are we a valid format string for some type
* @param format the format string
* @return true if we're valid of some type, false otherwise
*/
public static boolean isFormatString(String format){
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
if (hv.formatString.equals(format))
return true;
return false;
}
public String getVersionString() {
return versionString;
}
public String getFormatString() {
return formatString;
}
}

View File

@ -0,0 +1,121 @@
package org.broad.tribble.vcf;
import org.broad.tribble.util.ParsingUtils;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* @author ebanks
* <p/>
* Class VCFInfoHeaderLine
* <p/>
* A class representing a key=value entry for INFO fields in the VCF header
*/
public class VCFInfoHeaderLine extends VCFHeaderLine {
// the info field types
public enum INFO_TYPE {
Integer, Float, String, Character, Flag;
public Object convert(String value) {
switch (this) {
case Integer:
return java.lang.Integer.valueOf(value); // the java.lang is needed since we use Integer as a enum name
case Float:
return java.lang.Float.valueOf(value);
case String:
case Character:
return value;
case Flag:
return value.equals("0") ? false : true;
default:
throw new IllegalStateException("INFO_TYPE." + this + " doesn't have a set conversion approach");
}
}
}
private String mName;
private int mCount;
private String mDescription;
private INFO_TYPE mType;
// info line numerical values are allowed to be unbounded (or unknown), which is
// marked with a dot (.)
public static int UNBOUNDED = Integer.MIN_VALUE;
public static String UNBOUNDED_ENCODING = ".";
/**
* create a VCF info header line
*
* @param name the name for this header line
* @param count the count for this header line
* @param type the type for this header line
* @param description the description for this header line
*/
public VCFInfoHeaderLine(String name, int count, INFO_TYPE type, String description) {
super("INFO", "");
mName = name;
mCount = count;
mType = type;
mDescription = description;
}
/**
* create a VCF info header line
*
* @param line the header line
* @param version the VCF version
*/
protected VCFInfoHeaderLine(String line, VCFHeaderVersion version) {
super("INFO", "", version);
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description"));
mName = mapping.get("ID");
mCount = mapping.get("Number").equals(UNBOUNDED_ENCODING) ? UNBOUNDED : Integer.valueOf(mapping.get("Number"));
mType = INFO_TYPE.valueOf(mapping.get("Type"));
mDescription = mapping.get("Description");
}
protected String makeStringRep() {
if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2)
return String.format("INFO=%s,%d,%s,\"%s\"", mName, mCount, mType.toString(), mDescription);
else if (mVersion == VCFHeaderVersion.VCF4_0) {
Map<String,Object> map = new LinkedHashMap<String,Object>();
map.put("ID",mName);
map.put("Number",mCount == UNBOUNDED ? UNBOUNDED_ENCODING : mCount);
map.put("Type",mType);
map.put("Description",mDescription);
return "INFO=" + VCFHeaderLineTranslator.toValue(this.mVersion,map);
}
else throw new RuntimeException("Unsupported VCFVersion " + mVersion);
}
public boolean equals(Object o) {
if ( !(o instanceof VCFInfoHeaderLine) )
return false;
VCFInfoHeaderLine other = (VCFInfoHeaderLine)o;
return mName.equals(other.mName) &&
mCount == other.mCount &&
mDescription.equals(other.mDescription) &&
mType == other.mType;
}
public String getmName() {
return mName;
}
public int getmCount() {
return mCount;
}
public String getmDescription() {
return mDescription;
}
public INFO_TYPE getmType() {
return mType;
}
}

View File

@ -0,0 +1,206 @@
package org.broad.tribble.vcf;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** The VCFReaderUtils class, which contains a collection of utilities for working with VCF files */
public class VCFReaderUtils {
// our pattern matching for the genotype mFields
private static final Pattern gtPattern = Pattern.compile("([0-9\\.]+)([\\\\|\\/])([0-9\\.]*)");
/**
* create a VCF header, given an array of strings that all start with at least the # character. This function is
* package protected so that the VCFReaderUtils can access this function
*
* @param headerStrings a list of header strings
*
* @return a VCF Header created from the list of stinrgs
*/
public static VCFHeader createHeader(List<String> headerStrings, VCFHeaderVersion version) {
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
Set<String> auxTags = new LinkedHashSet<String>();
// iterate over all the passed in strings
for ( String str : headerStrings ) {
if ( !str.startsWith("##") ) {
String[] strings = str.substring(1).split("\\t");
// the columns should be in order according to Richard Durbin
int arrayIndex = 0;
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
try {
if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex]))
throw new RuntimeException("VCFReaderUtils: we were expecting column name " + field + " but we saw " + strings[arrayIndex]);
} catch (IllegalArgumentException e) {
throw new RuntimeException("VCFReaderUtils: Unknown column name \"" + strings[arrayIndex] + "\", it does not match a known column header name.");
}
arrayIndex++;
}
while (arrayIndex < strings.length) {
if (!strings[arrayIndex].equals("FORMAT"))
auxTags.add(strings[arrayIndex]);
arrayIndex++;
}
} else {
if ( str.startsWith("##INFO=") )
metaData.add(new VCFInfoHeaderLine(str.substring(7),version));
else if ( str.startsWith("##FILTER=") )
metaData.add(new VCFFilterHeaderLine(str.substring(9),version));
else if ( str.startsWith("##FORMAT=") )
metaData.add(new VCFFormatHeaderLine(str.substring(9),version));
else {
int equals = str.indexOf("=");
if ( equals != -1 )
metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1),version));
}
}
}
return new VCFHeader(metaData, auxTags);
}
/**
* create the next VCFRecord, given the input line
*
* @param line the line from the file
* @param mHeader the VCF header
*
* @return the VCFRecord
*/
public static VCFRecord createRecord(String line, VCFHeader mHeader) {
return createRecord(line, mHeader, false);
}
public static VCFRecord createRecord(String line, VCFHeader mHeader, boolean ignoreGenotypes) {
// things we need to make a VCF record
Map<VCFHeader.HEADER_FIELDS, String> values = new HashMap<VCFHeader.HEADER_FIELDS, String>();
String tokens[] = line.split("\\t");
// check to ensure that the column count of tokens is right
if (tokens.length != mHeader.getColumnCount()) {
throw new RuntimeException("The input file line doesn't contain enough fields, it should have " + mHeader.getColumnCount() + " fields, it has " + tokens.length + ". Line = " + line);
}
int index = 0;
for (VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields())
values.put(field, tokens[index++]);
// if we have genotyping data, we try and extract the genotype fields
if ( ! ignoreGenotypes && mHeader.hasGenotypingData()) {
String mFormatString = tokens[index];
String keyStrings[] = mFormatString.split(":");
List<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>();
index++;
String[] alt_alleles = values.get(VCFHeader.HEADER_FIELDS.ALT).split(",");
for (String str : mHeader.getGenotypeSamples()) {
genotypeRecords.add(getVCFGenotype(str, keyStrings, tokens[index], alt_alleles, values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0)));
index++;
}
VCFRecord vrec = new VCFRecord(values, mFormatString, genotypeRecords);
// associate the genotypes with this new record
for ( VCFGenotypeRecord gr : genotypeRecords )
gr.setVCFRecord(vrec);
return vrec;
}
return new VCFRecord(values);
}
/**
* generate a VCF genotype record, given it's format string, the genotype string, and allele info
*
* @param sampleName the sample name
* @param formatString the format string for this record, which contains the keys for the genotype parameters
* @param genotypeString contains the phasing information, allele information, and values for genotype parameters
* @param altAlleles the alternate allele string array, which we index into based on the field parameters
* @param referenceBase the reference base
*
* @return a VCFGenotypeRecord
*/
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String formatString, String genotypeString, String altAlleles[], char referenceBase) {
return getVCFGenotype(sampleName, formatString.split(":"), genotypeString, altAlleles, referenceBase);
}
/**
* generate a VCF genotype record, given it's format string, the genotype string, and allele info
*
* @param sampleName the sample name
* @param keyStrings the split format string for this record, which contains the keys for the genotype parameters
* @param genotypeString contains the phasing information, allele information, and values for genotype parameters
* @param altAlleles the alternate allele string array, which we index into based on the field parameters
* @param referenceBase the reference base
*
* @return a VCFGenotypeRecord
*/
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String[] keyStrings, String genotypeString, String altAlleles[], char referenceBase) {
// parameters to create the VCF genotype record
HashMap<String, String> tagToValue = new HashMap<String, String>();
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN;
List<VCFGenotypeEncoding> bases = new ArrayList<VCFGenotypeEncoding>();
for (String key : keyStrings) {
String parse;
int nextDivider;
if (!genotypeString.contains(":")) {
nextDivider = genotypeString.length();
parse = genotypeString;
} else {
nextDivider = (genotypeString.indexOf(":") > genotypeString.length()) ? genotypeString.length() : genotypeString.indexOf(":");
parse = genotypeString.substring(0, nextDivider);
}
if (key.equals(VCFGenotypeRecord.GENOTYPE_KEY)) {
Matcher m = gtPattern.matcher(parse);
if (!m.matches())
throw new RuntimeException("VCFReaderUtils: Unable to match GT genotype flag to it's expected pattern, the field was: " + parse);
phase = VCFGenotypeRecord.determinePhase(m.group(2));
addAllele(m.group(1), altAlleles, referenceBase, bases);
if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases);
} else {
if ( parse.length() == 0 )
parse = VCFGenotypeRecord.getMissingFieldValue(key);
tagToValue.put(key, parse);
}
if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1;
genotypeString = genotypeString.substring(nextDivider + 1, genotypeString.length());
}
if ( bases.size() > 0 && bases.get(0).equals(VCFGenotypeRecord.EMPTY_ALLELE) )
tagToValue.clear();
// catch some common errors, either there are too many field keys or there are two many field values
else if ( keyStrings.length != tagToValue.size() + ((bases.size() > 0) ? 1 : 0))
throw new RuntimeException("VCFReaderUtils: genotype value count doesn't match the key count (expected "
+ keyStrings.length + " but saw " + tagToValue.size() + ")");
else if ( genotypeString.length() > 0 )
throw new RuntimeException("VCFReaderUtils: genotype string contained additional unprocessed fields: " + genotypeString
+ ". This most likely means that the format string is shorter then the value fields.");
VCFGenotypeRecord rec = new VCFGenotypeRecord(sampleName, bases, phase);
for ( Map.Entry<String, String> entry : tagToValue.entrySet() )
rec.setField(entry.getKey(), entry.getValue());
return rec;
}
/**
* add an alternate allele to the list of alleles we have for a VCF genotype record
*
* @param alleleNumber the allele number, as a string
* @param altAlleles the list of alternate alleles
* @param referenceBase the reference base
* @param bases the list of bases for this genotype call
*/
private static void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List<VCFGenotypeEncoding> bases) {
if (alleleNumber.equals(VCFGenotypeRecord.EMPTY_ALLELE)) {
bases.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE));
} else {
int alleleValue = Integer.valueOf(alleleNumber);
// check to make sure the allele value is within bounds
if (alleleValue < 0 || alleleValue > altAlleles.length)
throw new IllegalArgumentException("VCFReaderUtils: the allele value of " + alleleValue + " is out of bounds given the alternate allele list.");
if (alleleValue == 0)
bases.add(new VCFGenotypeEncoding(String.valueOf(referenceBase)));
else
bases.add(new VCFGenotypeEncoding(altAlleles[alleleValue - 1]));
}
}
}

View File

@ -0,0 +1,682 @@
package org.broad.tribble.vcf;
import org.broad.tribble.Feature;
import org.broad.tribble.util.ParsingUtils;
import java.util.*;
/** the basic VCF record type */
public class VCFRecord implements Feature {
// standard info field keys
public static final String ANCESTRAL_ALLELE_KEY = "AA";
public static final String ALLELE_COUNT_KEY = "AC";
public static final String ALLELE_FREQUENCY_KEY = "AF";
public static final String ALLELE_NUMBER_KEY = "AN";
public static final String RMS_BASE_QUALITY_KEY = "BQ";
public static final String DBSNP_KEY = "DB";
public static final String DEPTH_KEY = "DP";
public static final String HAPMAP2_KEY = "H2";
public static final String HAPMAP3_KEY = "H3";
public static final String RMS_MAPPING_QUALITY_KEY = "MQ";
public static final String SAMPLE_NUMBER_KEY = "NS";
public static final String STRAND_BIAS_KEY = "SB";
// commonly used strings that are in the standard
public static final String FORMAT_FIELD_SEPERATOR = ":";
public static final String GENOTYPE_FIELD_SEPERATOR = ":";
public static final String FIELD_SEPERATOR = "\t";
public static final String FILTER_CODE_SEPERATOR = ";";
public static final String INFO_FIELD_SEPERATOR = ";";
// default values
public static final String UNFILTERED = ".";
public static final String PASSES_FILTERS = "0";
public static final String EMPTY_INFO_FIELD = ".";
public static final String EMPTY_ID_FIELD = ".";
public static final String EMPTY_ALLELE_FIELD = ".";
public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
public static final int MISSING_GENOTYPE_QUALITY = -1;
// the reference base
private String mReferenceBases;
// our location
private String mContig;
private int mPosition;
// our id
private String mID;
// the alternate bases
private final List<VCFGenotypeEncoding> mAlts = new ArrayList<VCFGenotypeEncoding>();
// our qual value
private double mQual;
// our filter string
private String mFilterString;
// our info fields -- use a TreeMap to ensure they can be pulled out in order (so it passes integration tests)
private final Map<String, String> mInfoFields = new TreeMap<String, String>();
// our genotype formatting string
private String mGenotypeFormatString;
// the vcf header we're associated with
private VCFHeader vcfHeader = null;
// our genotype sample fields
private final List<VCFGenotypeRecord> mGenotypeRecords = new ArrayList<VCFGenotypeRecord>();
/**
* given a reference base, a location, and the format string, create a VCF record.
*
* @param referenceBases the reference bases to use
* @param contig our contig
* @param start the start location
* @param genotypeFormatString the format string
*/
public VCFRecord(String referenceBases, String contig, int start, String genotypeFormatString) {
setReferenceBase(referenceBases);
setLocation(contig, start);
mGenotypeFormatString = genotypeFormatString;
}
/**
* given the values for each of the columns, create a VCF record.
*
* @param columnValues a mapping of header strings to values
* @param genotypeFormatString the format string for the genotype records
* @param genotypeRecords the genotype records
*/
public VCFRecord(Map<VCFHeader.HEADER_FIELDS, String> columnValues, String genotypeFormatString, List<VCFGenotypeRecord> genotypeRecords) {
extractFields(columnValues);
mGenotypeRecords.addAll(genotypeRecords);
mGenotypeFormatString = genotypeFormatString;
}
/**
* given the values for each of the columns, create a VCF record.
*
* @param columnValues a mapping of header strings to values
*/
public VCFRecord(Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
extractFields(columnValues);
mGenotypeFormatString = "";
}
/**
* create a VCF record
*
* @param referenceBases the reference bases to use
* @param contig the contig this variant is on
* @param position our position
* @param ID our ID string
* @param altBases the list of alternate bases
* @param qual the qual field
* @param filters the filters used on this variant
* @param infoFields the information fields
* @param genotypeFormatString the format string
* @param genotypeObjects the genotype objects
*/
public VCFRecord(String referenceBases,
String contig,
long position,
String ID,
List<VCFGenotypeEncoding> altBases,
double qual,
String filters,
Map<String, String> infoFields,
String genotypeFormatString,
List<VCFGenotypeRecord> genotypeObjects) {
setReferenceBase(referenceBases);
setLocation(contig, position);
this.mID = ID;
for (VCFGenotypeEncoding alt : altBases)
this.addAlternateBase(alt);
this.setQual(qual);
this.setFilterString(filters);
this.mInfoFields.putAll(infoFields);
this.mGenotypeFormatString = genotypeFormatString;
this.mGenotypeRecords.addAll(genotypeObjects);
}
/**
* extract the field values from the passed in array
*
* @param columnValues a map of the header fields to values
*/
private void extractFields(Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
String chrom = null;
long position = -1;
for (VCFHeader.HEADER_FIELDS val : columnValues.keySet()) {
switch (val) {
case CHROM:
chrom = columnValues.get(val);
break;
case POS:
position = Integer.valueOf(columnValues.get(val));
break;
case ID:
setID(columnValues.get(val));
break;
case REF:
if (columnValues.get(val).length() != 1)
throw new IllegalArgumentException("Reference base should be a single character");
setReferenceBase(columnValues.get(val));
break;
case ALT:
String values[] = columnValues.get(val).split(",");
for (String alt : values)
addAlternateBase(new VCFGenotypeEncoding(alt));
break;
case QUAL:
setQual(Double.valueOf(columnValues.get(val)));
break;
case FILTER:
setFilterString(columnValues.get(val));
break;
case INFO:
String vals[] = columnValues.get(val).split(";");
for (String alt : vals) {
if ( alt.equals(EMPTY_INFO_FIELD) )
continue;
String keyVal[] = alt.split("=");
if ( keyVal.length == 1 )
addInfoField(keyVal[0], "");
else if (keyVal.length == 2)
addInfoField(keyVal[0], keyVal[1]);
else
throw new IllegalArgumentException("info field key-value pair did not parse into key->value pair: " + alt);
}
break;
}
}
setLocation(chrom, position);
}
/**
* do we have genotyping data
*
* @return true if we have genotyping data, false otherwise
*/
public boolean hasGenotypeData() {
return (mGenotypeRecords.size() > 0);
}
/**
* @return the ID value for this record
*/
public String getID() {
return mID == null ? EMPTY_ID_FIELD : mID;
}
/**
* get the reference base
*
* @return either A, T, C, G, or N
*/
public String getReference() {
return mReferenceBases;
}
/**
* get the alternate allele strings
*
* @return an array of strings representing the alt alleles, or null if there are none
*/
public List<String> getAlternateAlleleList() {
ArrayList<String> alts = new ArrayList<String>();
for ( VCFGenotypeEncoding alt : mAlts )
alts.add(alt.getBases());
return alts;
}
public List<VCFGenotypeEncoding> getAlternateAlleles() {
return mAlts;
}
public boolean hasAlternateAllele() {
for ( VCFGenotypeEncoding alt : mAlts ) {
if ( alt.getType() != VCFGenotypeEncoding.TYPE.UNCALLED )
return true;
}
return false;
}
public boolean isBiallelic() {
return getAlternateAlleles().size() == 1;
}
public boolean isReference() {
return !hasAlternateAllele();
}
public List<String> getAlleleList() {
ArrayList<String> list = new ArrayList<String>();
list.add(getReference());
list.addAll(getAlternateAlleleList());
return list;
}
public double getNonRefAlleleFrequency() {
if ( mInfoFields.containsKey(ALLELE_FREQUENCY_KEY) ) {
return Double.valueOf(mInfoFields.get(ALLELE_FREQUENCY_KEY));
} else {
// this is the poor man's AF
if ( mInfoFields.containsKey(ALLELE_COUNT_KEY) && mInfoFields.containsKey(ALLELE_NUMBER_KEY)) {
String splt[] = mInfoFields.get(ALLELE_COUNT_KEY).split(",");
if ( splt.length > 0 ) {
return (Double.valueOf(splt[0]) / Double.valueOf(mInfoFields.get(ALLELE_NUMBER_KEY)));
}
}
}
return 0.0;
}
public VCFGenotypeEncoding.TYPE getType() {
VCFGenotypeEncoding.TYPE type = mAlts.get(0).getType();
for (int i = 1; i < mAlts.size(); i++) {
if ( mAlts.get(i).getType() != type )
return VCFGenotypeEncoding.TYPE.MIXED; // if we have more than one type, return mixed
}
return type;
}
public boolean isDeletion() {
return getType() == VCFGenotypeEncoding.TYPE.DELETION;
}
public boolean isInsertion() {
return getType() == VCFGenotypeEncoding.TYPE.INSERTION;
}
public boolean isIndel() {
return isDeletion() || isInsertion();
}
public boolean isSNP() {
return getType() == VCFGenotypeEncoding.TYPE.SINGLE_BASE;
}
public boolean isNovel() {
return ( ! isInDBSNP() ) && ( ! isInHapmap() );
}
public boolean isInDBSNP() {
return ( ( mID != null && ! mID.equals(".") ) || ( mInfoFields.get(DBSNP_KEY) != null && mInfoFields.get(DBSNP_KEY).equals("1") ) );
}
public boolean isInHapmap() {
if ( mInfoFields.get(HAPMAP2_KEY) != null && mInfoFields.get(HAPMAP2_KEY).equals("1") ) {
return true;
} else {
return ( mInfoFields.get(HAPMAP3_KEY) != null && mInfoFields.get(HAPMAP3_KEY).equals("1") );
}
}
public char getAlternativeBaseForSNP() {
if ( !isSNP() && !isBiallelic() )
throw new IllegalStateException("This record does not represent a SNP");
return mAlts.get(0).getBases().charAt(0);
}
public char getReferenceForSNP() {
if ( !isSNP() )
throw new IllegalStateException("This record does not represent a SNP");
return getReference().charAt(0);
}
/**
* @return the phred-scaled quality score
*/
public double getQual() {
return mQual;
}
public int getPosition() {
return mPosition;
}
public boolean isMissingQual() {
return (int)mQual == MISSING_GENOTYPE_QUALITY;
}
/**
* @return the -log10PError
*/
public double getNegLog10PError() {
return mQual / 10.0;
}
/**
* get the filter criteria
*
* @return an array of strings representing the filtering criteria, or UNFILTERED if none are applied
*/
public String[] getFilteringCodes() {
if (mFilterString == null) return new String[]{UNFILTERED};
return mFilterString.split(FILTER_CODE_SEPERATOR);
}
public boolean isFiltered() {
String[] codes = getFilteringCodes();
return !codes[0].equals(UNFILTERED) && !codes[0].equals(PASSES_FILTERS);
}
// public boolean hasFilteringCodes() {
// return mFilterString != null;
// }
public String getFilterString() {
return mFilterString;
}
/**
* get the information key-value pairs as a Map<>
*
* @return a map, of the info key-value pairs
*/
public final Map<String, String> getInfoValues() {
return mInfoFields;
}
public List<VCFGenotypeRecord> getVCFGenotypeRecords() {
return mGenotypeRecords;
}
/**
* @return a List of the sample names
*/
public String[] getSampleNames() {
String names[] = new String[mGenotypeRecords.size()];
for (int i = 0; i < mGenotypeRecords.size(); i++) {
names[i] = mGenotypeRecords.get(i).getSampleName();
}
return names;
}
public VCFGenotypeRecord getGenotype(final String sampleName) {
for ( VCFGenotypeRecord rec : getVCFGenotypeRecords() ) {
if ( rec.getSampleName().equals(sampleName) ) {
return rec;
}
}
return null;
}
public String getGenotypeFormatString() {
return mGenotypeFormatString;
}// the formatting string for our genotype records
public void setGenotypeFormatString(String newFormatString) {
mGenotypeFormatString = newFormatString;
}
public void setReferenceBase(String reference) {
mReferenceBases = reference.toUpperCase();
}
public void setLocation(String chrom, long position) {
if ( chrom == null )
throw new IllegalArgumentException("Chromosomes cannot be missing");
if ( position < 0 )
throw new IllegalArgumentException("Position values must be greater than 0");
this.mContig = chrom;
this.mPosition = (int)position;
}
public void setID(String ID) {
mID = ID;
}
public void setQual(double qual) {
if ( qual < 0 && (int)qual != MISSING_GENOTYPE_QUALITY )
throw new IllegalArgumentException("Qual values cannot be negative unless they are " + MISSING_GENOTYPE_QUALITY + " ('unknown')");
mQual = qual;
}
public void setFilterString(String filterString) {
mFilterString = filterString;
}
public void addGenotypeRecord(VCFGenotypeRecord mGenotypeRecord) {
mGenotypeRecords.add(mGenotypeRecord);
}
public void setGenotypeRecords(List<VCFGenotypeRecord> records) {
mGenotypeRecords.clear();
for ( VCFGenotypeRecord g : records )
addGenotypeRecord(g);
}
/**
* add an alternate base to our alternate base list. All bases are uppercased
* before being added to the list.
*
* @param base the base to add
*/
public void addAlternateBase(VCFGenotypeEncoding base) {
if (!mAlts.contains(base)) mAlts.add(base);
}
public void setAlternateBases(List<VCFGenotypeEncoding> bases) {
mAlts.clear();
for ( VCFGenotypeEncoding e : bases )
addAlternateBase(e);
}
/**
* add an info field to the record
*
* @param key the key, from the spec or a user created key
* @param value it's value as a string
*/
public void addInfoField(String key, String value) {
//System.out.printf("Adding info field %s=%s%n", key, value);
mInfoFields.put(key, value);
}
public void printInfoFields() {
for ( Map.Entry<String, String> e : mInfoFields.entrySet() ) {
System.out.printf(" Current info field %s=%s this=%s%n", e.getKey(), e.getValue(), this);
}
}
/**
* add an info field to the record
*
* @param m A map from info keys to info values
*/
public void addInfoFields(Map<String,String> m) {
for ( Map.Entry<String, String> e : m.entrySet() )
addInfoField(e.getKey(), e.getValue());
}
/**
* the generation of a string representation, which is used by the VCF writer
*
* @param header the VCF header for this VCF Record
* @return a string
*/
public String toStringEncoding(VCFHeader header) {
StringBuilder builder = new StringBuilder();
// CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO
builder.append(mContig);
builder.append(FIELD_SEPERATOR);
builder.append(mPosition);
builder.append(FIELD_SEPERATOR);
builder.append(getID());
builder.append(FIELD_SEPERATOR);
builder.append(getReference());
builder.append(FIELD_SEPERATOR);
List<VCFGenotypeEncoding> alts = getAlternateAlleles();
if ( alts.size() > 0 ) {
builder.append(alts.get(0));
for ( int i = 1; i < alts.size(); i++ ) {
builder.append(",");
builder.append(alts.get(i));
}
} else {
builder.append(EMPTY_ALLELE_FIELD);
}
builder.append(FIELD_SEPERATOR);
if ( (int)mQual == MISSING_GENOTYPE_QUALITY )
builder.append(MISSING_GENOTYPE_QUALITY);
else
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, mQual));
builder.append(FIELD_SEPERATOR);
builder.append(ParsingUtils.join(FILTER_CODE_SEPERATOR, getFilteringCodes()));
builder.append(FIELD_SEPERATOR);
builder.append(createInfoString());
if ( mGenotypeFormatString != null && mGenotypeFormatString.length() > 0 ) {
// try {
addGenotypeData(builder, header);
// } catch (Exception e) {
// if ( validationStringency == VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT )
// throw new RuntimeException(e);
// }
}
return builder.toString();
}
/**
* create the info string
*
* @return a string representing the infomation fields
*/
protected String createInfoString() {
StringBuffer info = new StringBuffer();
boolean isFirst = true;
for (Map.Entry<String, String> entry : mInfoFields.entrySet()) {
if ( isFirst )
isFirst = false;
else
info.append(INFO_FIELD_SEPERATOR);
info.append(entry.getKey());
if ( entry.getValue() != null && !entry.getValue().equals("") ) {
info.append("=");
info.append(entry.getValue());
}
}
return info.length() == 0 ? EMPTY_INFO_FIELD : info.toString();
}
/**
* add the genotype data
*
* @param builder the string builder
* @param header the header object
*/
private void addGenotypeData(StringBuilder builder, VCFHeader header) {
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getVCFGenotypeRecords());
StringBuffer tempStr = new StringBuffer();
if ( header.getGenotypeSamples().size() < getVCFGenotypeRecords().size() ) {
for ( String sample : gMap.keySet() ) {
if ( !header.getGenotypeSamples().contains(sample) )
System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header");
else
header.getGenotypeSamples().remove(sample);
}
throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated");
}
tempStr.append(FIELD_SEPERATOR + mGenotypeFormatString);
String[] genotypeFormatStrings = mGenotypeFormatString.split(":");
for ( String genotype : header.getGenotypeSamples() ) {
tempStr.append(FIELD_SEPERATOR);
if ( gMap.containsKey(genotype) ) {
VCFGenotypeRecord rec = gMap.get(genotype);
tempStr.append(rec.toStringEncoding(mAlts, genotypeFormatStrings));
gMap.remove(genotype);
} else {
tempStr.append(VCFGenotypeRecord.stringEncodingForEmptyGenotype(genotypeFormatStrings));
}
}
if ( gMap.size() != 0 ) {
for ( String sample : gMap.keySet() )
System.err.println("Sample " + sample + " is being genotyped but isn't in the header.");
throw new IllegalStateException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records");
}
builder.append(tempStr);
}
/**
* compare two VCF records
*
* @param other the other VCF record
* @return true if they're equal
*/
public boolean equals(VCFRecord other) {
if (!this.mAlts.equals(other.mAlts)) return false;
if (!this.mReferenceBases.equals(other.mReferenceBases)) return false;
if (!this.mContig.equals(other.mContig)) return false;
if (mPosition != other.mPosition) return false;
if (!this.mID.equals(other.mID)) return false;
if (this.mQual != other.mQual) return false;
if ( this.mFilterString == null ) {
if ( other.mFilterString != null ) return false;
} else if ( !this.mFilterString.equals(other.mFilterString) ) return false;
if (!this.mInfoFields.equals(other.mInfoFields)) return false;
if (!this.mGenotypeRecords.equals(other.mGenotypeRecords)) return false;
return true;
}
/**
* create a genotype mapping from a list and their sample names
*
* @param list a list of genotype samples
* @return a mapping of the sample name to VCF genotype record
*/
private static Map<String, VCFGenotypeRecord> genotypeListToMap(List<VCFGenotypeRecord> list) {
Map<String, VCFGenotypeRecord> map = new HashMap<String, VCFGenotypeRecord>();
for (int i = 0; i < list.size(); i++) {
VCFGenotypeRecord rec = list.get(i);
map.put(rec.getSampleName(), rec);
}
return map;
}
/** Return the features reference sequence name, e.g chromosome or contig */
public String getChr() {
return this.mContig;
}
/** Return the start position in 1-based coordinates (first base is 1) */
public int getStart() {
return this.mPosition;
}
/**
* Return the end position following 1-based fully closed conventions. The length of a feature is
* end - start + 1;
*/
public int getEnd() {
return this.mPosition;
}
/**
* set the VCF header we're associated with
* @param header the header
*/
void setHeader(VCFHeader header) {
vcfHeader = header;
}
/**
* get the associated header
* @return the VCF Header
*/
public VCFHeader getHeader() {
return vcfHeader;
}
}