moving VCF 3.3 back into the GATK so Guillermo can make changes for VCF 4 output
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3639 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b3edb7dc08
commit
d3848745ab
|
|
@ -0,0 +1,125 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.exception.CodecLineParsingException;
|
||||
import org.broad.tribble.util.LineReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class VCFCodec
|
||||
*
|
||||
* The codec for VCF, which relies on VCFReaderUtils to do most of the processing
|
||||
*/
|
||||
public class VCFCodec implements FeatureCodec {
|
||||
|
||||
// we have to store the list of strings that make up the header until they're needed
|
||||
private List<String> headerStrings = new ArrayList<String>();
|
||||
private VCFHeader header = null;
|
||||
private VCFHeaderVersion version = VCFHeaderVersion.VCF3_3;
|
||||
|
||||
|
||||
// some classes need to transform the line before
|
||||
private LineTransform transformer = null;
|
||||
|
||||
/**
|
||||
* Fast path to get the location of the Feature for indexing
|
||||
* @param line the input line to decode
|
||||
* @return
|
||||
*/
|
||||
public Feature decodeLoc(String line) {
|
||||
return reallyDecode(line, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode a line as a Feature.
|
||||
*
|
||||
* @param line
|
||||
*
|
||||
* @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is
|
||||
* a comment)
|
||||
*/
|
||||
public Feature decode(String line) {
|
||||
return reallyDecode(line, false);
|
||||
}
|
||||
|
||||
private Feature reallyDecode(String line, boolean justLocationPlease ) {
|
||||
// transform the line, if we have a transform to do
|
||||
if (transformer != null) line = transformer.lineTransform(line);
|
||||
if (line.startsWith("#"))
|
||||
return null;
|
||||
|
||||
// make a VCFRecord of the line and return it
|
||||
VCFRecord rec = VCFReaderUtils.createRecord(line, header, justLocationPlease);
|
||||
if ( ! justLocationPlease ) rec.setHeader(header);
|
||||
return rec;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the # of header lines for this file. We use this to parse out the header
|
||||
*
|
||||
* @return 0
|
||||
*/
|
||||
public int readHeader(LineReader reader) {
|
||||
String line = "";
|
||||
try {
|
||||
while ((line = reader.readLine()) != null) {
|
||||
if (line.startsWith("##")) {
|
||||
headerStrings.add(line);
|
||||
}
|
||||
else if (line.startsWith("#")) {
|
||||
headerStrings.add(line);
|
||||
header = VCFReaderUtils.createHeader(headerStrings,version);
|
||||
return headerStrings.size();
|
||||
}
|
||||
else {
|
||||
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("IO Exception ", e);
|
||||
}
|
||||
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
/**
|
||||
* @return VCFRecord.class
|
||||
*/
|
||||
public Class getFeatureType() {
|
||||
return VCFRecord.class;
|
||||
}
|
||||
|
||||
public VCFHeader getHeader(Class clazz) throws ClassCastException {
|
||||
if (!clazz.equals(VCFHeader.class))
|
||||
throw new ClassCastException("Unable to cast to expected type " + clazz + " from type " + VCFHeader.class);
|
||||
return header;
|
||||
}
|
||||
|
||||
public static interface LineTransform {
|
||||
public String lineTransform(String line);
|
||||
}
|
||||
|
||||
public LineTransform getTransformer() {
|
||||
return transformer;
|
||||
}
|
||||
|
||||
public void setTransformer(LineTransform transformer) {
|
||||
this.transformer = transformer;
|
||||
}
|
||||
|
||||
public VCFHeaderVersion getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
public void setVersion(VCFHeaderVersion version) {
|
||||
this.version = version;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* A class representing a key=value entry for FILTER fields in the VCF header
|
||||
*/
|
||||
public class VCFFilterHeaderLine extends VCFHeaderLine {
|
||||
|
||||
private String mName;
|
||||
private String mDescription;
|
||||
|
||||
|
||||
/**
|
||||
* create a VCF filter header line
|
||||
*
|
||||
* @param name the name for this header line
|
||||
* @param description the description for this header line
|
||||
*/
|
||||
public VCFFilterHeaderLine(String name, String description) {
|
||||
super("FILTER", "");
|
||||
mName = name;
|
||||
mDescription = description;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF info header line
|
||||
*
|
||||
* @param line the header line
|
||||
* @param version the vcf header version
|
||||
*/
|
||||
protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) {
|
||||
super("FILTER", "", version);
|
||||
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description"));
|
||||
mName = mapping.get("ID");
|
||||
mDescription = mapping.get("Description");
|
||||
}
|
||||
|
||||
protected String makeStringRep() {
|
||||
if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2)
|
||||
return String.format("FILTER=%s,\"%s\"", mName, mDescription);
|
||||
else if (mVersion == VCFHeaderVersion.VCF4_0) {
|
||||
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
||||
map.put("ID",mName);
|
||||
map.put("Description",mDescription);
|
||||
return "FILTER=" + VCFHeaderLineTranslator.toValue(this.mVersion,map);
|
||||
}
|
||||
else throw new RuntimeException("Unsupported VCFVersion " + mVersion);
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if ( !(o instanceof VCFFilterHeaderLine) )
|
||||
return false;
|
||||
VCFFilterHeaderLine other = (VCFFilterHeaderLine)o;
|
||||
return mName.equals(other.mName) &&
|
||||
mDescription.equals(other.mDescription);
|
||||
}
|
||||
|
||||
public String getmName() {
|
||||
return mName;
|
||||
}
|
||||
|
||||
public String getmDescription() {
|
||||
return mDescription;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,118 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* <p/>
|
||||
* Class VCFFormatHeaderLine
|
||||
* <p/>
|
||||
* A class representing a key=value entry for genotype FORMAT fields in the VCF header
|
||||
*/
|
||||
public class VCFFormatHeaderLine extends VCFHeaderLine {
|
||||
|
||||
// the format field types
|
||||
public enum FORMAT_TYPE {
|
||||
Integer, Float, String;
|
||||
public Object convert(String value) {
|
||||
switch (this) {
|
||||
case Integer:
|
||||
return java.lang.Integer.valueOf(value); // the java.lang is needed since we use Integer as a enum name
|
||||
case Float:
|
||||
return java.lang.Float.valueOf(value);
|
||||
case String:
|
||||
return value;
|
||||
default:
|
||||
throw new IllegalStateException("field." + this + " doesn't have a set conversion approach");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private String mName;
|
||||
private int mCount;
|
||||
private String mDescription;
|
||||
private FORMAT_TYPE mType;
|
||||
|
||||
|
||||
/**
|
||||
* create a VCF format header line
|
||||
*
|
||||
* @param name the name for this header line
|
||||
* @param count the count for this header line
|
||||
* @param type the type for this header line
|
||||
* @param description the description for this header line
|
||||
*/
|
||||
public VCFFormatHeaderLine(String name, int count, FORMAT_TYPE type, String description) {
|
||||
super("FORMAT", "");
|
||||
mName = name;
|
||||
mCount = count;
|
||||
mType = type;
|
||||
mDescription = description;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF format header line
|
||||
*
|
||||
* @param line the header line
|
||||
* @param version the VCF header version
|
||||
*
|
||||
*/
|
||||
protected VCFFormatHeaderLine(String line, VCFHeaderVersion version) {
|
||||
super("FORMAT", "", version);
|
||||
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description"));
|
||||
mName = mapping.get("ID");
|
||||
mCount = Integer.valueOf(mapping.get("Number"));
|
||||
mType = FORMAT_TYPE.valueOf(mapping.get("Type"));
|
||||
mDescription = mapping.get("Description");
|
||||
}
|
||||
|
||||
protected String makeStringRep() {
|
||||
if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2)
|
||||
return String.format("FORMAT=%s,%d,%s,\"%s\"", mName, mCount, mType.toString(), mDescription);
|
||||
else if (mVersion == VCFHeaderVersion.VCF4_0) {
|
||||
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
||||
map.put("ID",mName);
|
||||
map.put("Number",mCount);
|
||||
map.put("Type",mType);
|
||||
map.put("Description",mDescription);
|
||||
return "FORMAT=" + VCFHeaderLineTranslator.toValue(this.mVersion,map);
|
||||
}
|
||||
else throw new RuntimeException("Unsupported VCFVersion " + mVersion);
|
||||
}
|
||||
|
||||
public String getName() { return mName; }
|
||||
public int getCount() { return mCount; }
|
||||
public String getDescription() { return mDescription; }
|
||||
public FORMAT_TYPE getType() { return mType; }
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if ( !(o instanceof VCFFormatHeaderLine) )
|
||||
return false;
|
||||
VCFFormatHeaderLine other = (VCFFormatHeaderLine)o;
|
||||
return mName.equals(other.mName) &&
|
||||
mCount == other.mCount &&
|
||||
mDescription.equals(other.mDescription) &&
|
||||
mType == other.mType;
|
||||
}
|
||||
|
||||
public String getmName() {
|
||||
return mName;
|
||||
}
|
||||
|
||||
public int getmCount() {
|
||||
return mCount;
|
||||
}
|
||||
|
||||
public String getmDescription() {
|
||||
return mDescription;
|
||||
}
|
||||
|
||||
public FORMAT_TYPE getmType() {
|
||||
return mType;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,128 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class VCFGenotypeEncoding
|
||||
* <p/>
|
||||
* basic encoding class for genotype fields in VCF
|
||||
*/
|
||||
public class VCFGenotypeEncoding {
|
||||
public enum TYPE {
|
||||
SINGLE_BASE,
|
||||
INSERTION,
|
||||
DELETION,
|
||||
UNCALLED,
|
||||
MIXED // this type is only valid in aggregate, not for a single VCFGenotypeEncoding
|
||||
}
|
||||
|
||||
// our length (0 for SINGLE_BASE), our bases, and our type
|
||||
private final int mLength;
|
||||
private final String mBases;
|
||||
private final TYPE mType;
|
||||
|
||||
// public constructor, that parses out the base string
|
||||
public VCFGenotypeEncoding(String baseString) {
|
||||
if ((baseString.length() == 1)) {
|
||||
// are we an empty (no-call) genotype?
|
||||
if (baseString.equals(VCFGenotypeRecord.EMPTY_ALLELE)) {
|
||||
mBases = VCFGenotypeRecord.EMPTY_ALLELE;
|
||||
mLength = 0;
|
||||
mType = TYPE.UNCALLED;
|
||||
} else if (!validBases(baseString)) {
|
||||
throw new IllegalArgumentException("Alleles of length 1 must be one of A,C,G,T, " + baseString + " was passed in");
|
||||
} else { // we're a valid base
|
||||
mBases = baseString.toUpperCase();
|
||||
mLength = 0;
|
||||
mType = TYPE.SINGLE_BASE;
|
||||
}
|
||||
} else { // deletion or insertion
|
||||
if (baseString.length() < 1 || (baseString.toUpperCase().charAt(0) != 'D' && baseString.toUpperCase().charAt(0) != 'I')) {
|
||||
throw new IllegalArgumentException("Genotype encoding of " + baseString + " was passed in, but is not a valid deletion, insertion, base, or no call (.)");
|
||||
}
|
||||
if (baseString.toUpperCase().charAt(0) == 'D') {
|
||||
mLength = Integer.valueOf(baseString.substring(1, baseString.length()));
|
||||
mBases = "";
|
||||
mType = TYPE.DELETION;
|
||||
} else { // we're an I
|
||||
mBases = baseString.substring(1, baseString.length()).toUpperCase();
|
||||
if (!validBases(mBases))
|
||||
throw new IllegalArgumentException("The insertion base string contained invalid bases -> " + baseString);
|
||||
mLength = mBases.length();
|
||||
mType = TYPE.INSERTION;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return mLength;
|
||||
}
|
||||
|
||||
public String getBases() {
|
||||
return mBases;
|
||||
}
|
||||
|
||||
public TYPE getType() {
|
||||
return mType;
|
||||
}
|
||||
|
||||
public boolean equals(Object obj) {
|
||||
if ( obj == null )
|
||||
return false;
|
||||
if ( obj instanceof VCFGenotypeEncoding ) {
|
||||
VCFGenotypeEncoding d = (VCFGenotypeEncoding) obj;
|
||||
return (mType == d.mType) && (mBases.equals(d.mBases)) && (mLength == d.mLength);
|
||||
}
|
||||
if ( mType == TYPE.UNCALLED && obj.toString().equals(VCFGenotypeRecord.EMPTY_ALLELE) )
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
// our underlying data is immutable, so this is safe (we won't strand a value in a hashtable somewhere
|
||||
// when the data changes underneath, altering this value).
|
||||
String str = this.mBases + String.valueOf(this.mLength) + this.mType.toString();
|
||||
return str.hashCode();
|
||||
}
|
||||
|
||||
/**
|
||||
* dump the string representation of this genotype encoding
|
||||
*
|
||||
* @return string representation
|
||||
*/
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
switch (mType) {
|
||||
case SINGLE_BASE:
|
||||
case UNCALLED:
|
||||
builder.append(mBases);
|
||||
break;
|
||||
case INSERTION:
|
||||
builder.append("I");
|
||||
builder.append(mBases);
|
||||
break;
|
||||
case DELETION:
|
||||
builder.append("D");
|
||||
builder.append(mLength);
|
||||
break;
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* ensure that string contains valid bases
|
||||
*
|
||||
* @param bases the bases to check
|
||||
*
|
||||
* @return true if they're all either A,C,G,T; false otherwise
|
||||
*/
|
||||
private static boolean validBases(String bases) {
|
||||
for (char c : bases.toUpperCase().toCharArray()) {
|
||||
if (c != 'A' && c != 'C' && c != 'G' && c != 'T')
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,353 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class VCFGenotypeRecord
|
||||
*
|
||||
* A descriptions should go here. Blame aaron if it's missing.
|
||||
*/
|
||||
public class VCFGenotypeRecord {
|
||||
|
||||
// key names
|
||||
public static final String GENOTYPE_KEY = "GT";
|
||||
public static final String GENOTYPE_QUALITY_KEY = "GQ";
|
||||
public static final String DEPTH_KEY = "DP";
|
||||
public static final String HAPLOTYPE_QUALITY_KEY = "HQ";
|
||||
public static final String GENOTYPE_FILTER_KEY = "FT";
|
||||
public static final String GENOTYPE_LIKELIHOODS_KEY = "GL";
|
||||
public static final String OLD_DEPTH_KEY = "RD";
|
||||
|
||||
// the values for empty fields
|
||||
public static final String EMPTY_GENOTYPE = "./.";
|
||||
public static final String EMPTY_ALLELE = ".";
|
||||
public static final int MISSING_GENOTYPE_QUALITY = -1;
|
||||
public static final int MISSING_DEPTH = -1;
|
||||
public static final int MISSING_HAPLOTYPE_QUALITY = -1;
|
||||
public static final String PASSES_FILTERS = "0";
|
||||
public static final String UNFILTERED = ".";
|
||||
|
||||
public static final double MAX_QUAL_VALUE = 99.0;
|
||||
|
||||
// what kind of phasing this genotype has
|
||||
public enum PHASE {
|
||||
UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN
|
||||
}
|
||||
|
||||
// our record
|
||||
private VCFRecord mRecord;
|
||||
|
||||
// our phasing
|
||||
private PHASE mPhaseType;
|
||||
|
||||
// our bases(s)
|
||||
private final List<VCFGenotypeEncoding> mGenotypeAlleles = new ArrayList<VCFGenotypeEncoding>();
|
||||
|
||||
// our mapping of the format mFields to values
|
||||
private final Map<String, String> mFields = new HashMap<String, String>();
|
||||
|
||||
// our sample name
|
||||
private String mSampleName;
|
||||
|
||||
/**
|
||||
* Create a VCF genotype record
|
||||
*
|
||||
* @param sampleName sample name
|
||||
* @param genotypes list of genotypes
|
||||
* @param phasing phasing
|
||||
*/
|
||||
public VCFGenotypeRecord(String sampleName, List<VCFGenotypeEncoding> genotypes, PHASE phasing) {
|
||||
mSampleName = sampleName;
|
||||
if (genotypes != null)
|
||||
this.mGenotypeAlleles.addAll(genotypes);
|
||||
mPhaseType = phasing;
|
||||
}
|
||||
|
||||
public void setVCFRecord(VCFRecord record) {
|
||||
mRecord = record;
|
||||
}
|
||||
|
||||
public void setSampleName(String name) {
|
||||
mSampleName = name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a field to the genotype record.
|
||||
* Throws an exception if the key is GT, as that's computed internally.
|
||||
*
|
||||
* @param key the field name (use static variables above for common fields)
|
||||
* @param value the field value
|
||||
*/
|
||||
public void setField(String key, String value) {
|
||||
// make sure the GT field isn't being set
|
||||
if ( key.equals(GENOTYPE_KEY) )
|
||||
throw new IllegalArgumentException("Setting the GT field is not allowed as that's done internally");
|
||||
mFields.put(key, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* determine the phase of the genotype
|
||||
*
|
||||
* @param phase the string that contains the phase character
|
||||
*
|
||||
* @return the phase
|
||||
*/
|
||||
static PHASE determinePhase(String phase) {
|
||||
// find the phasing information
|
||||
if (phase.equals("/"))
|
||||
return PHASE.UNPHASED;
|
||||
else if (phase.equals("|"))
|
||||
return PHASE.PHASED;
|
||||
else if (phase.equals("\\"))
|
||||
return PHASE.PHASED_SWITCH_PROB;
|
||||
else
|
||||
throw new IllegalArgumentException("Unknown genotype phasing parameter");
|
||||
}
|
||||
|
||||
|
||||
public PHASE getPhaseType() {
|
||||
return mPhaseType;
|
||||
}
|
||||
|
||||
public String getSampleName() {
|
||||
return mSampleName;
|
||||
}
|
||||
|
||||
public List<VCFGenotypeEncoding> getAlleles() {
|
||||
return mGenotypeAlleles;
|
||||
}
|
||||
|
||||
public Map<String, String> getFields() {
|
||||
return mFields;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the phred-scaled quality score
|
||||
*/
|
||||
public double getQual() {
|
||||
return ( mFields.containsKey(GENOTYPE_QUALITY_KEY) ? Double.valueOf(mFields.get(GENOTYPE_QUALITY_KEY)) : MISSING_GENOTYPE_QUALITY);
|
||||
}
|
||||
|
||||
public boolean isMissingQual() {
|
||||
return (int)getQual() == MISSING_GENOTYPE_QUALITY;
|
||||
}
|
||||
|
||||
public double getNegLog10PError() {
|
||||
double qual = getQual();
|
||||
return (qual == MISSING_GENOTYPE_QUALITY ? MISSING_GENOTYPE_QUALITY : qual / 10.0);
|
||||
}
|
||||
|
||||
public int getReadCount() {
|
||||
return ( mFields.containsKey(DEPTH_KEY) ? Integer.valueOf(mFields.get(DEPTH_KEY)) : MISSING_DEPTH);
|
||||
}
|
||||
|
||||
public String getLocation() {
|
||||
return mRecord != null ? mRecord.getChr() + ":" + mRecord.getPosition() : null;
|
||||
}
|
||||
|
||||
public String getReference() {
|
||||
return mRecord != null ? mRecord.getReference() : "N";
|
||||
}
|
||||
|
||||
public String getBases() {
|
||||
String genotype = "";
|
||||
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles )
|
||||
genotype += encoding.getBases();
|
||||
return genotype;
|
||||
}
|
||||
|
||||
public boolean isVariant(char ref) {
|
||||
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) {
|
||||
if ( encoding.getType() == VCFGenotypeEncoding.TYPE.UNCALLED )
|
||||
continue;
|
||||
if ( encoding.getType() != VCFGenotypeEncoding.TYPE.SINGLE_BASE ||
|
||||
encoding.getBases().charAt(0) != ref )
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isPointGenotype() {
|
||||
return (mRecord != null ? !mRecord.isIndel() : true);
|
||||
}
|
||||
|
||||
public boolean isHom() {
|
||||
if ( mGenotypeAlleles.size() == 0 )
|
||||
return true;
|
||||
|
||||
String bases = mGenotypeAlleles.get(0).getBases();
|
||||
for ( int i = 1; i < mGenotypeAlleles.size(); i++ ) {
|
||||
if ( !bases.equals(mGenotypeAlleles.get(1).getBases()) )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean isHet() {
|
||||
return !isHom();
|
||||
}
|
||||
|
||||
public boolean isNoCall() {
|
||||
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) {
|
||||
if ( encoding.getType() != VCFGenotypeEncoding.TYPE.UNCALLED )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean isFiltered() {
|
||||
return ( mFields.get(GENOTYPE_FILTER_KEY) != null &&
|
||||
!mFields.get(GENOTYPE_FILTER_KEY).equals(UNFILTERED) &&
|
||||
!mFields.get(GENOTYPE_FILTER_KEY).equals(PASSES_FILTERS));
|
||||
}
|
||||
|
||||
public int getPloidy() {
|
||||
return 2;
|
||||
}
|
||||
|
||||
public VCFRecord getRecord() {
|
||||
return mRecord;
|
||||
}
|
||||
|
||||
private String toGenotypeString(List<VCFGenotypeEncoding> altAlleles) {
|
||||
String str = "";
|
||||
boolean first = true;
|
||||
for (VCFGenotypeEncoding allele : mGenotypeAlleles) {
|
||||
if (allele.getType() == VCFGenotypeEncoding.TYPE.UNCALLED)
|
||||
str += VCFGenotypeRecord.EMPTY_ALLELE;
|
||||
else
|
||||
str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0);
|
||||
if (first) {
|
||||
switch (mPhaseType) {
|
||||
case UNPHASED:
|
||||
str += "/";
|
||||
break;
|
||||
case PHASED:
|
||||
str += "|";
|
||||
break;
|
||||
case PHASED_SWITCH_PROB:
|
||||
str += "\\";
|
||||
break;
|
||||
case UNKNOWN:
|
||||
throw new UnsupportedOperationException("Unknown phase type");
|
||||
}
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
return str;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("[VCFGenotype %s %s %s %s]", getLocation(), mSampleName, this.mGenotypeAlleles, mFields);
|
||||
}
|
||||
|
||||
public boolean isEmptyGenotype() {
|
||||
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) {
|
||||
if ( encoding.getType() != VCFGenotypeEncoding.TYPE.UNCALLED )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other instanceof VCFGenotypeRecord) {
|
||||
if (((VCFGenotypeRecord) other).mPhaseType != this.mPhaseType) return false;
|
||||
if (!((VCFGenotypeRecord) other).mGenotypeAlleles.equals(this.mGenotypeAlleles)) return false;
|
||||
if (!((VCFGenotypeRecord) other).mFields.equals(mFields)) return false;
|
||||
if (!((VCFGenotypeRecord) other).mSampleName.equals(this.mSampleName)) return false;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* output a string representation of the VCFGenotypeRecord, given the alternate alleles
|
||||
*
|
||||
* @param altAlleles the alternate alleles, needed for toGenotypeString()
|
||||
* @param genotypeFormatStrings genotype format strings
|
||||
*
|
||||
* @return a string
|
||||
*/
|
||||
public String toStringEncoding(List<VCFGenotypeEncoding> altAlleles, String[] genotypeFormatStrings) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append(toGenotypeString(altAlleles));
|
||||
|
||||
for ( String field : genotypeFormatStrings ) {
|
||||
if ( field.equals(GENOTYPE_KEY) )
|
||||
continue;
|
||||
|
||||
String value = mFields.get(field);
|
||||
if ( value == null && field.equals(OLD_DEPTH_KEY) )
|
||||
value = mFields.get(DEPTH_KEY);
|
||||
|
||||
builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR);
|
||||
if ( value == null || value.equals("") )
|
||||
builder.append(getMissingFieldValue(field));
|
||||
else
|
||||
builder.append(value);
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* output a string representation of an empty genotype
|
||||
*
|
||||
* @param genotypeFormatStrings genotype format strings
|
||||
*
|
||||
* @return a string
|
||||
*/
|
||||
public static String stringEncodingForEmptyGenotype(String[] genotypeFormatStrings) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append(EMPTY_GENOTYPE);
|
||||
|
||||
for ( String field : genotypeFormatStrings ) {
|
||||
if ( field.equals(GENOTYPE_KEY) )
|
||||
continue;
|
||||
|
||||
builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR);
|
||||
builder.append(getMissingFieldValue(field));
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
public static String getMissingFieldValue(String field) {
|
||||
String result = "";
|
||||
if ( field.equals(GENOTYPE_QUALITY_KEY) )
|
||||
result = String.valueOf(MISSING_GENOTYPE_QUALITY);
|
||||
else if ( field.equals(DEPTH_KEY) || field.equals(OLD_DEPTH_KEY) )
|
||||
result = String.valueOf(MISSING_DEPTH);
|
||||
else if ( field.equals(GENOTYPE_FILTER_KEY) )
|
||||
result = UNFILTERED;
|
||||
else if ( field.equals(GENOTYPE_LIKELIHOODS_KEY) )
|
||||
result = "0,0,0";
|
||||
// TODO -- support haplotype quality
|
||||
//else if ( field.equals(HAPLOTYPE_QUALITY_KEY) )
|
||||
// result = String.valueOf(MISSING_HAPLOTYPE_QUALITY);
|
||||
return result;
|
||||
}
|
||||
|
||||
public static Set<VCFFormatHeaderLine> getSupportedHeaderStrings(VCFHeaderVersion version) {
|
||||
Set<VCFFormatHeaderLine> result = new HashSet<VCFFormatHeaderLine>();
|
||||
result.add(new VCFFormatHeaderLine(GENOTYPE_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.String, "Genotype"));
|
||||
result.add(new VCFFormatHeaderLine(GENOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.Float, "Genotype Quality"));
|
||||
result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.Integer, "Read Depth (only filtered reads used for calling)"));
|
||||
result.add(new VCFFormatHeaderLine(GENOTYPE_LIKELIHOODS_KEY, 3, VCFFormatHeaderLine.FORMAT_TYPE.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
|
||||
//result.add(new VCFFormatHeaderLine(HAPLOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Haplotype Quality"));
|
||||
return result;
|
||||
}
|
||||
|
||||
public void replaceFields(HashMap<String,String> newFields) {
|
||||
mFields.clear();
|
||||
for ( String s : newFields.keySet() ) {
|
||||
mFields.put(s,newFields.get(s));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class VCFHeader
|
||||
* <p/>
|
||||
* A class representing the VCF header
|
||||
*/
|
||||
public class VCFHeader {
|
||||
|
||||
// the manditory header fields
|
||||
public enum HEADER_FIELDS {
|
||||
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
|
||||
}
|
||||
|
||||
// the associated meta data
|
||||
private final Set<VCFHeaderLine> mMetaData;
|
||||
|
||||
// the list of auxillary tags
|
||||
private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
|
||||
|
||||
// the character string that indicates meta data
|
||||
public static final String METADATA_INDICATOR = "##";
|
||||
|
||||
// the header string indicator
|
||||
public static final String HEADER_INDICATOR = "#";
|
||||
|
||||
// our header versionVCF
|
||||
private VCFHeaderVersion versionVCF;
|
||||
|
||||
/** do we have genotying data? */
|
||||
private boolean hasGenotypingData = false;
|
||||
|
||||
/**
|
||||
* create a VCF header, given a list of meta data and auxillary tags
|
||||
*
|
||||
* @param metaData the meta data associated with this header
|
||||
*/
|
||||
public VCFHeader(Set<VCFHeaderLine> metaData) {
|
||||
mMetaData = new TreeSet<VCFHeaderLine>(metaData);
|
||||
checkVCFVersion();
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF header, given a list of meta data and auxillary tags
|
||||
*
|
||||
* @param metaData the meta data associated with this header
|
||||
* @param genotypeSampleNames the genotype format field, and the sample names
|
||||
*/
|
||||
public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) {
|
||||
mMetaData = new TreeSet<VCFHeaderLine>(metaData);
|
||||
for (String col : genotypeSampleNames) {
|
||||
if (!col.equals("FORMAT"))
|
||||
mGenotypeSampleNames.add(col);
|
||||
}
|
||||
if (genotypeSampleNames.size() > 0) hasGenotypingData = true;
|
||||
checkVCFVersion();
|
||||
}
|
||||
|
||||
/**
|
||||
* check our metadata for a VCF versionVCF tag, and throw an exception if the versionVCF is out of date
|
||||
* or the versionVCF is not present
|
||||
*/
|
||||
// TODO: fix this function
|
||||
public void checkVCFVersion() {
|
||||
VCFHeaderVersion version;
|
||||
List<VCFHeaderLine> toRemove = new ArrayList<VCFHeaderLine>();
|
||||
for ( VCFHeaderLine line : mMetaData )
|
||||
if ( VCFHeaderVersion.isFormatString(line.getKey())) {
|
||||
version = VCFHeaderVersion.toHeaderVersion(line.getValue(),line.getKey());
|
||||
if (version == null)
|
||||
{
|
||||
toRemove.add(line);
|
||||
}
|
||||
/**throw new RuntimeException("VCF version " + line.getValue() +
|
||||
" is not supported; only versionVCF " + VCFHeaderVersion.VCF3_2 + " and greater can be used");*/
|
||||
else return;
|
||||
}
|
||||
// remove old header lines for now,
|
||||
mMetaData.removeAll(toRemove);
|
||||
mMetaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF3_3.getFormatString(), VCFHeaderVersion.VCF3_3.getVersionString()));
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* get the header fields in order they're presented in the input file (which is now required to be
|
||||
* the order presented in the spec).
|
||||
*
|
||||
* @return a set of the header fields, in order
|
||||
*/
|
||||
public Set<HEADER_FIELDS> getHeaderFields() {
|
||||
Set<HEADER_FIELDS> fields = new LinkedHashSet<HEADER_FIELDS>();
|
||||
for (HEADER_FIELDS field : HEADER_FIELDS.values())
|
||||
fields.add(field);
|
||||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the meta data, associated with this header
|
||||
*
|
||||
* @return a set of the meta data
|
||||
*/
|
||||
public Set<VCFHeaderLine> getMetaData() {
|
||||
return mMetaData;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the genotyping sample names
|
||||
*
|
||||
* @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false
|
||||
*/
|
||||
public Set<String> getGenotypeSamples() {
|
||||
return mGenotypeSampleNames;
|
||||
}
|
||||
|
||||
/**
|
||||
* do we have genotyping data?
|
||||
*
|
||||
* @return true if we have genotyping columns, false otherwise
|
||||
*/
|
||||
public boolean hasGenotypingData() {
|
||||
return hasGenotypingData;
|
||||
}
|
||||
|
||||
/** @return the column count, */
|
||||
public int getColumnCount() {
|
||||
return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* <p/>
|
||||
* Class VCFHeaderLine
|
||||
* <p/>
|
||||
* A class representing a key=value entry in the VCF header
|
||||
*/
|
||||
public class VCFHeaderLine implements Comparable {
|
||||
|
||||
private String stringRep = null;
|
||||
private String mKey = null;
|
||||
private String mValue = null;
|
||||
protected VCFHeaderVersion mVersion = null;
|
||||
|
||||
/**
|
||||
* create a VCF header line
|
||||
*
|
||||
* @param key the key for this header line
|
||||
* @param value the value for this header line
|
||||
*/
|
||||
public VCFHeaderLine(String key, String value, VCFHeaderVersion version) {
|
||||
mKey = key;
|
||||
mValue = value;
|
||||
mVersion = version;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF header line
|
||||
*
|
||||
* @param key the key for this header line
|
||||
* @param value the value for this header line
|
||||
*/
|
||||
public VCFHeaderLine(String key, String value) {
|
||||
mKey = key;
|
||||
mValue = value;
|
||||
mVersion = VCFHeaderVersion.VCF3_3;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the key
|
||||
*
|
||||
* @return the key
|
||||
*/
|
||||
public String getKey() {
|
||||
return mKey;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the key
|
||||
*
|
||||
* @param key the key for this header line
|
||||
*/
|
||||
public void setKey(String key) {
|
||||
mKey = key;
|
||||
stringRep = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the value
|
||||
*
|
||||
* @return the value
|
||||
*/
|
||||
public String getValue() {
|
||||
return mValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the value
|
||||
*
|
||||
* @param value the value for this header line
|
||||
*/
|
||||
public void setValue(String value) {
|
||||
mValue = value;
|
||||
stringRep = null;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
if ( stringRep == null )
|
||||
stringRep = makeStringRep();
|
||||
return stringRep;
|
||||
}
|
||||
|
||||
protected String makeStringRep() {
|
||||
return mKey + "=" + mValue;
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if ( !(o instanceof VCFHeaderLine) )
|
||||
return false;
|
||||
return mKey.equals(((VCFHeaderLine)o).getKey()) && mValue.equals(((VCFHeaderLine)o).getValue());
|
||||
}
|
||||
|
||||
public int compareTo(Object other) {
|
||||
return toString().compareTo(other.toString());
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,161 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: aaron
|
||||
* Date: Jun 17, 2010
|
||||
* Time: 12:28:46 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class VCFHeaderLineTranslator {
|
||||
private static Map<VCFHeaderVersion,VCFLineParser> mapping;
|
||||
|
||||
static {
|
||||
mapping = new HashMap<VCFHeaderVersion,VCFLineParser>();
|
||||
mapping.put(VCFHeaderVersion.VCF4_0,new VCF4Parser());
|
||||
mapping.put(VCFHeaderVersion.VCF3_3,new VCF3Parser());
|
||||
}
|
||||
|
||||
public static Map<String,String> parseLine(VCFHeaderVersion version, String valueLine, List<String> expectedTagOrder) {
|
||||
return mapping.get(version).parseLine(valueLine,expectedTagOrder);
|
||||
}
|
||||
|
||||
public static String toValue(VCFHeaderVersion version, Map<String,Object> keyValues) {
|
||||
return mapping.get(version).toValue(keyValues);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
interface VCFLineParser {
|
||||
public String toValue(Map<String,? extends Object> keyValues);
|
||||
public Map<String,String> parseLine(String valueLine, List<String> expectedTagOrder);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* a class that handles the to and from disk for VCF 4 lines
|
||||
*/
|
||||
class VCF4Parser implements VCFLineParser {
|
||||
Set<String> bracketed = new HashSet<String>();
|
||||
|
||||
/**
|
||||
* create a string of a mapping pair for the target VCF version
|
||||
* @param keyValues a mapping of the key->value pairs to output
|
||||
* @return a string, correctly formatted
|
||||
*/
|
||||
public String toValue(Map<String, ? extends Object> keyValues) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("<");
|
||||
boolean start = true;
|
||||
for (Map.Entry<String,?> entry : keyValues.entrySet()) {
|
||||
if (start) start = false;
|
||||
else builder.append(",");
|
||||
builder.append(entry.getKey());
|
||||
builder.append("=");
|
||||
builder.append(entry.getValue().toString().contains(",") ||
|
||||
entry.getValue().toString().contains(" ") ||
|
||||
entry.getKey().equals("Description") ? "\""+ entry.getValue() + "\"" : entry.getValue());
|
||||
}
|
||||
builder.append(">");
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* parse a VCF4 line
|
||||
* @param valueLine the line
|
||||
* @return a mapping of the tags parsed out
|
||||
*/
|
||||
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder) {
|
||||
// our return map
|
||||
Map<String, String> ret = new LinkedHashMap<String, String>();
|
||||
|
||||
// a builder to store up characters as we go
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
||||
// store the key when we're parsing out the values
|
||||
String key = "";
|
||||
|
||||
// where are we in the stream of characters?
|
||||
int index = 0;
|
||||
|
||||
// are we inside a quotation? we don't special case ',' then
|
||||
boolean inQuote = false;
|
||||
|
||||
// a little switch machine to parse out the tags. Regex ended up being really complicated and ugly
|
||||
for (char c: valueLine.toCharArray()) {
|
||||
switch (c) {
|
||||
case ('<') : if (index == 0) break; // if we see a open bracket at the beginning, ignore it
|
||||
case ('>') : if (index == valueLine.length()-1) ret.put(key,builder.toString().trim()); break; // if we see a close bracket, and we're at the end, add an entry to our list
|
||||
case ('=') : key = builder.toString().trim(); builder = new StringBuilder(); break; // at an equals, copy the key and reset the builder
|
||||
case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it
|
||||
case (',') : if (!inQuote) { ret.put(key,builder.toString().trim()); builder = new StringBuilder(); break; } // drop the current key value to the return map
|
||||
default: builder.append(c); // otherwise simply append to the current string
|
||||
}
|
||||
index++;
|
||||
}
|
||||
|
||||
// validate the tags against the expected list
|
||||
index = 0;
|
||||
if (ret.size() > expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + ret.size() + " in string " + expectedTagOrder.size());
|
||||
for (String str : ret.keySet()) {
|
||||
if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
|
||||
index++;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
class VCF3Parser implements VCFLineParser {
|
||||
|
||||
public String toValue(Map<String, ? extends Object> keyValues) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
boolean start = true;
|
||||
for (Map.Entry<String,?> entry : keyValues.entrySet()) {
|
||||
if (start) start = false;
|
||||
else builder.append(",");
|
||||
builder.append(entry.getValue().toString().contains(",") || entry.getValue().toString().contains(" ")? "\""+ entry.getValue() + "\"" : entry.getValue());
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder) {
|
||||
// our return map
|
||||
Map<String, String> ret = new LinkedHashMap<String, String>();
|
||||
|
||||
// a builder to store up characters as we go
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
||||
// store the key when we're parsing out the values
|
||||
String key = "";
|
||||
|
||||
// where are we in the stream of characters?
|
||||
int index = 0;
|
||||
// where in the expected tag order are we?
|
||||
int tagIndex = 0;
|
||||
|
||||
// are we inside a quotation? we don't special case ',' then
|
||||
boolean inQuote = false;
|
||||
|
||||
// a little switch machine to parse out the tags. Regex ended up being really complicated and ugly
|
||||
for (char c: valueLine.toCharArray()) {
|
||||
switch (c) {
|
||||
case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it
|
||||
case (',') : if (!inQuote) { ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); builder = new StringBuilder(); break; } // drop the current key value to the return map
|
||||
default: builder.append(c); // otherwise simply append to the current string
|
||||
}
|
||||
index++;
|
||||
}
|
||||
ret.put(expectedTagOrder.get(tagIndex++),builder.toString());
|
||||
|
||||
// validate the tags against the expected list
|
||||
index = 0;
|
||||
if (tagIndex != expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size());
|
||||
for (String str : ret.keySet()){
|
||||
if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
|
||||
index++;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
/**
|
||||
* information that identifies each header version
|
||||
*/
|
||||
public enum VCFHeaderVersion {
|
||||
VCF3_2("VCRv3.2","format"),
|
||||
VCF3_3("VCFv3.3","fileformat"),
|
||||
VCF4_0("VCFv4.0","fileformat");
|
||||
|
||||
private final String versionString;
|
||||
private final String formatString;
|
||||
|
||||
/**
|
||||
* create the enum, privately, using:
|
||||
* @param vString the version string
|
||||
* @param fString the format string
|
||||
*/
|
||||
VCFHeaderVersion(String vString, String fString) {
|
||||
this.versionString = vString;
|
||||
this.formatString = fString;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the header version
|
||||
* @param version the version string
|
||||
* @param format the format string
|
||||
* @return a VCFHeaderVersion object
|
||||
*/
|
||||
public static VCFHeaderVersion toHeaderVersion(String version, String format) {
|
||||
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
|
||||
if (hv.versionString.equals(version) && hv.formatString.equals(format))
|
||||
return hv;
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the header version
|
||||
* @param version the version string
|
||||
* @return a VCFHeaderVersion object
|
||||
*/
|
||||
public static VCFHeaderVersion toHeaderVersion(String version) {
|
||||
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
|
||||
if (hv.versionString.equals(version))
|
||||
return hv;
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* are we a valid version string of some type
|
||||
* @param version the version string
|
||||
* @return true if we're valid of some type, false otherwise
|
||||
*/
|
||||
public static boolean isVersionString(String version){
|
||||
return toHeaderVersion(version) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* are we a valid format string for some type
|
||||
* @param format the format string
|
||||
* @return true if we're valid of some type, false otherwise
|
||||
*/
|
||||
public static boolean isFormatString(String format){
|
||||
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
|
||||
if (hv.formatString.equals(format))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public String getVersionString() {
|
||||
return versionString;
|
||||
}
|
||||
|
||||
public String getFormatString() {
|
||||
return formatString;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,121 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* <p/>
|
||||
* Class VCFInfoHeaderLine
|
||||
* <p/>
|
||||
* A class representing a key=value entry for INFO fields in the VCF header
|
||||
*/
|
||||
public class VCFInfoHeaderLine extends VCFHeaderLine {
|
||||
|
||||
// the info field types
|
||||
public enum INFO_TYPE {
|
||||
Integer, Float, String, Character, Flag;
|
||||
|
||||
public Object convert(String value) {
|
||||
switch (this) {
|
||||
case Integer:
|
||||
return java.lang.Integer.valueOf(value); // the java.lang is needed since we use Integer as a enum name
|
||||
case Float:
|
||||
return java.lang.Float.valueOf(value);
|
||||
case String:
|
||||
case Character:
|
||||
return value;
|
||||
case Flag:
|
||||
return value.equals("0") ? false : true;
|
||||
default:
|
||||
throw new IllegalStateException("INFO_TYPE." + this + " doesn't have a set conversion approach");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private String mName;
|
||||
private int mCount;
|
||||
private String mDescription;
|
||||
private INFO_TYPE mType;
|
||||
|
||||
|
||||
// info line numerical values are allowed to be unbounded (or unknown), which is
|
||||
// marked with a dot (.)
|
||||
public static int UNBOUNDED = Integer.MIN_VALUE;
|
||||
public static String UNBOUNDED_ENCODING = ".";
|
||||
|
||||
/**
|
||||
* create a VCF info header line
|
||||
*
|
||||
* @param name the name for this header line
|
||||
* @param count the count for this header line
|
||||
* @param type the type for this header line
|
||||
* @param description the description for this header line
|
||||
*/
|
||||
public VCFInfoHeaderLine(String name, int count, INFO_TYPE type, String description) {
|
||||
super("INFO", "");
|
||||
mName = name;
|
||||
mCount = count;
|
||||
mType = type;
|
||||
mDescription = description;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF info header line
|
||||
*
|
||||
* @param line the header line
|
||||
* @param version the VCF version
|
||||
*/
|
||||
protected VCFInfoHeaderLine(String line, VCFHeaderVersion version) {
|
||||
super("INFO", "", version);
|
||||
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description"));
|
||||
mName = mapping.get("ID");
|
||||
mCount = mapping.get("Number").equals(UNBOUNDED_ENCODING) ? UNBOUNDED : Integer.valueOf(mapping.get("Number"));
|
||||
mType = INFO_TYPE.valueOf(mapping.get("Type"));
|
||||
mDescription = mapping.get("Description");
|
||||
}
|
||||
|
||||
protected String makeStringRep() {
|
||||
if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2)
|
||||
return String.format("INFO=%s,%d,%s,\"%s\"", mName, mCount, mType.toString(), mDescription);
|
||||
else if (mVersion == VCFHeaderVersion.VCF4_0) {
|
||||
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
||||
map.put("ID",mName);
|
||||
map.put("Number",mCount == UNBOUNDED ? UNBOUNDED_ENCODING : mCount);
|
||||
map.put("Type",mType);
|
||||
map.put("Description",mDescription);
|
||||
return "INFO=" + VCFHeaderLineTranslator.toValue(this.mVersion,map);
|
||||
}
|
||||
else throw new RuntimeException("Unsupported VCFVersion " + mVersion);
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if ( !(o instanceof VCFInfoHeaderLine) )
|
||||
return false;
|
||||
VCFInfoHeaderLine other = (VCFInfoHeaderLine)o;
|
||||
return mName.equals(other.mName) &&
|
||||
mCount == other.mCount &&
|
||||
mDescription.equals(other.mDescription) &&
|
||||
mType == other.mType;
|
||||
}
|
||||
|
||||
public String getmName() {
|
||||
return mName;
|
||||
}
|
||||
|
||||
public int getmCount() {
|
||||
return mCount;
|
||||
}
|
||||
|
||||
public String getmDescription() {
|
||||
return mDescription;
|
||||
}
|
||||
|
||||
public INFO_TYPE getmType() {
|
||||
return mType;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,206 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/** The VCFReaderUtils class, which contains a collection of utilities for working with VCF files */
|
||||
public class VCFReaderUtils {
|
||||
|
||||
// our pattern matching for the genotype mFields
|
||||
private static final Pattern gtPattern = Pattern.compile("([0-9\\.]+)([\\\\|\\/])([0-9\\.]*)");
|
||||
|
||||
/**
|
||||
* create a VCF header, given an array of strings that all start with at least the # character. This function is
|
||||
* package protected so that the VCFReaderUtils can access this function
|
||||
*
|
||||
* @param headerStrings a list of header strings
|
||||
*
|
||||
* @return a VCF Header created from the list of stinrgs
|
||||
*/
|
||||
public static VCFHeader createHeader(List<String> headerStrings, VCFHeaderVersion version) {
|
||||
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
|
||||
Set<String> auxTags = new LinkedHashSet<String>();
|
||||
// iterate over all the passed in strings
|
||||
for ( String str : headerStrings ) {
|
||||
if ( !str.startsWith("##") ) {
|
||||
String[] strings = str.substring(1).split("\\t");
|
||||
// the columns should be in order according to Richard Durbin
|
||||
int arrayIndex = 0;
|
||||
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
|
||||
try {
|
||||
if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex]))
|
||||
throw new RuntimeException("VCFReaderUtils: we were expecting column name " + field + " but we saw " + strings[arrayIndex]);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new RuntimeException("VCFReaderUtils: Unknown column name \"" + strings[arrayIndex] + "\", it does not match a known column header name.");
|
||||
}
|
||||
arrayIndex++;
|
||||
}
|
||||
while (arrayIndex < strings.length) {
|
||||
if (!strings[arrayIndex].equals("FORMAT"))
|
||||
auxTags.add(strings[arrayIndex]);
|
||||
arrayIndex++;
|
||||
}
|
||||
} else {
|
||||
if ( str.startsWith("##INFO=") )
|
||||
metaData.add(new VCFInfoHeaderLine(str.substring(7),version));
|
||||
else if ( str.startsWith("##FILTER=") )
|
||||
metaData.add(new VCFFilterHeaderLine(str.substring(9),version));
|
||||
else if ( str.startsWith("##FORMAT=") )
|
||||
metaData.add(new VCFFormatHeaderLine(str.substring(9),version));
|
||||
else {
|
||||
int equals = str.indexOf("=");
|
||||
if ( equals != -1 )
|
||||
metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1),version));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new VCFHeader(metaData, auxTags);
|
||||
}
|
||||
|
||||
/**
|
||||
* create the next VCFRecord, given the input line
|
||||
*
|
||||
* @param line the line from the file
|
||||
* @param mHeader the VCF header
|
||||
*
|
||||
* @return the VCFRecord
|
||||
*/
|
||||
public static VCFRecord createRecord(String line, VCFHeader mHeader) {
|
||||
return createRecord(line, mHeader, false);
|
||||
}
|
||||
|
||||
public static VCFRecord createRecord(String line, VCFHeader mHeader, boolean ignoreGenotypes) {
|
||||
// things we need to make a VCF record
|
||||
Map<VCFHeader.HEADER_FIELDS, String> values = new HashMap<VCFHeader.HEADER_FIELDS, String>();
|
||||
String tokens[] = line.split("\\t");
|
||||
|
||||
// check to ensure that the column count of tokens is right
|
||||
if (tokens.length != mHeader.getColumnCount()) {
|
||||
throw new RuntimeException("The input file line doesn't contain enough fields, it should have " + mHeader.getColumnCount() + " fields, it has " + tokens.length + ". Line = " + line);
|
||||
}
|
||||
|
||||
int index = 0;
|
||||
for (VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields())
|
||||
values.put(field, tokens[index++]);
|
||||
// if we have genotyping data, we try and extract the genotype fields
|
||||
if ( ! ignoreGenotypes && mHeader.hasGenotypingData()) {
|
||||
String mFormatString = tokens[index];
|
||||
String keyStrings[] = mFormatString.split(":");
|
||||
List<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>();
|
||||
index++;
|
||||
String[] alt_alleles = values.get(VCFHeader.HEADER_FIELDS.ALT).split(",");
|
||||
for (String str : mHeader.getGenotypeSamples()) {
|
||||
genotypeRecords.add(getVCFGenotype(str, keyStrings, tokens[index], alt_alleles, values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0)));
|
||||
index++;
|
||||
}
|
||||
VCFRecord vrec = new VCFRecord(values, mFormatString, genotypeRecords);
|
||||
// associate the genotypes with this new record
|
||||
for ( VCFGenotypeRecord gr : genotypeRecords )
|
||||
gr.setVCFRecord(vrec);
|
||||
return vrec;
|
||||
|
||||
}
|
||||
return new VCFRecord(values);
|
||||
}
|
||||
|
||||
/**
|
||||
* generate a VCF genotype record, given it's format string, the genotype string, and allele info
|
||||
*
|
||||
* @param sampleName the sample name
|
||||
* @param formatString the format string for this record, which contains the keys for the genotype parameters
|
||||
* @param genotypeString contains the phasing information, allele information, and values for genotype parameters
|
||||
* @param altAlleles the alternate allele string array, which we index into based on the field parameters
|
||||
* @param referenceBase the reference base
|
||||
*
|
||||
* @return a VCFGenotypeRecord
|
||||
*/
|
||||
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String formatString, String genotypeString, String altAlleles[], char referenceBase) {
|
||||
return getVCFGenotype(sampleName, formatString.split(":"), genotypeString, altAlleles, referenceBase);
|
||||
}
|
||||
|
||||
/**
|
||||
* generate a VCF genotype record, given it's format string, the genotype string, and allele info
|
||||
*
|
||||
* @param sampleName the sample name
|
||||
* @param keyStrings the split format string for this record, which contains the keys for the genotype parameters
|
||||
* @param genotypeString contains the phasing information, allele information, and values for genotype parameters
|
||||
* @param altAlleles the alternate allele string array, which we index into based on the field parameters
|
||||
* @param referenceBase the reference base
|
||||
*
|
||||
* @return a VCFGenotypeRecord
|
||||
*/
|
||||
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String[] keyStrings, String genotypeString, String altAlleles[], char referenceBase) {
|
||||
// parameters to create the VCF genotype record
|
||||
HashMap<String, String> tagToValue = new HashMap<String, String>();
|
||||
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN;
|
||||
List<VCFGenotypeEncoding> bases = new ArrayList<VCFGenotypeEncoding>();
|
||||
|
||||
for (String key : keyStrings) {
|
||||
String parse;
|
||||
int nextDivider;
|
||||
if (!genotypeString.contains(":")) {
|
||||
nextDivider = genotypeString.length();
|
||||
parse = genotypeString;
|
||||
} else {
|
||||
nextDivider = (genotypeString.indexOf(":") > genotypeString.length()) ? genotypeString.length() : genotypeString.indexOf(":");
|
||||
parse = genotypeString.substring(0, nextDivider);
|
||||
}
|
||||
if (key.equals(VCFGenotypeRecord.GENOTYPE_KEY)) {
|
||||
Matcher m = gtPattern.matcher(parse);
|
||||
if (!m.matches())
|
||||
throw new RuntimeException("VCFReaderUtils: Unable to match GT genotype flag to it's expected pattern, the field was: " + parse);
|
||||
phase = VCFGenotypeRecord.determinePhase(m.group(2));
|
||||
addAllele(m.group(1), altAlleles, referenceBase, bases);
|
||||
if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases);
|
||||
} else {
|
||||
if ( parse.length() == 0 )
|
||||
parse = VCFGenotypeRecord.getMissingFieldValue(key);
|
||||
tagToValue.put(key, parse);
|
||||
}
|
||||
if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1;
|
||||
genotypeString = genotypeString.substring(nextDivider + 1, genotypeString.length());
|
||||
}
|
||||
if ( bases.size() > 0 && bases.get(0).equals(VCFGenotypeRecord.EMPTY_ALLELE) )
|
||||
tagToValue.clear();
|
||||
// catch some common errors, either there are too many field keys or there are two many field values
|
||||
else if ( keyStrings.length != tagToValue.size() + ((bases.size() > 0) ? 1 : 0))
|
||||
throw new RuntimeException("VCFReaderUtils: genotype value count doesn't match the key count (expected "
|
||||
+ keyStrings.length + " but saw " + tagToValue.size() + ")");
|
||||
else if ( genotypeString.length() > 0 )
|
||||
throw new RuntimeException("VCFReaderUtils: genotype string contained additional unprocessed fields: " + genotypeString
|
||||
+ ". This most likely means that the format string is shorter then the value fields.");
|
||||
|
||||
VCFGenotypeRecord rec = new VCFGenotypeRecord(sampleName, bases, phase);
|
||||
for ( Map.Entry<String, String> entry : tagToValue.entrySet() )
|
||||
rec.setField(entry.getKey(), entry.getValue());
|
||||
return rec;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* add an alternate allele to the list of alleles we have for a VCF genotype record
|
||||
*
|
||||
* @param alleleNumber the allele number, as a string
|
||||
* @param altAlleles the list of alternate alleles
|
||||
* @param referenceBase the reference base
|
||||
* @param bases the list of bases for this genotype call
|
||||
*/
|
||||
private static void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List<VCFGenotypeEncoding> bases) {
|
||||
if (alleleNumber.equals(VCFGenotypeRecord.EMPTY_ALLELE)) {
|
||||
bases.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE));
|
||||
} else {
|
||||
int alleleValue = Integer.valueOf(alleleNumber);
|
||||
// check to make sure the allele value is within bounds
|
||||
if (alleleValue < 0 || alleleValue > altAlleles.length)
|
||||
throw new IllegalArgumentException("VCFReaderUtils: the allele value of " + alleleValue + " is out of bounds given the alternate allele list.");
|
||||
if (alleleValue == 0)
|
||||
bases.add(new VCFGenotypeEncoding(String.valueOf(referenceBase)));
|
||||
else
|
||||
bases.add(new VCFGenotypeEncoding(altAlleles[alleleValue - 1]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,682 @@
|
|||
package org.broad.tribble.vcf;
|
||||
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/** the basic VCF record type */
|
||||
public class VCFRecord implements Feature {
|
||||
|
||||
// standard info field keys
|
||||
public static final String ANCESTRAL_ALLELE_KEY = "AA";
|
||||
public static final String ALLELE_COUNT_KEY = "AC";
|
||||
public static final String ALLELE_FREQUENCY_KEY = "AF";
|
||||
public static final String ALLELE_NUMBER_KEY = "AN";
|
||||
public static final String RMS_BASE_QUALITY_KEY = "BQ";
|
||||
public static final String DBSNP_KEY = "DB";
|
||||
public static final String DEPTH_KEY = "DP";
|
||||
public static final String HAPMAP2_KEY = "H2";
|
||||
public static final String HAPMAP3_KEY = "H3";
|
||||
public static final String RMS_MAPPING_QUALITY_KEY = "MQ";
|
||||
public static final String SAMPLE_NUMBER_KEY = "NS";
|
||||
public static final String STRAND_BIAS_KEY = "SB";
|
||||
|
||||
// commonly used strings that are in the standard
|
||||
public static final String FORMAT_FIELD_SEPERATOR = ":";
|
||||
public static final String GENOTYPE_FIELD_SEPERATOR = ":";
|
||||
public static final String FIELD_SEPERATOR = "\t";
|
||||
public static final String FILTER_CODE_SEPERATOR = ";";
|
||||
public static final String INFO_FIELD_SEPERATOR = ";";
|
||||
|
||||
// default values
|
||||
public static final String UNFILTERED = ".";
|
||||
public static final String PASSES_FILTERS = "0";
|
||||
public static final String EMPTY_INFO_FIELD = ".";
|
||||
public static final String EMPTY_ID_FIELD = ".";
|
||||
public static final String EMPTY_ALLELE_FIELD = ".";
|
||||
public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
|
||||
public static final int MISSING_GENOTYPE_QUALITY = -1;
|
||||
|
||||
// the reference base
|
||||
private String mReferenceBases;
|
||||
// our location
|
||||
private String mContig;
|
||||
private int mPosition;
|
||||
// our id
|
||||
private String mID;
|
||||
// the alternate bases
|
||||
private final List<VCFGenotypeEncoding> mAlts = new ArrayList<VCFGenotypeEncoding>();
|
||||
// our qual value
|
||||
private double mQual;
|
||||
// our filter string
|
||||
private String mFilterString;
|
||||
// our info fields -- use a TreeMap to ensure they can be pulled out in order (so it passes integration tests)
|
||||
private final Map<String, String> mInfoFields = new TreeMap<String, String>();
|
||||
|
||||
// our genotype formatting string
|
||||
private String mGenotypeFormatString;
|
||||
|
||||
// the vcf header we're associated with
|
||||
private VCFHeader vcfHeader = null;
|
||||
|
||||
// our genotype sample fields
|
||||
private final List<VCFGenotypeRecord> mGenotypeRecords = new ArrayList<VCFGenotypeRecord>();
|
||||
|
||||
/**
|
||||
* given a reference base, a location, and the format string, create a VCF record.
|
||||
*
|
||||
* @param referenceBases the reference bases to use
|
||||
* @param contig our contig
|
||||
* @param start the start location
|
||||
* @param genotypeFormatString the format string
|
||||
*/
|
||||
public VCFRecord(String referenceBases, String contig, int start, String genotypeFormatString) {
|
||||
setReferenceBase(referenceBases);
|
||||
setLocation(contig, start);
|
||||
mGenotypeFormatString = genotypeFormatString;
|
||||
}
|
||||
|
||||
/**
|
||||
* given the values for each of the columns, create a VCF record.
|
||||
*
|
||||
* @param columnValues a mapping of header strings to values
|
||||
* @param genotypeFormatString the format string for the genotype records
|
||||
* @param genotypeRecords the genotype records
|
||||
*/
|
||||
public VCFRecord(Map<VCFHeader.HEADER_FIELDS, String> columnValues, String genotypeFormatString, List<VCFGenotypeRecord> genotypeRecords) {
|
||||
extractFields(columnValues);
|
||||
mGenotypeRecords.addAll(genotypeRecords);
|
||||
mGenotypeFormatString = genotypeFormatString;
|
||||
}
|
||||
|
||||
/**
|
||||
* given the values for each of the columns, create a VCF record.
|
||||
*
|
||||
* @param columnValues a mapping of header strings to values
|
||||
*/
|
||||
public VCFRecord(Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
|
||||
extractFields(columnValues);
|
||||
mGenotypeFormatString = "";
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF record
|
||||
*
|
||||
* @param referenceBases the reference bases to use
|
||||
* @param contig the contig this variant is on
|
||||
* @param position our position
|
||||
* @param ID our ID string
|
||||
* @param altBases the list of alternate bases
|
||||
* @param qual the qual field
|
||||
* @param filters the filters used on this variant
|
||||
* @param infoFields the information fields
|
||||
* @param genotypeFormatString the format string
|
||||
* @param genotypeObjects the genotype objects
|
||||
*/
|
||||
public VCFRecord(String referenceBases,
|
||||
String contig,
|
||||
long position,
|
||||
String ID,
|
||||
List<VCFGenotypeEncoding> altBases,
|
||||
double qual,
|
||||
String filters,
|
||||
Map<String, String> infoFields,
|
||||
String genotypeFormatString,
|
||||
List<VCFGenotypeRecord> genotypeObjects) {
|
||||
setReferenceBase(referenceBases);
|
||||
setLocation(contig, position);
|
||||
this.mID = ID;
|
||||
for (VCFGenotypeEncoding alt : altBases)
|
||||
this.addAlternateBase(alt);
|
||||
this.setQual(qual);
|
||||
this.setFilterString(filters);
|
||||
this.mInfoFields.putAll(infoFields);
|
||||
this.mGenotypeFormatString = genotypeFormatString;
|
||||
this.mGenotypeRecords.addAll(genotypeObjects);
|
||||
}
|
||||
|
||||
/**
|
||||
* extract the field values from the passed in array
|
||||
*
|
||||
* @param columnValues a map of the header fields to values
|
||||
*/
|
||||
private void extractFields(Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
|
||||
String chrom = null;
|
||||
long position = -1;
|
||||
|
||||
for (VCFHeader.HEADER_FIELDS val : columnValues.keySet()) {
|
||||
switch (val) {
|
||||
case CHROM:
|
||||
chrom = columnValues.get(val);
|
||||
break;
|
||||
case POS:
|
||||
position = Integer.valueOf(columnValues.get(val));
|
||||
break;
|
||||
case ID:
|
||||
setID(columnValues.get(val));
|
||||
break;
|
||||
case REF:
|
||||
if (columnValues.get(val).length() != 1)
|
||||
throw new IllegalArgumentException("Reference base should be a single character");
|
||||
setReferenceBase(columnValues.get(val));
|
||||
break;
|
||||
case ALT:
|
||||
String values[] = columnValues.get(val).split(",");
|
||||
for (String alt : values)
|
||||
addAlternateBase(new VCFGenotypeEncoding(alt));
|
||||
break;
|
||||
case QUAL:
|
||||
setQual(Double.valueOf(columnValues.get(val)));
|
||||
break;
|
||||
case FILTER:
|
||||
setFilterString(columnValues.get(val));
|
||||
break;
|
||||
case INFO:
|
||||
String vals[] = columnValues.get(val).split(";");
|
||||
for (String alt : vals) {
|
||||
if ( alt.equals(EMPTY_INFO_FIELD) )
|
||||
continue;
|
||||
String keyVal[] = alt.split("=");
|
||||
if ( keyVal.length == 1 )
|
||||
addInfoField(keyVal[0], "");
|
||||
else if (keyVal.length == 2)
|
||||
addInfoField(keyVal[0], keyVal[1]);
|
||||
else
|
||||
throw new IllegalArgumentException("info field key-value pair did not parse into key->value pair: " + alt);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
setLocation(chrom, position);
|
||||
}
|
||||
|
||||
/**
|
||||
* do we have genotyping data
|
||||
*
|
||||
* @return true if we have genotyping data, false otherwise
|
||||
*/
|
||||
|
||||
public boolean hasGenotypeData() {
|
||||
return (mGenotypeRecords.size() > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the ID value for this record
|
||||
*/
|
||||
public String getID() {
|
||||
return mID == null ? EMPTY_ID_FIELD : mID;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the reference base
|
||||
*
|
||||
* @return either A, T, C, G, or N
|
||||
*/
|
||||
public String getReference() {
|
||||
return mReferenceBases;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the alternate allele strings
|
||||
*
|
||||
* @return an array of strings representing the alt alleles, or null if there are none
|
||||
*/
|
||||
public List<String> getAlternateAlleleList() {
|
||||
ArrayList<String> alts = new ArrayList<String>();
|
||||
for ( VCFGenotypeEncoding alt : mAlts )
|
||||
alts.add(alt.getBases());
|
||||
return alts;
|
||||
}
|
||||
|
||||
public List<VCFGenotypeEncoding> getAlternateAlleles() {
|
||||
return mAlts;
|
||||
}
|
||||
|
||||
public boolean hasAlternateAllele() {
|
||||
for ( VCFGenotypeEncoding alt : mAlts ) {
|
||||
if ( alt.getType() != VCFGenotypeEncoding.TYPE.UNCALLED )
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isBiallelic() {
|
||||
return getAlternateAlleles().size() == 1;
|
||||
}
|
||||
|
||||
public boolean isReference() {
|
||||
return !hasAlternateAllele();
|
||||
}
|
||||
|
||||
public List<String> getAlleleList() {
|
||||
ArrayList<String> list = new ArrayList<String>();
|
||||
list.add(getReference());
|
||||
list.addAll(getAlternateAlleleList());
|
||||
return list;
|
||||
}
|
||||
|
||||
public double getNonRefAlleleFrequency() {
|
||||
if ( mInfoFields.containsKey(ALLELE_FREQUENCY_KEY) ) {
|
||||
return Double.valueOf(mInfoFields.get(ALLELE_FREQUENCY_KEY));
|
||||
} else {
|
||||
// this is the poor man's AF
|
||||
if ( mInfoFields.containsKey(ALLELE_COUNT_KEY) && mInfoFields.containsKey(ALLELE_NUMBER_KEY)) {
|
||||
String splt[] = mInfoFields.get(ALLELE_COUNT_KEY).split(",");
|
||||
if ( splt.length > 0 ) {
|
||||
return (Double.valueOf(splt[0]) / Double.valueOf(mInfoFields.get(ALLELE_NUMBER_KEY)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
public VCFGenotypeEncoding.TYPE getType() {
|
||||
VCFGenotypeEncoding.TYPE type = mAlts.get(0).getType();
|
||||
for (int i = 1; i < mAlts.size(); i++) {
|
||||
if ( mAlts.get(i).getType() != type )
|
||||
return VCFGenotypeEncoding.TYPE.MIXED; // if we have more than one type, return mixed
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
||||
public boolean isDeletion() {
|
||||
return getType() == VCFGenotypeEncoding.TYPE.DELETION;
|
||||
}
|
||||
|
||||
public boolean isInsertion() {
|
||||
return getType() == VCFGenotypeEncoding.TYPE.INSERTION;
|
||||
}
|
||||
|
||||
public boolean isIndel() {
|
||||
return isDeletion() || isInsertion();
|
||||
}
|
||||
|
||||
public boolean isSNP() {
|
||||
return getType() == VCFGenotypeEncoding.TYPE.SINGLE_BASE;
|
||||
}
|
||||
|
||||
public boolean isNovel() {
|
||||
return ( ! isInDBSNP() ) && ( ! isInHapmap() );
|
||||
}
|
||||
|
||||
public boolean isInDBSNP() {
|
||||
return ( ( mID != null && ! mID.equals(".") ) || ( mInfoFields.get(DBSNP_KEY) != null && mInfoFields.get(DBSNP_KEY).equals("1") ) );
|
||||
}
|
||||
|
||||
public boolean isInHapmap() {
|
||||
if ( mInfoFields.get(HAPMAP2_KEY) != null && mInfoFields.get(HAPMAP2_KEY).equals("1") ) {
|
||||
return true;
|
||||
} else {
|
||||
return ( mInfoFields.get(HAPMAP3_KEY) != null && mInfoFields.get(HAPMAP3_KEY).equals("1") );
|
||||
}
|
||||
}
|
||||
|
||||
public char getAlternativeBaseForSNP() {
|
||||
if ( !isSNP() && !isBiallelic() )
|
||||
throw new IllegalStateException("This record does not represent a SNP");
|
||||
return mAlts.get(0).getBases().charAt(0);
|
||||
}
|
||||
|
||||
public char getReferenceForSNP() {
|
||||
if ( !isSNP() )
|
||||
throw new IllegalStateException("This record does not represent a SNP");
|
||||
return getReference().charAt(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the phred-scaled quality score
|
||||
*/
|
||||
public double getQual() {
|
||||
return mQual;
|
||||
}
|
||||
|
||||
public int getPosition() {
|
||||
return mPosition;
|
||||
}
|
||||
|
||||
public boolean isMissingQual() {
|
||||
return (int)mQual == MISSING_GENOTYPE_QUALITY;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the -log10PError
|
||||
*/
|
||||
public double getNegLog10PError() {
|
||||
return mQual / 10.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the filter criteria
|
||||
*
|
||||
* @return an array of strings representing the filtering criteria, or UNFILTERED if none are applied
|
||||
*/
|
||||
public String[] getFilteringCodes() {
|
||||
if (mFilterString == null) return new String[]{UNFILTERED};
|
||||
return mFilterString.split(FILTER_CODE_SEPERATOR);
|
||||
}
|
||||
|
||||
public boolean isFiltered() {
|
||||
String[] codes = getFilteringCodes();
|
||||
return !codes[0].equals(UNFILTERED) && !codes[0].equals(PASSES_FILTERS);
|
||||
}
|
||||
|
||||
// public boolean hasFilteringCodes() {
|
||||
// return mFilterString != null;
|
||||
// }
|
||||
|
||||
public String getFilterString() {
|
||||
return mFilterString;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the information key-value pairs as a Map<>
|
||||
*
|
||||
* @return a map, of the info key-value pairs
|
||||
*/
|
||||
public final Map<String, String> getInfoValues() {
|
||||
return mInfoFields;
|
||||
}
|
||||
|
||||
public List<VCFGenotypeRecord> getVCFGenotypeRecords() {
|
||||
return mGenotypeRecords;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return a List of the sample names
|
||||
*/
|
||||
public String[] getSampleNames() {
|
||||
String names[] = new String[mGenotypeRecords.size()];
|
||||
for (int i = 0; i < mGenotypeRecords.size(); i++) {
|
||||
names[i] = mGenotypeRecords.get(i).getSampleName();
|
||||
}
|
||||
return names;
|
||||
}
|
||||
|
||||
public VCFGenotypeRecord getGenotype(final String sampleName) {
|
||||
for ( VCFGenotypeRecord rec : getVCFGenotypeRecords() ) {
|
||||
if ( rec.getSampleName().equals(sampleName) ) {
|
||||
return rec;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public String getGenotypeFormatString() {
|
||||
return mGenotypeFormatString;
|
||||
}// the formatting string for our genotype records
|
||||
|
||||
public void setGenotypeFormatString(String newFormatString) {
|
||||
mGenotypeFormatString = newFormatString;
|
||||
}
|
||||
|
||||
public void setReferenceBase(String reference) {
|
||||
mReferenceBases = reference.toUpperCase();
|
||||
}
|
||||
|
||||
public void setLocation(String chrom, long position) {
|
||||
if ( chrom == null )
|
||||
throw new IllegalArgumentException("Chromosomes cannot be missing");
|
||||
if ( position < 0 )
|
||||
throw new IllegalArgumentException("Position values must be greater than 0");
|
||||
this.mContig = chrom;
|
||||
this.mPosition = (int)position;
|
||||
}
|
||||
|
||||
public void setID(String ID) {
|
||||
mID = ID;
|
||||
}
|
||||
|
||||
public void setQual(double qual) {
|
||||
if ( qual < 0 && (int)qual != MISSING_GENOTYPE_QUALITY )
|
||||
throw new IllegalArgumentException("Qual values cannot be negative unless they are " + MISSING_GENOTYPE_QUALITY + " ('unknown')");
|
||||
mQual = qual;
|
||||
}
|
||||
|
||||
public void setFilterString(String filterString) {
|
||||
mFilterString = filterString;
|
||||
}
|
||||
|
||||
public void addGenotypeRecord(VCFGenotypeRecord mGenotypeRecord) {
|
||||
mGenotypeRecords.add(mGenotypeRecord);
|
||||
}
|
||||
|
||||
public void setGenotypeRecords(List<VCFGenotypeRecord> records) {
|
||||
mGenotypeRecords.clear();
|
||||
for ( VCFGenotypeRecord g : records )
|
||||
addGenotypeRecord(g);
|
||||
}
|
||||
|
||||
/**
|
||||
* add an alternate base to our alternate base list. All bases are uppercased
|
||||
* before being added to the list.
|
||||
*
|
||||
* @param base the base to add
|
||||
*/
|
||||
public void addAlternateBase(VCFGenotypeEncoding base) {
|
||||
if (!mAlts.contains(base)) mAlts.add(base);
|
||||
}
|
||||
|
||||
public void setAlternateBases(List<VCFGenotypeEncoding> bases) {
|
||||
mAlts.clear();
|
||||
for ( VCFGenotypeEncoding e : bases )
|
||||
addAlternateBase(e);
|
||||
}
|
||||
|
||||
/**
|
||||
* add an info field to the record
|
||||
*
|
||||
* @param key the key, from the spec or a user created key
|
||||
* @param value it's value as a string
|
||||
*/
|
||||
public void addInfoField(String key, String value) {
|
||||
//System.out.printf("Adding info field %s=%s%n", key, value);
|
||||
mInfoFields.put(key, value);
|
||||
}
|
||||
|
||||
public void printInfoFields() {
|
||||
for ( Map.Entry<String, String> e : mInfoFields.entrySet() ) {
|
||||
System.out.printf(" Current info field %s=%s this=%s%n", e.getKey(), e.getValue(), this);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* add an info field to the record
|
||||
*
|
||||
* @param m A map from info keys to info values
|
||||
*/
|
||||
public void addInfoFields(Map<String,String> m) {
|
||||
for ( Map.Entry<String, String> e : m.entrySet() )
|
||||
addInfoField(e.getKey(), e.getValue());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* the generation of a string representation, which is used by the VCF writer
|
||||
*
|
||||
* @param header the VCF header for this VCF Record
|
||||
* @return a string
|
||||
*/
|
||||
public String toStringEncoding(VCFHeader header) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
||||
// CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO
|
||||
builder.append(mContig);
|
||||
builder.append(FIELD_SEPERATOR);
|
||||
builder.append(mPosition);
|
||||
builder.append(FIELD_SEPERATOR);
|
||||
builder.append(getID());
|
||||
builder.append(FIELD_SEPERATOR);
|
||||
builder.append(getReference());
|
||||
builder.append(FIELD_SEPERATOR);
|
||||
List<VCFGenotypeEncoding> alts = getAlternateAlleles();
|
||||
if ( alts.size() > 0 ) {
|
||||
builder.append(alts.get(0));
|
||||
for ( int i = 1; i < alts.size(); i++ ) {
|
||||
builder.append(",");
|
||||
builder.append(alts.get(i));
|
||||
}
|
||||
} else {
|
||||
builder.append(EMPTY_ALLELE_FIELD);
|
||||
}
|
||||
builder.append(FIELD_SEPERATOR);
|
||||
if ( (int)mQual == MISSING_GENOTYPE_QUALITY )
|
||||
builder.append(MISSING_GENOTYPE_QUALITY);
|
||||
else
|
||||
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, mQual));
|
||||
builder.append(FIELD_SEPERATOR);
|
||||
builder.append(ParsingUtils.join(FILTER_CODE_SEPERATOR, getFilteringCodes()));
|
||||
builder.append(FIELD_SEPERATOR);
|
||||
builder.append(createInfoString());
|
||||
|
||||
if ( mGenotypeFormatString != null && mGenotypeFormatString.length() > 0 ) {
|
||||
// try {
|
||||
addGenotypeData(builder, header);
|
||||
// } catch (Exception e) {
|
||||
// if ( validationStringency == VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT )
|
||||
// throw new RuntimeException(e);
|
||||
// }
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* create the info string
|
||||
*
|
||||
* @return a string representing the infomation fields
|
||||
*/
|
||||
protected String createInfoString() {
|
||||
StringBuffer info = new StringBuffer();
|
||||
boolean isFirst = true;
|
||||
for (Map.Entry<String, String> entry : mInfoFields.entrySet()) {
|
||||
if ( isFirst )
|
||||
isFirst = false;
|
||||
else
|
||||
info.append(INFO_FIELD_SEPERATOR);
|
||||
info.append(entry.getKey());
|
||||
if ( entry.getValue() != null && !entry.getValue().equals("") ) {
|
||||
info.append("=");
|
||||
info.append(entry.getValue());
|
||||
}
|
||||
}
|
||||
return info.length() == 0 ? EMPTY_INFO_FIELD : info.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* add the genotype data
|
||||
*
|
||||
* @param builder the string builder
|
||||
* @param header the header object
|
||||
*/
|
||||
private void addGenotypeData(StringBuilder builder, VCFHeader header) {
|
||||
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getVCFGenotypeRecords());
|
||||
|
||||
StringBuffer tempStr = new StringBuffer();
|
||||
if ( header.getGenotypeSamples().size() < getVCFGenotypeRecords().size() ) {
|
||||
for ( String sample : gMap.keySet() ) {
|
||||
if ( !header.getGenotypeSamples().contains(sample) )
|
||||
System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header");
|
||||
else
|
||||
header.getGenotypeSamples().remove(sample);
|
||||
}
|
||||
throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated");
|
||||
}
|
||||
tempStr.append(FIELD_SEPERATOR + mGenotypeFormatString);
|
||||
|
||||
String[] genotypeFormatStrings = mGenotypeFormatString.split(":");
|
||||
|
||||
for ( String genotype : header.getGenotypeSamples() ) {
|
||||
tempStr.append(FIELD_SEPERATOR);
|
||||
if ( gMap.containsKey(genotype) ) {
|
||||
VCFGenotypeRecord rec = gMap.get(genotype);
|
||||
tempStr.append(rec.toStringEncoding(mAlts, genotypeFormatStrings));
|
||||
gMap.remove(genotype);
|
||||
} else {
|
||||
tempStr.append(VCFGenotypeRecord.stringEncodingForEmptyGenotype(genotypeFormatStrings));
|
||||
}
|
||||
}
|
||||
if ( gMap.size() != 0 ) {
|
||||
for ( String sample : gMap.keySet() )
|
||||
System.err.println("Sample " + sample + " is being genotyped but isn't in the header.");
|
||||
throw new IllegalStateException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records");
|
||||
}
|
||||
|
||||
builder.append(tempStr);
|
||||
}
|
||||
|
||||
/**
|
||||
* compare two VCF records
|
||||
*
|
||||
* @param other the other VCF record
|
||||
* @return true if they're equal
|
||||
*/
|
||||
public boolean equals(VCFRecord other) {
|
||||
if (!this.mAlts.equals(other.mAlts)) return false;
|
||||
if (!this.mReferenceBases.equals(other.mReferenceBases)) return false;
|
||||
if (!this.mContig.equals(other.mContig)) return false;
|
||||
if (mPosition != other.mPosition) return false;
|
||||
if (!this.mID.equals(other.mID)) return false;
|
||||
if (this.mQual != other.mQual) return false;
|
||||
if ( this.mFilterString == null ) {
|
||||
if ( other.mFilterString != null ) return false;
|
||||
} else if ( !this.mFilterString.equals(other.mFilterString) ) return false;
|
||||
if (!this.mInfoFields.equals(other.mInfoFields)) return false;
|
||||
if (!this.mGenotypeRecords.equals(other.mGenotypeRecords)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a genotype mapping from a list and their sample names
|
||||
*
|
||||
* @param list a list of genotype samples
|
||||
* @return a mapping of the sample name to VCF genotype record
|
||||
*/
|
||||
private static Map<String, VCFGenotypeRecord> genotypeListToMap(List<VCFGenotypeRecord> list) {
|
||||
Map<String, VCFGenotypeRecord> map = new HashMap<String, VCFGenotypeRecord>();
|
||||
for (int i = 0; i < list.size(); i++) {
|
||||
VCFGenotypeRecord rec = list.get(i);
|
||||
map.put(rec.getSampleName(), rec);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
/** Return the features reference sequence name, e.g chromosome or contig */
|
||||
public String getChr() {
|
||||
return this.mContig;
|
||||
}
|
||||
|
||||
/** Return the start position in 1-based coordinates (first base is 1) */
|
||||
public int getStart() {
|
||||
return this.mPosition;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the end position following 1-based fully closed conventions. The length of a feature is
|
||||
* end - start + 1;
|
||||
*/
|
||||
public int getEnd() {
|
||||
return this.mPosition;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the VCF header we're associated with
|
||||
* @param header the header
|
||||
*/
|
||||
void setHeader(VCFHeader header) {
|
||||
vcfHeader = header;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the associated header
|
||||
* @return the VCF Header
|
||||
*/
|
||||
public VCFHeader getHeader() {
|
||||
return vcfHeader;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue