moving VCF 3.3 back into the GATK so Guillermo can make changes for VCF 4 output
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3639 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b3edb7dc08
commit
d3848745ab
|
|
@ -0,0 +1,125 @@
|
||||||
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
|
import org.broad.tribble.Feature;
|
||||||
|
import org.broad.tribble.FeatureCodec;
|
||||||
|
import org.broad.tribble.exception.CodecLineParsingException;
|
||||||
|
import org.broad.tribble.util.LineReader;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @author aaron
|
||||||
|
*
|
||||||
|
* Class VCFCodec
|
||||||
|
*
|
||||||
|
* The codec for VCF, which relies on VCFReaderUtils to do most of the processing
|
||||||
|
*/
|
||||||
|
public class VCFCodec implements FeatureCodec {
|
||||||
|
|
||||||
|
// we have to store the list of strings that make up the header until they're needed
|
||||||
|
private List<String> headerStrings = new ArrayList<String>();
|
||||||
|
private VCFHeader header = null;
|
||||||
|
private VCFHeaderVersion version = VCFHeaderVersion.VCF3_3;
|
||||||
|
|
||||||
|
|
||||||
|
// some classes need to transform the line before
|
||||||
|
private LineTransform transformer = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fast path to get the location of the Feature for indexing
|
||||||
|
* @param line the input line to decode
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Feature decodeLoc(String line) {
|
||||||
|
return reallyDecode(line, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decode a line as a Feature.
|
||||||
|
*
|
||||||
|
* @param line
|
||||||
|
*
|
||||||
|
* @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is
|
||||||
|
* a comment)
|
||||||
|
*/
|
||||||
|
public Feature decode(String line) {
|
||||||
|
return reallyDecode(line, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Feature reallyDecode(String line, boolean justLocationPlease ) {
|
||||||
|
// transform the line, if we have a transform to do
|
||||||
|
if (transformer != null) line = transformer.lineTransform(line);
|
||||||
|
if (line.startsWith("#"))
|
||||||
|
return null;
|
||||||
|
|
||||||
|
// make a VCFRecord of the line and return it
|
||||||
|
VCFRecord rec = VCFReaderUtils.createRecord(line, header, justLocationPlease);
|
||||||
|
if ( ! justLocationPlease ) rec.setHeader(header);
|
||||||
|
return rec;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the # of header lines for this file. We use this to parse out the header
|
||||||
|
*
|
||||||
|
* @return 0
|
||||||
|
*/
|
||||||
|
public int readHeader(LineReader reader) {
|
||||||
|
String line = "";
|
||||||
|
try {
|
||||||
|
while ((line = reader.readLine()) != null) {
|
||||||
|
if (line.startsWith("##")) {
|
||||||
|
headerStrings.add(line);
|
||||||
|
}
|
||||||
|
else if (line.startsWith("#")) {
|
||||||
|
headerStrings.add(line);
|
||||||
|
header = VCFReaderUtils.createHeader(headerStrings,version);
|
||||||
|
return headerStrings.size();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException("IO Exception ", e);
|
||||||
|
}
|
||||||
|
throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return VCFRecord.class
|
||||||
|
*/
|
||||||
|
public Class getFeatureType() {
|
||||||
|
return VCFRecord.class;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VCFHeader getHeader(Class clazz) throws ClassCastException {
|
||||||
|
if (!clazz.equals(VCFHeader.class))
|
||||||
|
throw new ClassCastException("Unable to cast to expected type " + clazz + " from type " + VCFHeader.class);
|
||||||
|
return header;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static interface LineTransform {
|
||||||
|
public String lineTransform(String line);
|
||||||
|
}
|
||||||
|
|
||||||
|
public LineTransform getTransformer() {
|
||||||
|
return transformer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTransformer(LineTransform transformer) {
|
||||||
|
this.transformer = transformer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VCFHeaderVersion getVersion() {
|
||||||
|
return version;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setVersion(VCFHeaderVersion version) {
|
||||||
|
this.version = version;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,72 @@
|
||||||
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
|
import org.broad.tribble.util.ParsingUtils;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author ebanks
|
||||||
|
* A class representing a key=value entry for FILTER fields in the VCF header
|
||||||
|
*/
|
||||||
|
public class VCFFilterHeaderLine extends VCFHeaderLine {
|
||||||
|
|
||||||
|
private String mName;
|
||||||
|
private String mDescription;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a VCF filter header line
|
||||||
|
*
|
||||||
|
* @param name the name for this header line
|
||||||
|
* @param description the description for this header line
|
||||||
|
*/
|
||||||
|
public VCFFilterHeaderLine(String name, String description) {
|
||||||
|
super("FILTER", "");
|
||||||
|
mName = name;
|
||||||
|
mDescription = description;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a VCF info header line
|
||||||
|
*
|
||||||
|
* @param line the header line
|
||||||
|
* @param version the vcf header version
|
||||||
|
*/
|
||||||
|
protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) {
|
||||||
|
super("FILTER", "", version);
|
||||||
|
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description"));
|
||||||
|
mName = mapping.get("ID");
|
||||||
|
mDescription = mapping.get("Description");
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String makeStringRep() {
|
||||||
|
if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2)
|
||||||
|
return String.format("FILTER=%s,\"%s\"", mName, mDescription);
|
||||||
|
else if (mVersion == VCFHeaderVersion.VCF4_0) {
|
||||||
|
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
||||||
|
map.put("ID",mName);
|
||||||
|
map.put("Description",mDescription);
|
||||||
|
return "FILTER=" + VCFHeaderLineTranslator.toValue(this.mVersion,map);
|
||||||
|
}
|
||||||
|
else throw new RuntimeException("Unsupported VCFVersion " + mVersion);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if ( !(o instanceof VCFFilterHeaderLine) )
|
||||||
|
return false;
|
||||||
|
VCFFilterHeaderLine other = (VCFFilterHeaderLine)o;
|
||||||
|
return mName.equals(other.mName) &&
|
||||||
|
mDescription.equals(other.mDescription);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getmName() {
|
||||||
|
return mName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getmDescription() {
|
||||||
|
return mDescription;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,118 @@
|
||||||
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
|
import org.broad.tribble.util.ParsingUtils;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author ebanks
|
||||||
|
* <p/>
|
||||||
|
* Class VCFFormatHeaderLine
|
||||||
|
* <p/>
|
||||||
|
* A class representing a key=value entry for genotype FORMAT fields in the VCF header
|
||||||
|
*/
|
||||||
|
public class VCFFormatHeaderLine extends VCFHeaderLine {
|
||||||
|
|
||||||
|
// the format field types
|
||||||
|
public enum FORMAT_TYPE {
|
||||||
|
Integer, Float, String;
|
||||||
|
public Object convert(String value) {
|
||||||
|
switch (this) {
|
||||||
|
case Integer:
|
||||||
|
return java.lang.Integer.valueOf(value); // the java.lang is needed since we use Integer as a enum name
|
||||||
|
case Float:
|
||||||
|
return java.lang.Float.valueOf(value);
|
||||||
|
case String:
|
||||||
|
return value;
|
||||||
|
default:
|
||||||
|
throw new IllegalStateException("field." + this + " doesn't have a set conversion approach");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String mName;
|
||||||
|
private int mCount;
|
||||||
|
private String mDescription;
|
||||||
|
private FORMAT_TYPE mType;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a VCF format header line
|
||||||
|
*
|
||||||
|
* @param name the name for this header line
|
||||||
|
* @param count the count for this header line
|
||||||
|
* @param type the type for this header line
|
||||||
|
* @param description the description for this header line
|
||||||
|
*/
|
||||||
|
public VCFFormatHeaderLine(String name, int count, FORMAT_TYPE type, String description) {
|
||||||
|
super("FORMAT", "");
|
||||||
|
mName = name;
|
||||||
|
mCount = count;
|
||||||
|
mType = type;
|
||||||
|
mDescription = description;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a VCF format header line
|
||||||
|
*
|
||||||
|
* @param line the header line
|
||||||
|
* @param version the VCF header version
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
protected VCFFormatHeaderLine(String line, VCFHeaderVersion version) {
|
||||||
|
super("FORMAT", "", version);
|
||||||
|
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description"));
|
||||||
|
mName = mapping.get("ID");
|
||||||
|
mCount = Integer.valueOf(mapping.get("Number"));
|
||||||
|
mType = FORMAT_TYPE.valueOf(mapping.get("Type"));
|
||||||
|
mDescription = mapping.get("Description");
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String makeStringRep() {
|
||||||
|
if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2)
|
||||||
|
return String.format("FORMAT=%s,%d,%s,\"%s\"", mName, mCount, mType.toString(), mDescription);
|
||||||
|
else if (mVersion == VCFHeaderVersion.VCF4_0) {
|
||||||
|
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
||||||
|
map.put("ID",mName);
|
||||||
|
map.put("Number",mCount);
|
||||||
|
map.put("Type",mType);
|
||||||
|
map.put("Description",mDescription);
|
||||||
|
return "FORMAT=" + VCFHeaderLineTranslator.toValue(this.mVersion,map);
|
||||||
|
}
|
||||||
|
else throw new RuntimeException("Unsupported VCFVersion " + mVersion);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() { return mName; }
|
||||||
|
public int getCount() { return mCount; }
|
||||||
|
public String getDescription() { return mDescription; }
|
||||||
|
public FORMAT_TYPE getType() { return mType; }
|
||||||
|
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if ( !(o instanceof VCFFormatHeaderLine) )
|
||||||
|
return false;
|
||||||
|
VCFFormatHeaderLine other = (VCFFormatHeaderLine)o;
|
||||||
|
return mName.equals(other.mName) &&
|
||||||
|
mCount == other.mCount &&
|
||||||
|
mDescription.equals(other.mDescription) &&
|
||||||
|
mType == other.mType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getmName() {
|
||||||
|
return mName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getmCount() {
|
||||||
|
return mCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getmDescription() {
|
||||||
|
return mDescription;
|
||||||
|
}
|
||||||
|
|
||||||
|
public FORMAT_TYPE getmType() {
|
||||||
|
return mType;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,128 @@
|
||||||
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Class VCFGenotypeEncoding
|
||||||
|
* <p/>
|
||||||
|
* basic encoding class for genotype fields in VCF
|
||||||
|
*/
|
||||||
|
public class VCFGenotypeEncoding {
|
||||||
|
public enum TYPE {
|
||||||
|
SINGLE_BASE,
|
||||||
|
INSERTION,
|
||||||
|
DELETION,
|
||||||
|
UNCALLED,
|
||||||
|
MIXED // this type is only valid in aggregate, not for a single VCFGenotypeEncoding
|
||||||
|
}
|
||||||
|
|
||||||
|
// our length (0 for SINGLE_BASE), our bases, and our type
|
||||||
|
private final int mLength;
|
||||||
|
private final String mBases;
|
||||||
|
private final TYPE mType;
|
||||||
|
|
||||||
|
// public constructor, that parses out the base string
|
||||||
|
public VCFGenotypeEncoding(String baseString) {
|
||||||
|
if ((baseString.length() == 1)) {
|
||||||
|
// are we an empty (no-call) genotype?
|
||||||
|
if (baseString.equals(VCFGenotypeRecord.EMPTY_ALLELE)) {
|
||||||
|
mBases = VCFGenotypeRecord.EMPTY_ALLELE;
|
||||||
|
mLength = 0;
|
||||||
|
mType = TYPE.UNCALLED;
|
||||||
|
} else if (!validBases(baseString)) {
|
||||||
|
throw new IllegalArgumentException("Alleles of length 1 must be one of A,C,G,T, " + baseString + " was passed in");
|
||||||
|
} else { // we're a valid base
|
||||||
|
mBases = baseString.toUpperCase();
|
||||||
|
mLength = 0;
|
||||||
|
mType = TYPE.SINGLE_BASE;
|
||||||
|
}
|
||||||
|
} else { // deletion or insertion
|
||||||
|
if (baseString.length() < 1 || (baseString.toUpperCase().charAt(0) != 'D' && baseString.toUpperCase().charAt(0) != 'I')) {
|
||||||
|
throw new IllegalArgumentException("Genotype encoding of " + baseString + " was passed in, but is not a valid deletion, insertion, base, or no call (.)");
|
||||||
|
}
|
||||||
|
if (baseString.toUpperCase().charAt(0) == 'D') {
|
||||||
|
mLength = Integer.valueOf(baseString.substring(1, baseString.length()));
|
||||||
|
mBases = "";
|
||||||
|
mType = TYPE.DELETION;
|
||||||
|
} else { // we're an I
|
||||||
|
mBases = baseString.substring(1, baseString.length()).toUpperCase();
|
||||||
|
if (!validBases(mBases))
|
||||||
|
throw new IllegalArgumentException("The insertion base string contained invalid bases -> " + baseString);
|
||||||
|
mLength = mBases.length();
|
||||||
|
mType = TYPE.INSERTION;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getLength() {
|
||||||
|
return mLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getBases() {
|
||||||
|
return mBases;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TYPE getType() {
|
||||||
|
return mType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if ( obj == null )
|
||||||
|
return false;
|
||||||
|
if ( obj instanceof VCFGenotypeEncoding ) {
|
||||||
|
VCFGenotypeEncoding d = (VCFGenotypeEncoding) obj;
|
||||||
|
return (mType == d.mType) && (mBases.equals(d.mBases)) && (mLength == d.mLength);
|
||||||
|
}
|
||||||
|
if ( mType == TYPE.UNCALLED && obj.toString().equals(VCFGenotypeRecord.EMPTY_ALLELE) )
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
// our underlying data is immutable, so this is safe (we won't strand a value in a hashtable somewhere
|
||||||
|
// when the data changes underneath, altering this value).
|
||||||
|
String str = this.mBases + String.valueOf(this.mLength) + this.mType.toString();
|
||||||
|
return str.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* dump the string representation of this genotype encoding
|
||||||
|
*
|
||||||
|
* @return string representation
|
||||||
|
*/
|
||||||
|
public String toString() {
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
switch (mType) {
|
||||||
|
case SINGLE_BASE:
|
||||||
|
case UNCALLED:
|
||||||
|
builder.append(mBases);
|
||||||
|
break;
|
||||||
|
case INSERTION:
|
||||||
|
builder.append("I");
|
||||||
|
builder.append(mBases);
|
||||||
|
break;
|
||||||
|
case DELETION:
|
||||||
|
builder.append("D");
|
||||||
|
builder.append(mLength);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* ensure that string contains valid bases
|
||||||
|
*
|
||||||
|
* @param bases the bases to check
|
||||||
|
*
|
||||||
|
* @return true if they're all either A,C,G,T; false otherwise
|
||||||
|
*/
|
||||||
|
private static boolean validBases(String bases) {
|
||||||
|
for (char c : bases.toUpperCase().toCharArray()) {
|
||||||
|
if (c != 'A' && c != 'C' && c != 'G' && c != 'T')
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,353 @@
|
||||||
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @author aaron
|
||||||
|
*
|
||||||
|
* Class VCFGenotypeRecord
|
||||||
|
*
|
||||||
|
* A descriptions should go here. Blame aaron if it's missing.
|
||||||
|
*/
|
||||||
|
public class VCFGenotypeRecord {
|
||||||
|
|
||||||
|
// key names
|
||||||
|
public static final String GENOTYPE_KEY = "GT";
|
||||||
|
public static final String GENOTYPE_QUALITY_KEY = "GQ";
|
||||||
|
public static final String DEPTH_KEY = "DP";
|
||||||
|
public static final String HAPLOTYPE_QUALITY_KEY = "HQ";
|
||||||
|
public static final String GENOTYPE_FILTER_KEY = "FT";
|
||||||
|
public static final String GENOTYPE_LIKELIHOODS_KEY = "GL";
|
||||||
|
public static final String OLD_DEPTH_KEY = "RD";
|
||||||
|
|
||||||
|
// the values for empty fields
|
||||||
|
public static final String EMPTY_GENOTYPE = "./.";
|
||||||
|
public static final String EMPTY_ALLELE = ".";
|
||||||
|
public static final int MISSING_GENOTYPE_QUALITY = -1;
|
||||||
|
public static final int MISSING_DEPTH = -1;
|
||||||
|
public static final int MISSING_HAPLOTYPE_QUALITY = -1;
|
||||||
|
public static final String PASSES_FILTERS = "0";
|
||||||
|
public static final String UNFILTERED = ".";
|
||||||
|
|
||||||
|
public static final double MAX_QUAL_VALUE = 99.0;
|
||||||
|
|
||||||
|
// what kind of phasing this genotype has
|
||||||
|
public enum PHASE {
|
||||||
|
UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN
|
||||||
|
}
|
||||||
|
|
||||||
|
// our record
|
||||||
|
private VCFRecord mRecord;
|
||||||
|
|
||||||
|
// our phasing
|
||||||
|
private PHASE mPhaseType;
|
||||||
|
|
||||||
|
// our bases(s)
|
||||||
|
private final List<VCFGenotypeEncoding> mGenotypeAlleles = new ArrayList<VCFGenotypeEncoding>();
|
||||||
|
|
||||||
|
// our mapping of the format mFields to values
|
||||||
|
private final Map<String, String> mFields = new HashMap<String, String>();
|
||||||
|
|
||||||
|
// our sample name
|
||||||
|
private String mSampleName;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a VCF genotype record
|
||||||
|
*
|
||||||
|
* @param sampleName sample name
|
||||||
|
* @param genotypes list of genotypes
|
||||||
|
* @param phasing phasing
|
||||||
|
*/
|
||||||
|
public VCFGenotypeRecord(String sampleName, List<VCFGenotypeEncoding> genotypes, PHASE phasing) {
|
||||||
|
mSampleName = sampleName;
|
||||||
|
if (genotypes != null)
|
||||||
|
this.mGenotypeAlleles.addAll(genotypes);
|
||||||
|
mPhaseType = phasing;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setVCFRecord(VCFRecord record) {
|
||||||
|
mRecord = record;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSampleName(String name) {
|
||||||
|
mSampleName = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a field to the genotype record.
|
||||||
|
* Throws an exception if the key is GT, as that's computed internally.
|
||||||
|
*
|
||||||
|
* @param key the field name (use static variables above for common fields)
|
||||||
|
* @param value the field value
|
||||||
|
*/
|
||||||
|
public void setField(String key, String value) {
|
||||||
|
// make sure the GT field isn't being set
|
||||||
|
if ( key.equals(GENOTYPE_KEY) )
|
||||||
|
throw new IllegalArgumentException("Setting the GT field is not allowed as that's done internally");
|
||||||
|
mFields.put(key, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* determine the phase of the genotype
|
||||||
|
*
|
||||||
|
* @param phase the string that contains the phase character
|
||||||
|
*
|
||||||
|
* @return the phase
|
||||||
|
*/
|
||||||
|
static PHASE determinePhase(String phase) {
|
||||||
|
// find the phasing information
|
||||||
|
if (phase.equals("/"))
|
||||||
|
return PHASE.UNPHASED;
|
||||||
|
else if (phase.equals("|"))
|
||||||
|
return PHASE.PHASED;
|
||||||
|
else if (phase.equals("\\"))
|
||||||
|
return PHASE.PHASED_SWITCH_PROB;
|
||||||
|
else
|
||||||
|
throw new IllegalArgumentException("Unknown genotype phasing parameter");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public PHASE getPhaseType() {
|
||||||
|
return mPhaseType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSampleName() {
|
||||||
|
return mSampleName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<VCFGenotypeEncoding> getAlleles() {
|
||||||
|
return mGenotypeAlleles;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, String> getFields() {
|
||||||
|
return mFields;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the phred-scaled quality score
|
||||||
|
*/
|
||||||
|
public double getQual() {
|
||||||
|
return ( mFields.containsKey(GENOTYPE_QUALITY_KEY) ? Double.valueOf(mFields.get(GENOTYPE_QUALITY_KEY)) : MISSING_GENOTYPE_QUALITY);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isMissingQual() {
|
||||||
|
return (int)getQual() == MISSING_GENOTYPE_QUALITY;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getNegLog10PError() {
|
||||||
|
double qual = getQual();
|
||||||
|
return (qual == MISSING_GENOTYPE_QUALITY ? MISSING_GENOTYPE_QUALITY : qual / 10.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getReadCount() {
|
||||||
|
return ( mFields.containsKey(DEPTH_KEY) ? Integer.valueOf(mFields.get(DEPTH_KEY)) : MISSING_DEPTH);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getLocation() {
|
||||||
|
return mRecord != null ? mRecord.getChr() + ":" + mRecord.getPosition() : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getReference() {
|
||||||
|
return mRecord != null ? mRecord.getReference() : "N";
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getBases() {
|
||||||
|
String genotype = "";
|
||||||
|
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles )
|
||||||
|
genotype += encoding.getBases();
|
||||||
|
return genotype;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isVariant(char ref) {
|
||||||
|
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) {
|
||||||
|
if ( encoding.getType() == VCFGenotypeEncoding.TYPE.UNCALLED )
|
||||||
|
continue;
|
||||||
|
if ( encoding.getType() != VCFGenotypeEncoding.TYPE.SINGLE_BASE ||
|
||||||
|
encoding.getBases().charAt(0) != ref )
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isPointGenotype() {
|
||||||
|
return (mRecord != null ? !mRecord.isIndel() : true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isHom() {
|
||||||
|
if ( mGenotypeAlleles.size() == 0 )
|
||||||
|
return true;
|
||||||
|
|
||||||
|
String bases = mGenotypeAlleles.get(0).getBases();
|
||||||
|
for ( int i = 1; i < mGenotypeAlleles.size(); i++ ) {
|
||||||
|
if ( !bases.equals(mGenotypeAlleles.get(1).getBases()) )
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isHet() {
|
||||||
|
return !isHom();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isNoCall() {
|
||||||
|
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) {
|
||||||
|
if ( encoding.getType() != VCFGenotypeEncoding.TYPE.UNCALLED )
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isFiltered() {
|
||||||
|
return ( mFields.get(GENOTYPE_FILTER_KEY) != null &&
|
||||||
|
!mFields.get(GENOTYPE_FILTER_KEY).equals(UNFILTERED) &&
|
||||||
|
!mFields.get(GENOTYPE_FILTER_KEY).equals(PASSES_FILTERS));
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getPloidy() {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VCFRecord getRecord() {
|
||||||
|
return mRecord;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String toGenotypeString(List<VCFGenotypeEncoding> altAlleles) {
|
||||||
|
String str = "";
|
||||||
|
boolean first = true;
|
||||||
|
for (VCFGenotypeEncoding allele : mGenotypeAlleles) {
|
||||||
|
if (allele.getType() == VCFGenotypeEncoding.TYPE.UNCALLED)
|
||||||
|
str += VCFGenotypeRecord.EMPTY_ALLELE;
|
||||||
|
else
|
||||||
|
str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0);
|
||||||
|
if (first) {
|
||||||
|
switch (mPhaseType) {
|
||||||
|
case UNPHASED:
|
||||||
|
str += "/";
|
||||||
|
break;
|
||||||
|
case PHASED:
|
||||||
|
str += "|";
|
||||||
|
break;
|
||||||
|
case PHASED_SWITCH_PROB:
|
||||||
|
str += "\\";
|
||||||
|
break;
|
||||||
|
case UNKNOWN:
|
||||||
|
throw new UnsupportedOperationException("Unknown phase type");
|
||||||
|
}
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return str;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return String.format("[VCFGenotype %s %s %s %s]", getLocation(), mSampleName, this.mGenotypeAlleles, mFields);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEmptyGenotype() {
|
||||||
|
for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) {
|
||||||
|
if ( encoding.getType() != VCFGenotypeEncoding.TYPE.UNCALLED )
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (other instanceof VCFGenotypeRecord) {
|
||||||
|
if (((VCFGenotypeRecord) other).mPhaseType != this.mPhaseType) return false;
|
||||||
|
if (!((VCFGenotypeRecord) other).mGenotypeAlleles.equals(this.mGenotypeAlleles)) return false;
|
||||||
|
if (!((VCFGenotypeRecord) other).mFields.equals(mFields)) return false;
|
||||||
|
if (!((VCFGenotypeRecord) other).mSampleName.equals(this.mSampleName)) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* output a string representation of the VCFGenotypeRecord, given the alternate alleles
|
||||||
|
*
|
||||||
|
* @param altAlleles the alternate alleles, needed for toGenotypeString()
|
||||||
|
* @param genotypeFormatStrings genotype format strings
|
||||||
|
*
|
||||||
|
* @return a string
|
||||||
|
*/
|
||||||
|
public String toStringEncoding(List<VCFGenotypeEncoding> altAlleles, String[] genotypeFormatStrings) {
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
builder.append(toGenotypeString(altAlleles));
|
||||||
|
|
||||||
|
for ( String field : genotypeFormatStrings ) {
|
||||||
|
if ( field.equals(GENOTYPE_KEY) )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
String value = mFields.get(field);
|
||||||
|
if ( value == null && field.equals(OLD_DEPTH_KEY) )
|
||||||
|
value = mFields.get(DEPTH_KEY);
|
||||||
|
|
||||||
|
builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR);
|
||||||
|
if ( value == null || value.equals("") )
|
||||||
|
builder.append(getMissingFieldValue(field));
|
||||||
|
else
|
||||||
|
builder.append(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* output a string representation of an empty genotype
|
||||||
|
*
|
||||||
|
* @param genotypeFormatStrings genotype format strings
|
||||||
|
*
|
||||||
|
* @return a string
|
||||||
|
*/
|
||||||
|
public static String stringEncodingForEmptyGenotype(String[] genotypeFormatStrings) {
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
builder.append(EMPTY_GENOTYPE);
|
||||||
|
|
||||||
|
for ( String field : genotypeFormatStrings ) {
|
||||||
|
if ( field.equals(GENOTYPE_KEY) )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR);
|
||||||
|
builder.append(getMissingFieldValue(field));
|
||||||
|
}
|
||||||
|
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String getMissingFieldValue(String field) {
|
||||||
|
String result = "";
|
||||||
|
if ( field.equals(GENOTYPE_QUALITY_KEY) )
|
||||||
|
result = String.valueOf(MISSING_GENOTYPE_QUALITY);
|
||||||
|
else if ( field.equals(DEPTH_KEY) || field.equals(OLD_DEPTH_KEY) )
|
||||||
|
result = String.valueOf(MISSING_DEPTH);
|
||||||
|
else if ( field.equals(GENOTYPE_FILTER_KEY) )
|
||||||
|
result = UNFILTERED;
|
||||||
|
else if ( field.equals(GENOTYPE_LIKELIHOODS_KEY) )
|
||||||
|
result = "0,0,0";
|
||||||
|
// TODO -- support haplotype quality
|
||||||
|
//else if ( field.equals(HAPLOTYPE_QUALITY_KEY) )
|
||||||
|
// result = String.valueOf(MISSING_HAPLOTYPE_QUALITY);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Set<VCFFormatHeaderLine> getSupportedHeaderStrings(VCFHeaderVersion version) {
|
||||||
|
Set<VCFFormatHeaderLine> result = new HashSet<VCFFormatHeaderLine>();
|
||||||
|
result.add(new VCFFormatHeaderLine(GENOTYPE_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.String, "Genotype"));
|
||||||
|
result.add(new VCFFormatHeaderLine(GENOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.Float, "Genotype Quality"));
|
||||||
|
result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.Integer, "Read Depth (only filtered reads used for calling)"));
|
||||||
|
result.add(new VCFFormatHeaderLine(GENOTYPE_LIKELIHOODS_KEY, 3, VCFFormatHeaderLine.FORMAT_TYPE.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic"));
|
||||||
|
//result.add(new VCFFormatHeaderLine(HAPLOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Haplotype Quality"));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void replaceFields(HashMap<String,String> newFields) {
|
||||||
|
mFields.clear();
|
||||||
|
for ( String s : newFields.keySet() ) {
|
||||||
|
mFields.put(s,newFields.get(s));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,137 @@
|
||||||
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Class VCFHeader
|
||||||
|
* <p/>
|
||||||
|
* A class representing the VCF header
|
||||||
|
*/
|
||||||
|
public class VCFHeader {
|
||||||
|
|
||||||
|
// the manditory header fields
|
||||||
|
public enum HEADER_FIELDS {
|
||||||
|
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
|
||||||
|
}
|
||||||
|
|
||||||
|
// the associated meta data
|
||||||
|
private final Set<VCFHeaderLine> mMetaData;
|
||||||
|
|
||||||
|
// the list of auxillary tags
|
||||||
|
private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
|
||||||
|
|
||||||
|
// the character string that indicates meta data
|
||||||
|
public static final String METADATA_INDICATOR = "##";
|
||||||
|
|
||||||
|
// the header string indicator
|
||||||
|
public static final String HEADER_INDICATOR = "#";
|
||||||
|
|
||||||
|
// our header versionVCF
|
||||||
|
private VCFHeaderVersion versionVCF;
|
||||||
|
|
||||||
|
/** do we have genotying data? */
|
||||||
|
private boolean hasGenotypingData = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a VCF header, given a list of meta data and auxillary tags
|
||||||
|
*
|
||||||
|
* @param metaData the meta data associated with this header
|
||||||
|
*/
|
||||||
|
public VCFHeader(Set<VCFHeaderLine> metaData) {
|
||||||
|
mMetaData = new TreeSet<VCFHeaderLine>(metaData);
|
||||||
|
checkVCFVersion();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a VCF header, given a list of meta data and auxillary tags
|
||||||
|
*
|
||||||
|
* @param metaData the meta data associated with this header
|
||||||
|
* @param genotypeSampleNames the genotype format field, and the sample names
|
||||||
|
*/
|
||||||
|
public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) {
|
||||||
|
mMetaData = new TreeSet<VCFHeaderLine>(metaData);
|
||||||
|
for (String col : genotypeSampleNames) {
|
||||||
|
if (!col.equals("FORMAT"))
|
||||||
|
mGenotypeSampleNames.add(col);
|
||||||
|
}
|
||||||
|
if (genotypeSampleNames.size() > 0) hasGenotypingData = true;
|
||||||
|
checkVCFVersion();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* check our metadata for a VCF versionVCF tag, and throw an exception if the versionVCF is out of date
|
||||||
|
* or the versionVCF is not present
|
||||||
|
*/
|
||||||
|
// TODO: fix this function
|
||||||
|
public void checkVCFVersion() {
|
||||||
|
VCFHeaderVersion version;
|
||||||
|
List<VCFHeaderLine> toRemove = new ArrayList<VCFHeaderLine>();
|
||||||
|
for ( VCFHeaderLine line : mMetaData )
|
||||||
|
if ( VCFHeaderVersion.isFormatString(line.getKey())) {
|
||||||
|
version = VCFHeaderVersion.toHeaderVersion(line.getValue(),line.getKey());
|
||||||
|
if (version == null)
|
||||||
|
{
|
||||||
|
toRemove.add(line);
|
||||||
|
}
|
||||||
|
/**throw new RuntimeException("VCF version " + line.getValue() +
|
||||||
|
" is not supported; only versionVCF " + VCFHeaderVersion.VCF3_2 + " and greater can be used");*/
|
||||||
|
else return;
|
||||||
|
}
|
||||||
|
// remove old header lines for now,
|
||||||
|
mMetaData.removeAll(toRemove);
|
||||||
|
mMetaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF3_3.getFormatString(), VCFHeaderVersion.VCF3_3.getVersionString()));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the header fields in order they're presented in the input file (which is now required to be
|
||||||
|
* the order presented in the spec).
|
||||||
|
*
|
||||||
|
* @return a set of the header fields, in order
|
||||||
|
*/
|
||||||
|
public Set<HEADER_FIELDS> getHeaderFields() {
|
||||||
|
Set<HEADER_FIELDS> fields = new LinkedHashSet<HEADER_FIELDS>();
|
||||||
|
for (HEADER_FIELDS field : HEADER_FIELDS.values())
|
||||||
|
fields.add(field);
|
||||||
|
return fields;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the meta data, associated with this header
|
||||||
|
*
|
||||||
|
* @return a set of the meta data
|
||||||
|
*/
|
||||||
|
public Set<VCFHeaderLine> getMetaData() {
|
||||||
|
return mMetaData;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the genotyping sample names
|
||||||
|
*
|
||||||
|
* @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false
|
||||||
|
*/
|
||||||
|
public Set<String> getGenotypeSamples() {
|
||||||
|
return mGenotypeSampleNames;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* do we have genotyping data?
|
||||||
|
*
|
||||||
|
* @return true if we have genotyping columns, false otherwise
|
||||||
|
*/
|
||||||
|
public boolean hasGenotypingData() {
|
||||||
|
return hasGenotypingData;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @return the column count, */
|
||||||
|
public int getColumnCount() {
|
||||||
|
return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -0,0 +1,100 @@
|
||||||
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author ebanks
|
||||||
|
* <p/>
|
||||||
|
* Class VCFHeaderLine
|
||||||
|
* <p/>
|
||||||
|
* A class representing a key=value entry in the VCF header
|
||||||
|
*/
|
||||||
|
public class VCFHeaderLine implements Comparable {
|
||||||
|
|
||||||
|
private String stringRep = null;
|
||||||
|
private String mKey = null;
|
||||||
|
private String mValue = null;
|
||||||
|
protected VCFHeaderVersion mVersion = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a VCF header line
|
||||||
|
*
|
||||||
|
* @param key the key for this header line
|
||||||
|
* @param value the value for this header line
|
||||||
|
*/
|
||||||
|
public VCFHeaderLine(String key, String value, VCFHeaderVersion version) {
|
||||||
|
mKey = key;
|
||||||
|
mValue = value;
|
||||||
|
mVersion = version;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a VCF header line
|
||||||
|
*
|
||||||
|
* @param key the key for this header line
|
||||||
|
* @param value the value for this header line
|
||||||
|
*/
|
||||||
|
public VCFHeaderLine(String key, String value) {
|
||||||
|
mKey = key;
|
||||||
|
mValue = value;
|
||||||
|
mVersion = VCFHeaderVersion.VCF3_3;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the key
|
||||||
|
*
|
||||||
|
* @return the key
|
||||||
|
*/
|
||||||
|
public String getKey() {
|
||||||
|
return mKey;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the key
|
||||||
|
*
|
||||||
|
* @param key the key for this header line
|
||||||
|
*/
|
||||||
|
public void setKey(String key) {
|
||||||
|
mKey = key;
|
||||||
|
stringRep = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the value
|
||||||
|
*
|
||||||
|
* @return the value
|
||||||
|
*/
|
||||||
|
public String getValue() {
|
||||||
|
return mValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the value
|
||||||
|
*
|
||||||
|
* @param value the value for this header line
|
||||||
|
*/
|
||||||
|
public void setValue(String value) {
|
||||||
|
mValue = value;
|
||||||
|
stringRep = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
if ( stringRep == null )
|
||||||
|
stringRep = makeStringRep();
|
||||||
|
return stringRep;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String makeStringRep() {
|
||||||
|
return mKey + "=" + mValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if ( !(o instanceof VCFHeaderLine) )
|
||||||
|
return false;
|
||||||
|
return mKey.equals(((VCFHeaderLine)o).getKey()) && mValue.equals(((VCFHeaderLine)o).getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
public int compareTo(Object other) {
|
||||||
|
return toString().compareTo(other.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,161 @@
|
||||||
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: aaron
|
||||||
|
* Date: Jun 17, 2010
|
||||||
|
* Time: 12:28:46 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
public class VCFHeaderLineTranslator {
|
||||||
|
private static Map<VCFHeaderVersion,VCFLineParser> mapping;
|
||||||
|
|
||||||
|
static {
|
||||||
|
mapping = new HashMap<VCFHeaderVersion,VCFLineParser>();
|
||||||
|
mapping.put(VCFHeaderVersion.VCF4_0,new VCF4Parser());
|
||||||
|
mapping.put(VCFHeaderVersion.VCF3_3,new VCF3Parser());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Map<String,String> parseLine(VCFHeaderVersion version, String valueLine, List<String> expectedTagOrder) {
|
||||||
|
return mapping.get(version).parseLine(valueLine,expectedTagOrder);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String toValue(VCFHeaderVersion version, Map<String,Object> keyValues) {
|
||||||
|
return mapping.get(version).toValue(keyValues);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
interface VCFLineParser {
|
||||||
|
public String toValue(Map<String,? extends Object> keyValues);
|
||||||
|
public Map<String,String> parseLine(String valueLine, List<String> expectedTagOrder);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* a class that handles the to and from disk for VCF 4 lines
|
||||||
|
*/
|
||||||
|
class VCF4Parser implements VCFLineParser {
|
||||||
|
Set<String> bracketed = new HashSet<String>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a string of a mapping pair for the target VCF version
|
||||||
|
* @param keyValues a mapping of the key->value pairs to output
|
||||||
|
* @return a string, correctly formatted
|
||||||
|
*/
|
||||||
|
public String toValue(Map<String, ? extends Object> keyValues) {
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
builder.append("<");
|
||||||
|
boolean start = true;
|
||||||
|
for (Map.Entry<String,?> entry : keyValues.entrySet()) {
|
||||||
|
if (start) start = false;
|
||||||
|
else builder.append(",");
|
||||||
|
builder.append(entry.getKey());
|
||||||
|
builder.append("=");
|
||||||
|
builder.append(entry.getValue().toString().contains(",") ||
|
||||||
|
entry.getValue().toString().contains(" ") ||
|
||||||
|
entry.getKey().equals("Description") ? "\""+ entry.getValue() + "\"" : entry.getValue());
|
||||||
|
}
|
||||||
|
builder.append(">");
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* parse a VCF4 line
|
||||||
|
* @param valueLine the line
|
||||||
|
* @return a mapping of the tags parsed out
|
||||||
|
*/
|
||||||
|
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder) {
|
||||||
|
// our return map
|
||||||
|
Map<String, String> ret = new LinkedHashMap<String, String>();
|
||||||
|
|
||||||
|
// a builder to store up characters as we go
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
|
||||||
|
// store the key when we're parsing out the values
|
||||||
|
String key = "";
|
||||||
|
|
||||||
|
// where are we in the stream of characters?
|
||||||
|
int index = 0;
|
||||||
|
|
||||||
|
// are we inside a quotation? we don't special case ',' then
|
||||||
|
boolean inQuote = false;
|
||||||
|
|
||||||
|
// a little switch machine to parse out the tags. Regex ended up being really complicated and ugly
|
||||||
|
for (char c: valueLine.toCharArray()) {
|
||||||
|
switch (c) {
|
||||||
|
case ('<') : if (index == 0) break; // if we see a open bracket at the beginning, ignore it
|
||||||
|
case ('>') : if (index == valueLine.length()-1) ret.put(key,builder.toString().trim()); break; // if we see a close bracket, and we're at the end, add an entry to our list
|
||||||
|
case ('=') : key = builder.toString().trim(); builder = new StringBuilder(); break; // at an equals, copy the key and reset the builder
|
||||||
|
case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it
|
||||||
|
case (',') : if (!inQuote) { ret.put(key,builder.toString().trim()); builder = new StringBuilder(); break; } // drop the current key value to the return map
|
||||||
|
default: builder.append(c); // otherwise simply append to the current string
|
||||||
|
}
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// validate the tags against the expected list
|
||||||
|
index = 0;
|
||||||
|
if (ret.size() > expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + ret.size() + " in string " + expectedTagOrder.size());
|
||||||
|
for (String str : ret.keySet()) {
|
||||||
|
if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class VCF3Parser implements VCFLineParser {
|
||||||
|
|
||||||
|
public String toValue(Map<String, ? extends Object> keyValues) {
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
boolean start = true;
|
||||||
|
for (Map.Entry<String,?> entry : keyValues.entrySet()) {
|
||||||
|
if (start) start = false;
|
||||||
|
else builder.append(",");
|
||||||
|
builder.append(entry.getValue().toString().contains(",") || entry.getValue().toString().contains(" ")? "\""+ entry.getValue() + "\"" : entry.getValue());
|
||||||
|
}
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder) {
|
||||||
|
// our return map
|
||||||
|
Map<String, String> ret = new LinkedHashMap<String, String>();
|
||||||
|
|
||||||
|
// a builder to store up characters as we go
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
|
||||||
|
// store the key when we're parsing out the values
|
||||||
|
String key = "";
|
||||||
|
|
||||||
|
// where are we in the stream of characters?
|
||||||
|
int index = 0;
|
||||||
|
// where in the expected tag order are we?
|
||||||
|
int tagIndex = 0;
|
||||||
|
|
||||||
|
// are we inside a quotation? we don't special case ',' then
|
||||||
|
boolean inQuote = false;
|
||||||
|
|
||||||
|
// a little switch machine to parse out the tags. Regex ended up being really complicated and ugly
|
||||||
|
for (char c: valueLine.toCharArray()) {
|
||||||
|
switch (c) {
|
||||||
|
case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it
|
||||||
|
case (',') : if (!inQuote) { ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); builder = new StringBuilder(); break; } // drop the current key value to the return map
|
||||||
|
default: builder.append(c); // otherwise simply append to the current string
|
||||||
|
}
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
ret.put(expectedTagOrder.get(tagIndex++),builder.toString());
|
||||||
|
|
||||||
|
// validate the tags against the expected list
|
||||||
|
index = 0;
|
||||||
|
if (tagIndex != expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size());
|
||||||
|
for (String str : ret.keySet()){
|
||||||
|
if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,78 @@
|
||||||
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* information that identifies each header version
|
||||||
|
*/
|
||||||
|
public enum VCFHeaderVersion {
|
||||||
|
VCF3_2("VCRv3.2","format"),
|
||||||
|
VCF3_3("VCFv3.3","fileformat"),
|
||||||
|
VCF4_0("VCFv4.0","fileformat");
|
||||||
|
|
||||||
|
private final String versionString;
|
||||||
|
private final String formatString;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create the enum, privately, using:
|
||||||
|
* @param vString the version string
|
||||||
|
* @param fString the format string
|
||||||
|
*/
|
||||||
|
VCFHeaderVersion(String vString, String fString) {
|
||||||
|
this.versionString = vString;
|
||||||
|
this.formatString = fString;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the header version
|
||||||
|
* @param version the version string
|
||||||
|
* @param format the format string
|
||||||
|
* @return a VCFHeaderVersion object
|
||||||
|
*/
|
||||||
|
public static VCFHeaderVersion toHeaderVersion(String version, String format) {
|
||||||
|
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
|
||||||
|
if (hv.versionString.equals(version) && hv.formatString.equals(format))
|
||||||
|
return hv;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the header version
|
||||||
|
* @param version the version string
|
||||||
|
* @return a VCFHeaderVersion object
|
||||||
|
*/
|
||||||
|
public static VCFHeaderVersion toHeaderVersion(String version) {
|
||||||
|
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
|
||||||
|
if (hv.versionString.equals(version))
|
||||||
|
return hv;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* are we a valid version string of some type
|
||||||
|
* @param version the version string
|
||||||
|
* @return true if we're valid of some type, false otherwise
|
||||||
|
*/
|
||||||
|
public static boolean isVersionString(String version){
|
||||||
|
return toHeaderVersion(version) != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* are we a valid format string for some type
|
||||||
|
* @param format the format string
|
||||||
|
* @return true if we're valid of some type, false otherwise
|
||||||
|
*/
|
||||||
|
public static boolean isFormatString(String format){
|
||||||
|
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
|
||||||
|
if (hv.formatString.equals(format))
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String getVersionString() {
|
||||||
|
return versionString;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFormatString() {
|
||||||
|
return formatString;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,121 @@
|
||||||
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
|
import org.broad.tribble.util.ParsingUtils;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author ebanks
|
||||||
|
* <p/>
|
||||||
|
* Class VCFInfoHeaderLine
|
||||||
|
* <p/>
|
||||||
|
* A class representing a key=value entry for INFO fields in the VCF header
|
||||||
|
*/
|
||||||
|
public class VCFInfoHeaderLine extends VCFHeaderLine {
|
||||||
|
|
||||||
|
// the info field types
|
||||||
|
public enum INFO_TYPE {
|
||||||
|
Integer, Float, String, Character, Flag;
|
||||||
|
|
||||||
|
public Object convert(String value) {
|
||||||
|
switch (this) {
|
||||||
|
case Integer:
|
||||||
|
return java.lang.Integer.valueOf(value); // the java.lang is needed since we use Integer as a enum name
|
||||||
|
case Float:
|
||||||
|
return java.lang.Float.valueOf(value);
|
||||||
|
case String:
|
||||||
|
case Character:
|
||||||
|
return value;
|
||||||
|
case Flag:
|
||||||
|
return value.equals("0") ? false : true;
|
||||||
|
default:
|
||||||
|
throw new IllegalStateException("INFO_TYPE." + this + " doesn't have a set conversion approach");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String mName;
|
||||||
|
private int mCount;
|
||||||
|
private String mDescription;
|
||||||
|
private INFO_TYPE mType;
|
||||||
|
|
||||||
|
|
||||||
|
// info line numerical values are allowed to be unbounded (or unknown), which is
|
||||||
|
// marked with a dot (.)
|
||||||
|
public static int UNBOUNDED = Integer.MIN_VALUE;
|
||||||
|
public static String UNBOUNDED_ENCODING = ".";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a VCF info header line
|
||||||
|
*
|
||||||
|
* @param name the name for this header line
|
||||||
|
* @param count the count for this header line
|
||||||
|
* @param type the type for this header line
|
||||||
|
* @param description the description for this header line
|
||||||
|
*/
|
||||||
|
public VCFInfoHeaderLine(String name, int count, INFO_TYPE type, String description) {
|
||||||
|
super("INFO", "");
|
||||||
|
mName = name;
|
||||||
|
mCount = count;
|
||||||
|
mType = type;
|
||||||
|
mDescription = description;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a VCF info header line
|
||||||
|
*
|
||||||
|
* @param line the header line
|
||||||
|
* @param version the VCF version
|
||||||
|
*/
|
||||||
|
protected VCFInfoHeaderLine(String line, VCFHeaderVersion version) {
|
||||||
|
super("INFO", "", version);
|
||||||
|
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description"));
|
||||||
|
mName = mapping.get("ID");
|
||||||
|
mCount = mapping.get("Number").equals(UNBOUNDED_ENCODING) ? UNBOUNDED : Integer.valueOf(mapping.get("Number"));
|
||||||
|
mType = INFO_TYPE.valueOf(mapping.get("Type"));
|
||||||
|
mDescription = mapping.get("Description");
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String makeStringRep() {
|
||||||
|
if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2)
|
||||||
|
return String.format("INFO=%s,%d,%s,\"%s\"", mName, mCount, mType.toString(), mDescription);
|
||||||
|
else if (mVersion == VCFHeaderVersion.VCF4_0) {
|
||||||
|
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
||||||
|
map.put("ID",mName);
|
||||||
|
map.put("Number",mCount == UNBOUNDED ? UNBOUNDED_ENCODING : mCount);
|
||||||
|
map.put("Type",mType);
|
||||||
|
map.put("Description",mDescription);
|
||||||
|
return "INFO=" + VCFHeaderLineTranslator.toValue(this.mVersion,map);
|
||||||
|
}
|
||||||
|
else throw new RuntimeException("Unsupported VCFVersion " + mVersion);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if ( !(o instanceof VCFInfoHeaderLine) )
|
||||||
|
return false;
|
||||||
|
VCFInfoHeaderLine other = (VCFInfoHeaderLine)o;
|
||||||
|
return mName.equals(other.mName) &&
|
||||||
|
mCount == other.mCount &&
|
||||||
|
mDescription.equals(other.mDescription) &&
|
||||||
|
mType == other.mType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getmName() {
|
||||||
|
return mName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getmCount() {
|
||||||
|
return mCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getmDescription() {
|
||||||
|
return mDescription;
|
||||||
|
}
|
||||||
|
|
||||||
|
public INFO_TYPE getmType() {
|
||||||
|
return mType;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,206 @@
|
||||||
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
/** The VCFReaderUtils class, which contains a collection of utilities for working with VCF files */
|
||||||
|
public class VCFReaderUtils {
|
||||||
|
|
||||||
|
// our pattern matching for the genotype mFields
|
||||||
|
private static final Pattern gtPattern = Pattern.compile("([0-9\\.]+)([\\\\|\\/])([0-9\\.]*)");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a VCF header, given an array of strings that all start with at least the # character. This function is
|
||||||
|
* package protected so that the VCFReaderUtils can access this function
|
||||||
|
*
|
||||||
|
* @param headerStrings a list of header strings
|
||||||
|
*
|
||||||
|
* @return a VCF Header created from the list of stinrgs
|
||||||
|
*/
|
||||||
|
public static VCFHeader createHeader(List<String> headerStrings, VCFHeaderVersion version) {
|
||||||
|
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
|
||||||
|
Set<String> auxTags = new LinkedHashSet<String>();
|
||||||
|
// iterate over all the passed in strings
|
||||||
|
for ( String str : headerStrings ) {
|
||||||
|
if ( !str.startsWith("##") ) {
|
||||||
|
String[] strings = str.substring(1).split("\\t");
|
||||||
|
// the columns should be in order according to Richard Durbin
|
||||||
|
int arrayIndex = 0;
|
||||||
|
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
|
||||||
|
try {
|
||||||
|
if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex]))
|
||||||
|
throw new RuntimeException("VCFReaderUtils: we were expecting column name " + field + " but we saw " + strings[arrayIndex]);
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
throw new RuntimeException("VCFReaderUtils: Unknown column name \"" + strings[arrayIndex] + "\", it does not match a known column header name.");
|
||||||
|
}
|
||||||
|
arrayIndex++;
|
||||||
|
}
|
||||||
|
while (arrayIndex < strings.length) {
|
||||||
|
if (!strings[arrayIndex].equals("FORMAT"))
|
||||||
|
auxTags.add(strings[arrayIndex]);
|
||||||
|
arrayIndex++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if ( str.startsWith("##INFO=") )
|
||||||
|
metaData.add(new VCFInfoHeaderLine(str.substring(7),version));
|
||||||
|
else if ( str.startsWith("##FILTER=") )
|
||||||
|
metaData.add(new VCFFilterHeaderLine(str.substring(9),version));
|
||||||
|
else if ( str.startsWith("##FORMAT=") )
|
||||||
|
metaData.add(new VCFFormatHeaderLine(str.substring(9),version));
|
||||||
|
else {
|
||||||
|
int equals = str.indexOf("=");
|
||||||
|
if ( equals != -1 )
|
||||||
|
metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1),version));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new VCFHeader(metaData, auxTags);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create the next VCFRecord, given the input line
|
||||||
|
*
|
||||||
|
* @param line the line from the file
|
||||||
|
* @param mHeader the VCF header
|
||||||
|
*
|
||||||
|
* @return the VCFRecord
|
||||||
|
*/
|
||||||
|
public static VCFRecord createRecord(String line, VCFHeader mHeader) {
|
||||||
|
return createRecord(line, mHeader, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static VCFRecord createRecord(String line, VCFHeader mHeader, boolean ignoreGenotypes) {
|
||||||
|
// things we need to make a VCF record
|
||||||
|
Map<VCFHeader.HEADER_FIELDS, String> values = new HashMap<VCFHeader.HEADER_FIELDS, String>();
|
||||||
|
String tokens[] = line.split("\\t");
|
||||||
|
|
||||||
|
// check to ensure that the column count of tokens is right
|
||||||
|
if (tokens.length != mHeader.getColumnCount()) {
|
||||||
|
throw new RuntimeException("The input file line doesn't contain enough fields, it should have " + mHeader.getColumnCount() + " fields, it has " + tokens.length + ". Line = " + line);
|
||||||
|
}
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
for (VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields())
|
||||||
|
values.put(field, tokens[index++]);
|
||||||
|
// if we have genotyping data, we try and extract the genotype fields
|
||||||
|
if ( ! ignoreGenotypes && mHeader.hasGenotypingData()) {
|
||||||
|
String mFormatString = tokens[index];
|
||||||
|
String keyStrings[] = mFormatString.split(":");
|
||||||
|
List<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>();
|
||||||
|
index++;
|
||||||
|
String[] alt_alleles = values.get(VCFHeader.HEADER_FIELDS.ALT).split(",");
|
||||||
|
for (String str : mHeader.getGenotypeSamples()) {
|
||||||
|
genotypeRecords.add(getVCFGenotype(str, keyStrings, tokens[index], alt_alleles, values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0)));
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
VCFRecord vrec = new VCFRecord(values, mFormatString, genotypeRecords);
|
||||||
|
// associate the genotypes with this new record
|
||||||
|
for ( VCFGenotypeRecord gr : genotypeRecords )
|
||||||
|
gr.setVCFRecord(vrec);
|
||||||
|
return vrec;
|
||||||
|
|
||||||
|
}
|
||||||
|
return new VCFRecord(values);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* generate a VCF genotype record, given it's format string, the genotype string, and allele info
|
||||||
|
*
|
||||||
|
* @param sampleName the sample name
|
||||||
|
* @param formatString the format string for this record, which contains the keys for the genotype parameters
|
||||||
|
* @param genotypeString contains the phasing information, allele information, and values for genotype parameters
|
||||||
|
* @param altAlleles the alternate allele string array, which we index into based on the field parameters
|
||||||
|
* @param referenceBase the reference base
|
||||||
|
*
|
||||||
|
* @return a VCFGenotypeRecord
|
||||||
|
*/
|
||||||
|
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String formatString, String genotypeString, String altAlleles[], char referenceBase) {
|
||||||
|
return getVCFGenotype(sampleName, formatString.split(":"), genotypeString, altAlleles, referenceBase);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* generate a VCF genotype record, given it's format string, the genotype string, and allele info
|
||||||
|
*
|
||||||
|
* @param sampleName the sample name
|
||||||
|
* @param keyStrings the split format string for this record, which contains the keys for the genotype parameters
|
||||||
|
* @param genotypeString contains the phasing information, allele information, and values for genotype parameters
|
||||||
|
* @param altAlleles the alternate allele string array, which we index into based on the field parameters
|
||||||
|
* @param referenceBase the reference base
|
||||||
|
*
|
||||||
|
* @return a VCFGenotypeRecord
|
||||||
|
*/
|
||||||
|
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String[] keyStrings, String genotypeString, String altAlleles[], char referenceBase) {
|
||||||
|
// parameters to create the VCF genotype record
|
||||||
|
HashMap<String, String> tagToValue = new HashMap<String, String>();
|
||||||
|
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN;
|
||||||
|
List<VCFGenotypeEncoding> bases = new ArrayList<VCFGenotypeEncoding>();
|
||||||
|
|
||||||
|
for (String key : keyStrings) {
|
||||||
|
String parse;
|
||||||
|
int nextDivider;
|
||||||
|
if (!genotypeString.contains(":")) {
|
||||||
|
nextDivider = genotypeString.length();
|
||||||
|
parse = genotypeString;
|
||||||
|
} else {
|
||||||
|
nextDivider = (genotypeString.indexOf(":") > genotypeString.length()) ? genotypeString.length() : genotypeString.indexOf(":");
|
||||||
|
parse = genotypeString.substring(0, nextDivider);
|
||||||
|
}
|
||||||
|
if (key.equals(VCFGenotypeRecord.GENOTYPE_KEY)) {
|
||||||
|
Matcher m = gtPattern.matcher(parse);
|
||||||
|
if (!m.matches())
|
||||||
|
throw new RuntimeException("VCFReaderUtils: Unable to match GT genotype flag to it's expected pattern, the field was: " + parse);
|
||||||
|
phase = VCFGenotypeRecord.determinePhase(m.group(2));
|
||||||
|
addAllele(m.group(1), altAlleles, referenceBase, bases);
|
||||||
|
if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases);
|
||||||
|
} else {
|
||||||
|
if ( parse.length() == 0 )
|
||||||
|
parse = VCFGenotypeRecord.getMissingFieldValue(key);
|
||||||
|
tagToValue.put(key, parse);
|
||||||
|
}
|
||||||
|
if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1;
|
||||||
|
genotypeString = genotypeString.substring(nextDivider + 1, genotypeString.length());
|
||||||
|
}
|
||||||
|
if ( bases.size() > 0 && bases.get(0).equals(VCFGenotypeRecord.EMPTY_ALLELE) )
|
||||||
|
tagToValue.clear();
|
||||||
|
// catch some common errors, either there are too many field keys or there are two many field values
|
||||||
|
else if ( keyStrings.length != tagToValue.size() + ((bases.size() > 0) ? 1 : 0))
|
||||||
|
throw new RuntimeException("VCFReaderUtils: genotype value count doesn't match the key count (expected "
|
||||||
|
+ keyStrings.length + " but saw " + tagToValue.size() + ")");
|
||||||
|
else if ( genotypeString.length() > 0 )
|
||||||
|
throw new RuntimeException("VCFReaderUtils: genotype string contained additional unprocessed fields: " + genotypeString
|
||||||
|
+ ". This most likely means that the format string is shorter then the value fields.");
|
||||||
|
|
||||||
|
VCFGenotypeRecord rec = new VCFGenotypeRecord(sampleName, bases, phase);
|
||||||
|
for ( Map.Entry<String, String> entry : tagToValue.entrySet() )
|
||||||
|
rec.setField(entry.getKey(), entry.getValue());
|
||||||
|
return rec;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* add an alternate allele to the list of alleles we have for a VCF genotype record
|
||||||
|
*
|
||||||
|
* @param alleleNumber the allele number, as a string
|
||||||
|
* @param altAlleles the list of alternate alleles
|
||||||
|
* @param referenceBase the reference base
|
||||||
|
* @param bases the list of bases for this genotype call
|
||||||
|
*/
|
||||||
|
private static void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List<VCFGenotypeEncoding> bases) {
|
||||||
|
if (alleleNumber.equals(VCFGenotypeRecord.EMPTY_ALLELE)) {
|
||||||
|
bases.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE));
|
||||||
|
} else {
|
||||||
|
int alleleValue = Integer.valueOf(alleleNumber);
|
||||||
|
// check to make sure the allele value is within bounds
|
||||||
|
if (alleleValue < 0 || alleleValue > altAlleles.length)
|
||||||
|
throw new IllegalArgumentException("VCFReaderUtils: the allele value of " + alleleValue + " is out of bounds given the alternate allele list.");
|
||||||
|
if (alleleValue == 0)
|
||||||
|
bases.add(new VCFGenotypeEncoding(String.valueOf(referenceBase)));
|
||||||
|
else
|
||||||
|
bases.add(new VCFGenotypeEncoding(altAlleles[alleleValue - 1]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,682 @@
|
||||||
|
package org.broad.tribble.vcf;
|
||||||
|
|
||||||
|
|
||||||
|
import org.broad.tribble.Feature;
|
||||||
|
import org.broad.tribble.util.ParsingUtils;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/** the basic VCF record type */
|
||||||
|
public class VCFRecord implements Feature {
|
||||||
|
|
||||||
|
// standard info field keys
|
||||||
|
public static final String ANCESTRAL_ALLELE_KEY = "AA";
|
||||||
|
public static final String ALLELE_COUNT_KEY = "AC";
|
||||||
|
public static final String ALLELE_FREQUENCY_KEY = "AF";
|
||||||
|
public static final String ALLELE_NUMBER_KEY = "AN";
|
||||||
|
public static final String RMS_BASE_QUALITY_KEY = "BQ";
|
||||||
|
public static final String DBSNP_KEY = "DB";
|
||||||
|
public static final String DEPTH_KEY = "DP";
|
||||||
|
public static final String HAPMAP2_KEY = "H2";
|
||||||
|
public static final String HAPMAP3_KEY = "H3";
|
||||||
|
public static final String RMS_MAPPING_QUALITY_KEY = "MQ";
|
||||||
|
public static final String SAMPLE_NUMBER_KEY = "NS";
|
||||||
|
public static final String STRAND_BIAS_KEY = "SB";
|
||||||
|
|
||||||
|
// commonly used strings that are in the standard
|
||||||
|
public static final String FORMAT_FIELD_SEPERATOR = ":";
|
||||||
|
public static final String GENOTYPE_FIELD_SEPERATOR = ":";
|
||||||
|
public static final String FIELD_SEPERATOR = "\t";
|
||||||
|
public static final String FILTER_CODE_SEPERATOR = ";";
|
||||||
|
public static final String INFO_FIELD_SEPERATOR = ";";
|
||||||
|
|
||||||
|
// default values
|
||||||
|
public static final String UNFILTERED = ".";
|
||||||
|
public static final String PASSES_FILTERS = "0";
|
||||||
|
public static final String EMPTY_INFO_FIELD = ".";
|
||||||
|
public static final String EMPTY_ID_FIELD = ".";
|
||||||
|
public static final String EMPTY_ALLELE_FIELD = ".";
|
||||||
|
public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
|
||||||
|
public static final int MISSING_GENOTYPE_QUALITY = -1;
|
||||||
|
|
||||||
|
// the reference base
|
||||||
|
private String mReferenceBases;
|
||||||
|
// our location
|
||||||
|
private String mContig;
|
||||||
|
private int mPosition;
|
||||||
|
// our id
|
||||||
|
private String mID;
|
||||||
|
// the alternate bases
|
||||||
|
private final List<VCFGenotypeEncoding> mAlts = new ArrayList<VCFGenotypeEncoding>();
|
||||||
|
// our qual value
|
||||||
|
private double mQual;
|
||||||
|
// our filter string
|
||||||
|
private String mFilterString;
|
||||||
|
// our info fields -- use a TreeMap to ensure they can be pulled out in order (so it passes integration tests)
|
||||||
|
private final Map<String, String> mInfoFields = new TreeMap<String, String>();
|
||||||
|
|
||||||
|
// our genotype formatting string
|
||||||
|
private String mGenotypeFormatString;
|
||||||
|
|
||||||
|
// the vcf header we're associated with
|
||||||
|
private VCFHeader vcfHeader = null;
|
||||||
|
|
||||||
|
// our genotype sample fields
|
||||||
|
private final List<VCFGenotypeRecord> mGenotypeRecords = new ArrayList<VCFGenotypeRecord>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* given a reference base, a location, and the format string, create a VCF record.
|
||||||
|
*
|
||||||
|
* @param referenceBases the reference bases to use
|
||||||
|
* @param contig our contig
|
||||||
|
* @param start the start location
|
||||||
|
* @param genotypeFormatString the format string
|
||||||
|
*/
|
||||||
|
public VCFRecord(String referenceBases, String contig, int start, String genotypeFormatString) {
|
||||||
|
setReferenceBase(referenceBases);
|
||||||
|
setLocation(contig, start);
|
||||||
|
mGenotypeFormatString = genotypeFormatString;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* given the values for each of the columns, create a VCF record.
|
||||||
|
*
|
||||||
|
* @param columnValues a mapping of header strings to values
|
||||||
|
* @param genotypeFormatString the format string for the genotype records
|
||||||
|
* @param genotypeRecords the genotype records
|
||||||
|
*/
|
||||||
|
public VCFRecord(Map<VCFHeader.HEADER_FIELDS, String> columnValues, String genotypeFormatString, List<VCFGenotypeRecord> genotypeRecords) {
|
||||||
|
extractFields(columnValues);
|
||||||
|
mGenotypeRecords.addAll(genotypeRecords);
|
||||||
|
mGenotypeFormatString = genotypeFormatString;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* given the values for each of the columns, create a VCF record.
|
||||||
|
*
|
||||||
|
* @param columnValues a mapping of header strings to values
|
||||||
|
*/
|
||||||
|
public VCFRecord(Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
|
||||||
|
extractFields(columnValues);
|
||||||
|
mGenotypeFormatString = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a VCF record
|
||||||
|
*
|
||||||
|
* @param referenceBases the reference bases to use
|
||||||
|
* @param contig the contig this variant is on
|
||||||
|
* @param position our position
|
||||||
|
* @param ID our ID string
|
||||||
|
* @param altBases the list of alternate bases
|
||||||
|
* @param qual the qual field
|
||||||
|
* @param filters the filters used on this variant
|
||||||
|
* @param infoFields the information fields
|
||||||
|
* @param genotypeFormatString the format string
|
||||||
|
* @param genotypeObjects the genotype objects
|
||||||
|
*/
|
||||||
|
public VCFRecord(String referenceBases,
|
||||||
|
String contig,
|
||||||
|
long position,
|
||||||
|
String ID,
|
||||||
|
List<VCFGenotypeEncoding> altBases,
|
||||||
|
double qual,
|
||||||
|
String filters,
|
||||||
|
Map<String, String> infoFields,
|
||||||
|
String genotypeFormatString,
|
||||||
|
List<VCFGenotypeRecord> genotypeObjects) {
|
||||||
|
setReferenceBase(referenceBases);
|
||||||
|
setLocation(contig, position);
|
||||||
|
this.mID = ID;
|
||||||
|
for (VCFGenotypeEncoding alt : altBases)
|
||||||
|
this.addAlternateBase(alt);
|
||||||
|
this.setQual(qual);
|
||||||
|
this.setFilterString(filters);
|
||||||
|
this.mInfoFields.putAll(infoFields);
|
||||||
|
this.mGenotypeFormatString = genotypeFormatString;
|
||||||
|
this.mGenotypeRecords.addAll(genotypeObjects);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* extract the field values from the passed in array
|
||||||
|
*
|
||||||
|
* @param columnValues a map of the header fields to values
|
||||||
|
*/
|
||||||
|
private void extractFields(Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
|
||||||
|
String chrom = null;
|
||||||
|
long position = -1;
|
||||||
|
|
||||||
|
for (VCFHeader.HEADER_FIELDS val : columnValues.keySet()) {
|
||||||
|
switch (val) {
|
||||||
|
case CHROM:
|
||||||
|
chrom = columnValues.get(val);
|
||||||
|
break;
|
||||||
|
case POS:
|
||||||
|
position = Integer.valueOf(columnValues.get(val));
|
||||||
|
break;
|
||||||
|
case ID:
|
||||||
|
setID(columnValues.get(val));
|
||||||
|
break;
|
||||||
|
case REF:
|
||||||
|
if (columnValues.get(val).length() != 1)
|
||||||
|
throw new IllegalArgumentException("Reference base should be a single character");
|
||||||
|
setReferenceBase(columnValues.get(val));
|
||||||
|
break;
|
||||||
|
case ALT:
|
||||||
|
String values[] = columnValues.get(val).split(",");
|
||||||
|
for (String alt : values)
|
||||||
|
addAlternateBase(new VCFGenotypeEncoding(alt));
|
||||||
|
break;
|
||||||
|
case QUAL:
|
||||||
|
setQual(Double.valueOf(columnValues.get(val)));
|
||||||
|
break;
|
||||||
|
case FILTER:
|
||||||
|
setFilterString(columnValues.get(val));
|
||||||
|
break;
|
||||||
|
case INFO:
|
||||||
|
String vals[] = columnValues.get(val).split(";");
|
||||||
|
for (String alt : vals) {
|
||||||
|
if ( alt.equals(EMPTY_INFO_FIELD) )
|
||||||
|
continue;
|
||||||
|
String keyVal[] = alt.split("=");
|
||||||
|
if ( keyVal.length == 1 )
|
||||||
|
addInfoField(keyVal[0], "");
|
||||||
|
else if (keyVal.length == 2)
|
||||||
|
addInfoField(keyVal[0], keyVal[1]);
|
||||||
|
else
|
||||||
|
throw new IllegalArgumentException("info field key-value pair did not parse into key->value pair: " + alt);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
setLocation(chrom, position);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* do we have genotyping data
|
||||||
|
*
|
||||||
|
* @return true if we have genotyping data, false otherwise
|
||||||
|
*/
|
||||||
|
|
||||||
|
public boolean hasGenotypeData() {
|
||||||
|
return (mGenotypeRecords.size() > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the ID value for this record
|
||||||
|
*/
|
||||||
|
public String getID() {
|
||||||
|
return mID == null ? EMPTY_ID_FIELD : mID;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the reference base
|
||||||
|
*
|
||||||
|
* @return either A, T, C, G, or N
|
||||||
|
*/
|
||||||
|
public String getReference() {
|
||||||
|
return mReferenceBases;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the alternate allele strings
|
||||||
|
*
|
||||||
|
* @return an array of strings representing the alt alleles, or null if there are none
|
||||||
|
*/
|
||||||
|
public List<String> getAlternateAlleleList() {
|
||||||
|
ArrayList<String> alts = new ArrayList<String>();
|
||||||
|
for ( VCFGenotypeEncoding alt : mAlts )
|
||||||
|
alts.add(alt.getBases());
|
||||||
|
return alts;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<VCFGenotypeEncoding> getAlternateAlleles() {
|
||||||
|
return mAlts;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasAlternateAllele() {
|
||||||
|
for ( VCFGenotypeEncoding alt : mAlts ) {
|
||||||
|
if ( alt.getType() != VCFGenotypeEncoding.TYPE.UNCALLED )
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isBiallelic() {
|
||||||
|
return getAlternateAlleles().size() == 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isReference() {
|
||||||
|
return !hasAlternateAllele();
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getAlleleList() {
|
||||||
|
ArrayList<String> list = new ArrayList<String>();
|
||||||
|
list.add(getReference());
|
||||||
|
list.addAll(getAlternateAlleleList());
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getNonRefAlleleFrequency() {
|
||||||
|
if ( mInfoFields.containsKey(ALLELE_FREQUENCY_KEY) ) {
|
||||||
|
return Double.valueOf(mInfoFields.get(ALLELE_FREQUENCY_KEY));
|
||||||
|
} else {
|
||||||
|
// this is the poor man's AF
|
||||||
|
if ( mInfoFields.containsKey(ALLELE_COUNT_KEY) && mInfoFields.containsKey(ALLELE_NUMBER_KEY)) {
|
||||||
|
String splt[] = mInfoFields.get(ALLELE_COUNT_KEY).split(",");
|
||||||
|
if ( splt.length > 0 ) {
|
||||||
|
return (Double.valueOf(splt[0]) / Double.valueOf(mInfoFields.get(ALLELE_NUMBER_KEY)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VCFGenotypeEncoding.TYPE getType() {
|
||||||
|
VCFGenotypeEncoding.TYPE type = mAlts.get(0).getType();
|
||||||
|
for (int i = 1; i < mAlts.size(); i++) {
|
||||||
|
if ( mAlts.get(i).getType() != type )
|
||||||
|
return VCFGenotypeEncoding.TYPE.MIXED; // if we have more than one type, return mixed
|
||||||
|
}
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isDeletion() {
|
||||||
|
return getType() == VCFGenotypeEncoding.TYPE.DELETION;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isInsertion() {
|
||||||
|
return getType() == VCFGenotypeEncoding.TYPE.INSERTION;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isIndel() {
|
||||||
|
return isDeletion() || isInsertion();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isSNP() {
|
||||||
|
return getType() == VCFGenotypeEncoding.TYPE.SINGLE_BASE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isNovel() {
|
||||||
|
return ( ! isInDBSNP() ) && ( ! isInHapmap() );
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isInDBSNP() {
|
||||||
|
return ( ( mID != null && ! mID.equals(".") ) || ( mInfoFields.get(DBSNP_KEY) != null && mInfoFields.get(DBSNP_KEY).equals("1") ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isInHapmap() {
|
||||||
|
if ( mInfoFields.get(HAPMAP2_KEY) != null && mInfoFields.get(HAPMAP2_KEY).equals("1") ) {
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return ( mInfoFields.get(HAPMAP3_KEY) != null && mInfoFields.get(HAPMAP3_KEY).equals("1") );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public char getAlternativeBaseForSNP() {
|
||||||
|
if ( !isSNP() && !isBiallelic() )
|
||||||
|
throw new IllegalStateException("This record does not represent a SNP");
|
||||||
|
return mAlts.get(0).getBases().charAt(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
public char getReferenceForSNP() {
|
||||||
|
if ( !isSNP() )
|
||||||
|
throw new IllegalStateException("This record does not represent a SNP");
|
||||||
|
return getReference().charAt(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the phred-scaled quality score
|
||||||
|
*/
|
||||||
|
public double getQual() {
|
||||||
|
return mQual;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getPosition() {
|
||||||
|
return mPosition;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isMissingQual() {
|
||||||
|
return (int)mQual == MISSING_GENOTYPE_QUALITY;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the -log10PError
|
||||||
|
*/
|
||||||
|
public double getNegLog10PError() {
|
||||||
|
return mQual / 10.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the filter criteria
|
||||||
|
*
|
||||||
|
* @return an array of strings representing the filtering criteria, or UNFILTERED if none are applied
|
||||||
|
*/
|
||||||
|
public String[] getFilteringCodes() {
|
||||||
|
if (mFilterString == null) return new String[]{UNFILTERED};
|
||||||
|
return mFilterString.split(FILTER_CODE_SEPERATOR);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isFiltered() {
|
||||||
|
String[] codes = getFilteringCodes();
|
||||||
|
return !codes[0].equals(UNFILTERED) && !codes[0].equals(PASSES_FILTERS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// public boolean hasFilteringCodes() {
|
||||||
|
// return mFilterString != null;
|
||||||
|
// }
|
||||||
|
|
||||||
|
public String getFilterString() {
|
||||||
|
return mFilterString;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the information key-value pairs as a Map<>
|
||||||
|
*
|
||||||
|
* @return a map, of the info key-value pairs
|
||||||
|
*/
|
||||||
|
public final Map<String, String> getInfoValues() {
|
||||||
|
return mInfoFields;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<VCFGenotypeRecord> getVCFGenotypeRecords() {
|
||||||
|
return mGenotypeRecords;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return a List of the sample names
|
||||||
|
*/
|
||||||
|
public String[] getSampleNames() {
|
||||||
|
String names[] = new String[mGenotypeRecords.size()];
|
||||||
|
for (int i = 0; i < mGenotypeRecords.size(); i++) {
|
||||||
|
names[i] = mGenotypeRecords.get(i).getSampleName();
|
||||||
|
}
|
||||||
|
return names;
|
||||||
|
}
|
||||||
|
|
||||||
|
public VCFGenotypeRecord getGenotype(final String sampleName) {
|
||||||
|
for ( VCFGenotypeRecord rec : getVCFGenotypeRecords() ) {
|
||||||
|
if ( rec.getSampleName().equals(sampleName) ) {
|
||||||
|
return rec;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getGenotypeFormatString() {
|
||||||
|
return mGenotypeFormatString;
|
||||||
|
}// the formatting string for our genotype records
|
||||||
|
|
||||||
|
public void setGenotypeFormatString(String newFormatString) {
|
||||||
|
mGenotypeFormatString = newFormatString;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setReferenceBase(String reference) {
|
||||||
|
mReferenceBases = reference.toUpperCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLocation(String chrom, long position) {
|
||||||
|
if ( chrom == null )
|
||||||
|
throw new IllegalArgumentException("Chromosomes cannot be missing");
|
||||||
|
if ( position < 0 )
|
||||||
|
throw new IllegalArgumentException("Position values must be greater than 0");
|
||||||
|
this.mContig = chrom;
|
||||||
|
this.mPosition = (int)position;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setID(String ID) {
|
||||||
|
mID = ID;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setQual(double qual) {
|
||||||
|
if ( qual < 0 && (int)qual != MISSING_GENOTYPE_QUALITY )
|
||||||
|
throw new IllegalArgumentException("Qual values cannot be negative unless they are " + MISSING_GENOTYPE_QUALITY + " ('unknown')");
|
||||||
|
mQual = qual;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFilterString(String filterString) {
|
||||||
|
mFilterString = filterString;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addGenotypeRecord(VCFGenotypeRecord mGenotypeRecord) {
|
||||||
|
mGenotypeRecords.add(mGenotypeRecord);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setGenotypeRecords(List<VCFGenotypeRecord> records) {
|
||||||
|
mGenotypeRecords.clear();
|
||||||
|
for ( VCFGenotypeRecord g : records )
|
||||||
|
addGenotypeRecord(g);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* add an alternate base to our alternate base list. All bases are uppercased
|
||||||
|
* before being added to the list.
|
||||||
|
*
|
||||||
|
* @param base the base to add
|
||||||
|
*/
|
||||||
|
public void addAlternateBase(VCFGenotypeEncoding base) {
|
||||||
|
if (!mAlts.contains(base)) mAlts.add(base);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAlternateBases(List<VCFGenotypeEncoding> bases) {
|
||||||
|
mAlts.clear();
|
||||||
|
for ( VCFGenotypeEncoding e : bases )
|
||||||
|
addAlternateBase(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* add an info field to the record
|
||||||
|
*
|
||||||
|
* @param key the key, from the spec or a user created key
|
||||||
|
* @param value it's value as a string
|
||||||
|
*/
|
||||||
|
public void addInfoField(String key, String value) {
|
||||||
|
//System.out.printf("Adding info field %s=%s%n", key, value);
|
||||||
|
mInfoFields.put(key, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void printInfoFields() {
|
||||||
|
for ( Map.Entry<String, String> e : mInfoFields.entrySet() ) {
|
||||||
|
System.out.printf(" Current info field %s=%s this=%s%n", e.getKey(), e.getValue(), this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* add an info field to the record
|
||||||
|
*
|
||||||
|
* @param m A map from info keys to info values
|
||||||
|
*/
|
||||||
|
public void addInfoFields(Map<String,String> m) {
|
||||||
|
for ( Map.Entry<String, String> e : m.entrySet() )
|
||||||
|
addInfoField(e.getKey(), e.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the generation of a string representation, which is used by the VCF writer
|
||||||
|
*
|
||||||
|
* @param header the VCF header for this VCF Record
|
||||||
|
* @return a string
|
||||||
|
*/
|
||||||
|
public String toStringEncoding(VCFHeader header) {
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
|
||||||
|
// CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO
|
||||||
|
builder.append(mContig);
|
||||||
|
builder.append(FIELD_SEPERATOR);
|
||||||
|
builder.append(mPosition);
|
||||||
|
builder.append(FIELD_SEPERATOR);
|
||||||
|
builder.append(getID());
|
||||||
|
builder.append(FIELD_SEPERATOR);
|
||||||
|
builder.append(getReference());
|
||||||
|
builder.append(FIELD_SEPERATOR);
|
||||||
|
List<VCFGenotypeEncoding> alts = getAlternateAlleles();
|
||||||
|
if ( alts.size() > 0 ) {
|
||||||
|
builder.append(alts.get(0));
|
||||||
|
for ( int i = 1; i < alts.size(); i++ ) {
|
||||||
|
builder.append(",");
|
||||||
|
builder.append(alts.get(i));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
builder.append(EMPTY_ALLELE_FIELD);
|
||||||
|
}
|
||||||
|
builder.append(FIELD_SEPERATOR);
|
||||||
|
if ( (int)mQual == MISSING_GENOTYPE_QUALITY )
|
||||||
|
builder.append(MISSING_GENOTYPE_QUALITY);
|
||||||
|
else
|
||||||
|
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, mQual));
|
||||||
|
builder.append(FIELD_SEPERATOR);
|
||||||
|
builder.append(ParsingUtils.join(FILTER_CODE_SEPERATOR, getFilteringCodes()));
|
||||||
|
builder.append(FIELD_SEPERATOR);
|
||||||
|
builder.append(createInfoString());
|
||||||
|
|
||||||
|
if ( mGenotypeFormatString != null && mGenotypeFormatString.length() > 0 ) {
|
||||||
|
// try {
|
||||||
|
addGenotypeData(builder, header);
|
||||||
|
// } catch (Exception e) {
|
||||||
|
// if ( validationStringency == VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT )
|
||||||
|
// throw new RuntimeException(e);
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create the info string
|
||||||
|
*
|
||||||
|
* @return a string representing the infomation fields
|
||||||
|
*/
|
||||||
|
protected String createInfoString() {
|
||||||
|
StringBuffer info = new StringBuffer();
|
||||||
|
boolean isFirst = true;
|
||||||
|
for (Map.Entry<String, String> entry : mInfoFields.entrySet()) {
|
||||||
|
if ( isFirst )
|
||||||
|
isFirst = false;
|
||||||
|
else
|
||||||
|
info.append(INFO_FIELD_SEPERATOR);
|
||||||
|
info.append(entry.getKey());
|
||||||
|
if ( entry.getValue() != null && !entry.getValue().equals("") ) {
|
||||||
|
info.append("=");
|
||||||
|
info.append(entry.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return info.length() == 0 ? EMPTY_INFO_FIELD : info.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* add the genotype data
|
||||||
|
*
|
||||||
|
* @param builder the string builder
|
||||||
|
* @param header the header object
|
||||||
|
*/
|
||||||
|
private void addGenotypeData(StringBuilder builder, VCFHeader header) {
|
||||||
|
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getVCFGenotypeRecords());
|
||||||
|
|
||||||
|
StringBuffer tempStr = new StringBuffer();
|
||||||
|
if ( header.getGenotypeSamples().size() < getVCFGenotypeRecords().size() ) {
|
||||||
|
for ( String sample : gMap.keySet() ) {
|
||||||
|
if ( !header.getGenotypeSamples().contains(sample) )
|
||||||
|
System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header");
|
||||||
|
else
|
||||||
|
header.getGenotypeSamples().remove(sample);
|
||||||
|
}
|
||||||
|
throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated");
|
||||||
|
}
|
||||||
|
tempStr.append(FIELD_SEPERATOR + mGenotypeFormatString);
|
||||||
|
|
||||||
|
String[] genotypeFormatStrings = mGenotypeFormatString.split(":");
|
||||||
|
|
||||||
|
for ( String genotype : header.getGenotypeSamples() ) {
|
||||||
|
tempStr.append(FIELD_SEPERATOR);
|
||||||
|
if ( gMap.containsKey(genotype) ) {
|
||||||
|
VCFGenotypeRecord rec = gMap.get(genotype);
|
||||||
|
tempStr.append(rec.toStringEncoding(mAlts, genotypeFormatStrings));
|
||||||
|
gMap.remove(genotype);
|
||||||
|
} else {
|
||||||
|
tempStr.append(VCFGenotypeRecord.stringEncodingForEmptyGenotype(genotypeFormatStrings));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( gMap.size() != 0 ) {
|
||||||
|
for ( String sample : gMap.keySet() )
|
||||||
|
System.err.println("Sample " + sample + " is being genotyped but isn't in the header.");
|
||||||
|
throw new IllegalStateException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records");
|
||||||
|
}
|
||||||
|
|
||||||
|
builder.append(tempStr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* compare two VCF records
|
||||||
|
*
|
||||||
|
* @param other the other VCF record
|
||||||
|
* @return true if they're equal
|
||||||
|
*/
|
||||||
|
public boolean equals(VCFRecord other) {
|
||||||
|
if (!this.mAlts.equals(other.mAlts)) return false;
|
||||||
|
if (!this.mReferenceBases.equals(other.mReferenceBases)) return false;
|
||||||
|
if (!this.mContig.equals(other.mContig)) return false;
|
||||||
|
if (mPosition != other.mPosition) return false;
|
||||||
|
if (!this.mID.equals(other.mID)) return false;
|
||||||
|
if (this.mQual != other.mQual) return false;
|
||||||
|
if ( this.mFilterString == null ) {
|
||||||
|
if ( other.mFilterString != null ) return false;
|
||||||
|
} else if ( !this.mFilterString.equals(other.mFilterString) ) return false;
|
||||||
|
if (!this.mInfoFields.equals(other.mInfoFields)) return false;
|
||||||
|
if (!this.mGenotypeRecords.equals(other.mGenotypeRecords)) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a genotype mapping from a list and their sample names
|
||||||
|
*
|
||||||
|
* @param list a list of genotype samples
|
||||||
|
* @return a mapping of the sample name to VCF genotype record
|
||||||
|
*/
|
||||||
|
private static Map<String, VCFGenotypeRecord> genotypeListToMap(List<VCFGenotypeRecord> list) {
|
||||||
|
Map<String, VCFGenotypeRecord> map = new HashMap<String, VCFGenotypeRecord>();
|
||||||
|
for (int i = 0; i < list.size(); i++) {
|
||||||
|
VCFGenotypeRecord rec = list.get(i);
|
||||||
|
map.put(rec.getSampleName(), rec);
|
||||||
|
}
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return the features reference sequence name, e.g chromosome or contig */
|
||||||
|
public String getChr() {
|
||||||
|
return this.mContig;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return the start position in 1-based coordinates (first base is 1) */
|
||||||
|
public int getStart() {
|
||||||
|
return this.mPosition;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the end position following 1-based fully closed conventions. The length of a feature is
|
||||||
|
* end - start + 1;
|
||||||
|
*/
|
||||||
|
public int getEnd() {
|
||||||
|
return this.mPosition;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* set the VCF header we're associated with
|
||||||
|
* @param header the header
|
||||||
|
*/
|
||||||
|
void setHeader(VCFHeader header) {
|
||||||
|
vcfHeader = header;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the associated header
|
||||||
|
* @return the VCF Header
|
||||||
|
*/
|
||||||
|
public VCFHeader getHeader() {
|
||||||
|
return vcfHeader;
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue