diff --git a/java/src/org/broad/tribble/vcf/VCFCodec.java b/java/src/org/broad/tribble/vcf/VCFCodec.java new file mode 100644 index 000000000..cb400fff3 --- /dev/null +++ b/java/src/org/broad/tribble/vcf/VCFCodec.java @@ -0,0 +1,125 @@ +package org.broad.tribble.vcf; + +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.exception.CodecLineParsingException; +import org.broad.tribble.util.LineReader; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + + +/** + * + * @author aaron + * + * Class VCFCodec + * + * The codec for VCF, which relies on VCFReaderUtils to do most of the processing + */ +public class VCFCodec implements FeatureCodec { + + // we have to store the list of strings that make up the header until they're needed + private List headerStrings = new ArrayList(); + private VCFHeader header = null; + private VCFHeaderVersion version = VCFHeaderVersion.VCF3_3; + + + // some classes need to transform the line before + private LineTransform transformer = null; + + /** + * Fast path to get the location of the Feature for indexing + * @param line the input line to decode + * @return + */ + public Feature decodeLoc(String line) { + return reallyDecode(line, true); + } + + /** + * Decode a line as a Feature. + * + * @param line + * + * @return Return the Feature encoded by the line, or null if the line does not represent a feature (e.g. is + * a comment) + */ + public Feature decode(String line) { + return reallyDecode(line, false); + } + + private Feature reallyDecode(String line, boolean justLocationPlease ) { + // transform the line, if we have a transform to do + if (transformer != null) line = transformer.lineTransform(line); + if (line.startsWith("#")) + return null; + + // make a VCFRecord of the line and return it + VCFRecord rec = VCFReaderUtils.createRecord(line, header, justLocationPlease); + if ( ! justLocationPlease ) rec.setHeader(header); + return rec; + } + + /** + * Return the # of header lines for this file. We use this to parse out the header + * + * @return 0 + */ + public int readHeader(LineReader reader) { + String line = ""; + try { + while ((line = reader.readLine()) != null) { + if (line.startsWith("##")) { + headerStrings.add(line); + } + else if (line.startsWith("#")) { + headerStrings.add(line); + header = VCFReaderUtils.createHeader(headerStrings,version); + return headerStrings.size(); + } + else { + throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file"); + } + + } + } catch (IOException e) { + throw new RuntimeException("IO Exception ", e); + } + throw new CodecLineParsingException("We never saw the required header line (starting with one #) for the input VCF file"); + } + + /** + * @return VCFRecord.class + */ + public Class getFeatureType() { + return VCFRecord.class; + } + + public VCFHeader getHeader(Class clazz) throws ClassCastException { + if (!clazz.equals(VCFHeader.class)) + throw new ClassCastException("Unable to cast to expected type " + clazz + " from type " + VCFHeader.class); + return header; + } + + public static interface LineTransform { + public String lineTransform(String line); + } + + public LineTransform getTransformer() { + return transformer; + } + + public void setTransformer(LineTransform transformer) { + this.transformer = transformer; + } + + public VCFHeaderVersion getVersion() { + return version; + } + + public void setVersion(VCFHeaderVersion version) { + this.version = version; + } +} diff --git a/java/src/org/broad/tribble/vcf/VCFFilterHeaderLine.java b/java/src/org/broad/tribble/vcf/VCFFilterHeaderLine.java new file mode 100755 index 000000000..afd988878 --- /dev/null +++ b/java/src/org/broad/tribble/vcf/VCFFilterHeaderLine.java @@ -0,0 +1,72 @@ +package org.broad.tribble.vcf; + +import org.broad.tribble.util.ParsingUtils; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.Map; + + +/** + * @author ebanks + * A class representing a key=value entry for FILTER fields in the VCF header + */ +public class VCFFilterHeaderLine extends VCFHeaderLine { + + private String mName; + private String mDescription; + + + /** + * create a VCF filter header line + * + * @param name the name for this header line + * @param description the description for this header line + */ + public VCFFilterHeaderLine(String name, String description) { + super("FILTER", ""); + mName = name; + mDescription = description; + } + + /** + * create a VCF info header line + * + * @param line the header line + * @param version the vcf header version + */ + protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) { + super("FILTER", "", version); + Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description")); + mName = mapping.get("ID"); + mDescription = mapping.get("Description"); + } + + protected String makeStringRep() { + if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2) + return String.format("FILTER=%s,\"%s\"", mName, mDescription); + else if (mVersion == VCFHeaderVersion.VCF4_0) { + Map map = new LinkedHashMap(); + map.put("ID",mName); + map.put("Description",mDescription); + return "FILTER=" + VCFHeaderLineTranslator.toValue(this.mVersion,map); + } + else throw new RuntimeException("Unsupported VCFVersion " + mVersion); + } + + public boolean equals(Object o) { + if ( !(o instanceof VCFFilterHeaderLine) ) + return false; + VCFFilterHeaderLine other = (VCFFilterHeaderLine)o; + return mName.equals(other.mName) && + mDescription.equals(other.mDescription); + } + + public String getmName() { + return mName; + } + + public String getmDescription() { + return mDescription; + } +} \ No newline at end of file diff --git a/java/src/org/broad/tribble/vcf/VCFFormatHeaderLine.java b/java/src/org/broad/tribble/vcf/VCFFormatHeaderLine.java new file mode 100755 index 000000000..0997f4422 --- /dev/null +++ b/java/src/org/broad/tribble/vcf/VCFFormatHeaderLine.java @@ -0,0 +1,118 @@ +package org.broad.tribble.vcf; + +import org.broad.tribble.util.ParsingUtils; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.Map; + + +/** + * @author ebanks + *

+ * Class VCFFormatHeaderLine + *

+ * A class representing a key=value entry for genotype FORMAT fields in the VCF header + */ +public class VCFFormatHeaderLine extends VCFHeaderLine { + + // the format field types + public enum FORMAT_TYPE { + Integer, Float, String; + public Object convert(String value) { + switch (this) { + case Integer: + return java.lang.Integer.valueOf(value); // the java.lang is needed since we use Integer as a enum name + case Float: + return java.lang.Float.valueOf(value); + case String: + return value; + default: + throw new IllegalStateException("field." + this + " doesn't have a set conversion approach"); + } + } + } + + private String mName; + private int mCount; + private String mDescription; + private FORMAT_TYPE mType; + + + /** + * create a VCF format header line + * + * @param name the name for this header line + * @param count the count for this header line + * @param type the type for this header line + * @param description the description for this header line + */ + public VCFFormatHeaderLine(String name, int count, FORMAT_TYPE type, String description) { + super("FORMAT", ""); + mName = name; + mCount = count; + mType = type; + mDescription = description; + } + + /** + * create a VCF format header line + * + * @param line the header line + * @param version the VCF header version + * + */ + protected VCFFormatHeaderLine(String line, VCFHeaderVersion version) { + super("FORMAT", "", version); + Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description")); + mName = mapping.get("ID"); + mCount = Integer.valueOf(mapping.get("Number")); + mType = FORMAT_TYPE.valueOf(mapping.get("Type")); + mDescription = mapping.get("Description"); + } + + protected String makeStringRep() { + if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2) + return String.format("FORMAT=%s,%d,%s,\"%s\"", mName, mCount, mType.toString(), mDescription); + else if (mVersion == VCFHeaderVersion.VCF4_0) { + Map map = new LinkedHashMap(); + map.put("ID",mName); + map.put("Number",mCount); + map.put("Type",mType); + map.put("Description",mDescription); + return "FORMAT=" + VCFHeaderLineTranslator.toValue(this.mVersion,map); + } + else throw new RuntimeException("Unsupported VCFVersion " + mVersion); + } + + public String getName() { return mName; } + public int getCount() { return mCount; } + public String getDescription() { return mDescription; } + public FORMAT_TYPE getType() { return mType; } + + public boolean equals(Object o) { + if ( !(o instanceof VCFFormatHeaderLine) ) + return false; + VCFFormatHeaderLine other = (VCFFormatHeaderLine)o; + return mName.equals(other.mName) && + mCount == other.mCount && + mDescription.equals(other.mDescription) && + mType == other.mType; + } + + public String getmName() { + return mName; + } + + public int getmCount() { + return mCount; + } + + public String getmDescription() { + return mDescription; + } + + public FORMAT_TYPE getmType() { + return mType; + } +} \ No newline at end of file diff --git a/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java b/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java new file mode 100644 index 000000000..520da467f --- /dev/null +++ b/java/src/org/broad/tribble/vcf/VCFGenotypeEncoding.java @@ -0,0 +1,128 @@ +package org.broad.tribble.vcf; + + + +/** + * @author aaron + *

+ * Class VCFGenotypeEncoding + *

+ * basic encoding class for genotype fields in VCF + */ +public class VCFGenotypeEncoding { + public enum TYPE { + SINGLE_BASE, + INSERTION, + DELETION, + UNCALLED, + MIXED // this type is only valid in aggregate, not for a single VCFGenotypeEncoding + } + + // our length (0 for SINGLE_BASE), our bases, and our type + private final int mLength; + private final String mBases; + private final TYPE mType; + + // public constructor, that parses out the base string + public VCFGenotypeEncoding(String baseString) { + if ((baseString.length() == 1)) { + // are we an empty (no-call) genotype? + if (baseString.equals(VCFGenotypeRecord.EMPTY_ALLELE)) { + mBases = VCFGenotypeRecord.EMPTY_ALLELE; + mLength = 0; + mType = TYPE.UNCALLED; + } else if (!validBases(baseString)) { + throw new IllegalArgumentException("Alleles of length 1 must be one of A,C,G,T, " + baseString + " was passed in"); + } else { // we're a valid base + mBases = baseString.toUpperCase(); + mLength = 0; + mType = TYPE.SINGLE_BASE; + } + } else { // deletion or insertion + if (baseString.length() < 1 || (baseString.toUpperCase().charAt(0) != 'D' && baseString.toUpperCase().charAt(0) != 'I')) { + throw new IllegalArgumentException("Genotype encoding of " + baseString + " was passed in, but is not a valid deletion, insertion, base, or no call (.)"); + } + if (baseString.toUpperCase().charAt(0) == 'D') { + mLength = Integer.valueOf(baseString.substring(1, baseString.length())); + mBases = ""; + mType = TYPE.DELETION; + } else { // we're an I + mBases = baseString.substring(1, baseString.length()).toUpperCase(); + if (!validBases(mBases)) + throw new IllegalArgumentException("The insertion base string contained invalid bases -> " + baseString); + mLength = mBases.length(); + mType = TYPE.INSERTION; + } + } + } + + public int getLength() { + return mLength; + } + + public String getBases() { + return mBases; + } + + public TYPE getType() { + return mType; + } + + public boolean equals(Object obj) { + if ( obj == null ) + return false; + if ( obj instanceof VCFGenotypeEncoding ) { + VCFGenotypeEncoding d = (VCFGenotypeEncoding) obj; + return (mType == d.mType) && (mBases.equals(d.mBases)) && (mLength == d.mLength); + } + if ( mType == TYPE.UNCALLED && obj.toString().equals(VCFGenotypeRecord.EMPTY_ALLELE) ) + return true; + return false; + } + + public int hashCode() { + // our underlying data is immutable, so this is safe (we won't strand a value in a hashtable somewhere + // when the data changes underneath, altering this value). + String str = this.mBases + String.valueOf(this.mLength) + this.mType.toString(); + return str.hashCode(); + } + + /** + * dump the string representation of this genotype encoding + * + * @return string representation + */ + public String toString() { + StringBuilder builder = new StringBuilder(); + switch (mType) { + case SINGLE_BASE: + case UNCALLED: + builder.append(mBases); + break; + case INSERTION: + builder.append("I"); + builder.append(mBases); + break; + case DELETION: + builder.append("D"); + builder.append(mLength); + break; + } + return builder.toString(); + } + + /** + * ensure that string contains valid bases + * + * @param bases the bases to check + * + * @return true if they're all either A,C,G,T; false otherwise + */ + private static boolean validBases(String bases) { + for (char c : bases.toUpperCase().toCharArray()) { + if (c != 'A' && c != 'C' && c != 'G' && c != 'T') + return false; + } + return true; + } +} \ No newline at end of file diff --git a/java/src/org/broad/tribble/vcf/VCFGenotypeRecord.java b/java/src/org/broad/tribble/vcf/VCFGenotypeRecord.java new file mode 100644 index 000000000..a39a40ae1 --- /dev/null +++ b/java/src/org/broad/tribble/vcf/VCFGenotypeRecord.java @@ -0,0 +1,353 @@ +package org.broad.tribble.vcf; + +import java.util.*; + + +/** + * + * @author aaron + * + * Class VCFGenotypeRecord + * + * A descriptions should go here. Blame aaron if it's missing. + */ +public class VCFGenotypeRecord { + + // key names + public static final String GENOTYPE_KEY = "GT"; + public static final String GENOTYPE_QUALITY_KEY = "GQ"; + public static final String DEPTH_KEY = "DP"; + public static final String HAPLOTYPE_QUALITY_KEY = "HQ"; + public static final String GENOTYPE_FILTER_KEY = "FT"; + public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; + public static final String OLD_DEPTH_KEY = "RD"; + + // the values for empty fields + public static final String EMPTY_GENOTYPE = "./."; + public static final String EMPTY_ALLELE = "."; + public static final int MISSING_GENOTYPE_QUALITY = -1; + public static final int MISSING_DEPTH = -1; + public static final int MISSING_HAPLOTYPE_QUALITY = -1; + public static final String PASSES_FILTERS = "0"; + public static final String UNFILTERED = "."; + + public static final double MAX_QUAL_VALUE = 99.0; + + // what kind of phasing this genotype has + public enum PHASE { + UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN + } + + // our record + private VCFRecord mRecord; + + // our phasing + private PHASE mPhaseType; + + // our bases(s) + private final List mGenotypeAlleles = new ArrayList(); + + // our mapping of the format mFields to values + private final Map mFields = new HashMap(); + + // our sample name + private String mSampleName; + + /** + * Create a VCF genotype record + * + * @param sampleName sample name + * @param genotypes list of genotypes + * @param phasing phasing + */ + public VCFGenotypeRecord(String sampleName, List genotypes, PHASE phasing) { + mSampleName = sampleName; + if (genotypes != null) + this.mGenotypeAlleles.addAll(genotypes); + mPhaseType = phasing; + } + + public void setVCFRecord(VCFRecord record) { + mRecord = record; + } + + public void setSampleName(String name) { + mSampleName = name; + } + + /** + * Adds a field to the genotype record. + * Throws an exception if the key is GT, as that's computed internally. + * + * @param key the field name (use static variables above for common fields) + * @param value the field value + */ + public void setField(String key, String value) { + // make sure the GT field isn't being set + if ( key.equals(GENOTYPE_KEY) ) + throw new IllegalArgumentException("Setting the GT field is not allowed as that's done internally"); + mFields.put(key, value); + } + + /** + * determine the phase of the genotype + * + * @param phase the string that contains the phase character + * + * @return the phase + */ + static PHASE determinePhase(String phase) { + // find the phasing information + if (phase.equals("/")) + return PHASE.UNPHASED; + else if (phase.equals("|")) + return PHASE.PHASED; + else if (phase.equals("\\")) + return PHASE.PHASED_SWITCH_PROB; + else + throw new IllegalArgumentException("Unknown genotype phasing parameter"); + } + + + public PHASE getPhaseType() { + return mPhaseType; + } + + public String getSampleName() { + return mSampleName; + } + + public List getAlleles() { + return mGenotypeAlleles; + } + + public Map getFields() { + return mFields; + } + + /** + * @return the phred-scaled quality score + */ + public double getQual() { + return ( mFields.containsKey(GENOTYPE_QUALITY_KEY) ? Double.valueOf(mFields.get(GENOTYPE_QUALITY_KEY)) : MISSING_GENOTYPE_QUALITY); + } + + public boolean isMissingQual() { + return (int)getQual() == MISSING_GENOTYPE_QUALITY; + } + + public double getNegLog10PError() { + double qual = getQual(); + return (qual == MISSING_GENOTYPE_QUALITY ? MISSING_GENOTYPE_QUALITY : qual / 10.0); + } + + public int getReadCount() { + return ( mFields.containsKey(DEPTH_KEY) ? Integer.valueOf(mFields.get(DEPTH_KEY)) : MISSING_DEPTH); + } + + public String getLocation() { + return mRecord != null ? mRecord.getChr() + ":" + mRecord.getPosition() : null; + } + + public String getReference() { + return mRecord != null ? mRecord.getReference() : "N"; + } + + public String getBases() { + String genotype = ""; + for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) + genotype += encoding.getBases(); + return genotype; + } + + public boolean isVariant(char ref) { + for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) { + if ( encoding.getType() == VCFGenotypeEncoding.TYPE.UNCALLED ) + continue; + if ( encoding.getType() != VCFGenotypeEncoding.TYPE.SINGLE_BASE || + encoding.getBases().charAt(0) != ref ) + return true; + } + return false; + } + + public boolean isPointGenotype() { + return (mRecord != null ? !mRecord.isIndel() : true); + } + + public boolean isHom() { + if ( mGenotypeAlleles.size() == 0 ) + return true; + + String bases = mGenotypeAlleles.get(0).getBases(); + for ( int i = 1; i < mGenotypeAlleles.size(); i++ ) { + if ( !bases.equals(mGenotypeAlleles.get(1).getBases()) ) + return false; + } + return true; + } + + public boolean isHet() { + return !isHom(); + } + + public boolean isNoCall() { + for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) { + if ( encoding.getType() != VCFGenotypeEncoding.TYPE.UNCALLED ) + return false; + } + return true; + } + + public boolean isFiltered() { + return ( mFields.get(GENOTYPE_FILTER_KEY) != null && + !mFields.get(GENOTYPE_FILTER_KEY).equals(UNFILTERED) && + !mFields.get(GENOTYPE_FILTER_KEY).equals(PASSES_FILTERS)); + } + + public int getPloidy() { + return 2; + } + + public VCFRecord getRecord() { + return mRecord; + } + + private String toGenotypeString(List altAlleles) { + String str = ""; + boolean first = true; + for (VCFGenotypeEncoding allele : mGenotypeAlleles) { + if (allele.getType() == VCFGenotypeEncoding.TYPE.UNCALLED) + str += VCFGenotypeRecord.EMPTY_ALLELE; + else + str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0); + if (first) { + switch (mPhaseType) { + case UNPHASED: + str += "/"; + break; + case PHASED: + str += "|"; + break; + case PHASED_SWITCH_PROB: + str += "\\"; + break; + case UNKNOWN: + throw new UnsupportedOperationException("Unknown phase type"); + } + first = false; + } + } + return str; + + } + + @Override + public String toString() { + return String.format("[VCFGenotype %s %s %s %s]", getLocation(), mSampleName, this.mGenotypeAlleles, mFields); + } + + public boolean isEmptyGenotype() { + for ( VCFGenotypeEncoding encoding : mGenotypeAlleles ) { + if ( encoding.getType() != VCFGenotypeEncoding.TYPE.UNCALLED ) + return false; + } + return true; + } + + public boolean equals(Object other) { + if (other instanceof VCFGenotypeRecord) { + if (((VCFGenotypeRecord) other).mPhaseType != this.mPhaseType) return false; + if (!((VCFGenotypeRecord) other).mGenotypeAlleles.equals(this.mGenotypeAlleles)) return false; + if (!((VCFGenotypeRecord) other).mFields.equals(mFields)) return false; + if (!((VCFGenotypeRecord) other).mSampleName.equals(this.mSampleName)) return false; + return true; + } + return false; + } + + /** + * output a string representation of the VCFGenotypeRecord, given the alternate alleles + * + * @param altAlleles the alternate alleles, needed for toGenotypeString() + * @param genotypeFormatStrings genotype format strings + * + * @return a string + */ + public String toStringEncoding(List altAlleles, String[] genotypeFormatStrings) { + StringBuilder builder = new StringBuilder(); + builder.append(toGenotypeString(altAlleles)); + + for ( String field : genotypeFormatStrings ) { + if ( field.equals(GENOTYPE_KEY) ) + continue; + + String value = mFields.get(field); + if ( value == null && field.equals(OLD_DEPTH_KEY) ) + value = mFields.get(DEPTH_KEY); + + builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR); + if ( value == null || value.equals("") ) + builder.append(getMissingFieldValue(field)); + else + builder.append(value); + } + + return builder.toString(); + } + + /** + * output a string representation of an empty genotype + * + * @param genotypeFormatStrings genotype format strings + * + * @return a string + */ + public static String stringEncodingForEmptyGenotype(String[] genotypeFormatStrings) { + StringBuilder builder = new StringBuilder(); + builder.append(EMPTY_GENOTYPE); + + for ( String field : genotypeFormatStrings ) { + if ( field.equals(GENOTYPE_KEY) ) + continue; + + builder.append(VCFRecord.GENOTYPE_FIELD_SEPERATOR); + builder.append(getMissingFieldValue(field)); + } + + return builder.toString(); + } + + public static String getMissingFieldValue(String field) { + String result = ""; + if ( field.equals(GENOTYPE_QUALITY_KEY) ) + result = String.valueOf(MISSING_GENOTYPE_QUALITY); + else if ( field.equals(DEPTH_KEY) || field.equals(OLD_DEPTH_KEY) ) + result = String.valueOf(MISSING_DEPTH); + else if ( field.equals(GENOTYPE_FILTER_KEY) ) + result = UNFILTERED; + else if ( field.equals(GENOTYPE_LIKELIHOODS_KEY) ) + result = "0,0,0"; + // TODO -- support haplotype quality + //else if ( field.equals(HAPLOTYPE_QUALITY_KEY) ) + // result = String.valueOf(MISSING_HAPLOTYPE_QUALITY); + return result; + } + + public static Set getSupportedHeaderStrings(VCFHeaderVersion version) { + Set result = new HashSet(); + result.add(new VCFFormatHeaderLine(GENOTYPE_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.String, "Genotype")); + result.add(new VCFFormatHeaderLine(GENOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.Float, "Genotype Quality")); + result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.FORMAT_TYPE.Integer, "Read Depth (only filtered reads used for calling)")); + result.add(new VCFFormatHeaderLine(GENOTYPE_LIKELIHOODS_KEY, 3, VCFFormatHeaderLine.FORMAT_TYPE.Float, "Log-scaled likelihoods for AA,AB,BB genotypes where A=ref and B=alt; not applicable if site is not biallelic")); + //result.add(new VCFFormatHeaderLine(HAPLOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Haplotype Quality")); + return result; + } + + public void replaceFields(HashMap newFields) { + mFields.clear(); + for ( String s : newFields.keySet() ) { + mFields.put(s,newFields.get(s)); + } + } +} diff --git a/java/src/org/broad/tribble/vcf/VCFHeader.java b/java/src/org/broad/tribble/vcf/VCFHeader.java new file mode 100644 index 000000000..c462a7a43 --- /dev/null +++ b/java/src/org/broad/tribble/vcf/VCFHeader.java @@ -0,0 +1,137 @@ +package org.broad.tribble.vcf; + + +import java.util.*; + + +/** + * @author aaron + *

+ * Class VCFHeader + *

+ * A class representing the VCF header + */ +public class VCFHeader { + + // the manditory header fields + public enum HEADER_FIELDS { + CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO + } + + // the associated meta data + private final Set mMetaData; + + // the list of auxillary tags + private final Set mGenotypeSampleNames = new LinkedHashSet(); + + // the character string that indicates meta data + public static final String METADATA_INDICATOR = "##"; + + // the header string indicator + public static final String HEADER_INDICATOR = "#"; + + // our header versionVCF + private VCFHeaderVersion versionVCF; + + /** do we have genotying data? */ + private boolean hasGenotypingData = false; + + /** + * create a VCF header, given a list of meta data and auxillary tags + * + * @param metaData the meta data associated with this header + */ + public VCFHeader(Set metaData) { + mMetaData = new TreeSet(metaData); + checkVCFVersion(); + } + + /** + * create a VCF header, given a list of meta data and auxillary tags + * + * @param metaData the meta data associated with this header + * @param genotypeSampleNames the genotype format field, and the sample names + */ + public VCFHeader(Set metaData, Set genotypeSampleNames) { + mMetaData = new TreeSet(metaData); + for (String col : genotypeSampleNames) { + if (!col.equals("FORMAT")) + mGenotypeSampleNames.add(col); + } + if (genotypeSampleNames.size() > 0) hasGenotypingData = true; + checkVCFVersion(); + } + + /** + * check our metadata for a VCF versionVCF tag, and throw an exception if the versionVCF is out of date + * or the versionVCF is not present + */ + // TODO: fix this function + public void checkVCFVersion() { + VCFHeaderVersion version; + List toRemove = new ArrayList(); + for ( VCFHeaderLine line : mMetaData ) + if ( VCFHeaderVersion.isFormatString(line.getKey())) { + version = VCFHeaderVersion.toHeaderVersion(line.getValue(),line.getKey()); + if (version == null) + { + toRemove.add(line); + } + /**throw new RuntimeException("VCF version " + line.getValue() + + " is not supported; only versionVCF " + VCFHeaderVersion.VCF3_2 + " and greater can be used");*/ + else return; + } + // remove old header lines for now, + mMetaData.removeAll(toRemove); + mMetaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF3_3.getFormatString(), VCFHeaderVersion.VCF3_3.getVersionString())); + + } + + /** + * get the header fields in order they're presented in the input file (which is now required to be + * the order presented in the spec). + * + * @return a set of the header fields, in order + */ + public Set getHeaderFields() { + Set fields = new LinkedHashSet(); + for (HEADER_FIELDS field : HEADER_FIELDS.values()) + fields.add(field); + return fields; + } + + /** + * get the meta data, associated with this header + * + * @return a set of the meta data + */ + public Set getMetaData() { + return mMetaData; + } + + /** + * get the genotyping sample names + * + * @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false + */ + public Set getGenotypeSamples() { + return mGenotypeSampleNames; + } + + /** + * do we have genotyping data? + * + * @return true if we have genotyping columns, false otherwise + */ + public boolean hasGenotypingData() { + return hasGenotypingData; + } + + /** @return the column count, */ + public int getColumnCount() { + return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0); + } +} + + + diff --git a/java/src/org/broad/tribble/vcf/VCFHeaderLine.java b/java/src/org/broad/tribble/vcf/VCFHeaderLine.java new file mode 100644 index 000000000..ccb8d2095 --- /dev/null +++ b/java/src/org/broad/tribble/vcf/VCFHeaderLine.java @@ -0,0 +1,100 @@ +package org.broad.tribble.vcf; + + + +/** + * @author ebanks + *

+ * Class VCFHeaderLine + *

+ * A class representing a key=value entry in the VCF header + */ +public class VCFHeaderLine implements Comparable { + + private String stringRep = null; + private String mKey = null; + private String mValue = null; + protected VCFHeaderVersion mVersion = null; + + /** + * create a VCF header line + * + * @param key the key for this header line + * @param value the value for this header line + */ + public VCFHeaderLine(String key, String value, VCFHeaderVersion version) { + mKey = key; + mValue = value; + mVersion = version; + } + + /** + * create a VCF header line + * + * @param key the key for this header line + * @param value the value for this header line + */ + public VCFHeaderLine(String key, String value) { + mKey = key; + mValue = value; + mVersion = VCFHeaderVersion.VCF3_3; + } + + /** + * Get the key + * + * @return the key + */ + public String getKey() { + return mKey; + } + + /** + * Set the key + * + * @param key the key for this header line + */ + public void setKey(String key) { + mKey = key; + stringRep = null; + } + + /** + * Get the value + * + * @return the value + */ + public String getValue() { + return mValue; + } + + /** + * Set the value + * + * @param value the value for this header line + */ + public void setValue(String value) { + mValue = value; + stringRep = null; + } + + public String toString() { + if ( stringRep == null ) + stringRep = makeStringRep(); + return stringRep; + } + + protected String makeStringRep() { + return mKey + "=" + mValue; + } + + public boolean equals(Object o) { + if ( !(o instanceof VCFHeaderLine) ) + return false; + return mKey.equals(((VCFHeaderLine)o).getKey()) && mValue.equals(((VCFHeaderLine)o).getValue()); + } + + public int compareTo(Object other) { + return toString().compareTo(other.toString()); + } +} \ No newline at end of file diff --git a/java/src/org/broad/tribble/vcf/VCFHeaderLineTranslator.java b/java/src/org/broad/tribble/vcf/VCFHeaderLineTranslator.java new file mode 100644 index 000000000..84b5ca1c9 --- /dev/null +++ b/java/src/org/broad/tribble/vcf/VCFHeaderLineTranslator.java @@ -0,0 +1,161 @@ +package org.broad.tribble.vcf; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: aaron + * Date: Jun 17, 2010 + * Time: 12:28:46 PM + * To change this template use File | Settings | File Templates. + */ +public class VCFHeaderLineTranslator { + private static Map mapping; + + static { + mapping = new HashMap(); + mapping.put(VCFHeaderVersion.VCF4_0,new VCF4Parser()); + mapping.put(VCFHeaderVersion.VCF3_3,new VCF3Parser()); + } + + public static Map parseLine(VCFHeaderVersion version, String valueLine, List expectedTagOrder) { + return mapping.get(version).parseLine(valueLine,expectedTagOrder); + } + + public static String toValue(VCFHeaderVersion version, Map keyValues) { + return mapping.get(version).toValue(keyValues); + } +} + + +interface VCFLineParser { + public String toValue(Map keyValues); + public Map parseLine(String valueLine, List expectedTagOrder); +} + + +/** + * a class that handles the to and from disk for VCF 4 lines + */ +class VCF4Parser implements VCFLineParser { + Set bracketed = new HashSet(); + + /** + * create a string of a mapping pair for the target VCF version + * @param keyValues a mapping of the key->value pairs to output + * @return a string, correctly formatted + */ + public String toValue(Map keyValues) { + StringBuilder builder = new StringBuilder(); + builder.append("<"); + boolean start = true; + for (Map.Entry entry : keyValues.entrySet()) { + if (start) start = false; + else builder.append(","); + builder.append(entry.getKey()); + builder.append("="); + builder.append(entry.getValue().toString().contains(",") || + entry.getValue().toString().contains(" ") || + entry.getKey().equals("Description") ? "\""+ entry.getValue() + "\"" : entry.getValue()); + } + builder.append(">"); + return builder.toString(); + } + + /** + * parse a VCF4 line + * @param valueLine the line + * @return a mapping of the tags parsed out + */ + public Map parseLine(String valueLine, List expectedTagOrder) { + // our return map + Map ret = new LinkedHashMap(); + + // a builder to store up characters as we go + StringBuilder builder = new StringBuilder(); + + // store the key when we're parsing out the values + String key = ""; + + // where are we in the stream of characters? + int index = 0; + + // are we inside a quotation? we don't special case ',' then + boolean inQuote = false; + + // a little switch machine to parse out the tags. Regex ended up being really complicated and ugly + for (char c: valueLine.toCharArray()) { + switch (c) { + case ('<') : if (index == 0) break; // if we see a open bracket at the beginning, ignore it + case ('>') : if (index == valueLine.length()-1) ret.put(key,builder.toString().trim()); break; // if we see a close bracket, and we're at the end, add an entry to our list + case ('=') : key = builder.toString().trim(); builder = new StringBuilder(); break; // at an equals, copy the key and reset the builder + case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it + case (',') : if (!inQuote) { ret.put(key,builder.toString().trim()); builder = new StringBuilder(); break; } // drop the current key value to the return map + default: builder.append(c); // otherwise simply append to the current string + } + index++; + } + + // validate the tags against the expected list + index = 0; + if (ret.size() > expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + ret.size() + " in string " + expectedTagOrder.size()); + for (String str : ret.keySet()) { + if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); + index++; + } + return ret; + } +} + +class VCF3Parser implements VCFLineParser { + + public String toValue(Map keyValues) { + StringBuilder builder = new StringBuilder(); + boolean start = true; + for (Map.Entry entry : keyValues.entrySet()) { + if (start) start = false; + else builder.append(","); + builder.append(entry.getValue().toString().contains(",") || entry.getValue().toString().contains(" ")? "\""+ entry.getValue() + "\"" : entry.getValue()); + } + return builder.toString(); + } + + public Map parseLine(String valueLine, List expectedTagOrder) { + // our return map + Map ret = new LinkedHashMap(); + + // a builder to store up characters as we go + StringBuilder builder = new StringBuilder(); + + // store the key when we're parsing out the values + String key = ""; + + // where are we in the stream of characters? + int index = 0; + // where in the expected tag order are we? + int tagIndex = 0; + + // are we inside a quotation? we don't special case ',' then + boolean inQuote = false; + + // a little switch machine to parse out the tags. Regex ended up being really complicated and ugly + for (char c: valueLine.toCharArray()) { + switch (c) { + case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it + case (',') : if (!inQuote) { ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); builder = new StringBuilder(); break; } // drop the current key value to the return map + default: builder.append(c); // otherwise simply append to the current string + } + index++; + } + ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); + + // validate the tags against the expected list + index = 0; + if (tagIndex != expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size()); + for (String str : ret.keySet()){ + if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); + index++; + } + return ret; + } +} \ No newline at end of file diff --git a/java/src/org/broad/tribble/vcf/VCFHeaderVersion.java b/java/src/org/broad/tribble/vcf/VCFHeaderVersion.java new file mode 100644 index 000000000..9803cf04e --- /dev/null +++ b/java/src/org/broad/tribble/vcf/VCFHeaderVersion.java @@ -0,0 +1,78 @@ +package org.broad.tribble.vcf; + +/** + * information that identifies each header version + */ +public enum VCFHeaderVersion { + VCF3_2("VCRv3.2","format"), + VCF3_3("VCFv3.3","fileformat"), + VCF4_0("VCFv4.0","fileformat"); + + private final String versionString; + private final String formatString; + + /** + * create the enum, privately, using: + * @param vString the version string + * @param fString the format string + */ + VCFHeaderVersion(String vString, String fString) { + this.versionString = vString; + this.formatString = fString; + } + + /** + * get the header version + * @param version the version string + * @param format the format string + * @return a VCFHeaderVersion object + */ + public static VCFHeaderVersion toHeaderVersion(String version, String format) { + for (VCFHeaderVersion hv : VCFHeaderVersion.values()) + if (hv.versionString.equals(version) && hv.formatString.equals(format)) + return hv; + return null; + } + + /** + * get the header version + * @param version the version string + * @return a VCFHeaderVersion object + */ + public static VCFHeaderVersion toHeaderVersion(String version) { + for (VCFHeaderVersion hv : VCFHeaderVersion.values()) + if (hv.versionString.equals(version)) + return hv; + return null; + } + + /** + * are we a valid version string of some type + * @param version the version string + * @return true if we're valid of some type, false otherwise + */ + public static boolean isVersionString(String version){ + return toHeaderVersion(version) != null; + } + + /** + * are we a valid format string for some type + * @param format the format string + * @return true if we're valid of some type, false otherwise + */ + public static boolean isFormatString(String format){ + for (VCFHeaderVersion hv : VCFHeaderVersion.values()) + if (hv.formatString.equals(format)) + return true; + return false; + } + + + public String getVersionString() { + return versionString; + } + + public String getFormatString() { + return formatString; + } +} diff --git a/java/src/org/broad/tribble/vcf/VCFInfoHeaderLine.java b/java/src/org/broad/tribble/vcf/VCFInfoHeaderLine.java new file mode 100755 index 000000000..4d8d2c400 --- /dev/null +++ b/java/src/org/broad/tribble/vcf/VCFInfoHeaderLine.java @@ -0,0 +1,121 @@ +package org.broad.tribble.vcf; + +import org.broad.tribble.util.ParsingUtils; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.Map; + + +/** + * @author ebanks + *

+ * Class VCFInfoHeaderLine + *

+ * A class representing a key=value entry for INFO fields in the VCF header + */ +public class VCFInfoHeaderLine extends VCFHeaderLine { + + // the info field types + public enum INFO_TYPE { + Integer, Float, String, Character, Flag; + + public Object convert(String value) { + switch (this) { + case Integer: + return java.lang.Integer.valueOf(value); // the java.lang is needed since we use Integer as a enum name + case Float: + return java.lang.Float.valueOf(value); + case String: + case Character: + return value; + case Flag: + return value.equals("0") ? false : true; + default: + throw new IllegalStateException("INFO_TYPE." + this + " doesn't have a set conversion approach"); + } + } + } + + private String mName; + private int mCount; + private String mDescription; + private INFO_TYPE mType; + + + // info line numerical values are allowed to be unbounded (or unknown), which is + // marked with a dot (.) + public static int UNBOUNDED = Integer.MIN_VALUE; + public static String UNBOUNDED_ENCODING = "."; + + /** + * create a VCF info header line + * + * @param name the name for this header line + * @param count the count for this header line + * @param type the type for this header line + * @param description the description for this header line + */ + public VCFInfoHeaderLine(String name, int count, INFO_TYPE type, String description) { + super("INFO", ""); + mName = name; + mCount = count; + mType = type; + mDescription = description; + } + + /** + * create a VCF info header line + * + * @param line the header line + * @param version the VCF version + */ + protected VCFInfoHeaderLine(String line, VCFHeaderVersion version) { + super("INFO", "", version); + Map mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description")); + mName = mapping.get("ID"); + mCount = mapping.get("Number").equals(UNBOUNDED_ENCODING) ? UNBOUNDED : Integer.valueOf(mapping.get("Number")); + mType = INFO_TYPE.valueOf(mapping.get("Type")); + mDescription = mapping.get("Description"); + } + + protected String makeStringRep() { + if (mVersion == VCFHeaderVersion.VCF3_3 || mVersion == VCFHeaderVersion.VCF3_2) + return String.format("INFO=%s,%d,%s,\"%s\"", mName, mCount, mType.toString(), mDescription); + else if (mVersion == VCFHeaderVersion.VCF4_0) { + Map map = new LinkedHashMap(); + map.put("ID",mName); + map.put("Number",mCount == UNBOUNDED ? UNBOUNDED_ENCODING : mCount); + map.put("Type",mType); + map.put("Description",mDescription); + return "INFO=" + VCFHeaderLineTranslator.toValue(this.mVersion,map); + } + else throw new RuntimeException("Unsupported VCFVersion " + mVersion); + } + + public boolean equals(Object o) { + if ( !(o instanceof VCFInfoHeaderLine) ) + return false; + VCFInfoHeaderLine other = (VCFInfoHeaderLine)o; + return mName.equals(other.mName) && + mCount == other.mCount && + mDescription.equals(other.mDescription) && + mType == other.mType; + } + + public String getmName() { + return mName; + } + + public int getmCount() { + return mCount; + } + + public String getmDescription() { + return mDescription; + } + + public INFO_TYPE getmType() { + return mType; + } +} \ No newline at end of file diff --git a/java/src/org/broad/tribble/vcf/VCFReaderUtils.java b/java/src/org/broad/tribble/vcf/VCFReaderUtils.java new file mode 100644 index 000000000..3466ce0b3 --- /dev/null +++ b/java/src/org/broad/tribble/vcf/VCFReaderUtils.java @@ -0,0 +1,206 @@ +package org.broad.tribble.vcf; + + + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** The VCFReaderUtils class, which contains a collection of utilities for working with VCF files */ +public class VCFReaderUtils { + + // our pattern matching for the genotype mFields + private static final Pattern gtPattern = Pattern.compile("([0-9\\.]+)([\\\\|\\/])([0-9\\.]*)"); + + /** + * create a VCF header, given an array of strings that all start with at least the # character. This function is + * package protected so that the VCFReaderUtils can access this function + * + * @param headerStrings a list of header strings + * + * @return a VCF Header created from the list of stinrgs + */ + public static VCFHeader createHeader(List headerStrings, VCFHeaderVersion version) { + Set metaData = new TreeSet(); + Set auxTags = new LinkedHashSet(); + // iterate over all the passed in strings + for ( String str : headerStrings ) { + if ( !str.startsWith("##") ) { + String[] strings = str.substring(1).split("\\t"); + // the columns should be in order according to Richard Durbin + int arrayIndex = 0; + for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { + try { + if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex])) + throw new RuntimeException("VCFReaderUtils: we were expecting column name " + field + " but we saw " + strings[arrayIndex]); + } catch (IllegalArgumentException e) { + throw new RuntimeException("VCFReaderUtils: Unknown column name \"" + strings[arrayIndex] + "\", it does not match a known column header name."); + } + arrayIndex++; + } + while (arrayIndex < strings.length) { + if (!strings[arrayIndex].equals("FORMAT")) + auxTags.add(strings[arrayIndex]); + arrayIndex++; + } + } else { + if ( str.startsWith("##INFO=") ) + metaData.add(new VCFInfoHeaderLine(str.substring(7),version)); + else if ( str.startsWith("##FILTER=") ) + metaData.add(new VCFFilterHeaderLine(str.substring(9),version)); + else if ( str.startsWith("##FORMAT=") ) + metaData.add(new VCFFormatHeaderLine(str.substring(9),version)); + else { + int equals = str.indexOf("="); + if ( equals != -1 ) + metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1),version)); + } + } + } + + return new VCFHeader(metaData, auxTags); + } + + /** + * create the next VCFRecord, given the input line + * + * @param line the line from the file + * @param mHeader the VCF header + * + * @return the VCFRecord + */ + public static VCFRecord createRecord(String line, VCFHeader mHeader) { + return createRecord(line, mHeader, false); + } + + public static VCFRecord createRecord(String line, VCFHeader mHeader, boolean ignoreGenotypes) { + // things we need to make a VCF record + Map values = new HashMap(); + String tokens[] = line.split("\\t"); + + // check to ensure that the column count of tokens is right + if (tokens.length != mHeader.getColumnCount()) { + throw new RuntimeException("The input file line doesn't contain enough fields, it should have " + mHeader.getColumnCount() + " fields, it has " + tokens.length + ". Line = " + line); + } + + int index = 0; + for (VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields()) + values.put(field, tokens[index++]); + // if we have genotyping data, we try and extract the genotype fields + if ( ! ignoreGenotypes && mHeader.hasGenotypingData()) { + String mFormatString = tokens[index]; + String keyStrings[] = mFormatString.split(":"); + List genotypeRecords = new ArrayList(); + index++; + String[] alt_alleles = values.get(VCFHeader.HEADER_FIELDS.ALT).split(","); + for (String str : mHeader.getGenotypeSamples()) { + genotypeRecords.add(getVCFGenotype(str, keyStrings, tokens[index], alt_alleles, values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0))); + index++; + } + VCFRecord vrec = new VCFRecord(values, mFormatString, genotypeRecords); + // associate the genotypes with this new record + for ( VCFGenotypeRecord gr : genotypeRecords ) + gr.setVCFRecord(vrec); + return vrec; + + } + return new VCFRecord(values); + } + + /** + * generate a VCF genotype record, given it's format string, the genotype string, and allele info + * + * @param sampleName the sample name + * @param formatString the format string for this record, which contains the keys for the genotype parameters + * @param genotypeString contains the phasing information, allele information, and values for genotype parameters + * @param altAlleles the alternate allele string array, which we index into based on the field parameters + * @param referenceBase the reference base + * + * @return a VCFGenotypeRecord + */ + public static VCFGenotypeRecord getVCFGenotype(String sampleName, String formatString, String genotypeString, String altAlleles[], char referenceBase) { + return getVCFGenotype(sampleName, formatString.split(":"), genotypeString, altAlleles, referenceBase); + } + + /** + * generate a VCF genotype record, given it's format string, the genotype string, and allele info + * + * @param sampleName the sample name + * @param keyStrings the split format string for this record, which contains the keys for the genotype parameters + * @param genotypeString contains the phasing information, allele information, and values for genotype parameters + * @param altAlleles the alternate allele string array, which we index into based on the field parameters + * @param referenceBase the reference base + * + * @return a VCFGenotypeRecord + */ + public static VCFGenotypeRecord getVCFGenotype(String sampleName, String[] keyStrings, String genotypeString, String altAlleles[], char referenceBase) { + // parameters to create the VCF genotype record + HashMap tagToValue = new HashMap(); + VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN; + List bases = new ArrayList(); + + for (String key : keyStrings) { + String parse; + int nextDivider; + if (!genotypeString.contains(":")) { + nextDivider = genotypeString.length(); + parse = genotypeString; + } else { + nextDivider = (genotypeString.indexOf(":") > genotypeString.length()) ? genotypeString.length() : genotypeString.indexOf(":"); + parse = genotypeString.substring(0, nextDivider); + } + if (key.equals(VCFGenotypeRecord.GENOTYPE_KEY)) { + Matcher m = gtPattern.matcher(parse); + if (!m.matches()) + throw new RuntimeException("VCFReaderUtils: Unable to match GT genotype flag to it's expected pattern, the field was: " + parse); + phase = VCFGenotypeRecord.determinePhase(m.group(2)); + addAllele(m.group(1), altAlleles, referenceBase, bases); + if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases); + } else { + if ( parse.length() == 0 ) + parse = VCFGenotypeRecord.getMissingFieldValue(key); + tagToValue.put(key, parse); + } + if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1; + genotypeString = genotypeString.substring(nextDivider + 1, genotypeString.length()); + } + if ( bases.size() > 0 && bases.get(0).equals(VCFGenotypeRecord.EMPTY_ALLELE) ) + tagToValue.clear(); + // catch some common errors, either there are too many field keys or there are two many field values + else if ( keyStrings.length != tagToValue.size() + ((bases.size() > 0) ? 1 : 0)) + throw new RuntimeException("VCFReaderUtils: genotype value count doesn't match the key count (expected " + + keyStrings.length + " but saw " + tagToValue.size() + ")"); + else if ( genotypeString.length() > 0 ) + throw new RuntimeException("VCFReaderUtils: genotype string contained additional unprocessed fields: " + genotypeString + + ". This most likely means that the format string is shorter then the value fields."); + + VCFGenotypeRecord rec = new VCFGenotypeRecord(sampleName, bases, phase); + for ( Map.Entry entry : tagToValue.entrySet() ) + rec.setField(entry.getKey(), entry.getValue()); + return rec; + } + + + /** + * add an alternate allele to the list of alleles we have for a VCF genotype record + * + * @param alleleNumber the allele number, as a string + * @param altAlleles the list of alternate alleles + * @param referenceBase the reference base + * @param bases the list of bases for this genotype call + */ + private static void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List bases) { + if (alleleNumber.equals(VCFGenotypeRecord.EMPTY_ALLELE)) { + bases.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); + } else { + int alleleValue = Integer.valueOf(alleleNumber); + // check to make sure the allele value is within bounds + if (alleleValue < 0 || alleleValue > altAlleles.length) + throw new IllegalArgumentException("VCFReaderUtils: the allele value of " + alleleValue + " is out of bounds given the alternate allele list."); + if (alleleValue == 0) + bases.add(new VCFGenotypeEncoding(String.valueOf(referenceBase))); + else + bases.add(new VCFGenotypeEncoding(altAlleles[alleleValue - 1])); + } + } +} diff --git a/java/src/org/broad/tribble/vcf/VCFRecord.java b/java/src/org/broad/tribble/vcf/VCFRecord.java new file mode 100644 index 000000000..5b9bff8f8 --- /dev/null +++ b/java/src/org/broad/tribble/vcf/VCFRecord.java @@ -0,0 +1,682 @@ +package org.broad.tribble.vcf; + + +import org.broad.tribble.Feature; +import org.broad.tribble.util.ParsingUtils; + +import java.util.*; + +/** the basic VCF record type */ +public class VCFRecord implements Feature { + + // standard info field keys + public static final String ANCESTRAL_ALLELE_KEY = "AA"; + public static final String ALLELE_COUNT_KEY = "AC"; + public static final String ALLELE_FREQUENCY_KEY = "AF"; + public static final String ALLELE_NUMBER_KEY = "AN"; + public static final String RMS_BASE_QUALITY_KEY = "BQ"; + public static final String DBSNP_KEY = "DB"; + public static final String DEPTH_KEY = "DP"; + public static final String HAPMAP2_KEY = "H2"; + public static final String HAPMAP3_KEY = "H3"; + public static final String RMS_MAPPING_QUALITY_KEY = "MQ"; + public static final String SAMPLE_NUMBER_KEY = "NS"; + public static final String STRAND_BIAS_KEY = "SB"; + + // commonly used strings that are in the standard + public static final String FORMAT_FIELD_SEPERATOR = ":"; + public static final String GENOTYPE_FIELD_SEPERATOR = ":"; + public static final String FIELD_SEPERATOR = "\t"; + public static final String FILTER_CODE_SEPERATOR = ";"; + public static final String INFO_FIELD_SEPERATOR = ";"; + + // default values + public static final String UNFILTERED = "."; + public static final String PASSES_FILTERS = "0"; + public static final String EMPTY_INFO_FIELD = "."; + public static final String EMPTY_ID_FIELD = "."; + public static final String EMPTY_ALLELE_FIELD = "."; + public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f"; + public static final int MISSING_GENOTYPE_QUALITY = -1; + + // the reference base + private String mReferenceBases; + // our location + private String mContig; + private int mPosition; + // our id + private String mID; + // the alternate bases + private final List mAlts = new ArrayList(); + // our qual value + private double mQual; + // our filter string + private String mFilterString; + // our info fields -- use a TreeMap to ensure they can be pulled out in order (so it passes integration tests) + private final Map mInfoFields = new TreeMap(); + + // our genotype formatting string + private String mGenotypeFormatString; + + // the vcf header we're associated with + private VCFHeader vcfHeader = null; + + // our genotype sample fields + private final List mGenotypeRecords = new ArrayList(); + + /** + * given a reference base, a location, and the format string, create a VCF record. + * + * @param referenceBases the reference bases to use + * @param contig our contig + * @param start the start location + * @param genotypeFormatString the format string + */ + public VCFRecord(String referenceBases, String contig, int start, String genotypeFormatString) { + setReferenceBase(referenceBases); + setLocation(contig, start); + mGenotypeFormatString = genotypeFormatString; + } + + /** + * given the values for each of the columns, create a VCF record. + * + * @param columnValues a mapping of header strings to values + * @param genotypeFormatString the format string for the genotype records + * @param genotypeRecords the genotype records + */ + public VCFRecord(Map columnValues, String genotypeFormatString, List genotypeRecords) { + extractFields(columnValues); + mGenotypeRecords.addAll(genotypeRecords); + mGenotypeFormatString = genotypeFormatString; + } + + /** + * given the values for each of the columns, create a VCF record. + * + * @param columnValues a mapping of header strings to values + */ + public VCFRecord(Map columnValues) { + extractFields(columnValues); + mGenotypeFormatString = ""; + } + + /** + * create a VCF record + * + * @param referenceBases the reference bases to use + * @param contig the contig this variant is on + * @param position our position + * @param ID our ID string + * @param altBases the list of alternate bases + * @param qual the qual field + * @param filters the filters used on this variant + * @param infoFields the information fields + * @param genotypeFormatString the format string + * @param genotypeObjects the genotype objects + */ + public VCFRecord(String referenceBases, + String contig, + long position, + String ID, + List altBases, + double qual, + String filters, + Map infoFields, + String genotypeFormatString, + List genotypeObjects) { + setReferenceBase(referenceBases); + setLocation(contig, position); + this.mID = ID; + for (VCFGenotypeEncoding alt : altBases) + this.addAlternateBase(alt); + this.setQual(qual); + this.setFilterString(filters); + this.mInfoFields.putAll(infoFields); + this.mGenotypeFormatString = genotypeFormatString; + this.mGenotypeRecords.addAll(genotypeObjects); + } + + /** + * extract the field values from the passed in array + * + * @param columnValues a map of the header fields to values + */ + private void extractFields(Map columnValues) { + String chrom = null; + long position = -1; + + for (VCFHeader.HEADER_FIELDS val : columnValues.keySet()) { + switch (val) { + case CHROM: + chrom = columnValues.get(val); + break; + case POS: + position = Integer.valueOf(columnValues.get(val)); + break; + case ID: + setID(columnValues.get(val)); + break; + case REF: + if (columnValues.get(val).length() != 1) + throw new IllegalArgumentException("Reference base should be a single character"); + setReferenceBase(columnValues.get(val)); + break; + case ALT: + String values[] = columnValues.get(val).split(","); + for (String alt : values) + addAlternateBase(new VCFGenotypeEncoding(alt)); + break; + case QUAL: + setQual(Double.valueOf(columnValues.get(val))); + break; + case FILTER: + setFilterString(columnValues.get(val)); + break; + case INFO: + String vals[] = columnValues.get(val).split(";"); + for (String alt : vals) { + if ( alt.equals(EMPTY_INFO_FIELD) ) + continue; + String keyVal[] = alt.split("="); + if ( keyVal.length == 1 ) + addInfoField(keyVal[0], ""); + else if (keyVal.length == 2) + addInfoField(keyVal[0], keyVal[1]); + else + throw new IllegalArgumentException("info field key-value pair did not parse into key->value pair: " + alt); + } + break; + } + } + setLocation(chrom, position); + } + + /** + * do we have genotyping data + * + * @return true if we have genotyping data, false otherwise + */ + + public boolean hasGenotypeData() { + return (mGenotypeRecords.size() > 0); + } + + /** + * @return the ID value for this record + */ + public String getID() { + return mID == null ? EMPTY_ID_FIELD : mID; + } + + /** + * get the reference base + * + * @return either A, T, C, G, or N + */ + public String getReference() { + return mReferenceBases; + } + + /** + * get the alternate allele strings + * + * @return an array of strings representing the alt alleles, or null if there are none + */ + public List getAlternateAlleleList() { + ArrayList alts = new ArrayList(); + for ( VCFGenotypeEncoding alt : mAlts ) + alts.add(alt.getBases()); + return alts; + } + + public List getAlternateAlleles() { + return mAlts; + } + + public boolean hasAlternateAllele() { + for ( VCFGenotypeEncoding alt : mAlts ) { + if ( alt.getType() != VCFGenotypeEncoding.TYPE.UNCALLED ) + return true; + } + + return false; + } + + public boolean isBiallelic() { + return getAlternateAlleles().size() == 1; + } + + public boolean isReference() { + return !hasAlternateAllele(); + } + + public List getAlleleList() { + ArrayList list = new ArrayList(); + list.add(getReference()); + list.addAll(getAlternateAlleleList()); + return list; + } + + public double getNonRefAlleleFrequency() { + if ( mInfoFields.containsKey(ALLELE_FREQUENCY_KEY) ) { + return Double.valueOf(mInfoFields.get(ALLELE_FREQUENCY_KEY)); + } else { + // this is the poor man's AF + if ( mInfoFields.containsKey(ALLELE_COUNT_KEY) && mInfoFields.containsKey(ALLELE_NUMBER_KEY)) { + String splt[] = mInfoFields.get(ALLELE_COUNT_KEY).split(","); + if ( splt.length > 0 ) { + return (Double.valueOf(splt[0]) / Double.valueOf(mInfoFields.get(ALLELE_NUMBER_KEY))); + } + } + } + + return 0.0; + } + + public VCFGenotypeEncoding.TYPE getType() { + VCFGenotypeEncoding.TYPE type = mAlts.get(0).getType(); + for (int i = 1; i < mAlts.size(); i++) { + if ( mAlts.get(i).getType() != type ) + return VCFGenotypeEncoding.TYPE.MIXED; // if we have more than one type, return mixed + } + return type; + } + + public boolean isDeletion() { + return getType() == VCFGenotypeEncoding.TYPE.DELETION; + } + + public boolean isInsertion() { + return getType() == VCFGenotypeEncoding.TYPE.INSERTION; + } + + public boolean isIndel() { + return isDeletion() || isInsertion(); + } + + public boolean isSNP() { + return getType() == VCFGenotypeEncoding.TYPE.SINGLE_BASE; + } + + public boolean isNovel() { + return ( ! isInDBSNP() ) && ( ! isInHapmap() ); + } + + public boolean isInDBSNP() { + return ( ( mID != null && ! mID.equals(".") ) || ( mInfoFields.get(DBSNP_KEY) != null && mInfoFields.get(DBSNP_KEY).equals("1") ) ); + } + + public boolean isInHapmap() { + if ( mInfoFields.get(HAPMAP2_KEY) != null && mInfoFields.get(HAPMAP2_KEY).equals("1") ) { + return true; + } else { + return ( mInfoFields.get(HAPMAP3_KEY) != null && mInfoFields.get(HAPMAP3_KEY).equals("1") ); + } + } + + public char getAlternativeBaseForSNP() { + if ( !isSNP() && !isBiallelic() ) + throw new IllegalStateException("This record does not represent a SNP"); + return mAlts.get(0).getBases().charAt(0); + } + + public char getReferenceForSNP() { + if ( !isSNP() ) + throw new IllegalStateException("This record does not represent a SNP"); + return getReference().charAt(0); + } + + /** + * @return the phred-scaled quality score + */ + public double getQual() { + return mQual; + } + + public int getPosition() { + return mPosition; + } + + public boolean isMissingQual() { + return (int)mQual == MISSING_GENOTYPE_QUALITY; + } + + /** + * @return the -log10PError + */ + public double getNegLog10PError() { + return mQual / 10.0; + } + + /** + * get the filter criteria + * + * @return an array of strings representing the filtering criteria, or UNFILTERED if none are applied + */ + public String[] getFilteringCodes() { + if (mFilterString == null) return new String[]{UNFILTERED}; + return mFilterString.split(FILTER_CODE_SEPERATOR); + } + + public boolean isFiltered() { + String[] codes = getFilteringCodes(); + return !codes[0].equals(UNFILTERED) && !codes[0].equals(PASSES_FILTERS); + } + +// public boolean hasFilteringCodes() { +// return mFilterString != null; +// } + + public String getFilterString() { + return mFilterString; + } + + /** + * get the information key-value pairs as a Map<> + * + * @return a map, of the info key-value pairs + */ + public final Map getInfoValues() { + return mInfoFields; + } + + public List getVCFGenotypeRecords() { + return mGenotypeRecords; + } + + /** + * @return a List of the sample names + */ + public String[] getSampleNames() { + String names[] = new String[mGenotypeRecords.size()]; + for (int i = 0; i < mGenotypeRecords.size(); i++) { + names[i] = mGenotypeRecords.get(i).getSampleName(); + } + return names; + } + + public VCFGenotypeRecord getGenotype(final String sampleName) { + for ( VCFGenotypeRecord rec : getVCFGenotypeRecords() ) { + if ( rec.getSampleName().equals(sampleName) ) { + return rec; + } + } + + return null; + } + + public String getGenotypeFormatString() { + return mGenotypeFormatString; + }// the formatting string for our genotype records + + public void setGenotypeFormatString(String newFormatString) { + mGenotypeFormatString = newFormatString; + } + + public void setReferenceBase(String reference) { + mReferenceBases = reference.toUpperCase(); + } + + public void setLocation(String chrom, long position) { + if ( chrom == null ) + throw new IllegalArgumentException("Chromosomes cannot be missing"); + if ( position < 0 ) + throw new IllegalArgumentException("Position values must be greater than 0"); + this.mContig = chrom; + this.mPosition = (int)position; + } + + public void setID(String ID) { + mID = ID; + } + + public void setQual(double qual) { + if ( qual < 0 && (int)qual != MISSING_GENOTYPE_QUALITY ) + throw new IllegalArgumentException("Qual values cannot be negative unless they are " + MISSING_GENOTYPE_QUALITY + " ('unknown')"); + mQual = qual; + } + + public void setFilterString(String filterString) { + mFilterString = filterString; + } + + public void addGenotypeRecord(VCFGenotypeRecord mGenotypeRecord) { + mGenotypeRecords.add(mGenotypeRecord); + } + + public void setGenotypeRecords(List records) { + mGenotypeRecords.clear(); + for ( VCFGenotypeRecord g : records ) + addGenotypeRecord(g); + } + + /** + * add an alternate base to our alternate base list. All bases are uppercased + * before being added to the list. + * + * @param base the base to add + */ + public void addAlternateBase(VCFGenotypeEncoding base) { + if (!mAlts.contains(base)) mAlts.add(base); + } + + public void setAlternateBases(List bases) { + mAlts.clear(); + for ( VCFGenotypeEncoding e : bases ) + addAlternateBase(e); + } + + /** + * add an info field to the record + * + * @param key the key, from the spec or a user created key + * @param value it's value as a string + */ + public void addInfoField(String key, String value) { + //System.out.printf("Adding info field %s=%s%n", key, value); + mInfoFields.put(key, value); + } + + public void printInfoFields() { + for ( Map.Entry e : mInfoFields.entrySet() ) { + System.out.printf(" Current info field %s=%s this=%s%n", e.getKey(), e.getValue(), this); + } + } + + + /** + * add an info field to the record + * + * @param m A map from info keys to info values + */ + public void addInfoFields(Map m) { + for ( Map.Entry e : m.entrySet() ) + addInfoField(e.getKey(), e.getValue()); + } + + + /** + * the generation of a string representation, which is used by the VCF writer + * + * @param header the VCF header for this VCF Record + * @return a string + */ + public String toStringEncoding(VCFHeader header) { + StringBuilder builder = new StringBuilder(); + + // CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO + builder.append(mContig); + builder.append(FIELD_SEPERATOR); + builder.append(mPosition); + builder.append(FIELD_SEPERATOR); + builder.append(getID()); + builder.append(FIELD_SEPERATOR); + builder.append(getReference()); + builder.append(FIELD_SEPERATOR); + List alts = getAlternateAlleles(); + if ( alts.size() > 0 ) { + builder.append(alts.get(0)); + for ( int i = 1; i < alts.size(); i++ ) { + builder.append(","); + builder.append(alts.get(i)); + } + } else { + builder.append(EMPTY_ALLELE_FIELD); + } + builder.append(FIELD_SEPERATOR); + if ( (int)mQual == MISSING_GENOTYPE_QUALITY ) + builder.append(MISSING_GENOTYPE_QUALITY); + else + builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, mQual)); + builder.append(FIELD_SEPERATOR); + builder.append(ParsingUtils.join(FILTER_CODE_SEPERATOR, getFilteringCodes())); + builder.append(FIELD_SEPERATOR); + builder.append(createInfoString()); + + if ( mGenotypeFormatString != null && mGenotypeFormatString.length() > 0 ) { +// try { + addGenotypeData(builder, header); +// } catch (Exception e) { +// if ( validationStringency == VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT ) +// throw new RuntimeException(e); +// } + } + + return builder.toString(); + } + + /** + * create the info string + * + * @return a string representing the infomation fields + */ + protected String createInfoString() { + StringBuffer info = new StringBuffer(); + boolean isFirst = true; + for (Map.Entry entry : mInfoFields.entrySet()) { + if ( isFirst ) + isFirst = false; + else + info.append(INFO_FIELD_SEPERATOR); + info.append(entry.getKey()); + if ( entry.getValue() != null && !entry.getValue().equals("") ) { + info.append("="); + info.append(entry.getValue()); + } + } + return info.length() == 0 ? EMPTY_INFO_FIELD : info.toString(); + } + + /** + * add the genotype data + * + * @param builder the string builder + * @param header the header object + */ + private void addGenotypeData(StringBuilder builder, VCFHeader header) { + Map gMap = genotypeListToMap(getVCFGenotypeRecords()); + + StringBuffer tempStr = new StringBuffer(); + if ( header.getGenotypeSamples().size() < getVCFGenotypeRecords().size() ) { + for ( String sample : gMap.keySet() ) { + if ( !header.getGenotypeSamples().contains(sample) ) + System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header"); + else + header.getGenotypeSamples().remove(sample); + } + throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated"); + } + tempStr.append(FIELD_SEPERATOR + mGenotypeFormatString); + + String[] genotypeFormatStrings = mGenotypeFormatString.split(":"); + + for ( String genotype : header.getGenotypeSamples() ) { + tempStr.append(FIELD_SEPERATOR); + if ( gMap.containsKey(genotype) ) { + VCFGenotypeRecord rec = gMap.get(genotype); + tempStr.append(rec.toStringEncoding(mAlts, genotypeFormatStrings)); + gMap.remove(genotype); + } else { + tempStr.append(VCFGenotypeRecord.stringEncodingForEmptyGenotype(genotypeFormatStrings)); + } + } + if ( gMap.size() != 0 ) { + for ( String sample : gMap.keySet() ) + System.err.println("Sample " + sample + " is being genotyped but isn't in the header."); + throw new IllegalStateException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records"); + } + + builder.append(tempStr); + } + + /** + * compare two VCF records + * + * @param other the other VCF record + * @return true if they're equal + */ + public boolean equals(VCFRecord other) { + if (!this.mAlts.equals(other.mAlts)) return false; + if (!this.mReferenceBases.equals(other.mReferenceBases)) return false; + if (!this.mContig.equals(other.mContig)) return false; + if (mPosition != other.mPosition) return false; + if (!this.mID.equals(other.mID)) return false; + if (this.mQual != other.mQual) return false; + if ( this.mFilterString == null ) { + if ( other.mFilterString != null ) return false; + } else if ( !this.mFilterString.equals(other.mFilterString) ) return false; + if (!this.mInfoFields.equals(other.mInfoFields)) return false; + if (!this.mGenotypeRecords.equals(other.mGenotypeRecords)) return false; + return true; + } + + /** + * create a genotype mapping from a list and their sample names + * + * @param list a list of genotype samples + * @return a mapping of the sample name to VCF genotype record + */ + private static Map genotypeListToMap(List list) { + Map map = new HashMap(); + for (int i = 0; i < list.size(); i++) { + VCFGenotypeRecord rec = list.get(i); + map.put(rec.getSampleName(), rec); + } + return map; + } + + /** Return the features reference sequence name, e.g chromosome or contig */ + public String getChr() { + return this.mContig; + } + + /** Return the start position in 1-based coordinates (first base is 1) */ + public int getStart() { + return this.mPosition; + } + + /** + * Return the end position following 1-based fully closed conventions. The length of a feature is + * end - start + 1; + */ + public int getEnd() { + return this.mPosition; + } + + /** + * set the VCF header we're associated with + * @param header the header + */ + void setHeader(VCFHeader header) { + vcfHeader = header; + } + + /** + * get the associated header + * @return the VCF Header + */ + public VCFHeader getHeader() { + return vcfHeader; + } +}