gatk-3.8/java/src/org/broad/tribble/vcf/VCFReaderUtils.java

207 lines
10 KiB
Java
Raw Normal View History

package org.broad.tribble.vcf;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** The VCFReaderUtils class, which contains a collection of utilities for working with VCF files */
public class VCFReaderUtils {
// our pattern matching for the genotype mFields
private static final Pattern gtPattern = Pattern.compile("([0-9\\.]+)([\\\\|\\/])([0-9\\.]*)");
/**
* create a VCF header, given an array of strings that all start with at least the # character. This function is
* package protected so that the VCFReaderUtils can access this function
*
* @param headerStrings a list of header strings
First step in major cleanup/redo of VCF functionality. Specifically, now: a) VCF track name can work again with 3.3 or 4.0 VCF's when specifying -B name,VCF,file. Code will read header and parse automatically the version. b) Old VCF codec is deprecated. Reader goes now direct from parsing VCF lines into producing VariantContext objects, with no intermediate VCF records. If anyone can't resist the urge to still input files using the old method, a new VCF3Codec is in place with the old code, but it will be eventually deleted. c) VCF headers and VCF info fields no longer keep track of the version. They are parsed into an internal representation and will be output only in VCF4.0 format. d) As a consequence, the existing GATK bug where files are produced with VCF4 body but VCF3.3 headers is solved. e) Several VCF 4.0 writer bugs are now solved. f) Integration test MD5's are changed, mostly because of corrected VCF4.0 headers and because validation data mostly uses now VCF4.0. g) Several VCF files in the ValidationData/ directory have been converted to VCF 4.0 format. I kept the old versions, and the new versions have a .vcf4 extension. Pending issues: a) We are still not dealing with indels consistently or correctly when representing them. This will be a second part of the changes. b) The VCF writer doesn't use VCFRecord but it does still use a lot of leftovers like VCFGenotypeEncoding, VCFGenotypeRecord, etc. This needs to be simplified and cleaned. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3813 348d0f76-0448-11de-a6fe-93d51630548a
2010-07-17 06:49:16 +08:00
* @param version Header version to parse
* @return a VCF Header created from the list of stinrgs
*/
public static VCFHeader createHeader(List<String> headerStrings, VCFHeaderVersion version) {
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
Set<String> auxTags = new LinkedHashSet<String>();
// iterate over all the passed in strings
for ( String str : headerStrings ) {
if ( !str.startsWith("##") ) {
String[] strings = str.substring(1).split("\\t");
// the columns should be in order according to Richard Durbin
int arrayIndex = 0;
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
try {
if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex]))
throw new RuntimeException("VCFReaderUtils: we were expecting column name " + field + " but we saw " + strings[arrayIndex]);
} catch (IllegalArgumentException e) {
throw new RuntimeException("VCFReaderUtils: Unknown column name \"" + strings[arrayIndex] + "\", it does not match a known column header name.");
}
arrayIndex++;
}
while (arrayIndex < strings.length) {
if (!strings[arrayIndex].equals("FORMAT"))
auxTags.add(strings[arrayIndex]);
arrayIndex++;
}
} else {
if ( str.startsWith("##INFO=") )
metaData.add(new VCFInfoHeaderLine(str.substring(7),version));
else if ( str.startsWith("##FILTER=") )
metaData.add(new VCFFilterHeaderLine(str.substring(9),version));
else if ( str.startsWith("##FORMAT=") )
metaData.add(new VCFFormatHeaderLine(str.substring(9),version));
else {
int equals = str.indexOf("=");
if ( equals != -1 )
First step in major cleanup/redo of VCF functionality. Specifically, now: a) VCF track name can work again with 3.3 or 4.0 VCF's when specifying -B name,VCF,file. Code will read header and parse automatically the version. b) Old VCF codec is deprecated. Reader goes now direct from parsing VCF lines into producing VariantContext objects, with no intermediate VCF records. If anyone can't resist the urge to still input files using the old method, a new VCF3Codec is in place with the old code, but it will be eventually deleted. c) VCF headers and VCF info fields no longer keep track of the version. They are parsed into an internal representation and will be output only in VCF4.0 format. d) As a consequence, the existing GATK bug where files are produced with VCF4 body but VCF3.3 headers is solved. e) Several VCF 4.0 writer bugs are now solved. f) Integration test MD5's are changed, mostly because of corrected VCF4.0 headers and because validation data mostly uses now VCF4.0. g) Several VCF files in the ValidationData/ directory have been converted to VCF 4.0 format. I kept the old versions, and the new versions have a .vcf4 extension. Pending issues: a) We are still not dealing with indels consistently or correctly when representing them. This will be a second part of the changes. b) The VCF writer doesn't use VCFRecord but it does still use a lot of leftovers like VCFGenotypeEncoding, VCFGenotypeRecord, etc. This needs to be simplified and cleaned. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3813 348d0f76-0448-11de-a6fe-93d51630548a
2010-07-17 06:49:16 +08:00
metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1)));
}
}
}
return new VCFHeader(metaData, auxTags);
}
/**
* create the next VCFRecord, given the input line
*
* @param line the line from the file
* @param mHeader the VCF header
*
* @return the VCFRecord
*/
public static VCFRecord createRecord(String line, VCFHeader mHeader) {
return createRecord(line, mHeader, false);
}
public static VCFRecord createRecord(String line, VCFHeader mHeader, boolean ignoreGenotypes) {
// things we need to make a VCF record
Map<VCFHeader.HEADER_FIELDS, String> values = new HashMap<VCFHeader.HEADER_FIELDS, String>();
String tokens[] = line.split("\\t");
// check to ensure that the column count of tokens is right
if (tokens.length != mHeader.getColumnCount()) {
throw new RuntimeException("The input file line doesn't contain enough fields, it should have " + mHeader.getColumnCount() + " fields, it has " + tokens.length + ". Line = " + line);
}
int index = 0;
for (VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields())
values.put(field, tokens[index++]);
// if we have genotyping data, we try and extract the genotype fields
if ( ! ignoreGenotypes && mHeader.hasGenotypingData()) {
String mFormatString = tokens[index];
String keyStrings[] = mFormatString.split(":");
List<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>();
index++;
String[] alt_alleles = values.get(VCFHeader.HEADER_FIELDS.ALT).split(",");
for (String str : mHeader.getGenotypeSamples()) {
genotypeRecords.add(getVCFGenotype(str, keyStrings, tokens[index], alt_alleles, values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0)));
index++;
}
VCFRecord vrec = new VCFRecord(values, mFormatString, genotypeRecords);
// associate the genotypes with this new record
for ( VCFGenotypeRecord gr : genotypeRecords )
gr.setVCFRecord(vrec);
return vrec;
}
return new VCFRecord(values);
}
/**
* generate a VCF genotype record, given it's format string, the genotype string, and allele info
*
* @param sampleName the sample name
* @param formatString the format string for this record, which contains the keys for the genotype parameters
* @param genotypeString contains the phasing information, allele information, and values for genotype parameters
* @param altAlleles the alternate allele string array, which we index into based on the field parameters
* @param referenceBase the reference base
*
* @return a VCFGenotypeRecord
*/
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String formatString, String genotypeString, String altAlleles[], char referenceBase) {
return getVCFGenotype(sampleName, formatString.split(":"), genotypeString, altAlleles, referenceBase);
}
/**
* generate a VCF genotype record, given it's format string, the genotype string, and allele info
*
* @param sampleName the sample name
* @param keyStrings the split format string for this record, which contains the keys for the genotype parameters
* @param genotypeString contains the phasing information, allele information, and values for genotype parameters
* @param altAlleles the alternate allele string array, which we index into based on the field parameters
* @param referenceBase the reference base
*
* @return a VCFGenotypeRecord
*/
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String[] keyStrings, String genotypeString, String altAlleles[], char referenceBase) {
// parameters to create the VCF genotype record
HashMap<String, String> tagToValue = new HashMap<String, String>();
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNPHASED;
List<VCFGenotypeEncoding> bases = new ArrayList<VCFGenotypeEncoding>();
for (String key : keyStrings) {
String parse;
int nextDivider;
if (!genotypeString.contains(":")) {
nextDivider = genotypeString.length();
parse = genotypeString;
} else {
nextDivider = (genotypeString.indexOf(":") > genotypeString.length()) ? genotypeString.length() : genotypeString.indexOf(":");
parse = genotypeString.substring(0, nextDivider);
}
if (key.equals(VCFConstants.GENOTYPE_KEY)) {
Matcher m = gtPattern.matcher(parse);
if (!m.matches())
throw new RuntimeException("VCFReaderUtils: Unable to match GT genotype flag to it's expected pattern, the field was: " + parse);
phase = VCFGenotypeRecord.determinePhase(m.group(2));
addAllele(m.group(1), altAlleles, referenceBase, bases);
if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases);
} else {
if ( parse.length() == 0 )
parse = VCFGenotypeRecord.getMissingFieldValue(key);
tagToValue.put(key, parse);
}
if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1;
genotypeString = genotypeString.substring(nextDivider + 1, genotypeString.length());
}
if ( bases.size() > 0 && bases.get(0).equals(VCFConstants.EMPTY_ALLELE) )
tagToValue.clear();
// catch some common errors, either there are too many field keys or there are two many field values
else if ( keyStrings.length != tagToValue.size() + ((bases.size() > 0) ? 1 : 0))
throw new RuntimeException("VCFReaderUtils: genotype value count doesn't match the key count (expected "
+ keyStrings.length + " but saw " + tagToValue.size() + ")");
else if ( genotypeString.length() > 0 )
throw new RuntimeException("VCFReaderUtils: genotype string contained additional unprocessed fields: " + genotypeString
+ ". This most likely means that the format string is shorter then the value fields.");
VCFGenotypeRecord rec = new VCFGenotypeRecord(sampleName, bases, phase);
for ( Map.Entry<String, String> entry : tagToValue.entrySet() )
rec.setField(entry.getKey(), entry.getValue());
return rec;
}
/**
* add an alternate allele to the list of alleles we have for a VCF genotype record
*
* @param alleleNumber the allele number, as a string
* @param altAlleles the list of alternate alleles
* @param referenceBase the reference base
* @param bases the list of bases for this genotype call
*/
private static void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List<VCFGenotypeEncoding> bases) {
if (alleleNumber.equals(VCFConstants.EMPTY_ALLELE)) {
bases.add(new VCFGenotypeEncoding(VCFConstants.EMPTY_ALLELE));
} else {
int alleleValue = Integer.valueOf(alleleNumber);
// check to make sure the allele value is within bounds
if (alleleValue < 0 || alleleValue > altAlleles.length)
throw new IllegalArgumentException("VCFReaderUtils: the allele value of " + alleleValue + " is out of bounds given the alternate allele list.");
if (alleleValue == 0)
bases.add(new VCFGenotypeEncoding(String.valueOf(referenceBase)));
else
bases.add(new VCFGenotypeEncoding(altAlleles[alleleValue - 1]));
}
}
}