package org.broad.tribble.vcf; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** The VCFReaderUtils class, which contains a collection of utilities for working with VCF files */ public class VCFReaderUtils { // our pattern matching for the genotype mFields private static final Pattern gtPattern = Pattern.compile("([0-9\\.]+)([\\\\|\\/])([0-9\\.]*)"); /** * create a VCF header, given an array of strings that all start with at least the # character. This function is * package protected so that the VCFReaderUtils can access this function * * @param headerStrings a list of header strings * * @return a VCF Header created from the list of stinrgs */ public static VCFHeader createHeader(List headerStrings, VCFHeaderVersion version) { Set metaData = new TreeSet(); Set auxTags = new LinkedHashSet(); // iterate over all the passed in strings for ( String str : headerStrings ) { if ( !str.startsWith("##") ) { String[] strings = str.substring(1).split("\\t"); // the columns should be in order according to Richard Durbin int arrayIndex = 0; for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { try { if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex])) throw new RuntimeException("VCFReaderUtils: we were expecting column name " + field + " but we saw " + strings[arrayIndex]); } catch (IllegalArgumentException e) { throw new RuntimeException("VCFReaderUtils: Unknown column name \"" + strings[arrayIndex] + "\", it does not match a known column header name."); } arrayIndex++; } while (arrayIndex < strings.length) { if (!strings[arrayIndex].equals("FORMAT")) auxTags.add(strings[arrayIndex]); arrayIndex++; } } else { if ( str.startsWith("##INFO=") ) metaData.add(new VCFInfoHeaderLine(str.substring(7),version)); else if ( str.startsWith("##FILTER=") ) metaData.add(new VCFFilterHeaderLine(str.substring(9),version)); else if ( str.startsWith("##FORMAT=") ) metaData.add(new VCFFormatHeaderLine(str.substring(9),version)); else { int equals = str.indexOf("="); if ( equals != -1 ) metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1),version)); } } } return new VCFHeader(metaData, auxTags); } /** * create the next VCFRecord, given the input line * * @param line the line from the file * @param mHeader the VCF header * * @return the VCFRecord */ public static VCFRecord createRecord(String line, VCFHeader mHeader) { return createRecord(line, mHeader, false); } public static VCFRecord createRecord(String line, VCFHeader mHeader, boolean ignoreGenotypes) { // things we need to make a VCF record Map values = new HashMap(); String tokens[] = line.split("\\t"); // check to ensure that the column count of tokens is right if (tokens.length != mHeader.getColumnCount()) { throw new RuntimeException("The input file line doesn't contain enough fields, it should have " + mHeader.getColumnCount() + " fields, it has " + tokens.length + ". Line = " + line); } int index = 0; for (VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields()) values.put(field, tokens[index++]); // if we have genotyping data, we try and extract the genotype fields if ( ! ignoreGenotypes && mHeader.hasGenotypingData()) { String mFormatString = tokens[index]; String keyStrings[] = mFormatString.split(":"); List genotypeRecords = new ArrayList(); index++; String[] alt_alleles = values.get(VCFHeader.HEADER_FIELDS.ALT).split(","); for (String str : mHeader.getGenotypeSamples()) { genotypeRecords.add(getVCFGenotype(str, keyStrings, tokens[index], alt_alleles, values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0))); index++; } VCFRecord vrec = new VCFRecord(values, mFormatString, genotypeRecords); // associate the genotypes with this new record for ( VCFGenotypeRecord gr : genotypeRecords ) gr.setVCFRecord(vrec); return vrec; } return new VCFRecord(values); } /** * generate a VCF genotype record, given it's format string, the genotype string, and allele info * * @param sampleName the sample name * @param formatString the format string for this record, which contains the keys for the genotype parameters * @param genotypeString contains the phasing information, allele information, and values for genotype parameters * @param altAlleles the alternate allele string array, which we index into based on the field parameters * @param referenceBase the reference base * * @return a VCFGenotypeRecord */ public static VCFGenotypeRecord getVCFGenotype(String sampleName, String formatString, String genotypeString, String altAlleles[], char referenceBase) { return getVCFGenotype(sampleName, formatString.split(":"), genotypeString, altAlleles, referenceBase); } /** * generate a VCF genotype record, given it's format string, the genotype string, and allele info * * @param sampleName the sample name * @param keyStrings the split format string for this record, which contains the keys for the genotype parameters * @param genotypeString contains the phasing information, allele information, and values for genotype parameters * @param altAlleles the alternate allele string array, which we index into based on the field parameters * @param referenceBase the reference base * * @return a VCFGenotypeRecord */ public static VCFGenotypeRecord getVCFGenotype(String sampleName, String[] keyStrings, String genotypeString, String altAlleles[], char referenceBase) { // parameters to create the VCF genotype record HashMap tagToValue = new HashMap(); VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN; List bases = new ArrayList(); for (String key : keyStrings) { String parse; int nextDivider; if (!genotypeString.contains(":")) { nextDivider = genotypeString.length(); parse = genotypeString; } else { nextDivider = (genotypeString.indexOf(":") > genotypeString.length()) ? genotypeString.length() : genotypeString.indexOf(":"); parse = genotypeString.substring(0, nextDivider); } if (key.equals(VCFGenotypeRecord.GENOTYPE_KEY)) { Matcher m = gtPattern.matcher(parse); if (!m.matches()) throw new RuntimeException("VCFReaderUtils: Unable to match GT genotype flag to it's expected pattern, the field was: " + parse); phase = VCFGenotypeRecord.determinePhase(m.group(2)); addAllele(m.group(1), altAlleles, referenceBase, bases); if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases); } else { if ( parse.length() == 0 ) parse = VCFGenotypeRecord.getMissingFieldValue(key); tagToValue.put(key, parse); } if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1; genotypeString = genotypeString.substring(nextDivider + 1, genotypeString.length()); } if ( bases.size() > 0 && bases.get(0).equals(VCFGenotypeRecord.EMPTY_ALLELE) ) tagToValue.clear(); // catch some common errors, either there are too many field keys or there are two many field values else if ( keyStrings.length != tagToValue.size() + ((bases.size() > 0) ? 1 : 0)) throw new RuntimeException("VCFReaderUtils: genotype value count doesn't match the key count (expected " + keyStrings.length + " but saw " + tagToValue.size() + ")"); else if ( genotypeString.length() > 0 ) throw new RuntimeException("VCFReaderUtils: genotype string contained additional unprocessed fields: " + genotypeString + ". This most likely means that the format string is shorter then the value fields."); VCFGenotypeRecord rec = new VCFGenotypeRecord(sampleName, bases, phase); for ( Map.Entry entry : tagToValue.entrySet() ) rec.setField(entry.getKey(), entry.getValue()); return rec; } /** * add an alternate allele to the list of alleles we have for a VCF genotype record * * @param alleleNumber the allele number, as a string * @param altAlleles the list of alternate alleles * @param referenceBase the reference base * @param bases the list of bases for this genotype call */ private static void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List bases) { if (alleleNumber.equals(VCFGenotypeRecord.EMPTY_ALLELE)) { bases.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); } else { int alleleValue = Integer.valueOf(alleleNumber); // check to make sure the allele value is within bounds if (alleleValue < 0 || alleleValue > altAlleles.length) throw new IllegalArgumentException("VCFReaderUtils: the allele value of " + alleleValue + " is out of bounds given the alternate allele list."); if (alleleValue == 0) bases.add(new VCFGenotypeEncoding(String.valueOf(referenceBase))); else bases.add(new VCFGenotypeEncoding(altAlleles[alleleValue - 1])); } } }