diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java index 1052998d7..7becf3cfc 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java @@ -4,8 +4,6 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; /** @@ -26,77 +24,28 @@ public class VCFGenotypeRecord { private GT_GENOTYPE phaseType; // our reference bases(s) - private final char reference; + private final char mReferenceBase; // our bases(s) - private final List bases = new ArrayList(); + private final List mAlleleBases = new ArrayList(); - // our mapping of the format fields to values - private final Map fields = new HashMap(); - - // our pattern matching for the genotype fields - private static final Pattern basicSplit = Pattern.compile("([0-9]*)([\\\\|\\/])([0-9]*):(\\S*)"); + // our mapping of the format mFields to values + private final Map mFields = new HashMap(); /** - * generate a VCF genotype record, given it's format string, the genotype string, and allele info + * create a VCF record * - * @param formatString the format string for this record, which contains the keys for the genotype parameters - * @param genotypeString contains the phasing information, allele information, and values for genotype parameters - * @param altAlleles the alternate allele string array, which we index into based on the field parameters - * @param referenceBase the reference base - */ - protected VCFGenotypeRecord(String formatString, String genotypeString, String altAlleles[], char referenceBase) { - reference = referenceBase; - // check that the first format field is GT, which is required - String keys[] = formatString.split(":"); - if (keys.length < 0 || !keys[0].equals("GT")) - throw new IllegalArgumentException("The format string must have fields, and the first must be GT (genotype)"); - - // find the values for each of the keys, of which the GT field should be the first - Matcher match = basicSplit.matcher(genotypeString); - if (!match.matches() || match.groupCount() < 3) - throw new IllegalArgumentException("Unable to match genotype string to expected regex"); - - // add the alternate base (which can be ref by specifying 0) - addAllele(match.group(1), altAlleles, referenceBase); - - determinePhase(match.group(2)); - - // do we have a second alt allele? - if (match.group(3).length() > 0) { - addAllele(match.group(3), altAlleles, referenceBase); - } - - // check to see what other records we have - if (match.groupCount() == 4) { - // make sure we'll have enough occurances - String tokens[] = match.group(4).split(":{1}"); // the {1} was required, since string.split does a greedy match of the specified regex, like :+ - int keyIndex = 1; - for (String token: tokens) { - this.fields.put(keys[keyIndex],token); - keyIndex++; - } - if (keyIndex + 1 == tokens.length) fields.put(keys[++keyIndex],""); // if the last value is blank, split will leave it off - if (keyIndex == 1 && match.group(4).contains(":")) { - // there was a string of all semicolons, split doesn't handle this well (or at all) - while(keyIndex < keys.length) this.fields.put(keys[keyIndex++],""); - } - } - - } - - /** - * add an alternate allele to the list of alleles we have - * - * @param alleleNumber the allele number, as a string - * @param altAlleles the list of alternate alleles + * @param keyValues the key values + * @param Alleles the alleles, one if we're halpoid, two if we're diploid + * @param phasing the phasing of the the genotype * @param referenceBase the reference base */ - private void addAllele(String alleleNumber, String[] altAlleles, char referenceBase) { - if (Integer.valueOf(alleleNumber) == 0) - bases.add(String.valueOf(referenceBase)); - else - bases.add(altAlleles[Integer.valueOf(alleleNumber) - 1]); + public VCFGenotypeRecord(Map keyValues, List Alleles, GT_GENOTYPE phasing, char referenceBase) { + // validate + this.mReferenceBase = referenceBase; + this.mFields.putAll(keyValues); + this.mAlleleBases.addAll(Alleles); + this.phaseType = phasing; } /** @@ -104,14 +53,14 @@ public class VCFGenotypeRecord { * * @param phase the string that contains the phase character */ - private void determinePhase(String phase) { + static GT_GENOTYPE determinePhase(String phase) { // find the phasing information if (phase.equals("/")) - phaseType = GT_GENOTYPE.UNPHASED; + return GT_GENOTYPE.UNPHASED; else if (phase.equals("|")) - phaseType = GT_GENOTYPE.PHASED; + return GT_GENOTYPE.PHASED; else if (phase.equals("\\")) - phaseType = GT_GENOTYPE.PHASED_SWITCH_PROB; + return GT_GENOTYPE.PHASED_SWITCH_PROB; else throw new IllegalArgumentException("Unknown genotype phasing parameter"); } @@ -123,14 +72,14 @@ public class VCFGenotypeRecord { } public char getReference() { - return reference; + return mReferenceBase; } public List getAllele() { - return bases; + return mAlleleBases; } public Map getFields() { - return fields; + return mFields; } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java index 0660e1d13..459bcd161 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java @@ -20,10 +20,13 @@ public class VCFReader implements Iterator, Iterable { // our next record private VCFRecord mNextRecord = null; - + // a pattern we use for detecting meta data and header lines private static Pattern pMeta = Pattern.compile("^" + VCFHeader.METADATA_INDICATOR + "\\s*(\\S+)\\s*=\\s*(\\S+)\\s*$"); + // our pattern matching for the genotype mFields + private static final Pattern basicSplit = Pattern.compile("([0-9]*)([\\\\|\\/])([0-9]*):(\\S*)"); + /** * Create a VCF reader, given a VCF file * @@ -49,7 +52,7 @@ public class VCFReader implements Iterator, Iterable { line = mReader.readLine(); } mHeader = this.createHeader(lines); - mNextRecord = new VCFRecord(mHeader, line); + mNextRecord = createRecord(mReader.readLine()); } catch (IOException e) { throw new StingException("VCFReader: Failed to parse VCF File on line: " + line, e); } @@ -71,7 +74,7 @@ public class VCFReader implements Iterator, Iterable { try { String line = mReader.readLine(); if (line == null) mNextRecord = null; - else mNextRecord = new VCFRecord(mHeader, line); + else mNextRecord = createRecord(line); } catch (IOException e) { mNextRecord = null; } @@ -91,7 +94,7 @@ public class VCFReader implements Iterator, Iterable { */ protected VCFHeader createHeader(List headerStrings) { - Map metaData = new HashMap(); + Map metaData = new HashMap(); Set headerFields = new LinkedHashSet(); List auxTags = new ArrayList(); // iterate over all the passed in strings @@ -112,12 +115,13 @@ public class VCFReader implements Iterator, Iterable { if (str.startsWith("#") && !str.startsWith("##")) { String[] strings = str.substring(1).split("\\s+"); for (String s : strings) { - if (headerFields.contains(s)) throw new StingException("VCFReader: Header field duplication is not allowed"); + if (headerFields.contains(s)) + throw new StingException("VCFReader: Header field duplication is not allowed"); try { headerFields.add(VCFHeader.HEADER_FIELDS.valueOf(s)); } catch (IllegalArgumentException e) { - if (!s.equals("FORMAT")) - auxTags.add(s); + if (!s.equals("FORMAT")) + auxTags.add(s); } } } @@ -126,13 +130,117 @@ public class VCFReader implements Iterator, Iterable { throw new StingException("VCFReader: The VCF column header line is missing " + (VCFHeader.HEADER_FIELDS.values().length - headerFields.size()) + " of the " + VCFHeader.HEADER_FIELDS.values().length + " required fields"); } - return new VCFHeader(headerFields,metaData,auxTags); + return new VCFHeader(headerFields, metaData, auxTags); } /** + * create the next VCFRecord, given the input line * - * @return get the header associated with this reader + * @param line the line from the file + * + * @return the VCFRecord */ + public VCFRecord createRecord(String line) { + // things we need to make a VCF record + Map values = new HashMap(); + String tokens[] = line.split("\\s+"); + + // check to ensure that the column count of tokens is right + if (tokens.length != mHeader.getColumnCount()) { + throw new StingException("The input file line doesn't contain enough fields, it should have " + mHeader.getColumnCount() + " fields, it has" + values.size()); + } + + int index = 0; + for (VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields()) + values.put(field, tokens[index++]); + // if we have genotyping data, we try and extract the genotype fields + if (mHeader.hasGenotypingData()) { + String mFormatString = tokens[index]; + List genotypeRecords = new ArrayList(); + index++; + for (String str : mHeader.getGenotypeSamples()) { + genotypeRecords.add(getVCFGenotype(mFormatString, tokens[index], values.get(VCFHeader.HEADER_FIELDS.ALT).split(","), values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0))); + index++; + } + return new VCFRecord(mHeader,values,mFormatString,genotypeRecords); + } + return new VCFRecord(mHeader, values); + } + + /** + * generate a VCF genotype record, given it's format string, the genotype string, and allele info + * + * @param formatString the format string for this record, which contains the keys for the genotype parameters + * @param genotypeString contains the phasing information, allele information, and values for genotype parameters + * @param altAlleles the alternate allele string array, which we index into based on the field parameters + * @param referenceBase the reference base + */ + public VCFGenotypeRecord getVCFGenotype(String formatString, String genotypeString, String altAlleles[], char referenceBase) { + // check that the first format field is GT, which is required + String keys[] = formatString.split(":"); + List alleles = new ArrayList(); + if (keys.length < 0 || !keys[0].equals("GT")) + throw new IllegalArgumentException("The format string must have fields, and the first must be GT (genotype)"); + + // find the values for each of the keys, of which the GT field should be the first + Matcher match = basicSplit.matcher(genotypeString); + if (!match.matches() || match.groupCount() < 3) + throw new IllegalArgumentException("Unable to match genotype string to expected regex"); + + // add the alternate base (which can be ref by specifying 0) + addAllele(match.group(1), altAlleles, referenceBase, alleles); + + VCFGenotypeRecord.GT_GENOTYPE phase = VCFGenotypeRecord.determinePhase(match.group(2)); + + // do we have a second alt allele? + if (match.group(3).length() > 0) { + addAllele(match.group(3), altAlleles, referenceBase, alleles); + } + + Map fields = new HashMap(); + // check to see what other records we have + if (match.groupCount() == 4) { + // make sure we'll have enough occurances + String tokens[] = match.group(4).split(":{1}"); // the {1} was required, since string.split does a greedy match of the specified regex, like :+ + int keyIndex = 1; + try { + for (String token : tokens) { + fields.put(keys[keyIndex], token); + keyIndex++; + } + } + // we catch the follow exception. What this generally means is that the format string specified less mFields then the genotype string contains + catch (ArrayIndexOutOfBoundsException e) { + throw new StingException("VCFGenotypeRecord: ArrayIndexOutOfBoundsException, most likely the field list was less then the genotype " + "" + + "values provided. Format String = " + formatString + ", genotype value string = " + genotypeString, e); + } + + // you're allowed to leave out mFields, if any field doesn't have a value fill it in + if (keyIndex < tokens.length && match.group(4).contains(":")) { + while (keyIndex < keys.length) + if (!fields.containsKey(keys[keyIndex])) + fields.put(keys[keyIndex++], ""); + } + } + return new VCFGenotypeRecord(fields, alleles, phase, referenceBase); + } + + /** + * add an alternate allele to the list of alleles we have for a VCF genotype record + * + * @param alleleNumber the allele number, as a string + * @param altAlleles the list of alternate alleles + * @param referenceBase the reference base + */ + private void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List bases) { + if (Integer.valueOf(alleleNumber) == 0) + bases.add(String.valueOf(referenceBase)); + else + bases.add(altAlleles[Integer.valueOf(alleleNumber) - 1]); + } + + + /** @return get the header associated with this reader */ public VCFHeader getHeader() { return this.mHeader; } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java index e92089215..4fa4d0de4 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFRecord.java @@ -13,76 +13,60 @@ public class VCFRecord { private final Map mValues = new HashMap(); // our genotype sample fields - private final Map mGenotypeFields = new HashMap(); + private final List mGenotypeFields; // the format String, which specifies what each genotype can contain for values - private String formatString; + private final String mFormatString; + + // the associated header + private final VCFHeader mHeader; /** - * create a VCFRecord, given a VCF header and the the values in this field. THis is protected, so that the reader is - * the only accessing object + * given a VCF header, and the values for each of the columns, create a VCF record. * - * @param header the VCF header - * @param line the line to parse into individual fields + * @param header the VCF header + * @param columnValues a mapping of header strings to values + * @param formatString the format string for the genotype records + * @param genotypeRecords the genotype records */ - protected VCFRecord(VCFHeader header, String line) { - String tokens[] = line.split("\\s+"); - List values = new ArrayList(); - for (String str : tokens) values.add(str); - initialize(header, values); + public VCFRecord(VCFHeader header, Map columnValues, String formatString, List genotypeRecords) { + mHeader = header; + mValues.putAll(columnValues); + mFormatString = formatString; + mGenotypeFields = new ArrayList(); + mGenotypeFields.addAll(genotypeRecords); } /** - * given a VCF header, and the values for each of the columns, create a VCF record + * given a VCF header, and the values for each of the columns, create a VCF record. * - * @param header the VCF header - * @param values the values, as a list, for each of the columns + * @param header the VCF header + * @param columnValues a mapping of header strings to values */ - public VCFRecord(VCFHeader header, List values) { - initialize(header, values); + public VCFRecord(VCFHeader header, Map columnValues) { + mHeader = header; + mValues.putAll(columnValues); + mGenotypeFields = null; + mFormatString = null; } /** - * create the VCFRecord - * - * @param header the VCF header - * @param values the list of strings that make up the columns of the record + * do we have genotyping data + * @return true if we have genotyping data, false otherwise */ - private void initialize(VCFHeader header, List values) { - if (values.size() != header.getColumnCount()) { - throw new StingException("The input list doesn't contain enough fields, it should have " + header.getColumnCount() + " fields"); - } - int index = 0; - for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) { - mValues.put(field, values.get(index)); - index++; - } - if (header.hasGenotypingData()) { - formatString = values.get(index); - index++; - for (String str : header.getGenotypeSamples()) { - mGenotypeFields.put(str, values.get(index)); - index++; - } + public boolean hasGenotypeData() { + if (mGenotypeFields==null) { + return false; } + return true; } /** - * lookup a value, given it's column name - * - * @param key the column name, which is looked up in both the set columns and the auxillary columns - * - * @return a String representing the column values, or null if the field doesn't exist in this record + * get the format string + * @return the format sting, null if it doesn't exist */ - public String getValue(String key) { - try { - return mValues.get(VCFHeader.HEADER_FIELDS.valueOf(key)); - } catch (IllegalArgumentException e) { - if (this.mGenotypeFields.containsKey(key)) { - return mGenotypeFields.get(key); - } - return null; - } + public String getFormatString() { + return mFormatString; } /** @@ -98,7 +82,7 @@ public class VCFRecord { /** @return the string for the chromosome that this VCF record is associated with */ public String getChromosome() { - return this.mValues.get(VCFHeader.HEADER_FIELDS.CHROM); + return mValues.get(VCFHeader.HEADER_FIELDS.CHROM); } /** @return this VCF records position on the specified chromosome */ @@ -108,7 +92,7 @@ public class VCFRecord { /** @return the ID value for this record */ public String getID() { - return this.mValues.get(VCFHeader.HEADER_FIELDS.ID); + return mValues.get(VCFHeader.HEADER_FIELDS.ID); } /** @@ -118,7 +102,7 @@ public class VCFRecord { */ public char getReferenceBase() { // TODO: this field isn't validated correctly - return this.mValues.get(VCFHeader.HEADER_FIELDS.REF).charAt(0); + return mValues.get(VCFHeader.HEADER_FIELDS.REF).charAt(0); } /** @@ -127,10 +111,10 @@ public class VCFRecord { * @return an array of strings representing the alt alleles, or null if there are none */ public String[] getAlternateAlleles() { - if (this.mValues.get(VCFHeader.HEADER_FIELDS.ALT).trim().equals(".")) { + if (mValues.get(VCFHeader.HEADER_FIELDS.ALT).trim().equals(".")) { return null; } - return this.mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(","); + return mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(","); } public boolean hasAlternateAllele() { @@ -139,7 +123,7 @@ public class VCFRecord { /** @return the phred-scaled quality score */ public int getQual() { - return Integer.valueOf(this.mValues.get(VCFHeader.HEADER_FIELDS.QUAL)); + return Integer.valueOf(mValues.get(VCFHeader.HEADER_FIELDS.QUAL)); } /** @@ -148,10 +132,10 @@ public class VCFRecord { * @return an array of strings representing the filtering criteria, or null if none were applied */ public String[] getFilteringCodes() { - if (this.mValues.get(VCFHeader.HEADER_FIELDS.FILTER).trim().equals("0")) { + if (mValues.get(VCFHeader.HEADER_FIELDS.FILTER).trim().equals("0")) { return null; } - return this.mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(";"); + return mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(";"); } public boolean hasFilteringCodes() { @@ -177,20 +161,22 @@ public class VCFRecord { /** @return the number of columnsof data we're storing */ public int getColumnCount() { - return this.mGenotypeFields.size() + this.mValues.size(); + return mGenotypeFields.size() + mValues.size(); } /** * return the mapping of the format tags to the specified sample's values - * @param sampleName the sample name to get the genotyping tags for * @return a VCFGenotypeRecord */ - public VCFGenotypeRecord getVCFGenotypeRecord(String sampleName) { - if (!this.mGenotypeFields.containsKey(sampleName)) { - throw new IllegalArgumentException("Sample Name: " + sampleName + " doesn't exist in this VCF record"); - } - return new VCFGenotypeRecord(formatString,mGenotypeFields.get(sampleName),this.getAlternateAlleles(),this.getReferenceBase()); + public List getVCFGenotypeRecords() { + return this.mGenotypeFields; + } + /** @return a List of the sample names */ + public String[] getSampleNames() { + String ret[] = new String[mHeader.getGenotypeSamples().size()]; + mHeader.getGenotypeSamples().toArray(ret); + return ret; } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFValidator.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFValidator.java index b37bb3929..df79f57e2 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFValidator.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFValidator.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.utils.genotype.vcf; import java.io.File; +import java.util.Map; +import java.util.TreeMap; /** @@ -23,34 +25,83 @@ public class VCFValidator { * @param args the vcf file is the only parameter */ public static void main(String[] args) { - if (args.length != 1) { + boolean catchAll = false; + + if (args.length == 2 && args[0].equals("-A")) + catchAll = true; + else if (args.length == 1) + catchAll = false; + else { printUsage(); return; } - File vcfFile = new File(args[0]); + File vcfFile = new File(args[(catchAll) ? 1 : 0]); if (!vcfFile.exists()) { System.err.println("Specified VCF file doesn't exist, please check the input file\n"); printUsage(); return; } - int counter = 0; + // count hom many records we see + int recordCount = 0; + Map problems = new TreeMap(); + try { + // open up our reader VCFReader reader = new VCFReader(vcfFile); + while (reader.hasNext()) { - counter++; - reader.next(); + try { + recordCount++; + VCFRecord rec = reader.next(); + // if the header indicates we have genotyping data, try to extract it for all samples + if (reader.getHeader().hasGenotypingData()) { + for (VCFGenotypeRecord genorec : rec.getVCFGenotypeRecords()) { + // just cycle through them, more checks go here + } + } + } catch (Exception e) { + if (catchAll) + problems.put(recordCount,e); + else { + validationFailed(e, recordCount); + return; + } + } } } catch (Exception e) { - System.err.println("VCF Validation failed, after parsing " + counter + " entries."); - System.err.println("The reason given was: " + e.getMessage()); + if (catchAll) + problems.put(new Integer(0),e); + else + validationFailed(e, recordCount); + } + System.err.println("Viewed " + recordCount + " VCF record entries."); + if (problems.size() > 0) { + System.err.println("Encountered " + problems.size() + " number of issues. (record zero indicates a header problem)"); + for (Integer e : problems.keySet()) { + System.err.println("\tProblem at record " + e + " : " + problems.get(e)); + } } - System.err.println("Viewed " + counter + " VCF record entries."); } + /** + * validation failed + * + * @param e the exception + * @param count the current record count + */ + public static void validationFailed(Exception e, int count) { + System.err.println("VCF Validation failed, after parsing " + count + " entries."); + System.err.println("The reason given was: " + e.getMessage()); + e.printStackTrace(); + } + + /** print the usage information for the VCF validator */ public static void printUsage() { System.err.println("VCF validator (VCF Version " + VCF_VERSION + ")"); System.err.println("Usage:"); - System.err.println("vcfvalidator "); + System.err.println("vcfvalidator <-A> "); + System.err.println(""); + System.err.println("\t-A\tTell the validator to attempt to catch all the problems, and not stop at the first. Some may be too fatal to continue."); System.err.println(""); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index c848ece17..4b07aed88 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -51,6 +51,7 @@ public class VCFWriter { /** * output a record to the VCF file + * * @param record the record to output */ public void addRecord(VCFRecord record) { @@ -59,14 +60,19 @@ public class VCFWriter { " columns, when is should have " + mHeader.getColumnCount()); } StringBuilder builder = new StringBuilder(); + // first output the required fields in order boolean first = true; for (VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields()) { - if (first) { first = false; builder.append(record.getValue(field)); } - else builder.append("\t" + record.getValue(field)); + if (first) { + first = false; + builder.append(record.getValue(field)); + } else builder.append("\t" + record.getValue(field)); } - for (String auxTag : mHeader.getGenotypeSamples()) { - builder.append("\t" + record.getValue(auxTag)); + for (VCFGenotypeRecord rec : record.getVCFGenotypeRecords()) { + builder.append("\t"); + for (String s : rec.getFields().keySet()) + builder.append(":" + rec.getFields().get(s)); } try { mWriter.write(builder.toString() + "\n"); @@ -75,9 +81,7 @@ public class VCFWriter { } } - /** - * attempt to close the VCF file - */ + /** attempt to close the VCF file */ public void close() { try { mWriter.close();