diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java index f75e2f6d9..7cc3392d2 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFHeader.java @@ -19,9 +19,6 @@ public class VCFHeader { CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO } - // our header field ordering, as a linked hash set to guarantee ordering - private Set mHeaderFields = new LinkedHashSet(); - // the associated meta data private final Map mMetaData = new HashMap(); @@ -46,11 +43,9 @@ public class VCFHeader { /** * create a VCF header, given a list of meta data and auxillary tags * - * @param headerFields the required header fields, in order they're presented * @param metaData the meta data associated with this header */ - protected VCFHeader(Set headerFields, Map metaData) { - for (HEADER_FIELDS field : headerFields) mHeaderFields.add(field); + protected VCFHeader(Map metaData) { for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key)); checkVCFVersion(); } @@ -58,18 +53,16 @@ public class VCFHeader { /** * create a VCF header, given a list of meta data and auxillary tags * - * @param headerFields the required header fields, in order they're presented * @param metaData the meta data associated with this header * @param genotypeSampleNames the genotype format field, and the sample names */ - protected VCFHeader(Set headerFields, Map metaData, List genotypeSampleNames) { - for (HEADER_FIELDS field : headerFields) mHeaderFields.add(field); + protected VCFHeader(Map metaData, List genotypeSampleNames) { for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key)); for (String col : genotypeSampleNames) { if (!col.equals("FORMAT")) mGenotypeSampleNames.add(col); } - hasGenotypingData = true; + if (genotypeSampleNames.size() > 0) hasGenotypingData = true; checkVCFVersion(); } @@ -88,12 +81,16 @@ public class VCFHeader { } /** - * get the header fields in order they're presented in the input file + * get the header fields in order they're presented in the input file (which is now required to be + * the order presented in the spec). * * @return a set of the header fields, in order */ public Set getHeaderFields() { - return mHeaderFields; + Set fields = new LinkedHashSet(); + for (HEADER_FIELDS field : HEADER_FIELDS.values()) + fields.add(field); + return fields; } /** @@ -125,7 +122,7 @@ public class VCFHeader { /** @return the column count, */ public int getColumnCount() { - return mHeaderFields.size() + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0); + return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0); } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java index 0349982ea..bf536d849 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFReader.java @@ -48,7 +48,7 @@ public class VCFReader implements Iterator, Iterable { lines.add(line); line = mReader.readLine(); } - mHeader = this.createHeader(lines); + mHeader = this.createHeader(lines); mNextRecord = createRecord(line, mHeader); } catch (IOException e) { throw new RuntimeException("VCFReader: Failed to parse VCF File on line: " + line, e); @@ -126,7 +126,6 @@ public class VCFReader implements Iterator, Iterable { protected VCFHeader createHeader(List headerStrings) { Map metaData = new HashMap(); - Set headerFields = new LinkedHashSet(); List auxTags = new ArrayList(); // iterate over all the passed in strings for (String str : headerStrings) { @@ -142,32 +141,28 @@ public class VCFReader implements Iterator, Iterable { } // iterate over all the passed in strings - for (String str : headerStrings) { + for (String str : headerStrings) { // TODO: fix, we shouldn't loop over every line if (str.startsWith("#") && !str.startsWith("##")) { String[] strings = str.substring(1).split("\\s+"); - for (String s : strings) { - VCFHeader.HEADER_FIELDS field; + // the columns should be in order according to Richard Durbin + int arrayIndex = 0; + for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { try { - field = VCFHeader.HEADER_FIELDS.valueOf(s); + if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex])) + throw new RuntimeException("VCFReader: we were expecting column name " + field + " but we saw " + strings[arrayIndex]); } catch (IllegalArgumentException e) { - throw new RuntimeException("VCFReader: Unknown column name \"" + s + "\", it does not match a known column header name."); - } - if (headerFields.contains(field)) - throw new RuntimeException("VCFReader: Header field duplication is not allowed"); - try { - headerFields.add(VCFHeader.HEADER_FIELDS.valueOf(s)); - } catch (IllegalArgumentException e) { - if (!s.equals("FORMAT")) - auxTags.add(s); + throw new RuntimeException("VCFReader: Unknown column name \"" + strings[arrayIndex] + "\", it does not match a known column header name."); } + arrayIndex++; + } + while (arrayIndex < strings.length) { + if (!strings[arrayIndex].equals("FORMAT")) + auxTags.add(strings[arrayIndex]); + arrayIndex++; } } } - if (headerFields.size() != VCFHeader.HEADER_FIELDS.values().length) { - throw new RuntimeException("VCFReader: The VCF column header line is missing " + (VCFHeader.HEADER_FIELDS.values().length - headerFields.size()) - + " of the " + VCFHeader.HEADER_FIELDS.values().length + " required fields"); - } - return new VCFHeader(headerFields, metaData, auxTags); + return new VCFHeader(metaData, auxTags); } /** @@ -221,7 +216,7 @@ public class VCFReader implements Iterator, Iterable { Map tagToValue = new HashMap(); VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN; List bases = new ArrayList(); - + int addedCount = 0; String keyStrings[] = formatString.split(":"); for (String key : keyStrings) { String parse; @@ -236,17 +231,23 @@ public class VCFReader implements Iterator, Iterable { if (key.equals("GT")) { Matcher m = gtPattern.matcher(parse); if (!m.matches()) - throw new RuntimeException("Ubable to match GT genotype flag to it's regular expression"); + throw new RuntimeException("VCFReader: Unable to match GT genotype flag to it's expected pattern, the field was: " + parse); phase = VCFGenotypeRecord.determinePhase(m.group(2)); addAllele(m.group(1), altAlleles, referenceBase, bases); if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases); } tagToValue.put(key, parse); + addedCount++; if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1; genotypeString = genotypeString.substring(nextDivider + 1, genotypeString.length()); } - if (keyStrings.length != tagToValue.size() || genotypeString.length() > 0) - throw new RuntimeException("genotype value count doesn't match the key count"); + // catch some common errors, either there are too many field keys or there are two many field values + if (keyStrings.length != tagToValue.size()) + throw new RuntimeException("VCFReader: genotype value count doesn't match the key count (expected " + + keyStrings.length + " but saw " + tagToValue.size() + ")"); + else if (genotypeString.length() > 0) + throw new RuntimeException("VCFReader: genotype string contained additional unprocessed fields: " + genotypeString + + ". This most likely means that the format string is shorter then the value fields."); return new VCFGenotypeRecord(sampleName, tagToValue, bases, phase, referenceBase); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFValidator.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFValidator.java index 30b875072..f5f14b04c 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFValidator.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFValidator.java @@ -2,6 +2,9 @@ package org.broadinstitute.sting.utils.genotype.vcf; import java.io.File; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Date; import java.util.Map; import java.util.TreeMap; @@ -24,7 +27,7 @@ public class VCFValidator { * and if no errors pop up in processing, well hey, looks good to us. * * @param args the vcf file is the only required parameter, with the optional -A indicating that errors - * should be held until the end of processing + * should be held until the end of processing */ public static void main(String[] args) { boolean catchAll = false; @@ -37,13 +40,14 @@ public class VCFValidator { printUsage(); return; } + printHeader(args[(catchAll) ? 1 : 0]); File vcfFile = new File(args[(catchAll) ? 1 : 0]); if (!vcfFile.exists()) { System.err.println("Specified VCF file doesn't exist, please check the input file\n"); printUsage(); return; } - // count hom many records we see + // count hom many records we've see int recordCount = 0; Map problems = new TreeMap(); @@ -53,24 +57,28 @@ public class VCFValidator { // the number of samples should be set in the header and consistant over all records final int sampleCount = reader.getHeader().getGenotypeSamples().size(); - while (reader.hasNext()) { + boolean keepGoing = true; + while (keepGoing) { try { - recordCount++; - VCFRecord rec = reader.next(); - // if the header indicates we have genotyping data, try to extract it for all samples - if (reader.getHeader().hasGenotypingData()) { - int sampleCounter = 0; - for (VCFGenotypeRecord genorec : rec.getVCFGenotypeRecords()) { - sampleCounter++; - /** - * just cycle through the records right now; any additional checks for - * the records should go in this block. - **/ + recordCount++; + keepGoing = reader.hasNext(); + if (keepGoing) { + VCFRecord rec = reader.next(); + // if the header indicates we have genotyping data, try to extract it for all samples + if (reader.getHeader().hasGenotypingData()) { + int sampleCounter = 0; + for (VCFGenotypeRecord genorec : rec.getVCFGenotypeRecords()) { + sampleCounter++; + /** + * just cycle through the records right now; any additional checks for + * the records should go in this block. + **/ + } + if (sampleCounter != sampleCount) + throw new RuntimeException("Record " + recordCount + " does not have the required number " + + "of records (" + sampleCounter + " in the record, " + sampleCount + " in the header)"); + } - if (sampleCounter != sampleCount) - throw new RuntimeException("Record " + recordCount + " does not have the required number " + - "of records (" + sampleCounter + " in the record, " + sampleCount + " in the header)"); - } } catch (Exception e) { if (catchAll) @@ -82,9 +90,10 @@ public class VCFValidator { } } } catch (Exception e) { - if (catchAll) + if (catchAll) { problems.put(new Integer(0), e); - else + e.printStackTrace(); + } else validationFailed(e, recordCount); } System.err.println("Viewed " + recordCount + " VCF record entries."); @@ -108,9 +117,7 @@ public class VCFValidator { e.printStackTrace(); } - /** - * print the usage information for the VCF validator - */ + /** print the usage information for the VCF validator */ public static void printUsage() { System.err.println("VCF validator (VCF Version " + VCF_VERSION + ")"); System.err.println("Usage:"); @@ -121,4 +128,16 @@ public class VCFValidator { System.err.println(""); } + public static void printHeader(String file) { + System.err.println("-------------------------------------------"); + System.err.println("VCF Validator v1.0\n"); + System.err.println("Run on file " + file + " at " + getDateTime()); + System.err.println("-------------------------------------------"); + } + + private static String getDateTime() { + DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); + Date date = new Date(); + return dateFormat.format(date); + } } diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderTest.java index fb94d085c..06b36a472 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFHeaderTest.java @@ -26,16 +26,13 @@ public class VCFHeaderTest extends BaseTest { */ @Test public void testHeaderConstructor() { - for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { - headerFields.add(field); - } metaData.put("format","VCRv3.2"); metaData.put("two","2"); additionalColumns.add("extra1"); additionalColumns.add("extra2"); // this should create a header that is valid - VCFHeader header = new VCFHeader(headerFields, metaData, additionalColumns); + VCFHeader header = new VCFHeader(metaData, additionalColumns); // check the fields int index = 0; diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java index 37059a38b..f6b3921e4 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFReaderTest.java @@ -16,6 +16,11 @@ public class VCFReaderTest extends BaseTest { @Test public void testVCFInput() { + try { + Thread.sleep(5000); + } catch (InterruptedException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } VCFReader reader = new VCFReader(vcfFile); int counter = 0; while (reader.hasNext()) { diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java index 1ab7a8c4d..2241646ec 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterTest.java @@ -46,15 +46,12 @@ public class VCFWriterTest extends BaseTest { * @return a fake VCF header */ private VCFHeader createFakeHeader() { - for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { - headerFields.add(field); - } metaData.put("format", "VCRv3.2"); // required metaData.put("two", "2"); additionalColumns.add("FORMAT"); additionalColumns.add("extra1"); additionalColumns.add("extra2"); - return new VCFHeader(headerFields, metaData, additionalColumns); + return new VCFHeader(metaData, additionalColumns); } /**