My interpretation of the VCF spec is that the FORMAT field should only be present if there is genotype/sample data. So the VCFCodec now throws an exception when it encounters such a case. I had to fix one of the integration test VCFs.

2011-09-21 22:23:28 -04:00 · 2011-09-21 22:23:28 -04:00 · 8f8b59a932
parent e53cb79d42
commit 8f8b59a932
3 changed files with 32 additions and 29 deletions
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
@ -115,15 +115,21 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
                    }
                    arrayIndex++;
                }
+
+                boolean sawFormatTag = false;
                if ( arrayIndex < strings.length ) {
                    if ( !strings[arrayIndex].equals("FORMAT") )
                        throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'");
+                    sawFormatTag = true;
                    arrayIndex++;
                }

-                while (arrayIndex < strings.length)
+                while ( arrayIndex < strings.length )
                    auxTags.add(strings[arrayIndex++]);

+                if ( sawFormatTag && auxTags.size() == 0 )
+                    throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data");
+
            } else {
                if ( str.startsWith("##INFO=") ) {
                    VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version);
@ -200,28 +206,24 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
     * @return a VariantContext
     */
    public Feature decode(String line) {
-        return reallyDecode(line);
-    }
+        // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
+        if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;

-    private Feature reallyDecode(String line) {
-            // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
-            if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
+        // our header cannot be null, we need the genotype sample names and counts
+        if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");

-            // our header cannot be null, we need the genotype sample names and counts
-            if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
+        if (parts == null)
+            parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)];

-            if (parts == null)
-                parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)];
+        int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true);

-            int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
+        // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns.  Otherwise check that we have nine (normal colummns + genotyping data)
+        if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) ||
+             (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) )
+            throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
+                    " tokens, and saw " + nParts + " )", lineNo);

-            // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns.  Otherwise check that we have nine (normal colummns + genotyping data)
-            if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) ||
-                 (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) )
-                throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
-                        " tokens, and saw " + nParts + " )", lineNo);
-
-            return parseVCFLine(parts);
+        return parseVCFLine(parts);
    }

    protected void generateException(String message) {
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
@ -35,9 +35,6 @@ public class VCFHeader {
    // the header string indicator
    public static final String HEADER_INDICATOR = "#";

-    /** do we have genotying data? */
-    private boolean hasGenotypingData = false;
-
    // were the input samples sorted originally (or are we sorting them)?
    private boolean samplesWereAlreadySorted = true;

@ -57,17 +54,15 @@ public class VCFHeader {
     * create a VCF header, given a list of meta data and auxillary tags
     *
     * @param metaData            the meta data associated with this header
-     * @param genotypeSampleNames the genotype format field, and the sample names
+     * @param genotypeSampleNames the sample names
     */
    public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) {
        mMetaData = new TreeSet<VCFHeaderLine>();
        if ( metaData != null )
            mMetaData.addAll(metaData);
-        for (String col : genotypeSampleNames) {
-            if (!col.equals("FORMAT"))
-                mGenotypeSampleNames.add(col);
-        }
-        if (genotypeSampleNames.size() > 0) hasGenotypingData = true;
+
+        mGenotypeSampleNames.addAll(genotypeSampleNames);
+
        loadVCFVersion();
        loadMetaDataMaps();

@ -157,7 +152,7 @@ public class VCFHeader {
     * @return true if we have genotyping columns, false otherwise
     */
    public boolean hasGenotypingData() {
-        return hasGenotypingData;
+        return mGenotypeSampleNames.size() > 0;
    }

    /**
@ -171,7 +166,7 @@ public class VCFHeader {

    /** @return the column count */
    public int getColumnCount() {
-        return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
+        return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
    }

    /**
--- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
+++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
@ -174,6 +174,12 @@ public class UserException extends ReviewedStingException {
        }
    }

+    public static class MalformedVCFHeader extends UserException {
+        public MalformedVCFHeader(String message) {
+            super(String.format("The provided VCF file has a malformed header: %s", message));
+        }
+    }
+
    public static class ReadMissingReadGroup extends MalformedBAM {
        public ReadMissingReadGroup(SAMRecord read) {
            super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK.  Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName()));