From 8f8b59a932923d29fdeb91a7ef90d1eef24ca1fd Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Wed, 21 Sep 2011 22:23:28 -0400
Subject: [PATCH] My interpretation of the VCF spec is that the FORMAT field
 should only be present if there is genotype/sample data. So the VCFCodec now
 throws an exception when it encounters such a case.  I had to fix one of the
 integration test VCFs.

---
 .../utils/codecs/vcf/AbstractVCFCodec.java    | 38 ++++++++++---------
 .../sting/utils/codecs/vcf/VCFHeader.java     | 17 +++------
 .../sting/utils/exceptions/UserException.java |  6 +++
 3 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
index 83c7083d0..43b07476d 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
@@ -115,15 +115,21 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
                     }
                     arrayIndex++;
                 }
+
+                boolean sawFormatTag = false;
                 if ( arrayIndex < strings.length ) {
                     if ( !strings[arrayIndex].equals("FORMAT") )
                         throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'");
+                    sawFormatTag = true;
                     arrayIndex++;
                 }
 
-                while (arrayIndex < strings.length)
+                while ( arrayIndex < strings.length )
                     auxTags.add(strings[arrayIndex++]);
 
+                if ( sawFormatTag && auxTags.size() == 0 )
+                    throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data");
+
             } else {
                 if ( str.startsWith("##INFO=") ) {
                     VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version);
@@ -200,28 +206,24 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
      * @return a VariantContext
      */
     public Feature decode(String line) {
-        return reallyDecode(line);
-    }
+        // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
+        if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
 
-    private Feature reallyDecode(String line) {
-            // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
-            if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
+        // our header cannot be null, we need the genotype sample names and counts
+        if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
 
-            // our header cannot be null, we need the genotype sample names and counts
-            if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
+        if (parts == null)
+            parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)];
 
-            if (parts == null)
-                parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)];
+        int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
 
-            int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
+        // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns.  Otherwise check that we have nine (normal colummns + genotyping data)
+        if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) ||
+             (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) )
+            throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
+                    " tokens, and saw " + nParts + " )", lineNo);
 
-            // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns.  Otherwise check that we have nine (normal colummns + genotyping data)
-            if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) ||
-                 (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) )
-                throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
-                        " tokens, and saw " + nParts + " )", lineNo);
-
-            return parseVCFLine(parts);
+        return parseVCFLine(parts);
     }
 
     protected void generateException(String message) {
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
index fd1c74993..66e11bc1e 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
@@ -35,9 +35,6 @@ public class VCFHeader {
     // the header string indicator
     public static final String HEADER_INDICATOR = "#";
 
-    /** do we have genotying data? */
-    private boolean hasGenotypingData = false;
-
     // were the input samples sorted originally (or are we sorting them)?
     private boolean samplesWereAlreadySorted = true;
 
@@ -57,17 +54,15 @@ public class VCFHeader {
      * create a VCF header, given a list of meta data and auxillary tags
      *
      * @param metaData            the meta data associated with this header
-     * @param genotypeSampleNames the genotype format field, and the sample names
+     * @param genotypeSampleNames the sample names
      */
     public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) {
         mMetaData = new TreeSet<VCFHeaderLine>();
         if ( metaData != null )
             mMetaData.addAll(metaData);
-        for (String col : genotypeSampleNames) {
-            if (!col.equals("FORMAT"))
-                mGenotypeSampleNames.add(col);
-        }
-        if (genotypeSampleNames.size() > 0) hasGenotypingData = true;
+
+        mGenotypeSampleNames.addAll(genotypeSampleNames);
+
         loadVCFVersion();
         loadMetaDataMaps();
 
@@ -157,7 +152,7 @@ public class VCFHeader {
      * @return true if we have genotyping columns, false otherwise
      */
     public boolean hasGenotypingData() {
-        return hasGenotypingData;
+        return mGenotypeSampleNames.size() > 0;
     }
 
     /**
@@ -171,7 +166,7 @@ public class VCFHeader {
 
     /** @return the column count */
     public int getColumnCount() {
-        return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
+        return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
     }
 
     /**
diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
index 274c64f42..70f7387f4 100755
--- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
+++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
@@ -174,6 +174,12 @@ public class UserException extends ReviewedStingException {
         }
     }
 
+    public static class MalformedVCFHeader extends UserException {
+        public MalformedVCFHeader(String message) {
+            super(String.format("The provided VCF file has a malformed header: %s", message));
+        }
+    }
+
     public static class ReadMissingReadGroup extends MalformedBAM {
         public ReadMissingReadGroup(SAMRecord read) {
             super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK.  Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName()));