Phase I commit to get shadowBCFs passing tests

-- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument -- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA -- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample -- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original -- Performance optimizations in BCF2 codec and writer -- using arrays not lists for intermediate data structures -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over -- VCFHeader (which needs a complete rewrite, FYI Eric) -- Warn and fix on the way flag values with counts > 0 -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation. -- Explicitly track FILTER fields for efficient lookup in their own hashmap -- Automatically add PL field when we see a GL field and no PL field -- Added get and has methods for INFO, FILTER, and FORMAT fields -- No longer add AC and AF values to the INFO field when there's no ALT allele -- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare -- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-15 14:25:00 -04:00 · 2012-06-15 14:25:00 -04:00 · 9c81f45c9f
parent ab53220635
commit 9c81f45c9f
25 changed files with 344 additions and 133 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@ -347,6 +347,9 @@ public class GATKArgumentCollection {
    public boolean USE_SLOW_GENOTYPES = false;
    // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed

+    @Argument(fullName="allowMissingVCFHeaders",shortName = "allowMissingVCFHeaders",doc="If provided, the GATK will write out VCF files that contain INFO, FILTER, and FORMAT fields not found in the VCF header",required=false)
+    public boolean allowMissingVCFHeaders = false;
+
    /**
     * The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file
     * and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other
--- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java
@ -74,7 +74,8 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
        else if ( stub.getOutputStream() != null ) {
            this.file = null;
            this.stream = stub.getOutputStream();
-            writer = VariantContextWriterFactory.create(stream, stub.getMasterSequenceDictionary(), stub.getWriterOptions(false));
+            writer = VariantContextWriterFactory.create(stream,
+                    stub.getMasterSequenceDictionary(), stub.getWriterOptions(false));
        }
        else
            throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream.");
--- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java
@ -183,6 +183,7 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
        List<Options> options = new ArrayList<Options>();

        if ( doNotWriteGenotypes ) options.add(Options.DO_NOT_WRITE_GENOTYPES);
+        if ( engine.getArguments().allowMissingVCFHeaders ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER);
        if ( indexOnTheFly && ! isCompressed() ) options.add(Options.INDEX_ON_THE_FLY);

        return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options);
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java
@ -68,9 +68,10 @@ public class TandemRepeatAnnotator extends InfoFieldAnnotation implements Standa
    }

    public static final String[] keyNames = {STR_PRESENT, REPEAT_UNIT_KEY,REPEATS_PER_ALLELE_KEY };
-    public static final VCFInfoHeaderLine[] descriptions = { new VCFInfoHeaderLine(STR_PRESENT, 1, VCFHeaderLineType.Flag, "Variant is a short tandem repeat"),
+    public static final VCFInfoHeaderLine[] descriptions = {
+            new VCFInfoHeaderLine(STR_PRESENT, 0, VCFHeaderLineType.Flag, "Variant is a short tandem repeat"),
            new VCFInfoHeaderLine(REPEAT_UNIT_KEY, 1, VCFHeaderLineType.String, "Tandem repeat unit (bases)"),
-            new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Number of times tandem repeat unit is repeated, for each allele (including reference)") };
+            new VCFInfoHeaderLine(REPEATS_PER_ALLELE_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Number of times tandem repeat unit is repeated, for each allele (including reference)") };

    public List<String> getKeyNames() {
        return Arrays.asList(keyNames);
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
@ -203,8 +203,9 @@ public class VariantAnnotatorEngine {
        // go through all the requested info annotationTypes
        for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) {
            Map<String, Object> annotationsFromCurrentType = ((ActiveRegionBasedAnnotation)annotationType).annotate(stratifiedContexts, vc);
-            if ( annotationsFromCurrentType != null )
+            if ( annotationsFromCurrentType != null ) {
                infoAnnotations.putAll(annotationsFromCurrentType);
+            }
        }

        // generate a new annotated VC
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@ -198,8 +198,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
    // Variables
    private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();

-    private Set<String> sampleNamesForEvaluation = new TreeSet<String>();
-    private Set<String> sampleNamesForStratification = new TreeSet<String>();
+    private boolean isSubsettingSamples;
+    private Set<String> sampleNamesForEvaluation = new LinkedHashSet<String>();
+    private Set<String> sampleNamesForStratification = new LinkedHashSet<String>();

    // important stratifications
    private boolean byFilterIsEnabled = false;
@ -249,8 +250,10 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
        Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evals);
        Set<String> vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);

-        // Load the sample list
-        sampleNamesForEvaluation.addAll(SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS));
+        // Load the sample list, using an intermediate tree set to sort the samples
+        final Set<String> allSampleNames = SampleUtils.getSamplesFromCommandLineInput(vcfSamples);
+        sampleNamesForEvaluation.addAll(new TreeSet<String>(SampleUtils.getSamplesFromCommandLineInput(vcfSamples, SAMPLE_EXPRESSIONS)));
+        isSubsettingSamples = ! sampleNamesForEvaluation.containsAll(allSampleNames);

        if (Arrays.asList(STRATIFICATIONS_TO_USE).contains("Sample")) {
            sampleNamesForStratification.addAll(sampleNamesForEvaluation);
@ -571,6 +574,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr

    public List<RodBinding<VariantContext>> getEvals() { return evals; }

+    public boolean isSubsettingToSpecificSamples() { return isSubsettingSamples; }
    public Set<String> getSampleNamesForEvaluation() { return sampleNamesForEvaluation; }

    public Set<String> getSampleNamesForStratification() { return sampleNamesForStratification; }
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java
@ -28,8 +28,6 @@ import org.apache.log4j.Logger;
 import org.broadinstitute.sting.commandline.RodBinding;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.report.GATKReport;
-import org.broadinstitute.sting.gatk.report.GATKReportTable;
 import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
 import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.StandardEval;
 import org.broadinstitute.sting.gatk.walkers.varianteval.evaluators.VariantEvaluator;
@ -37,13 +35,13 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.Require
 import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.StandardStratification;
 import org.broadinstitute.sting.gatk.walkers.varianteval.stratifications.VariantStratifier;
 import org.broadinstitute.sting.utils.classloader.PluginManager;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
 import org.broadinstitute.sting.utils.exceptions.StingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
 import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;

-import java.lang.reflect.Field;
 import java.util.*;

 public class VariantEvalUtils {
@ -199,18 +197,32 @@ public class VariantEvalUtils {
     * @return a new VariantContext with just the requested samples
     */
    public VariantContext getSubsetOfVariantContext(VariantContext vc, Set<String> sampleNames) {
-        VariantContext vcsub = vc.subContextFromSamples(sampleNames, false);
-        VariantContextBuilder builder = new VariantContextBuilder(vcsub);
+        return ensureAnnotations(vc, vc.subContextFromSamples(sampleNames, false));
+    }

+    public VariantContext ensureAnnotations(final VariantContext vc, final VariantContext vcsub) {
        final int originalAlleleCount = vc.getHetCount() + 2 * vc.getHomVarCount();
        final int newAlleleCount = vcsub.getHetCount() + 2 * vcsub.getHomVarCount();
+        final boolean isSingleton = originalAlleleCount == newAlleleCount && newAlleleCount == 1;
+        final boolean hasChrCountAnnotations = vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) &&
+                vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) &&
+                vc.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY);

-        if (originalAlleleCount == newAlleleCount && newAlleleCount == 1) {
-            builder.attribute(VariantEvalWalker.IS_SINGLETON_KEY, true);
+        if ( ! isSingleton && hasChrCountAnnotations ) {
+            // nothing to update
+            return vc;
+        } else {
+            // have to do the work
+            VariantContextBuilder builder = new VariantContextBuilder(vc);
+
+            if ( isSingleton )
+                builder.attribute(VariantEvalWalker.IS_SINGLETON_KEY, true);
+
+            if ( ! hasChrCountAnnotations )
+                VariantContextUtils.calculateChromosomeCounts(builder, true);
+
+            return builder.make();
        }
-
-        VariantContextUtils.calculateChromosomeCounts(builder, true);
-        return builder.make();
    }

    /**
@ -250,8 +262,11 @@ public class VariantEvalUtils {
                // First, filter the VariantContext to represent only the samples for evaluation
                VariantContext vcsub = vc;

-                if (subsetBySample && vc.hasGenotypes() && vc.hasGenotypes(variantEvalWalker.getSampleNamesForEvaluation())) {
-                    vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation());
+                if (subsetBySample && vc.hasGenotypes()) {
+                    if ( variantEvalWalker.isSubsettingToSpecificSamples() )
+                        vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation());
+                    else
+                        vcsub = ensureAnnotations(vc, vc);
                }

                if ((byFilter || !vcsub.isFiltered())) {
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@ -427,12 +427,12 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
        headerLines.add(new VCFHeaderLine("source", "SelectVariants"));

        if (KEEP_ORIGINAL_CHR_COUNTS) {
-            headerLines.add(new VCFFormatHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC"));
-            headerLines.add(new VCFFormatHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF"));
-            headerLines.add(new VCFFormatHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN"));
+            headerLines.add(new VCFInfoHeaderLine("AC_Orig", 1, VCFHeaderLineType.Integer, "Original AC"));
+            headerLines.add(new VCFInfoHeaderLine("AF_Orig", 1, VCFHeaderLineType.Float, "Original AF"));
+            headerLines.add(new VCFInfoHeaderLine("AN_Orig", 1, VCFHeaderLineType.Integer, "Original AN"));
        }
        headerLines.addAll(Arrays.asList(ChromosomeCounts.descriptions));
-        headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Depth of coverage"));
+        headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Depth of coverage"));
        vcfWriter.writeHeader(new VCFHeader(headerLines, samples));

        for (int i = 0; i < SELECT_EXPRESSIONS.size(); i++) {
--- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java
@ -150,8 +150,7 @@ public class SampleUtils {
        // iterate to get all of the sample names

        for ( Map.Entry<String, VCFHeader> pair : VCFUtils.getVCFHeadersFromRods(toolkit).entrySet() ) {
-            Set<String> vcfSamples = pair.getValue().getGenotypeSamples();
-            for ( String sample : vcfSamples )
+            for ( String sample : pair.getValue().getGenotypeSamples() )
                addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey());
        }
    }
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java
@ -79,6 +79,14 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
     */
    private BCF2GenotypeFieldDecoders gtFieldDecoders = null;

+    /**
+     * A cached array of GenotypeBuilders for efficient genotype decoding.
+     *
+     * Caching it allows us to avoid recreating this intermediate data
+     * structure each time we decode genotypes
+     */
+    private GenotypeBuilder[] builders = null;
+
    // for error handling
    private int recordNo = 0;
    private int pos = 0;
@ -168,6 +176,13 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
        // prepare the genotype field decoders
        gtFieldDecoders = new BCF2GenotypeFieldDecoders(header);

+        // create and initialize the genotype builder array
+        final int nSamples = header.getNGenotypeSamples();
+        builders = new GenotypeBuilder[nSamples];
+        for ( int i = 0; i < nSamples; i++ ) {
+            builders[i] = new GenotypeBuilder(header.getGenotypeSamples().get(i));
+        }
+
        // position right before next line (would be right before first real record byte at end of header)
        return new FeatureCodecHeader(header, inputStream.getPosition());
    }
@ -256,6 +271,11 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
        final int nFormatFields = nFormatSamples >> 24;
        final int nSamples = nFormatSamples & 0x00FFFFF;

+        if ( header.getNGenotypeSamples() != nSamples )
+            throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
+                    "different numbers of samples per record.  Saw " + header.getNGenotypeSamples() +
+                    " samples in header but have a record with " + nSamples + " samples");
+
        decodeID(builder);
        final ArrayList<Allele> alleles = decodeAlleles(builder, pos, nAlleles);
        decodeFilter(builder);
@ -416,11 +436,11 @@ public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceD
                                             final VariantContextBuilder builder ) {
        if (siteInfo.nSamples > 0) {
            final LazyGenotypesContext.LazyParser lazyParser =
-                    new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields);
-            final int nGenotypes = header.getGenotypeSamples().size();
+                    new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders);
+
            LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser,
                    new LazyData(siteInfo.nFormatFields, decoder.getRecordBytes()),
-                    nGenotypes);
+                    header.getNGenotypeSamples());

            // did we resort the sample names?  If so, we need to load the genotype data
            if ( !header.samplesWereAlreadySorted() )
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2GenotypeFieldDecoders.java
@ -99,21 +99,21 @@ public class BCF2GenotypeFieldDecoders {
     */
    public interface Decoder {
        @Requires({"siteAlleles != null", "! siteAlleles.isEmpty()",
-                "field != null", "decoder != null", "gbs != null", "! gbs.isEmpty()"})
+                "field != null", "decoder != null", "gbs != null", "gbs.length != 0"})
        public void decode(final List<Allele> siteAlleles,
                           final String field,
                           final BCF2Decoder decoder,
                           final byte typeDescriptor,
-                           final List<GenotypeBuilder> gbs);
+                           final GenotypeBuilder[] gbs);
    }

    private class GTDecoder implements Decoder {
        @Override
-        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
+        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) {
            // we have to do a bit of low-level processing here as we want to know the size upfronta
            final int ploidy = decoder.decodeNumberOfElements(typeDescriptor);

-            if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && ploidy == 2 && gbs.size() >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES )
+            if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && ploidy == 2 && gbs.length >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES )
                fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs);
            else {
                generalDecode(siteAlleles, ploidy, decoder, typeDescriptor, gbs);
@ -137,7 +137,7 @@ public class BCF2GenotypeFieldDecoders {
        private final void fastBiallelicDiploidDecode(final List<Allele> siteAlleles,
                                                      final BCF2Decoder decoder,
                                                      final byte typeDescriptor,
-                                                      final List<GenotypeBuilder> gbs) {
+                                                      final GenotypeBuilder[] gbs) {
            final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);

            final int nPossibleGenotypes = 3 * 3;
@ -176,7 +176,7 @@ public class BCF2GenotypeFieldDecoders {
                                         final int ploidy,
                                         final BCF2Decoder decoder,
                                         final byte typeDescriptor,
-                                         final List<GenotypeBuilder> gbs) {
+                                         final GenotypeBuilder[] gbs) {
            final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);

            // a single cache for the encoded genotypes, since we don't actually need this vector
@ -213,7 +213,7 @@ public class BCF2GenotypeFieldDecoders {

    private class DPDecoder implements Decoder {
        @Override
-        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
+        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) {
            for ( final GenotypeBuilder gb : gbs ) {
                // the -1 is for missing
                gb.DP(decoder.decodeInt(typeDescriptor, -1));
@ -223,7 +223,7 @@ public class BCF2GenotypeFieldDecoders {

    private class GQDecoder implements Decoder {
        @Override
-        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
+        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) {
            for ( final GenotypeBuilder gb : gbs ) {
                // the -1 is for missing
                gb.GQ(decoder.decodeInt(typeDescriptor, -1));
@ -233,7 +233,7 @@ public class BCF2GenotypeFieldDecoders {

    private class ADDecoder implements Decoder {
        @Override
-        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
+        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) {
            for ( final GenotypeBuilder gb : gbs ) {
                gb.AD(decoder.decodeIntArray(typeDescriptor));
            }
@ -242,7 +242,7 @@ public class BCF2GenotypeFieldDecoders {

    private class PLDecoder implements Decoder {
        @Override
-        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
+        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) {
            for ( final GenotypeBuilder gb : gbs ) {
                gb.PL(decoder.decodeIntArray(typeDescriptor));
            }
@ -251,7 +251,7 @@ public class BCF2GenotypeFieldDecoders {

    private class GenericDecoder implements Decoder {
        @Override
-        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
+        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) {
            for ( final GenotypeBuilder gb : gbs ) {
                Object value = decoder.decodeTypedValue(typeDescriptor);
                if ( value != null ) { // don't add missing values
@ -270,7 +270,7 @@ public class BCF2GenotypeFieldDecoders {

    private class FTDecoder implements Decoder {
        @Override
-        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
+        public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final GenotypeBuilder[] gbs) {
            for ( final GenotypeBuilder gb : gbs ) {
                Object value = decoder.decodeTypedValue(typeDescriptor);
                if ( value != null ) { // don't add missing values
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java
@ -24,6 +24,7 @@

 package org.broadinstitute.sting.utils.codecs.bcf2;

+import com.google.java.contract.Requires;
 import org.apache.log4j.Logger;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.variantcontext.*;
@ -46,12 +47,16 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
    private final ArrayList<Allele> siteAlleles;
    private final int nSamples;
    private final int nFields;
+    private final GenotypeBuilder[] builders;

-    BCF2LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList<Allele> alleles, final int nSamples, final int nFields) {
+    @Requires("codec.getHeader().getNGenotypeSamples() == builders.length")
+    BCF2LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList<Allele> alleles, final int nSamples,
+                             final int nFields, final GenotypeBuilder[] builders) {
        this.codec = codec;
        this.siteAlleles = alleles;
        this.nSamples = nSamples;
        this.nFields = nFields;
+        this.builders = builders;
    }

    @Override
@ -62,21 +67,8 @@ class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
        // load our byte[] data into the decoder
        final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes);

-        // TODO -- fast path for sites only
-
-        // go ahead and decode everyone
-        final List<String> samples = new ArrayList<String>(codec.getHeader().getGenotypeSamples());
-
-        if ( samples.size() != nSamples )
-            throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
-                    "different numbers of samples per record.  Saw " + samples.size() +
-                    " samples in header but have a record with " + nSamples + " samples");
-
-        // create and initialize the genotypes array
-        final ArrayList<GenotypeBuilder> builders = new ArrayList<GenotypeBuilder>(nSamples);
-        for ( int i = 0; i < nSamples; i++ ) {
-            builders.add(new GenotypeBuilder(samples.get(i)));
-        }
+        for ( int i = 0; i < nSamples; i++ )
+            builders[i].reset(true);

        for ( int i = 0; i < nFields; i++ ) {
            // get the field name
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
@ -344,7 +344,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
        // do we have genotyping data
        if (parts.length > NUM_STANDARD_FIELDS) {
            final LazyGenotypesContext.LazyParser lazyParser = new LazyVCFGenotypesParser(alleles, chr, pos);
-            final int nGenotypes = header.getGenotypeSamples().size();
+            final int nGenotypes = header.getNGenotypeSamples();
            LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, parts[8], nGenotypes);

            // did we resort the sample names?  If so, we need to load the genotype data
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java
@ -24,6 +24,7 @@

 package org.broadinstitute.sting.utils.codecs.vcf;

+import org.apache.log4j.Logger;
 import org.broad.tribble.TribbleException;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;

@ -35,6 +36,8 @@ import java.util.Map;
 * a base class for compound header lines, which include info lines and format lines (so far)
 */
 public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine {
+    final protected static Logger logger = Logger.getLogger(VCFHeader.class);
+
    public enum SupportedHeaderLineType {
        INFO(true), FORMAT(false);

@ -172,6 +175,11 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
        if ( name == null || type == null || description == null || lineType == null )
            throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s", 
                    super.getKey(), name, type, description, lineType ));
+
+        if ( type == VCFHeaderLineType.Flag && count != 0 ) {
+            count = 0;
+            logger.warn("FLAG fields must have a count value of 0, but saw " + count + " for header line " + getID() + ". Changing it to 0 inside the code");
+        }
    }

    /**
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.codecs.vcf;

 import org.apache.log4j.Logger;
 import org.broad.tribble.util.ParsingUtils;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;

 import java.util.*;

@ -54,11 +55,12 @@ public class VCFHeader {
    private final Set<VCFHeaderLine> mMetaData = new TreeSet<VCFHeaderLine>();
    private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new HashMap<String, VCFInfoHeaderLine>();
    private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new HashMap<String, VCFFormatHeaderLine>();
+    private final Map<String, VCFFilterHeaderLine> mFilterMetaData = new HashMap<String, VCFFilterHeaderLine>();
    private final Map<String, VCFHeaderLine> mOtherMetaData = new HashMap<String, VCFHeaderLine>();
    private final List<VCFContigHeaderLine> contigMetaData = new ArrayList<VCFContigHeaderLine>();

    // the list of auxillary tags
-    private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
+    private final List<String> mGenotypeSampleNames = new ArrayList<String>();

    // the character string that indicates meta data
    public static final String METADATA_INDICATOR = "##";
@ -106,7 +108,15 @@ public class VCFHeader {
     * @param genotypeSampleNames the sample names
     */
    public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) {
+        this(metaData, new ArrayList<String>(genotypeSampleNames));
+    }
+
+    public VCFHeader(Set<VCFHeaderLine> metaData, List<String> genotypeSampleNames) {
        this(metaData);
+
+        if ( genotypeSampleNames.size() != new HashSet<String>(genotypeSampleNames).size() )
+            throw new ReviewedStingException("BUG: VCF header has duplicate sample names");
+
        mGenotypeSampleNames.addAll(genotypeSampleNames);
        samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames);
        buildVCFReaderMaps(genotypeSampleNames);
@ -175,12 +185,23 @@ public class VCFHeader {
            } else if ( line instanceof VCFFormatHeaderLine ) {
                VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
                addMetaDataMapBinding(mFormatMetaData, formatLine);
+            } else if ( line instanceof VCFFilterHeaderLine ) {
+                VCFFilterHeaderLine filterLine = (VCFFilterHeaderLine)line;
+                mFilterMetaData.put(filterLine.getID(), filterLine);
            } else if ( line instanceof VCFContigHeaderLine ) {
                contigMetaData.add((VCFContigHeaderLine)line);
            } else {
                mOtherMetaData.put(line.getKey(), line);
            }
        }
+
+        if ( hasFormatLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && ! hasFormatLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
+            logger.warn("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no "
+                    + VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY + " field.  As the GATK now only manages PL fields internally"
+                    + " automatically adding a corresponding PL field to your VCF header");
+            addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
+            loadMetaDataMaps();
+        }
    }

    /**
@ -239,7 +260,7 @@ public class VCFHeader {
     *
     * @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false
     */
-    public Set<String> getGenotypeSamples() {
+    public List<String> getGenotypeSamples() {
        return mGenotypeSampleNames;
    }

@ -294,6 +315,26 @@ public class VCFHeader {
        return mFormatMetaData.get(id);
    }

+    /**
+     * @param id    the header key name
+     * @return the meta data line, or null if there is none
+     */
+    public VCFFilterHeaderLine getFilterHeaderLine(final String id) {
+        return mFilterMetaData.get(id);
+    }
+
+    public boolean hasInfoLine(final String id) {
+        return getInfoHeaderLine(id) != null;
+    }
+
+    public boolean hasFormatLine(final String id) {
+        return getFormatHeaderLine(id) != null;
+    }
+
+    public boolean hasFilterLine(final String id) {
+        return getFilterHeaderLine(id) != null;
+    }
+
    /**
     * @param key    the header key name
     * @return the meta data line, or null if there is none
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java
@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.variantcontext;


 import com.google.java.contract.Ensures;
+import com.google.java.contract.Invariant;
 import com.google.java.contract.Requires;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;

@ -49,6 +50,7 @@ import java.util.*;
 * @author Mark DePristo
 * @since 06/12
 */
+@Invariant({"alleles != null"})
 public final class GenotypeBuilder {
    public static boolean MAKE_FAST_BY_DEFAULT = true;

@ -154,9 +156,9 @@ public final class GenotypeBuilder {
     * function you must provide sampleName and alleles before trying to
     * make more Genotypes.
     */
-    public final void reset() {
-        sampleName = null;
-        alleles = null;
+    public final void reset(final boolean keepSampleName) {
+        if ( ! keepSampleName ) sampleName = null;
+        alleles = Collections.emptyList();
        isPhased = false;
        GQ = -1;
        DP = -1;
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
@ -119,10 +119,6 @@ public class VariantContextUtils {
                attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts);
                attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs);
            }
-            else {
-                attributes.put(VCFConstants.ALLELE_COUNT_KEY, 0);
-                attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, 0.0);
-            }
        }

        return attributes;
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java
@ -260,7 +260,7 @@ public abstract class BCF2FieldEncoder {
    @Requires("isDynamicallyTyped()")
    @Ensures("result != null")
    public BCF2Type getDynamicType(final Object value) {
-        throw new ReviewedStingException("BUG: cannot get dynamic type for statically typed BCF2 field");
+        throw new ReviewedStingException("BUG: cannot get dynamic type for statically typed BCF2 field " + getField());
    }

    // ----------------------------------------------------------------------
@ -367,7 +367,7 @@ public abstract class BCF2FieldEncoder {
        public Flag(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
            super(headerLine, dict, BCF2Type.INT8);
            if ( ! headerLine.isFixedCount() || headerLine.getCount() != 0 )
-                throw new ReviewedStingException("Flag encoder only suppports atomic flags!");
+                throw new ReviewedStingException("Flag encoder only suppports atomic flags for field " + getField());
        }

        @Override
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/Options.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/Options.java
@ -33,5 +33,6 @@ package org.broadinstitute.sting.utils.variantcontext.writer;
 public enum Options {
    INDEX_ON_THE_FLY,
    DO_NOT_WRITE_GENOTYPES,
+    ALLOW_MISSING_FIELDS_IN_HEADER,
    FORCE_BCF
 }
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java
@ -27,9 +27,9 @@ package org.broadinstitute.sting.utils.variantcontext.writer;
 import net.sf.samtools.SAMSequenceDictionary;
 import org.broad.tribble.TribbleException;
 import org.broad.tribble.util.ParsingUtils;
-import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.codecs.vcf.*;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.variantcontext.*;

 import java.io.*;
@ -54,12 +54,17 @@ class VCFWriter extends IndexingVariantContextWriter {
    // were filters applied?
    protected boolean filtersWereAppliedToContext = false;

+    final private boolean allowMissingFieldsInHeader;
+
    private IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors();

-    public VCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
+    public VCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict,
+                     final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes,
+                     final boolean allowMissingFieldsInHeader ) {
        super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
        mWriter = new BufferedWriter(new OutputStreamWriter(getOutputStream())); // todo -- fix buffer size
        this.doNotWriteGenotypes = doNotWriteGenotypes;
+        this.allowMissingFieldsInHeader = allowMissingFieldsInHeader;
    }

    // --------------------------------------------------------------------------------
@ -222,6 +227,10 @@ class VCFWriter extends IndexingVariantContextWriter {
            Map<String, String> infoFields = new TreeMap<String, String>();
            for ( Map.Entry<String, Object> field : vc.getAttributes().entrySet() ) {
                String key = field.getKey();
+
+                if ( ! mHeader.hasInfoLine(key) )
+                    fieldIsMissingFromHeaderError(vc, key, "INFO");
+
                String outputValue = formatVCFField(field.getValue());
                if ( outputValue != null )
                    infoFields.put(key, outputValue);
@ -236,6 +245,10 @@ class VCFWriter extends IndexingVariantContextWriter {
            } else {
                List<String> genotypeAttributeKeys = calcVCFGenotypeKeys(vc, mHeader);
                if ( ! genotypeAttributeKeys.isEmpty() ) {
+                    for ( final String format : genotypeAttributeKeys )
+                        if ( ! mHeader.hasFormatLine(format) )
+                            fieldIsMissingFromHeaderError(vc, format, "FORMAT");
+
                    final String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys);

                    mWriter.write(VCFConstants.FIELD_SEPARATOR);
@ -270,12 +283,18 @@ class VCFWriter extends IndexingVariantContextWriter {
    //
    // --------------------------------------------------------------------------------

-    public static final String getFilterString(final VariantContext vc) {
-        return getFilterString(vc, false);
-    }
+    private final String getFilterString(final VariantContext vc, boolean forcePASS) {
+        if ( vc.isFiltered() ) {
+            for ( final String filter : vc.getFilters() )
+                if ( ! mHeader.hasFilterLine(filter) )
+                    fieldIsMissingFromHeaderError(vc, filter, "FILTER");

-    public static final String getFilterString(final VariantContext vc, boolean forcePASS) {
-        return vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (forcePASS || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
+            return ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters()));
+        }
+        else if ( forcePASS || vc.filtersWereApplied() )
+            return VCFConstants.PASSES_FILTERS_v4;
+        else
+            return VCFConstants.UNFILTERED;
    }

    private static final String QUAL_FORMAT_STRING = "%.2f";
@ -330,13 +349,13 @@ class VCFWriter extends IndexingVariantContextWriter {
     */
    private void addGenotypeData(VariantContext vc, Map<Allele, String> alleleMap, List<String> genotypeFormatKeys)
    throws IOException {
-        if ( ! mHeader.getGenotypeSamples().containsAll(vc.getSampleNames()) ) {
-            final List<String> badSampleNames = new ArrayList<String>();
-            for ( final Genotype g : vc.getGenotypes() )
-                if ( ! mHeader.getGenotypeSamples().contains(g.getSampleName()) )
-                    badSampleNames.add(g.getSampleName());
-            throw new ReviewedStingException("BUG: VariantContext contains some samples not in the VCF header: bad samples are " + Utils.join(",",badSampleNames));
-        }
+//        if ( ! mHeader.getGenotypeSamples().containsAll(vc.getSampleNames()) ) {
+//            final List<String> badSampleNames = new ArrayList<String>();
+//            for ( final Genotype g : vc.getGenotypes() )
+//                if ( ! mHeader.getGenotypeSamples().contains(g.getSampleName()) )
+//                    badSampleNames.add(g.getSampleName());
+//            throw new ReviewedStingException("BUG: VariantContext contains some samples not in the VCF header: bad samples are " + Utils.join(",",badSampleNames));
+//        }

        for ( String sample : mHeader.getGenotypeSamples() ) {
            mWriter.write(VCFConstants.FIELD_SEPARATOR);
@ -553,4 +572,13 @@ class VCFWriter extends IndexingVariantContextWriter {
           }
           return count;
    }
+
+    private final void fieldIsMissingFromHeaderError(final VariantContext vc, final String id, final String field) {
+        if ( !allowMissingFieldsInHeader)
+            throw new UserException.MalformedVCFHeader("Key " + id + " found in VariantContext field " + field
+                    + " at " + vc.getChr() + ":" + vc.getStart()
+                    + " but this key isn't defined in the VCFHeader.  The GATK now requires all VCFs to have"
+                    + " complete VCF headers by default.  This error can be disabled with the engine argument"
+                    + " --allowMissingVCFHeaders");
+    }
 }
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWriterFactory.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWriterFactory.java
@ -79,7 +79,8 @@ public class VariantContextWriterFactory {
        else {
            return new VCFWriter(location, output, refDict,
                    options.contains(Options.INDEX_ON_THE_FLY),
-                    options.contains(Options.DO_NOT_WRITE_GENOTYPES));
+                    options.contains(Options.DO_NOT_WRITE_GENOTYPES),
+                    options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER));
        }
    }

--- a/public/java/test/org/broadinstitute/sting/BaseTest.java
+++ b/public/java/test/org/broadinstitute/sting/BaseTest.java
@ -277,7 +277,7 @@ public abstract class BaseTest {
        Reporter.log(message, true);
    }

-    private static final double DEFAULT_FLOAT_TOLERANCE = 1e-4;
+    private static final double DEFAULT_FLOAT_TOLERANCE = 1e-1;

    public static final void assertEqualsDoubleSmart(final Object actual, final Double expected) {
        Assert.assertTrue(actual instanceof Double);
--- a/public/java/test/org/broadinstitute/sting/WalkerTest.java
+++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java
@ -51,7 +51,7 @@ import java.text.SimpleDateFormat;
 import java.util.*;

 public class WalkerTest extends BaseTest {
-    private static final boolean GENERATE_SHADOW_BCF = false;
+    private static final boolean GENERATE_SHADOW_BCF = true;
    private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false;
    private static final boolean ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX = false;

--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
@ -1,6 +1,7 @@
 package org.broadinstitute.sting.gatk.walkers.variantutils;

 import org.broadinstitute.sting.WalkerTest;
+import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.testng.annotations.Test;

 import java.util.Arrays;
@ -15,9 +16,11 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        String testFile = testDir + "NA12878.hg19.example1.vcf";

        WalkerTestSpec spec = new WalkerTestSpec(
-                "-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s --no_cmdline_in_header",
+                "-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant "
+                        + b37hapmapGenotypes + " -disc " + testFile
+                        + " -o %s --no_cmdline_in_header --allowMissingVCFHeaders --allowMissingVCFHeaders",
                1,
-                Arrays.asList("133fd0ded0bb213097cbe68995afbb7e")
+                Arrays.asList("18ca4d3e69874c5e55377ca2ad7c6014")
        );
        spec.disableShadowBCF();

@ -31,7 +34,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        WalkerTestSpec spec = new WalkerTestSpec(
                baseTestString(" -sn A -sn B -sn C --variant " + testfile),
                1,
-                Arrays.asList("b2ee12588ebda200727762a903b8c972")
+                Arrays.asList("ddb082f5f15944b7545b45c20440a474")
        );

        executeTest("testRepeatedLineSelection--" + testfile, spec);
@ -42,9 +45,11 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        String testFile = testDir + "NA12878.hg19.example1.vcf";

        WalkerTestSpec spec = new WalkerTestSpec(
-                "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s --no_cmdline_in_header",
+                "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant "
+                        + b37hapmapGenotypes + " -disc " + testFile
+                        + " -o %s --no_cmdline_in_header --allowMissingVCFHeaders",
                1,
-                Arrays.asList("f64c90c4cca470f1095d9fa2062eac3e")
+                Arrays.asList("2e0eaed94e2ab14349b967dc89a09823")
        );
        spec.disableShadowBCF();

@ -57,9 +62,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        String samplesFile = validationDataLocation + "SelectVariants.samples.txt";

        WalkerTestSpec spec = new WalkerTestSpec(
-            baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile),
-            1,
-            Arrays.asList("446eea62630bc5325ffab30b9b9fbfe4")
+                baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile),
+                1,
+                Arrays.asList("88c142737080847e44dc8e8c982cfc74")
        );
        spec.disableShadowBCF();
        executeTest("testComplexSelection--" + testfile, spec);
@ -71,9 +76,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        String samplesFile = validationDataLocation + "SelectVariants.samples.txt";

        WalkerTestSpec spec = new WalkerTestSpec(
-            "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile,
-            1,
-            Arrays.asList("b24f31db48d254d8fe15295955173486")
+                "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile,
+                1,
+                Arrays.asList("338bfc635667793b89c349446982a36d")
        );
        spec.disableShadowBCF();

@ -86,9 +91,11 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        String testFile = testDir + "NA12878.hg19.example1.vcf";

        WalkerTestSpec spec = new WalkerTestSpec(
-                "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc " + b37hapmapGenotypes + " --variant " + testFile + " -o %s --no_cmdline_in_header",
+                "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc "
+                        + b37hapmapGenotypes + " --variant " + testFile
+                        + " -o %s --no_cmdline_in_header --allowMissingVCFHeaders",
                1,
-                Arrays.asList("9da5dab3d344c1c0a5987b15e60fa082")
+                Arrays.asList("7d2c7deeacf307d140a85a34629c6539")
        );
        spec.disableShadowBCF();

@ -102,7 +109,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        WalkerTestSpec spec = new WalkerTestSpec(
                "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s --no_cmdline_in_header",
                1,
-                Arrays.asList("30b89b3a6706f7f46b23bfb3be69cc8e")
+                Arrays.asList("f0509f4c2c6c9eff3bec8171f00762b3")
        );

        executeTest("testVariantTypeSelection--" + testFile, spec);
@ -115,7 +122,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        WalkerTestSpec spec = new WalkerTestSpec(
                "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + testFile + " -o %s --no_cmdline_in_header",
                1,
-                Arrays.asList("8bf557aaa07eccb294c81f491225bf9e")
+                Arrays.asList("9162a67ccb4201c0542f30d14967f2d5")
        );

        executeTest("testUsingDbsnpName--" + testFile, spec);
@ -141,7 +148,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        WalkerTestSpec spec = new WalkerTestSpec(
                "-T SelectVariants -R " + b36KGReference + " -select 'KG_FREQ < 0.5' --variant " + testFile + " -o %s --no_cmdline_in_header",
                1,
-                Arrays.asList("cb9932f9a7aa2e53af605b30d88ad43f")
+                Arrays.asList("7dabe6910a40a9db5006c6af66b4617c")
        );

        executeTest("testMultipleRecordsAtOnePosition--" + testFile, spec);
@ -154,7 +161,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        WalkerTestSpec spec = new WalkerTestSpec(
                "-T SelectVariants -R " + b37KGReference + " --variant " + testFile + " -o %s --no_cmdline_in_header",
                1,
-                Arrays.asList("920605cc2182026e3f54c009f6a04141")
+                Arrays.asList("034965918283d5597bb443e65cd20cf8")
        );

        executeTest("testNoGTs--" + testFile, spec);
@ -167,9 +174,9 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        WalkerTestSpec spec;

        spec = new WalkerTestSpec(
-            baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 2"),
-            1,
-            Arrays.asList("446eea62630bc5325ffab30b9b9fbfe4")
+                baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 2"),
+                1,
+                Arrays.asList("88c142737080847e44dc8e8c982cfc74")
        );
        spec.disableShadowBCF();
        executeTest("testParallelization (2 threads)--" + testfile, spec);
@ -177,13 +184,13 @@ public class SelectVariantsIntegrationTest extends WalkerTest {

    @Test(enabled = false)
    public void testParallelization4() {
-            String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
-            String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
-            WalkerTestSpec spec;
-            spec = new WalkerTestSpec(
-            baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 4"),
-            1,
-            Arrays.asList("446eea62630bc5325ffab30b9b9fbfe4")
+        String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
+        String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
+        WalkerTestSpec spec;
+        spec = new WalkerTestSpec(
+                baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 4"),
+                1,
+                Arrays.asList("88c142737080847e44dc8e8c982cfc74")
        );
        spec.disableShadowBCF();

@ -197,8 +204,32 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        WalkerTestSpec spec = new WalkerTestSpec(
                "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile,
                1,
-                Arrays.asList("2f2a342812ba914bcce666e42ef761d7")
+                Arrays.asList("3cbabcd256070ea113ddc04fef3b6900")
        );
        executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec);
    }
+
+    @Test()
+    public void testFileWithoutInfoLineInHeader() {
+        testFileWithoutInfoLineInHeader("testFileWithoutInfoLineInHeader", UserException.class);
+    }
+
+    @Test()
+    public void testFileWithoutInfoLineInHeaderWithOverride() {
+        testFileWithoutInfoLineInHeader("testFileWithoutInfoLineInHeaderWithOverride", null);
+    }
+
+    private void testFileWithoutInfoLineInHeader(final String name, final Class expectedException) {
+        final String testFile = testDir + "missingHeaderLine.vcf";
+        final String cmd = "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp "
+                + testFile + " -o %s --no_cmdline_in_header"
+                + (expectedException == null ? " -allowMissingVCFHeaders" : "");
+        WalkerTestSpec spec =
+                expectedException != null
+                        ? new WalkerTestSpec(cmd, 1, expectedException)
+                        : new WalkerTestSpec(cmd, 1, Arrays.asList(""));
+        spec.disableShadowBCF();
+
+        executeTest(name, spec);
+    }
 }
--- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java
+++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java
@ -142,16 +142,16 @@ public class VariantContextTestProvider {
        if ( ENABLE_SOURCE_VCF_TESTS ) {
            for ( final File file : testSourceVCFs ) {
                VCFCodec codec = new VCFCodec();
-                Pair<VCFHeader, List<VariantContext>> x = readAllVCs( file, codec );
-                List<VariantContext> fullyDecoded = new ArrayList<VariantContext>(x.getSecond().size());
-                int i = 0;
+                Pair<VCFHeader, Iterable<VariantContext>> x = readAllVCs( file, codec );
+                List<VariantContext> fullyDecoded = new ArrayList<VariantContext>();
+
                logger.warn("Reading records from " + file);
                for ( final VariantContext raw : x.getSecond() ) {
                    fullyDecoded.add(raw.fullyDecode(x.getFirst()));
-                    logger.warn("\t" + i++);
                }
                logger.warn("Done reading " + file);
-                TEST_DATAs.add(new VariantContextTestData(x.getFirst(), x.getSecond()));
+
+                TEST_DATAs.add(new VariantContextTestData(x.getFirst(), fullyDecoded));
            }
        }
    }
@ -326,6 +326,10 @@ public class VariantContextTestProvider {
            final Genotype homVar = GenotypeBuilder.create("homVar", Arrays.asList(alt1, alt1));
            addGenotypeTests(site, homRef, het);
            addGenotypeTests(site, homRef, het, homVar);
+
+            // test no GT at all
+            addGenotypeTests(site);
+
            final List<Allele> noCall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);

            // ploidy
@ -511,7 +515,7 @@ public class VariantContextTestProvider {
            writer.add(vc);
        writer.close();

-        final List<VariantContext> actual = readAllVCs(tmpFile, tester.makeCodec()).getSecond();
+        final Iterable<VariantContext> actual = readAllVCs(tmpFile, tester.makeCodec()).getSecond();
        assertEquals(actual, expected);

    }
@ -524,7 +528,7 @@ public class VariantContextTestProvider {
     * @return
     * @throws IOException
     */
-    private final static Pair<VCFHeader, List<VariantContext>> readAllVCs( final File source, final FeatureCodec<VariantContext> codec ) throws IOException {
+    private final static Pair<VCFHeader, Iterable<VariantContext>> readAllVCs( final File source, final FeatureCodec<VariantContext> codec ) throws IOException {
        // read in the features
        PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source));
        FeatureCodecHeader header = codec.readHeader(pbs);
@ -533,27 +537,79 @@ public class VariantContextTestProvider {
        pbs = new PositionalBufferedStream(new FileInputStream(source));
        pbs.skip(header.getHeaderEnd());

-        final List<VariantContext> read = new ArrayList<VariantContext>();
-        while ( ! pbs.isDone() ) {
-            final VariantContext vc = codec.decode(pbs);
-            if ( vc != null ) read.add(vc.fullyDecode((VCFHeader)header.getHeaderValue()));
-        };
+        final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue();
+        return new Pair<VCFHeader, Iterable<VariantContext>>(vcfHeader, new VCIterable(pbs, codec, vcfHeader));
+    }

-        return new Pair<VCFHeader, List<VariantContext>>((VCFHeader)header.getHeaderValue(), read);
+    private static class VCIterable implements Iterable<VariantContext>, Iterator<VariantContext> {
+        final PositionalBufferedStream pbs;
+        final FeatureCodec<VariantContext> codec;
+        final VCFHeader header;
+
+        private VCIterable(final PositionalBufferedStream pbs, final FeatureCodec<VariantContext> codec, final VCFHeader header) {
+            this.pbs = pbs;
+            this.codec = codec;
+            this.header = header;
+        }
+
+        @Override
+        public Iterator<VariantContext> iterator() {
+            return this;
+        }
+
+        @Override
+        public boolean hasNext() {
+            try {
+                return ! pbs.isDone();
+            } catch ( IOException e ) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        @Override
+        public VariantContext next() {
+            try {
+                final VariantContext vc = codec.decode(pbs);
+                return vc == null ? null : vc.fullyDecode(header);
+            } catch ( IOException e ) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        @Override
+        public void remove() {
+            //To change body of implemented methods use File | Settings | File Templates.
+        }
    }

    public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException {
-        final Pair<VCFHeader, List<VariantContext>> vcfData = readAllVCs(vcfFile, new VCFCodec());
-        final Pair<VCFHeader, List<VariantContext>> bcfData = readAllVCs(bcfFile, new BCF2Codec());
+        final Pair<VCFHeader, Iterable<VariantContext>> vcfData = readAllVCs(vcfFile, new VCFCodec());
+        final Pair<VCFHeader, Iterable<VariantContext>> bcfData = readAllVCs(bcfFile, new BCF2Codec());
        assertEquals(bcfData.getFirst(), vcfData.getFirst());
        assertEquals(bcfData.getSecond(), vcfData.getSecond());
    }

-    public static void assertEquals(final List<VariantContext> actual, final List<VariantContext> expected) {
-        Assert.assertEquals(actual.size(), expected.size());
+    public static void assertEquals(final Iterable<VariantContext> actual, final Iterable<VariantContext> expected) {
+        final Iterator<VariantContext> actualIT = actual.iterator();
+        final Iterator<VariantContext> expectedIT = expected.iterator();

-        for ( int i = 0; i < expected.size(); i++ )
-            assertEquals(actual.get(i), expected.get(i));
+        while ( expectedIT.hasNext() ) {
+            final VariantContext expectedVC = expectedIT.next();
+            if ( expectedVC == null )
+                continue;
+
+            VariantContext actualVC;
+            do {
+                Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual");
+                actualVC = actualIT.next();
+            } while ( actualIT.hasNext() && actualVC == null );
+
+            if ( actualVC == null )
+                Assert.fail("Too few records in actual");
+
+            assertEquals(actualVC, expectedVC);
+        }
+        Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual");
    }

    /**
@ -730,4 +786,14 @@ public class VariantContextTestProvider {
            Assert.assertEquals(actualLines.get(i), expectedLines.get(i));
        }
    }
+
+    public static void main( String argv[] ) {
+        final File variants1 = new File(argv[0]);
+        final File variants2 = new File(argv[1]);
+        try {
+            VariantContextTestProvider.assertVCFandBCFFilesAreTheSame(variants1, variants2);
+        } catch ( IOException e ) {
+            throw new RuntimeException(e);
+        }
+    }
 }