BCF2 optimizations; parallel CombineVariants

-- BCF2 now determines whether it can safely write out raw genotype blocks, which is true in the case where the VCF header of the input is a complete, ordered subset of the output header. Added utilities to determine this and extensive unit tests (headerLinesAreOrderedConsistently) -- Cleanup collapseStringList and exploreStringList for new unit tests of BCF2Utils. Fixed bug in edge case that never occurred in practice -- VCFContigHeaderLine now provides its own key (VCFHeader.CONTIG_KEY) directly instead of requiring the user to provide it (and hoping its right) -- More ways to access the data in VCFHeader -- BCF2Writer uses a cache to avoid recomputing unnecessarily whether raw genotype blocks can be emitted directly into the output -- Optimization of fullyDecodeAttributes -- attributes.size() is expensive and unnecessary. We just guess that on average we need ~10 elements for the attribute map -- CombineVariants optimization -- filters are online HashSet but are sorted at the end by creating a TreeSet -- makeCombinations is now makePermutations, and you can request to create the permutations with or without replacement
2012-08-15 14:36:06 -04:00 · 2012-08-15 14:36:06 -04:00 · 669c43031a
parent dafa7e3885
commit 669c43031a
15 changed files with 334 additions and 44 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
@ -67,6 +67,12 @@ import java.util.*;
 * VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out
 * in the detailed example on the wiki.
 *
+ * Note that CombineVariants supports multi-threaded parallelism (8/15/12).  This is particularly useful
+ * when converting from VCF to BCF2, which can be expensive.  In this case each thread spends CPU time
+ * doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together
+ * efficiency.  However, since this merge runs in only one thread, you can quickly reach diminishing
+ * returns with the number of parallel threads.  -nt 4 works well but -nt 8 may be too much.
+ *
 * <h2>Input</h2>
 * <p>
 * One or more variant sets to combine.
--- a/public/java/src/org/broadinstitute/sting/utils/Utils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java
@ -32,7 +32,6 @@ import net.sf.samtools.util.StringUtil;
 import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
-import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.text.TextFormattingUtils;

 import java.net.InetAddress;
@ -742,19 +741,23 @@ public class Utils {
     * @param objects
     * @param n
     * @param <T>
+     * @param withReplacement if false, the resulting permutations will only contain unique objects from objects
     * @return
     */
-    public static <T> List<List<T>> makeCombinations(final List<T> objects, final int n) {
+    public static <T> List<List<T>> makePermutations(final List<T> objects, final int n, final boolean withReplacement) {
        final List<List<T>> combinations = new ArrayList<List<T>>();

-        if ( n == 1 ) {
+        if ( n <= 0 )
+            ;
+        else if ( n == 1 ) {
            for ( final T o : objects )
                combinations.add(Collections.singletonList(o));
        } else {
-            final List<List<T>> sub = makeCombinations(objects, n - 1);
+            final List<List<T>> sub = makePermutations(objects, n - 1, withReplacement);
            for ( List<T> subI : sub ) {
                for ( final T a : objects ) {
-                    combinations.add(Utils.cons(a, subI));
+                    if ( withReplacement || ! subI.contains(a) )
+                        combinations.add(Utils.cons(a, subI));
                }
            }
        }
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java
@ -423,9 +423,8 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
            final LazyGenotypesContext.LazyParser lazyParser =
                    new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders);

-            LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser,
-                    new LazyData(siteInfo.nFormatFields, decoder.getRecordBytes()),
-                    header.getNGenotypeSamples());
+            final LazyData lazyData = new LazyData(header, siteInfo.nFormatFields, decoder.getRecordBytes());
+            final LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, lazyData, header.getNGenotypeSamples());

            // did we resort the sample names?  If so, we need to load the genotype data
            if ( !header.samplesWereAlreadySorted() )
@ -436,11 +435,13 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
    }

    public static class LazyData {
+        final public VCFHeader header;
        final public int nGenotypeFields;
        final public byte[] bytes;

        @Requires({"nGenotypeFields > 0", "bytes != null"})
-        public LazyData(final int nGenotypeFields, final byte[] bytes) {
+        public LazyData(final VCFHeader header, final int nGenotypeFields, final byte[] bytes) {
+            this.header = header;
            this.nGenotypeFields = nGenotypeFields;
            this.bytes = bytes;
        }
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java
@ -39,7 +39,7 @@ import java.util.*;
 * @author Mark DePristo
 * @since 5/12
 */
-class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
+public class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
    final protected static Logger logger = Logger.getLogger(BCF2LazyGenotypesDecoder.class);

    // the essential information for us to use to decode the genotypes data
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java
@ -131,17 +131,21 @@ public final class BCF2Utils {
     * @param strings size > 1 list of strings
     * @return
     */
-    @Requires({"strings != null", "strings.size() > 1"})
+    @Requires({"strings != null"})
    @Ensures("result != null")
    public static String collapseStringList(final List<String> strings) {
-        final StringBuilder b = new StringBuilder();
-        for ( final String s : strings ) {
-            if ( s != null ) {
-                assert s.indexOf(",") == -1; // no commas in individual strings
-                b.append(",").append(s);
+        if ( strings.isEmpty() ) return "";
+        else if ( strings.size() == 1 ) return strings.get(0);
+        else {
+            final StringBuilder b = new StringBuilder();
+            for ( final String s : strings ) {
+                if ( s != null ) {
+                    assert s.indexOf(",") == -1; // no commas in individual strings
+                    b.append(",").append(s);
+                }
            }
+            return b.toString();
        }
-        return b.toString();
    }

    /**
@ -163,7 +167,7 @@ public final class BCF2Utils {

    @Requires("s != null")
    public static boolean isCollapsedString(final String s) {
-        return s.charAt(0) == ',';
+        return s.length() > 0 && s.charAt(0) == ',';
    }

    /**
@ -280,4 +284,49 @@ public final class BCF2Utils {
        else if ( o instanceof List ) return (List<Object>)o;
        else return Collections.singletonList(o);
    }
+
+    /**
+     * Are the elements and their order in the output and input headers consistent so that
+     * we can write out the raw genotypes block without decoding and recoding it?
+     *
+     * If the order of INFO, FILTER, or contrig elements in the output header is different than
+     * in the input header we must decode the blocks using the input header and then recode them
+     * based on the new output order.
+     *
+     * If they are consistent, we can simply pass through the raw genotypes block bytes, which is
+     * a *huge* performance win for large blocks.
+     *
+     * Many common operations on BCF2 files (merging them for -nt, selecting a subset of records, etc)
+     * don't modify the ordering of the header fields and so can safely pass through the genotypes
+     * undecoded.  Some operations -- those at add filters or info fields -- can change the ordering
+     * of the header fields and so produce invalid BCF2 files if the genotypes aren't decoded
+     */
+    public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHeader, final VCFHeader genotypesBlockHeader) {
+        // first, we have to have the same samples in the same order
+        if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) )
+            return false;
+
+        final Iterator<? extends VCFIDHeaderLine> outputLinesIt = outputHeader.getIDHeaderLines().iterator();
+        final Iterator<? extends VCFIDHeaderLine> inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator();
+
+        while ( inputLinesIt.hasNext() ) {
+            if ( ! outputLinesIt.hasNext() ) // missing lines in output
+                return false;
+
+            final VCFIDHeaderLine outputLine = outputLinesIt.next();
+            final VCFIDHeaderLine inputLine = inputLinesIt.next();
+
+            if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) )
+                return false;
+        }
+
+        return true;
+    }
+
+    private static <T> List<T> nullAsEmpty(List<T> l) {
+        if ( l == null )
+            return Collections.emptyList();
+        else
+            return l;
+    }
 }
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFContigHeaderLine.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFContigHeaderLine.java
@ -47,8 +47,8 @@ public class VCFContigHeaderLine extends VCFSimpleHeaderLine {
        this.contigIndex = contigIndex;
    }

-    public VCFContigHeaderLine(final String key, final Map<String, String> mapping, int contigIndex) {
-        super(key, mapping, null);
+    public VCFContigHeaderLine(final Map<String, String> mapping, int contigIndex) {
+        super(VCFHeader.CONTIG_KEY, mapping, null);
        this.contigIndex = contigIndex;
    }

--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
@ -53,10 +53,10 @@ public class VCFHeader {

    // the associated meta data
    private final Set<VCFHeaderLine> mMetaData = new LinkedHashSet<VCFHeaderLine>();
-    private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new HashMap<String, VCFInfoHeaderLine>();
-    private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new HashMap<String, VCFFormatHeaderLine>();
-    private final Map<String, VCFFilterHeaderLine> mFilterMetaData = new HashMap<String, VCFFilterHeaderLine>();
-    private final Map<String, VCFHeaderLine> mOtherMetaData = new HashMap<String, VCFHeaderLine>();
+    private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new LinkedHashMap<String, VCFInfoHeaderLine>();
+    private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new LinkedHashMap<String, VCFFormatHeaderLine>();
+    private final Map<String, VCFFilterHeaderLine> mFilterMetaData = new LinkedHashMap<String, VCFFilterHeaderLine>();
+    private final Map<String, VCFHeaderLine> mOtherMetaData = new LinkedHashMap<String, VCFHeaderLine>();
    private final List<VCFContigHeaderLine> contigMetaData = new ArrayList<VCFContigHeaderLine>();

    // the list of auxillary tags
@ -101,6 +101,15 @@ public class VCFHeader {
        loadMetaDataMaps();
    }

+    /**
+     * Creates a shallow copy of the meta data in VCF header toCopy
+     *
+     * @param toCopy
+     */
+    public VCFHeader(final VCFHeader toCopy) {
+        this(toCopy.mMetaData);
+    }
+
    /**
     * create a VCF header, given a list of meta data and auxillary tags
     *
@ -153,12 +162,39 @@ public class VCFHeader {
    }

    /**
-     * @return all of the VCF header lines of the ##contig form in order, or an empty set if none were present
+     * @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present
     */
    public List<VCFContigHeaderLine> getContigLines() {
        return Collections.unmodifiableList(contigMetaData);
    }

+
+    /**
+     * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
+     */
+    public List<VCFFilterHeaderLine> getFilterLines() {
+        final List<VCFFilterHeaderLine> filters = new ArrayList<VCFFilterHeaderLine>();
+        for ( VCFHeaderLine line : mMetaData ) {
+            if ( line instanceof VCFFilterHeaderLine )  {
+                filters.add((VCFFilterHeaderLine)line);
+            }
+        }
+        return filters;
+    }
+
+    /**
+     * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
+     */
+    public List<VCFIDHeaderLine> getIDHeaderLines() {
+        final List<VCFIDHeaderLine> filters = new ArrayList<VCFIDHeaderLine>();
+        for ( VCFHeaderLine line : mMetaData ) {
+            if ( line instanceof VCFIDHeaderLine )  {
+                filters.add((VCFIDHeaderLine)line);
+            }
+        }
+        return filters;
+    }
+
    /**
     * check our metadata for a VCF version tag, and throw an exception if the version is out of date
     * or the version is not present
@ -299,10 +335,16 @@ public class VCFHeader {
        return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
    }

+    /**
+     * Returns the INFO HeaderLines in their original ordering
+     */
    public Collection<VCFInfoHeaderLine> getInfoHeaderLines() {
        return mInfoMetaData.values();
    }

+    /**
+     * Returns the FORMAT HeaderLines in their original ordering
+     */
    public Collection<VCFFormatHeaderLine> getFormatHeaderLines() {
        return mFormatMetaData.values();
    }
@ -390,4 +432,13 @@ public class VCFHeader {
    public HashMap<String, Integer> getSampleNameToOffset() {
        return sampleNameToOffset;
    }
+
+    @Override
+    public String toString() {
+        final StringBuilder b = new StringBuilder();
+        b.append("[VCFHeader:");
+        for ( final VCFHeaderLine line : mMetaData )
+            b.append("\n\t").append(line);
+        return b.append("\n]").toString();
+    }
 }
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java
@ -301,7 +301,7 @@ public class VCFUtils {
        map.put("ID", contig.getSequenceName());
        map.put("length", String.valueOf(contig.getSequenceLength()));
        if ( assembly != null ) map.put("assembly", assembly);
-        return new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, map, contig.getSequenceIndex());
+        return new VCFContigHeaderLine(map, contig.getSequenceIndex());
    }

    private static String getReferenceAssembly(final String refPath) {
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@ -1351,7 +1351,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
    private final Map<String, Object> fullyDecodeAttributes(final Map<String, Object> attributes,
                                                            final VCFHeader header,
                                                            final boolean lenientDecoding) {
-        final Map<String, Object> newAttributes = new HashMap<String, Object>(attributes.size());
+        final Map<String, Object> newAttributes = new HashMap<String, Object>(10);

        for ( final Map.Entry<String, Object> attr : attributes.entrySet() ) {
            final String field = attr.getKey();
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
@ -504,7 +504,7 @@ public class VariantContextUtils {
        Byte referenceBaseForIndel = null;

        final Set<Allele> alleles = new LinkedHashSet<Allele>();
-        final Set<String> filters = new TreeSet<String>();
+        final Set<String> filters = new HashSet<String>();
        final Map<String, Object> attributes = new TreeMap<String, Object>();
        final Set<String> inconsistentAttributes = new HashSet<String>();
        final Set<String> variantSources = new HashSet<String>(); // contains the set of sources we found in our set of VCs that are variant
@ -656,7 +656,8 @@ public class VariantContextUtils {
        builder.alleles(alleles);
        builder.genotypes(genotypes);
        builder.log10PError(log10PError);
-        builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes);
+        builder.filters(filters.isEmpty() ? filters : new TreeSet<String>(filters));
+        builder.attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes);

        // Trim the padded bases of all alleles if necessary
        final VariantContext merged = builder.make();
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java
@ -124,7 +124,7 @@ public final class BCF2Encoder {
    @Ensures("encodeStream.size() > old(encodeStream.size())")
    public final void encodeTyped(List<? extends Object> v, final BCF2Type type) throws IOException {
        if ( type == BCF2Type.CHAR && v.size() != 0 ) {
-            final String s = v.size() > 1 ? BCF2Utils.collapseStringList((List<String>) v) : (String)v.get(0);
+            final String s = BCF2Utils.collapseStringList((List<String>) v);
            v = stringToBytes(s);
        }

--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java
@ -335,7 +335,6 @@ public abstract class BCF2FieldEncoder {
            else if (value instanceof List) {
                final List<String> l = (List<String>)value;
                if ( l.isEmpty() ) return "";
-                else if ( l.size() == 1 ) return (String)l.get(0);
                else return BCF2Utils.collapseStringList(l);
            } else
                return (String)value;
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java
@ -87,14 +87,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
    public static final int MAJOR_VERSION = 2;
    public static final int MINOR_VERSION = 1;

-    /**
-     * If true, we will write out the undecoded raw bytes for a genotypes block, if it
-     * is found in the input VC.  This can be very dangerous as the genotype encoding
-     * depends on the exact ordering of the header.
-     *
-     * TODO -- enable when the new smart VCF header code is created by Eric Banks
-     */
-    private final static boolean WRITE_UNDECODED_GENOTYPE_BLOCK = false;
    final protected static Logger logger = Logger.getLogger(BCF2Writer.class);
    final private static boolean ALLOW_MISSING_CONTIG_LINES = false;

@ -108,6 +100,13 @@ class BCF2Writer extends IndexingVariantContextWriter {
    private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
    final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager();

+    /**
+     * cached results for whether we can write out raw genotypes data.
+     */
+    private VCFHeader lastVCFHeaderOfUnparsedGenotypes = null;
+    private boolean canPassOnUnparsedGenotypeDataForLastVCFHeader = false;
+
+
    public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
        super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
        this.outputStream = getOutputStream();
@ -247,13 +246,39 @@ class BCF2Writer extends IndexingVariantContextWriter {
        return encoder.getRecordBytes();
    }

+
+    /**
+     * Can we safely write on the raw (undecoded) genotypes of an input VC?
+     *
+     * The cache depends on the undecoded lazy data header == lastVCFHeaderOfUnparsedGenotypes, in
+     * which case we return the previous result.  If it's not cached, we use the BCF2Util to
+     * compare the VC header with our header (expensive) and cache it.
+     *
+     * @param lazyData
+     * @return
+     */
+    private boolean canSafelyWriteRawGenotypesBytes(final BCF2Codec.LazyData lazyData) {
+        if ( lazyData.header != lastVCFHeaderOfUnparsedGenotypes ) {
+            // result is already cached
+            canPassOnUnparsedGenotypeDataForLastVCFHeader = BCF2Utils.headerLinesAreOrderedConsistently(this.header,lazyData.header);
+            lastVCFHeaderOfUnparsedGenotypes = lazyData.header;
+        }
+
+        return canPassOnUnparsedGenotypeDataForLastVCFHeader;
+    }
+
    private BCF2Codec.LazyData getLazyData(final VariantContext vc) {
        if ( vc.getGenotypes().isLazyWithData() ) {
-                LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes();
-            if ( WRITE_UNDECODED_GENOTYPE_BLOCK && lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData )
+            final LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes();
+
+            if ( lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData &&
+                    canSafelyWriteRawGenotypesBytes((BCF2Codec.LazyData) lgc.getUnparsedGenotypeData())) {
+                //logger.info("Passing on raw BCF2 genotypes data");
                return (BCF2Codec.LazyData)lgc.getUnparsedGenotypeData();
-            else
+            } else {
+                //logger.info("Decoding raw BCF2 genotypes data");
                lgc.decode(); // WARNING -- required to avoid keeping around bad lazy data for too long
+            }
        }

        return null;
--- a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2UtilsUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2UtilsUnitTest.java
@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.codecs.bcf2;
+
+import org.broad.tribble.readers.PositionalBufferedStream;
+import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.codecs.vcf.*;
+
+import java.io.*;
+import java.util.*;
+import org.testng.Assert;
+import org.testng.annotations.BeforeSuite;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+/**
+ * Tests for BCF2Utils
+ */
+public final class BCF2UtilsUnitTest extends BaseTest {
+    @DataProvider(name = "CollapseExpandTest")
+    public Object[][] makeCollapseExpandTest() {
+        List<Object[]> tests = new ArrayList<Object[]>();
+        tests.add(new Object[]{Arrays.asList("A"), "A", false});
+        tests.add(new Object[]{Arrays.asList("A", "B"), ",A,B", true});
+        tests.add(new Object[]{Arrays.asList("AB"), "AB", false});
+        tests.add(new Object[]{Arrays.asList("AB", "C"), ",AB,C", true});
+        tests.add(new Object[]{Arrays.asList(), "", false});
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(dataProvider = "CollapseExpandTest")
+    public void testCollapseExpandTest(final List<String> in, final String expectedCollapsed, final boolean isCollapsed) {
+        final String actualCollapsed = BCF2Utils.collapseStringList(in);
+        Assert.assertEquals(actualCollapsed, expectedCollapsed);
+        Assert.assertEquals(BCF2Utils.isCollapsedString(actualCollapsed), isCollapsed);
+        if ( isCollapsed )
+            Assert.assertEquals(BCF2Utils.explodeStringList(actualCollapsed), in);
+    }
+
+    @DataProvider(name = "HeaderOrderTestProvider")
+    public Object[][] makeHeaderOrderTestProvider() {
+        final List<VCFHeaderLine> inputLines = new ArrayList<VCFHeaderLine>();
+        final List<VCFHeaderLine> extraLines = new ArrayList<VCFHeaderLine>();
+
+        int counter = 0;
+        inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
+        inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
+        inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
+        inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
+        inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
+        inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
+        inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
+        inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
+        final int inputLineCounter = counter;
+        final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet<VCFHeaderLine>(inputLines));
+
+        extraLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
+        extraLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
+        extraLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
+        extraLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
+        extraLines.add(new VCFHeaderLine("x", "misc"));
+        extraLines.add(new VCFHeaderLine("y", "misc"));
+
+        List<Object[]> tests = new ArrayList<Object[]>();
+        for ( final int extrasToTake : Arrays.asList(0, 1, 2, 3) ) {
+            final List<VCFHeaderLine> empty = Collections.emptyList();
+            final List<List<VCFHeaderLine>> permutations = extrasToTake == 0
+                    ? Collections.singletonList(empty)
+                    : Utils.makePermutations(extraLines, extrasToTake, false);
+            for ( final List<VCFHeaderLine> permutation : permutations ) {
+                for ( int i = -1; i < inputLines.size(); i++ ) {
+                    final List<VCFHeaderLine> allLines = new ArrayList<VCFHeaderLine>(inputLines);
+                    if ( i >= 0 )
+                        allLines.remove(i);
+                    allLines.addAll(permutation);
+                    final VCFHeader testHeader = new VCFHeader(new LinkedHashSet<VCFHeaderLine>(allLines));
+                    final boolean expectedConsistent = expectedConsistent(testHeader, inputLineCounter);
+                    tests.add(new Object[]{inputHeader, testHeader, expectedConsistent});
+                }
+            }
+        }
+
+        // sample name tests
+        final List<List<String>> sampleNameTests = Arrays.asList(
+                new ArrayList<String>(),
+                Arrays.asList("A"),
+                Arrays.asList("A", "B"),
+                Arrays.asList("A", "B", "C"));
+        for ( final List<String> inSamples : sampleNameTests ) {
+            for ( final List<String> testSamples : sampleNameTests ) {
+                final VCFHeader inputHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), inSamples);
+
+                final List<List<String>> permutations = testSamples.isEmpty()
+                        ? Collections.singletonList(testSamples)
+                        : Utils.makePermutations(testSamples, testSamples.size(), false);
+                for ( final List<String> testSamplesPermutation : permutations ) {
+                    final VCFHeader testHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), testSamplesPermutation);
+                    final boolean expectedConsistent = testSamples.equals(inSamples);
+                    tests.add(new Object[]{inputHeaderWithSamples, testHeaderWithSamples, expectedConsistent});
+                }
+            }
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    private static boolean expectedConsistent(final VCFHeader combinationHeader, final int minCounterForInputLines) {
+        final List<Integer> ids = new ArrayList<Integer>();
+        for ( final VCFHeaderLine line : combinationHeader.getMetaDataInInputOrder() ) {
+            if ( line instanceof VCFIDHeaderLine ) {
+                ids.add(Integer.valueOf(((VCFIDHeaderLine) line).getID()));
+            }
+        }
+
+        // as long as the start contains all of the ids up to minCounterForInputLines in order
+        for ( int i = 0; i < minCounterForInputLines; i++ )
+            if ( i >= ids.size() || ids.get(i) != i )
+                return false;
+
+        return true;
+    }
+
+    //
+    // Test to make sure that we detect correctly the case where we can preserve the genotypes data in a BCF2
+    // even when the header file is slightly different
+    //
+    @Test(dataProvider = "HeaderOrderTestProvider")
+    public void testHeaderOrder(final VCFHeader inputHeader, final VCFHeader testHeader, final boolean expectedConsistent) {
+        final boolean actualOrderConsistency = BCF2Utils.headerLinesAreOrderedConsistently(testHeader, inputHeader);
+        Assert.assertEquals(actualOrderConsistency, expectedConsistent);
+    }
+}
--- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java
+++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java
@ -197,7 +197,7 @@ public class VariantContextTestProvider {
        addHeaderLine(metaData, "FT", 1, VCFHeaderLineType.String);

        // prep the header
-        metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0));
+        metaData.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0));

        metaData.add(new VCFFilterHeaderLine("FILTER1"));
        metaData.add(new VCFFilterHeaderLine("FILTER2"));
@ -889,7 +889,7 @@ public class VariantContextTestProvider {
    }

    private static List<List<Allele>> makeAllGenotypes(final List<Allele> alleles, final int highestPloidy) {
-        return Utils.makeCombinations(alleles, highestPloidy);
+        return Utils.makePermutations(alleles, highestPloidy, true);
    }

    public static void assertEquals(final VCFHeader actual, final VCFHeader expected) {