diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
index d6504e841..8dabd49b8 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java
@@ -67,6 +67,12 @@ import java.util.*;
* VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out
* in the detailed example on the wiki.
*
+ * Note that CombineVariants supports multi-threaded parallelism (8/15/12). This is particularly useful
+ * when converting from VCF to BCF2, which can be expensive. In this case each thread spends CPU time
+ * doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together
+ * efficiency. However, since this merge runs in only one thread, you can quickly reach diminishing
+ * returns with the number of parallel threads. -nt 4 works well but -nt 8 may be too much.
+ *
*
Input
*
* One or more variant sets to combine.
diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java
index 14d7177a0..476098ae6 100755
--- a/public/java/src/org/broadinstitute/sting/utils/Utils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java
@@ -32,7 +32,6 @@ import net.sf.samtools.util.StringUtil;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
-import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
import java.net.InetAddress;
@@ -742,19 +741,23 @@ public class Utils {
* @param objects
* @param n
* @param
+ * @param withReplacement if false, the resulting permutations will only contain unique objects from objects
* @return
*/
- public static List> makeCombinations(final List objects, final int n) {
+ public static List> makePermutations(final List objects, final int n, final boolean withReplacement) {
final List> combinations = new ArrayList>();
- if ( n == 1 ) {
+ if ( n <= 0 )
+ ;
+ else if ( n == 1 ) {
for ( final T o : objects )
combinations.add(Collections.singletonList(o));
} else {
- final List> sub = makeCombinations(objects, n - 1);
+ final List> sub = makePermutations(objects, n - 1, withReplacement);
for ( List subI : sub ) {
for ( final T a : objects ) {
- combinations.add(Utils.cons(a, subI));
+ if ( withReplacement || ! subI.contains(a) )
+ combinations.add(Utils.cons(a, subI));
}
}
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java
index 67e189d11..ac6348f80 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java
@@ -423,9 +423,8 @@ public final class BCF2Codec implements FeatureCodec {
final LazyGenotypesContext.LazyParser lazyParser =
new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders);
- LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser,
- new LazyData(siteInfo.nFormatFields, decoder.getRecordBytes()),
- header.getNGenotypeSamples());
+ final LazyData lazyData = new LazyData(header, siteInfo.nFormatFields, decoder.getRecordBytes());
+ final LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, lazyData, header.getNGenotypeSamples());
// did we resort the sample names? If so, we need to load the genotype data
if ( !header.samplesWereAlreadySorted() )
@@ -436,11 +435,13 @@ public final class BCF2Codec implements FeatureCodec {
}
public static class LazyData {
+ final public VCFHeader header;
final public int nGenotypeFields;
final public byte[] bytes;
@Requires({"nGenotypeFields > 0", "bytes != null"})
- public LazyData(final int nGenotypeFields, final byte[] bytes) {
+ public LazyData(final VCFHeader header, final int nGenotypeFields, final byte[] bytes) {
+ this.header = header;
this.nGenotypeFields = nGenotypeFields;
this.bytes = bytes;
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java
index 513b9fcb5..46b1fa6c1 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java
@@ -39,7 +39,7 @@ import java.util.*;
* @author Mark DePristo
* @since 5/12
*/
-class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
+public class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
final protected static Logger logger = Logger.getLogger(BCF2LazyGenotypesDecoder.class);
// the essential information for us to use to decode the genotypes data
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java
index e6e78d89d..2ac916db1 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java
@@ -131,17 +131,21 @@ public final class BCF2Utils {
* @param strings size > 1 list of strings
* @return
*/
- @Requires({"strings != null", "strings.size() > 1"})
+ @Requires({"strings != null"})
@Ensures("result != null")
public static String collapseStringList(final List strings) {
- final StringBuilder b = new StringBuilder();
- for ( final String s : strings ) {
- if ( s != null ) {
- assert s.indexOf(",") == -1; // no commas in individual strings
- b.append(",").append(s);
+ if ( strings.isEmpty() ) return "";
+ else if ( strings.size() == 1 ) return strings.get(0);
+ else {
+ final StringBuilder b = new StringBuilder();
+ for ( final String s : strings ) {
+ if ( s != null ) {
+ assert s.indexOf(",") == -1; // no commas in individual strings
+ b.append(",").append(s);
+ }
}
+ return b.toString();
}
- return b.toString();
}
/**
@@ -163,7 +167,7 @@ public final class BCF2Utils {
@Requires("s != null")
public static boolean isCollapsedString(final String s) {
- return s.charAt(0) == ',';
+ return s.length() > 0 && s.charAt(0) == ',';
}
/**
@@ -280,4 +284,49 @@ public final class BCF2Utils {
else if ( o instanceof List ) return (List)o;
else return Collections.singletonList(o);
}
+
+ /**
+ * Are the elements and their order in the output and input headers consistent so that
+ * we can write out the raw genotypes block without decoding and recoding it?
+ *
+ * If the order of INFO, FILTER, or contrig elements in the output header is different than
+ * in the input header we must decode the blocks using the input header and then recode them
+ * based on the new output order.
+ *
+ * If they are consistent, we can simply pass through the raw genotypes block bytes, which is
+ * a *huge* performance win for large blocks.
+ *
+ * Many common operations on BCF2 files (merging them for -nt, selecting a subset of records, etc)
+ * don't modify the ordering of the header fields and so can safely pass through the genotypes
+ * undecoded. Some operations -- those at add filters or info fields -- can change the ordering
+ * of the header fields and so produce invalid BCF2 files if the genotypes aren't decoded
+ */
+ public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHeader, final VCFHeader genotypesBlockHeader) {
+ // first, we have to have the same samples in the same order
+ if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) )
+ return false;
+
+ final Iterator extends VCFIDHeaderLine> outputLinesIt = outputHeader.getIDHeaderLines().iterator();
+ final Iterator extends VCFIDHeaderLine> inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator();
+
+ while ( inputLinesIt.hasNext() ) {
+ if ( ! outputLinesIt.hasNext() ) // missing lines in output
+ return false;
+
+ final VCFIDHeaderLine outputLine = outputLinesIt.next();
+ final VCFIDHeaderLine inputLine = inputLinesIt.next();
+
+ if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) )
+ return false;
+ }
+
+ return true;
+ }
+
+ private static List nullAsEmpty(List l) {
+ if ( l == null )
+ return Collections.emptyList();
+ else
+ return l;
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFContigHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFContigHeaderLine.java
index d5d76cab7..35cc75af2 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFContigHeaderLine.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFContigHeaderLine.java
@@ -47,8 +47,8 @@ public class VCFContigHeaderLine extends VCFSimpleHeaderLine {
this.contigIndex = contigIndex;
}
- public VCFContigHeaderLine(final String key, final Map mapping, int contigIndex) {
- super(key, mapping, null);
+ public VCFContigHeaderLine(final Map mapping, int contigIndex) {
+ super(VCFHeader.CONTIG_KEY, mapping, null);
this.contigIndex = contigIndex;
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
index 7a9329583..2663e848f 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
@@ -53,10 +53,10 @@ public class VCFHeader {
// the associated meta data
private final Set mMetaData = new LinkedHashSet();
- private final Map mInfoMetaData = new HashMap();
- private final Map mFormatMetaData = new HashMap();
- private final Map mFilterMetaData = new HashMap();
- private final Map mOtherMetaData = new HashMap();
+ private final Map mInfoMetaData = new LinkedHashMap();
+ private final Map mFormatMetaData = new LinkedHashMap();
+ private final Map mFilterMetaData = new LinkedHashMap();
+ private final Map mOtherMetaData = new LinkedHashMap();
private final List contigMetaData = new ArrayList();
// the list of auxillary tags
@@ -101,6 +101,15 @@ public class VCFHeader {
loadMetaDataMaps();
}
+ /**
+ * Creates a shallow copy of the meta data in VCF header toCopy
+ *
+ * @param toCopy
+ */
+ public VCFHeader(final VCFHeader toCopy) {
+ this(toCopy.mMetaData);
+ }
+
/**
* create a VCF header, given a list of meta data and auxillary tags
*
@@ -153,12 +162,39 @@ public class VCFHeader {
}
/**
- * @return all of the VCF header lines of the ##contig form in order, or an empty set if none were present
+ * @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present
*/
public List getContigLines() {
return Collections.unmodifiableList(contigMetaData);
}
+
+ /**
+ * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
+ */
+ public List getFilterLines() {
+ final List filters = new ArrayList();
+ for ( VCFHeaderLine line : mMetaData ) {
+ if ( line instanceof VCFFilterHeaderLine ) {
+ filters.add((VCFFilterHeaderLine)line);
+ }
+ }
+ return filters;
+ }
+
+ /**
+ * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
+ */
+ public List getIDHeaderLines() {
+ final List filters = new ArrayList();
+ for ( VCFHeaderLine line : mMetaData ) {
+ if ( line instanceof VCFIDHeaderLine ) {
+ filters.add((VCFIDHeaderLine)line);
+ }
+ }
+ return filters;
+ }
+
/**
* check our metadata for a VCF version tag, and throw an exception if the version is out of date
* or the version is not present
@@ -299,10 +335,16 @@ public class VCFHeader {
return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
}
+ /**
+ * Returns the INFO HeaderLines in their original ordering
+ */
public Collection getInfoHeaderLines() {
return mInfoMetaData.values();
}
+ /**
+ * Returns the FORMAT HeaderLines in their original ordering
+ */
public Collection getFormatHeaderLines() {
return mFormatMetaData.values();
}
@@ -390,4 +432,13 @@ public class VCFHeader {
public HashMap getSampleNameToOffset() {
return sampleNameToOffset;
}
+
+ @Override
+ public String toString() {
+ final StringBuilder b = new StringBuilder();
+ b.append("[VCFHeader:");
+ for ( final VCFHeaderLine line : mMetaData )
+ b.append("\n\t").append(line);
+ return b.append("\n]").toString();
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java
index 561e8e78d..be87e7306 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java
@@ -301,7 +301,7 @@ public class VCFUtils {
map.put("ID", contig.getSequenceName());
map.put("length", String.valueOf(contig.getSequenceLength()));
if ( assembly != null ) map.put("assembly", assembly);
- return new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, map, contig.getSequenceIndex());
+ return new VCFContigHeaderLine(map, contig.getSequenceIndex());
}
private static String getReferenceAssembly(final String refPath) {
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
index 2211cfe5e..1fe6b8652 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@@ -1351,7 +1351,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
private final Map fullyDecodeAttributes(final Map attributes,
final VCFHeader header,
final boolean lenientDecoding) {
- final Map newAttributes = new HashMap(attributes.size());
+ final Map newAttributes = new HashMap(10);
for ( final Map.Entry attr : attributes.entrySet() ) {
final String field = attr.getKey();
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
index a8f956413..e571bb4c1 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
@@ -504,7 +504,7 @@ public class VariantContextUtils {
Byte referenceBaseForIndel = null;
final Set alleles = new LinkedHashSet();
- final Set filters = new TreeSet();
+ final Set filters = new HashSet();
final Map attributes = new TreeMap();
final Set inconsistentAttributes = new HashSet();
final Set variantSources = new HashSet(); // contains the set of sources we found in our set of VCs that are variant
@@ -656,7 +656,8 @@ public class VariantContextUtils {
builder.alleles(alleles);
builder.genotypes(genotypes);
builder.log10PError(log10PError);
- builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes);
+ builder.filters(filters.isEmpty() ? filters : new TreeSet(filters));
+ builder.attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes);
// Trim the padded bases of all alleles if necessary
final VariantContext merged = builder.make();
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java
index 01dac7eb6..22acc4787 100644
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java
@@ -124,7 +124,7 @@ public final class BCF2Encoder {
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeTyped(List extends Object> v, final BCF2Type type) throws IOException {
if ( type == BCF2Type.CHAR && v.size() != 0 ) {
- final String s = v.size() > 1 ? BCF2Utils.collapseStringList((List) v) : (String)v.get(0);
+ final String s = BCF2Utils.collapseStringList((List) v);
v = stringToBytes(s);
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java
index ddeb4d284..a91eb216d 100644
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java
@@ -335,7 +335,6 @@ public abstract class BCF2FieldEncoder {
else if (value instanceof List) {
final List l = (List)value;
if ( l.isEmpty() ) return "";
- else if ( l.size() == 1 ) return (String)l.get(0);
else return BCF2Utils.collapseStringList(l);
} else
return (String)value;
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java
index a080c4e62..e4c64b26b 100644
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java
@@ -87,14 +87,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
public static final int MAJOR_VERSION = 2;
public static final int MINOR_VERSION = 1;
- /**
- * If true, we will write out the undecoded raw bytes for a genotypes block, if it
- * is found in the input VC. This can be very dangerous as the genotype encoding
- * depends on the exact ordering of the header.
- *
- * TODO -- enable when the new smart VCF header code is created by Eric Banks
- */
- private final static boolean WRITE_UNDECODED_GENOTYPE_BLOCK = false;
final protected static Logger logger = Logger.getLogger(BCF2Writer.class);
final private static boolean ALLOW_MISSING_CONTIG_LINES = false;
@@ -108,6 +100,13 @@ class BCF2Writer extends IndexingVariantContextWriter {
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager();
+ /**
+ * cached results for whether we can write out raw genotypes data.
+ */
+ private VCFHeader lastVCFHeaderOfUnparsedGenotypes = null;
+ private boolean canPassOnUnparsedGenotypeDataForLastVCFHeader = false;
+
+
public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
this.outputStream = getOutputStream();
@@ -247,13 +246,39 @@ class BCF2Writer extends IndexingVariantContextWriter {
return encoder.getRecordBytes();
}
+
+ /**
+ * Can we safely write on the raw (undecoded) genotypes of an input VC?
+ *
+ * The cache depends on the undecoded lazy data header == lastVCFHeaderOfUnparsedGenotypes, in
+ * which case we return the previous result. If it's not cached, we use the BCF2Util to
+ * compare the VC header with our header (expensive) and cache it.
+ *
+ * @param lazyData
+ * @return
+ */
+ private boolean canSafelyWriteRawGenotypesBytes(final BCF2Codec.LazyData lazyData) {
+ if ( lazyData.header != lastVCFHeaderOfUnparsedGenotypes ) {
+ // result is already cached
+ canPassOnUnparsedGenotypeDataForLastVCFHeader = BCF2Utils.headerLinesAreOrderedConsistently(this.header,lazyData.header);
+ lastVCFHeaderOfUnparsedGenotypes = lazyData.header;
+ }
+
+ return canPassOnUnparsedGenotypeDataForLastVCFHeader;
+ }
+
private BCF2Codec.LazyData getLazyData(final VariantContext vc) {
if ( vc.getGenotypes().isLazyWithData() ) {
- LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes();
- if ( WRITE_UNDECODED_GENOTYPE_BLOCK && lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData )
+ final LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes();
+
+ if ( lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData &&
+ canSafelyWriteRawGenotypesBytes((BCF2Codec.LazyData) lgc.getUnparsedGenotypeData())) {
+ //logger.info("Passing on raw BCF2 genotypes data");
return (BCF2Codec.LazyData)lgc.getUnparsedGenotypeData();
- else
+ } else {
+ //logger.info("Decoding raw BCF2 genotypes data");
lgc.decode(); // WARNING -- required to avoid keeping around bad lazy data for too long
+ }
}
return null;
diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2UtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2UtilsUnitTest.java
new file mode 100644
index 000000000..ae76a374a
--- /dev/null
+++ b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2UtilsUnitTest.java
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.codecs.bcf2;
+
+import org.broad.tribble.readers.PositionalBufferedStream;
+import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.codecs.vcf.*;
+
+import java.io.*;
+import java.util.*;
+import org.testng.Assert;
+import org.testng.annotations.BeforeSuite;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+/**
+ * Tests for BCF2Utils
+ */
+public final class BCF2UtilsUnitTest extends BaseTest {
+ @DataProvider(name = "CollapseExpandTest")
+ public Object[][] makeCollapseExpandTest() {
+ List tests = new ArrayList();
+ tests.add(new Object[]{Arrays.asList("A"), "A", false});
+ tests.add(new Object[]{Arrays.asList("A", "B"), ",A,B", true});
+ tests.add(new Object[]{Arrays.asList("AB"), "AB", false});
+ tests.add(new Object[]{Arrays.asList("AB", "C"), ",AB,C", true});
+ tests.add(new Object[]{Arrays.asList(), "", false});
+ return tests.toArray(new Object[][]{});
+ }
+
+ @Test(dataProvider = "CollapseExpandTest")
+ public void testCollapseExpandTest(final List in, final String expectedCollapsed, final boolean isCollapsed) {
+ final String actualCollapsed = BCF2Utils.collapseStringList(in);
+ Assert.assertEquals(actualCollapsed, expectedCollapsed);
+ Assert.assertEquals(BCF2Utils.isCollapsedString(actualCollapsed), isCollapsed);
+ if ( isCollapsed )
+ Assert.assertEquals(BCF2Utils.explodeStringList(actualCollapsed), in);
+ }
+
+ @DataProvider(name = "HeaderOrderTestProvider")
+ public Object[][] makeHeaderOrderTestProvider() {
+ final List inputLines = new ArrayList();
+ final List extraLines = new ArrayList();
+
+ int counter = 0;
+ inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
+ inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
+ inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
+ inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
+ inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
+ inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
+ inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
+ inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
+ final int inputLineCounter = counter;
+ final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet(inputLines));
+
+ extraLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
+ extraLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
+ extraLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
+ extraLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
+ extraLines.add(new VCFHeaderLine("x", "misc"));
+ extraLines.add(new VCFHeaderLine("y", "misc"));
+
+ List tests = new ArrayList();
+ for ( final int extrasToTake : Arrays.asList(0, 1, 2, 3) ) {
+ final List empty = Collections.emptyList();
+ final List> permutations = extrasToTake == 0
+ ? Collections.singletonList(empty)
+ : Utils.makePermutations(extraLines, extrasToTake, false);
+ for ( final List permutation : permutations ) {
+ for ( int i = -1; i < inputLines.size(); i++ ) {
+ final List allLines = new ArrayList(inputLines);
+ if ( i >= 0 )
+ allLines.remove(i);
+ allLines.addAll(permutation);
+ final VCFHeader testHeader = new VCFHeader(new LinkedHashSet(allLines));
+ final boolean expectedConsistent = expectedConsistent(testHeader, inputLineCounter);
+ tests.add(new Object[]{inputHeader, testHeader, expectedConsistent});
+ }
+ }
+ }
+
+ // sample name tests
+ final List> sampleNameTests = Arrays.asList(
+ new ArrayList(),
+ Arrays.asList("A"),
+ Arrays.asList("A", "B"),
+ Arrays.asList("A", "B", "C"));
+ for ( final List inSamples : sampleNameTests ) {
+ for ( final List testSamples : sampleNameTests ) {
+ final VCFHeader inputHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), inSamples);
+
+ final List> permutations = testSamples.isEmpty()
+ ? Collections.singletonList(testSamples)
+ : Utils.makePermutations(testSamples, testSamples.size(), false);
+ for ( final List testSamplesPermutation : permutations ) {
+ final VCFHeader testHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), testSamplesPermutation);
+ final boolean expectedConsistent = testSamples.equals(inSamples);
+ tests.add(new Object[]{inputHeaderWithSamples, testHeaderWithSamples, expectedConsistent});
+ }
+ }
+ }
+
+ return tests.toArray(new Object[][]{});
+ }
+
+ private static boolean expectedConsistent(final VCFHeader combinationHeader, final int minCounterForInputLines) {
+ final List ids = new ArrayList();
+ for ( final VCFHeaderLine line : combinationHeader.getMetaDataInInputOrder() ) {
+ if ( line instanceof VCFIDHeaderLine ) {
+ ids.add(Integer.valueOf(((VCFIDHeaderLine) line).getID()));
+ }
+ }
+
+ // as long as the start contains all of the ids up to minCounterForInputLines in order
+ for ( int i = 0; i < minCounterForInputLines; i++ )
+ if ( i >= ids.size() || ids.get(i) != i )
+ return false;
+
+ return true;
+ }
+
+ //
+ // Test to make sure that we detect correctly the case where we can preserve the genotypes data in a BCF2
+ // even when the header file is slightly different
+ //
+ @Test(dataProvider = "HeaderOrderTestProvider")
+ public void testHeaderOrder(final VCFHeader inputHeader, final VCFHeader testHeader, final boolean expectedConsistent) {
+ final boolean actualOrderConsistency = BCF2Utils.headerLinesAreOrderedConsistently(testHeader, inputHeader);
+ Assert.assertEquals(actualOrderConsistency, expectedConsistent);
+ }
+}
diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java
index dd1985be3..26e2dbfbc 100644
--- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java
+++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java
@@ -197,7 +197,7 @@ public class VariantContextTestProvider {
addHeaderLine(metaData, "FT", 1, VCFHeaderLineType.String);
// prep the header
- metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0));
+ metaData.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0));
metaData.add(new VCFFilterHeaderLine("FILTER1"));
metaData.add(new VCFFilterHeaderLine("FILTER2"));
@@ -889,7 +889,7 @@ public class VariantContextTestProvider {
}
private static List> makeAllGenotypes(final List alleles, final int highestPloidy) {
- return Utils.makeCombinations(alleles, highestPloidy);
+ return Utils.makePermutations(alleles, highestPloidy, true);
}
public static void assertEquals(final VCFHeader actual, final VCFHeader expected) {