diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index d6504e841..8dabd49b8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -67,6 +67,12 @@ import java.util.*; * VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out * in the detailed example on the wiki. * + * Note that CombineVariants supports multi-threaded parallelism (8/15/12). This is particularly useful + * when converting from VCF to BCF2, which can be expensive. In this case each thread spends CPU time + * doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together + * efficiency. However, since this merge runs in only one thread, you can quickly reach diminishing + * returns with the number of parallel threads. -nt 4 works well but -nt 8 may be too much. + * *

Input

*

* One or more variant sets to combine. diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 14d7177a0..476098ae6 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -32,7 +32,6 @@ import net.sf.samtools.util.StringUtil; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.text.TextFormattingUtils; import java.net.InetAddress; @@ -742,19 +741,23 @@ public class Utils { * @param objects * @param n * @param + * @param withReplacement if false, the resulting permutations will only contain unique objects from objects * @return */ - public static List> makeCombinations(final List objects, final int n) { + public static List> makePermutations(final List objects, final int n, final boolean withReplacement) { final List> combinations = new ArrayList>(); - if ( n == 1 ) { + if ( n <= 0 ) + ; + else if ( n == 1 ) { for ( final T o : objects ) combinations.add(Collections.singletonList(o)); } else { - final List> sub = makeCombinations(objects, n - 1); + final List> sub = makePermutations(objects, n - 1, withReplacement); for ( List subI : sub ) { for ( final T a : objects ) { - combinations.add(Utils.cons(a, subI)); + if ( withReplacement || ! subI.contains(a) ) + combinations.add(Utils.cons(a, subI)); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 67e189d11..ac6348f80 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -423,9 +423,8 @@ public final class BCF2Codec implements FeatureCodec { final LazyGenotypesContext.LazyParser lazyParser = new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders); - LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, - new LazyData(siteInfo.nFormatFields, decoder.getRecordBytes()), - header.getNGenotypeSamples()); + final LazyData lazyData = new LazyData(header, siteInfo.nFormatFields, decoder.getRecordBytes()); + final LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, lazyData, header.getNGenotypeSamples()); // did we resort the sample names? If so, we need to load the genotype data if ( !header.samplesWereAlreadySorted() ) @@ -436,11 +435,13 @@ public final class BCF2Codec implements FeatureCodec { } public static class LazyData { + final public VCFHeader header; final public int nGenotypeFields; final public byte[] bytes; @Requires({"nGenotypeFields > 0", "bytes != null"}) - public LazyData(final int nGenotypeFields, final byte[] bytes) { + public LazyData(final VCFHeader header, final int nGenotypeFields, final byte[] bytes) { + this.header = header; this.nGenotypeFields = nGenotypeFields; this.bytes = bytes; } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java index 513b9fcb5..46b1fa6c1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java @@ -39,7 +39,7 @@ import java.util.*; * @author Mark DePristo * @since 5/12 */ -class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { +public class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { final protected static Logger logger = Logger.getLogger(BCF2LazyGenotypesDecoder.class); // the essential information for us to use to decode the genotypes data diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java index e6e78d89d..2ac916db1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java @@ -131,17 +131,21 @@ public final class BCF2Utils { * @param strings size > 1 list of strings * @return */ - @Requires({"strings != null", "strings.size() > 1"}) + @Requires({"strings != null"}) @Ensures("result != null") public static String collapseStringList(final List strings) { - final StringBuilder b = new StringBuilder(); - for ( final String s : strings ) { - if ( s != null ) { - assert s.indexOf(",") == -1; // no commas in individual strings - b.append(",").append(s); + if ( strings.isEmpty() ) return ""; + else if ( strings.size() == 1 ) return strings.get(0); + else { + final StringBuilder b = new StringBuilder(); + for ( final String s : strings ) { + if ( s != null ) { + assert s.indexOf(",") == -1; // no commas in individual strings + b.append(",").append(s); + } } + return b.toString(); } - return b.toString(); } /** @@ -163,7 +167,7 @@ public final class BCF2Utils { @Requires("s != null") public static boolean isCollapsedString(final String s) { - return s.charAt(0) == ','; + return s.length() > 0 && s.charAt(0) == ','; } /** @@ -280,4 +284,49 @@ public final class BCF2Utils { else if ( o instanceof List ) return (List)o; else return Collections.singletonList(o); } + + /** + * Are the elements and their order in the output and input headers consistent so that + * we can write out the raw genotypes block without decoding and recoding it? + * + * If the order of INFO, FILTER, or contrig elements in the output header is different than + * in the input header we must decode the blocks using the input header and then recode them + * based on the new output order. + * + * If they are consistent, we can simply pass through the raw genotypes block bytes, which is + * a *huge* performance win for large blocks. + * + * Many common operations on BCF2 files (merging them for -nt, selecting a subset of records, etc) + * don't modify the ordering of the header fields and so can safely pass through the genotypes + * undecoded. Some operations -- those at add filters or info fields -- can change the ordering + * of the header fields and so produce invalid BCF2 files if the genotypes aren't decoded + */ + public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHeader, final VCFHeader genotypesBlockHeader) { + // first, we have to have the same samples in the same order + if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) ) + return false; + + final Iterator outputLinesIt = outputHeader.getIDHeaderLines().iterator(); + final Iterator inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator(); + + while ( inputLinesIt.hasNext() ) { + if ( ! outputLinesIt.hasNext() ) // missing lines in output + return false; + + final VCFIDHeaderLine outputLine = outputLinesIt.next(); + final VCFIDHeaderLine inputLine = inputLinesIt.next(); + + if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) ) + return false; + } + + return true; + } + + private static List nullAsEmpty(List l) { + if ( l == null ) + return Collections.emptyList(); + else + return l; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFContigHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFContigHeaderLine.java index d5d76cab7..35cc75af2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFContigHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFContigHeaderLine.java @@ -47,8 +47,8 @@ public class VCFContigHeaderLine extends VCFSimpleHeaderLine { this.contigIndex = contigIndex; } - public VCFContigHeaderLine(final String key, final Map mapping, int contigIndex) { - super(key, mapping, null); + public VCFContigHeaderLine(final Map mapping, int contigIndex) { + super(VCFHeader.CONTIG_KEY, mapping, null); this.contigIndex = contigIndex; } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index 7a9329583..2663e848f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -53,10 +53,10 @@ public class VCFHeader { // the associated meta data private final Set mMetaData = new LinkedHashSet(); - private final Map mInfoMetaData = new HashMap(); - private final Map mFormatMetaData = new HashMap(); - private final Map mFilterMetaData = new HashMap(); - private final Map mOtherMetaData = new HashMap(); + private final Map mInfoMetaData = new LinkedHashMap(); + private final Map mFormatMetaData = new LinkedHashMap(); + private final Map mFilterMetaData = new LinkedHashMap(); + private final Map mOtherMetaData = new LinkedHashMap(); private final List contigMetaData = new ArrayList(); // the list of auxillary tags @@ -101,6 +101,15 @@ public class VCFHeader { loadMetaDataMaps(); } + /** + * Creates a shallow copy of the meta data in VCF header toCopy + * + * @param toCopy + */ + public VCFHeader(final VCFHeader toCopy) { + this(toCopy.mMetaData); + } + /** * create a VCF header, given a list of meta data and auxillary tags * @@ -153,12 +162,39 @@ public class VCFHeader { } /** - * @return all of the VCF header lines of the ##contig form in order, or an empty set if none were present + * @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present */ public List getContigLines() { return Collections.unmodifiableList(contigMetaData); } + + /** + * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present + */ + public List getFilterLines() { + final List filters = new ArrayList(); + for ( VCFHeaderLine line : mMetaData ) { + if ( line instanceof VCFFilterHeaderLine ) { + filters.add((VCFFilterHeaderLine)line); + } + } + return filters; + } + + /** + * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present + */ + public List getIDHeaderLines() { + final List filters = new ArrayList(); + for ( VCFHeaderLine line : mMetaData ) { + if ( line instanceof VCFIDHeaderLine ) { + filters.add((VCFIDHeaderLine)line); + } + } + return filters; + } + /** * check our metadata for a VCF version tag, and throw an exception if the version is out of date * or the version is not present @@ -299,10 +335,16 @@ public class VCFHeader { return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0); } + /** + * Returns the INFO HeaderLines in their original ordering + */ public Collection getInfoHeaderLines() { return mInfoMetaData.values(); } + /** + * Returns the FORMAT HeaderLines in their original ordering + */ public Collection getFormatHeaderLines() { return mFormatMetaData.values(); } @@ -390,4 +432,13 @@ public class VCFHeader { public HashMap getSampleNameToOffset() { return sampleNameToOffset; } + + @Override + public String toString() { + final StringBuilder b = new StringBuilder(); + b.append("[VCFHeader:"); + for ( final VCFHeaderLine line : mMetaData ) + b.append("\n\t").append(line); + return b.append("\n]").toString(); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index 561e8e78d..be87e7306 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -301,7 +301,7 @@ public class VCFUtils { map.put("ID", contig.getSequenceName()); map.put("length", String.valueOf(contig.getSequenceLength())); if ( assembly != null ) map.put("assembly", assembly); - return new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, map, contig.getSequenceIndex()); + return new VCFContigHeaderLine(map, contig.getSequenceIndex()); } private static String getReferenceAssembly(final String refPath) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 2211cfe5e..1fe6b8652 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1351,7 +1351,7 @@ public class VariantContext implements Feature { // to enable tribble integratio private final Map fullyDecodeAttributes(final Map attributes, final VCFHeader header, final boolean lenientDecoding) { - final Map newAttributes = new HashMap(attributes.size()); + final Map newAttributes = new HashMap(10); for ( final Map.Entry attr : attributes.entrySet() ) { final String field = attr.getKey(); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index a8f956413..e571bb4c1 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -504,7 +504,7 @@ public class VariantContextUtils { Byte referenceBaseForIndel = null; final Set alleles = new LinkedHashSet(); - final Set filters = new TreeSet(); + final Set filters = new HashSet(); final Map attributes = new TreeMap(); final Set inconsistentAttributes = new HashSet(); final Set variantSources = new HashSet(); // contains the set of sources we found in our set of VCs that are variant @@ -656,7 +656,8 @@ public class VariantContextUtils { builder.alleles(alleles); builder.genotypes(genotypes); builder.log10PError(log10PError); - builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes); + builder.filters(filters.isEmpty() ? filters : new TreeSet(filters)); + builder.attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes); // Trim the padded bases of all alleles if necessary final VariantContext merged = builder.make(); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java index 01dac7eb6..22acc4787 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Encoder.java @@ -124,7 +124,7 @@ public final class BCF2Encoder { @Ensures("encodeStream.size() > old(encodeStream.size())") public final void encodeTyped(List v, final BCF2Type type) throws IOException { if ( type == BCF2Type.CHAR && v.size() != 0 ) { - final String s = v.size() > 1 ? BCF2Utils.collapseStringList((List) v) : (String)v.get(0); + final String s = BCF2Utils.collapseStringList((List) v); v = stringToBytes(s); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java index ddeb4d284..a91eb216d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldEncoder.java @@ -335,7 +335,6 @@ public abstract class BCF2FieldEncoder { else if (value instanceof List) { final List l = (List)value; if ( l.isEmpty() ) return ""; - else if ( l.size() == 1 ) return (String)l.get(0); else return BCF2Utils.collapseStringList(l); } else return (String)value; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index a080c4e62..e4c64b26b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -87,14 +87,6 @@ class BCF2Writer extends IndexingVariantContextWriter { public static final int MAJOR_VERSION = 2; public static final int MINOR_VERSION = 1; - /** - * If true, we will write out the undecoded raw bytes for a genotypes block, if it - * is found in the input VC. This can be very dangerous as the genotype encoding - * depends on the exact ordering of the header. - * - * TODO -- enable when the new smart VCF header code is created by Eric Banks - */ - private final static boolean WRITE_UNDECODED_GENOTYPE_BLOCK = false; final protected static Logger logger = Logger.getLogger(BCF2Writer.class); final private static boolean ALLOW_MISSING_CONTIG_LINES = false; @@ -108,6 +100,13 @@ class BCF2Writer extends IndexingVariantContextWriter { private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager(); + /** + * cached results for whether we can write out raw genotypes data. + */ + private VCFHeader lastVCFHeaderOfUnparsedGenotypes = null; + private boolean canPassOnUnparsedGenotypeDataForLastVCFHeader = false; + + public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) { super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing); this.outputStream = getOutputStream(); @@ -247,13 +246,39 @@ class BCF2Writer extends IndexingVariantContextWriter { return encoder.getRecordBytes(); } + + /** + * Can we safely write on the raw (undecoded) genotypes of an input VC? + * + * The cache depends on the undecoded lazy data header == lastVCFHeaderOfUnparsedGenotypes, in + * which case we return the previous result. If it's not cached, we use the BCF2Util to + * compare the VC header with our header (expensive) and cache it. + * + * @param lazyData + * @return + */ + private boolean canSafelyWriteRawGenotypesBytes(final BCF2Codec.LazyData lazyData) { + if ( lazyData.header != lastVCFHeaderOfUnparsedGenotypes ) { + // result is already cached + canPassOnUnparsedGenotypeDataForLastVCFHeader = BCF2Utils.headerLinesAreOrderedConsistently(this.header,lazyData.header); + lastVCFHeaderOfUnparsedGenotypes = lazyData.header; + } + + return canPassOnUnparsedGenotypeDataForLastVCFHeader; + } + private BCF2Codec.LazyData getLazyData(final VariantContext vc) { if ( vc.getGenotypes().isLazyWithData() ) { - LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes(); - if ( WRITE_UNDECODED_GENOTYPE_BLOCK && lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData ) + final LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes(); + + if ( lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData && + canSafelyWriteRawGenotypesBytes((BCF2Codec.LazyData) lgc.getUnparsedGenotypeData())) { + //logger.info("Passing on raw BCF2 genotypes data"); return (BCF2Codec.LazyData)lgc.getUnparsedGenotypeData(); - else + } else { + //logger.info("Decoding raw BCF2 genotypes data"); lgc.decode(); // WARNING -- required to avoid keeping around bad lazy data for too long + } } return null; diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2UtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2UtilsUnitTest.java new file mode 100644 index 000000000..ae76a374a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2UtilsUnitTest.java @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.codecs.bcf2; + +import org.broad.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.codecs.vcf.*; + +import java.io.*; +import java.util.*; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +/** + * Tests for BCF2Utils + */ +public final class BCF2UtilsUnitTest extends BaseTest { + @DataProvider(name = "CollapseExpandTest") + public Object[][] makeCollapseExpandTest() { + List tests = new ArrayList(); + tests.add(new Object[]{Arrays.asList("A"), "A", false}); + tests.add(new Object[]{Arrays.asList("A", "B"), ",A,B", true}); + tests.add(new Object[]{Arrays.asList("AB"), "AB", false}); + tests.add(new Object[]{Arrays.asList("AB", "C"), ",AB,C", true}); + tests.add(new Object[]{Arrays.asList(), "", false}); + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "CollapseExpandTest") + public void testCollapseExpandTest(final List in, final String expectedCollapsed, final boolean isCollapsed) { + final String actualCollapsed = BCF2Utils.collapseStringList(in); + Assert.assertEquals(actualCollapsed, expectedCollapsed); + Assert.assertEquals(BCF2Utils.isCollapsedString(actualCollapsed), isCollapsed); + if ( isCollapsed ) + Assert.assertEquals(BCF2Utils.explodeStringList(actualCollapsed), in); + } + + @DataProvider(name = "HeaderOrderTestProvider") + public Object[][] makeHeaderOrderTestProvider() { + final List inputLines = new ArrayList(); + final List extraLines = new ArrayList(); + + int counter = 0; + inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); + inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); + inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); + inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); + inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + final int inputLineCounter = counter; + final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet(inputLines)); + + extraLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); + extraLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); + extraLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + extraLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + extraLines.add(new VCFHeaderLine("x", "misc")); + extraLines.add(new VCFHeaderLine("y", "misc")); + + List tests = new ArrayList(); + for ( final int extrasToTake : Arrays.asList(0, 1, 2, 3) ) { + final List empty = Collections.emptyList(); + final List> permutations = extrasToTake == 0 + ? Collections.singletonList(empty) + : Utils.makePermutations(extraLines, extrasToTake, false); + for ( final List permutation : permutations ) { + for ( int i = -1; i < inputLines.size(); i++ ) { + final List allLines = new ArrayList(inputLines); + if ( i >= 0 ) + allLines.remove(i); + allLines.addAll(permutation); + final VCFHeader testHeader = new VCFHeader(new LinkedHashSet(allLines)); + final boolean expectedConsistent = expectedConsistent(testHeader, inputLineCounter); + tests.add(new Object[]{inputHeader, testHeader, expectedConsistent}); + } + } + } + + // sample name tests + final List> sampleNameTests = Arrays.asList( + new ArrayList(), + Arrays.asList("A"), + Arrays.asList("A", "B"), + Arrays.asList("A", "B", "C")); + for ( final List inSamples : sampleNameTests ) { + for ( final List testSamples : sampleNameTests ) { + final VCFHeader inputHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), inSamples); + + final List> permutations = testSamples.isEmpty() + ? Collections.singletonList(testSamples) + : Utils.makePermutations(testSamples, testSamples.size(), false); + for ( final List testSamplesPermutation : permutations ) { + final VCFHeader testHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), testSamplesPermutation); + final boolean expectedConsistent = testSamples.equals(inSamples); + tests.add(new Object[]{inputHeaderWithSamples, testHeaderWithSamples, expectedConsistent}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private static boolean expectedConsistent(final VCFHeader combinationHeader, final int minCounterForInputLines) { + final List ids = new ArrayList(); + for ( final VCFHeaderLine line : combinationHeader.getMetaDataInInputOrder() ) { + if ( line instanceof VCFIDHeaderLine ) { + ids.add(Integer.valueOf(((VCFIDHeaderLine) line).getID())); + } + } + + // as long as the start contains all of the ids up to minCounterForInputLines in order + for ( int i = 0; i < minCounterForInputLines; i++ ) + if ( i >= ids.size() || ids.get(i) != i ) + return false; + + return true; + } + + // + // Test to make sure that we detect correctly the case where we can preserve the genotypes data in a BCF2 + // even when the header file is slightly different + // + @Test(dataProvider = "HeaderOrderTestProvider") + public void testHeaderOrder(final VCFHeader inputHeader, final VCFHeader testHeader, final boolean expectedConsistent) { + final boolean actualOrderConsistency = BCF2Utils.headerLinesAreOrderedConsistently(testHeader, inputHeader); + Assert.assertEquals(actualOrderConsistency, expectedConsistent); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index dd1985be3..26e2dbfbc 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -197,7 +197,7 @@ public class VariantContextTestProvider { addHeaderLine(metaData, "FT", 1, VCFHeaderLineType.String); // prep the header - metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0)); + metaData.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0)); metaData.add(new VCFFilterHeaderLine("FILTER1")); metaData.add(new VCFFilterHeaderLine("FILTER2")); @@ -889,7 +889,7 @@ public class VariantContextTestProvider { } private static List> makeAllGenotypes(final List alleles, final int highestPloidy) { - return Utils.makeCombinations(alleles, highestPloidy); + return Utils.makePermutations(alleles, highestPloidy, true); } public static void assertEquals(final VCFHeader actual, final VCFHeader expected) {