BCF2 optimizations; parallel CombineVariants

-- BCF2 now determines whether it can safely write out raw genotype blocks, which is true in the case where the VCF header of the input is a complete, ordered subset of the output header.  Added utilities to determine this and extensive unit tests (headerLinesAreOrderedConsistently)
-- Cleanup collapseStringList and exploreStringList for new unit tests of BCF2Utils.  Fixed bug in edge case that never occurred in practice
-- VCFContigHeaderLine now provides its own key (VCFHeader.CONTIG_KEY) directly instead of requiring the user to provide it (and hoping its right)
-- More ways to access the data in VCFHeader
-- BCF2Writer uses a cache to avoid recomputing unnecessarily whether raw genotype blocks can be emitted directly into the output
-- Optimization of fullyDecodeAttributes -- attributes.size() is expensive and unnecessary.  We just guess that on average we need ~10 elements for the attribute map
-- CombineVariants optimization -- filters are online HashSet but are sorted at the end by creating a TreeSet
-- makeCombinations is now makePermutations, and you can request to create the permutations with or without replacement
This commit is contained in:
Mark DePristo 2012-08-15 14:36:06 -04:00
parent dafa7e3885
commit 669c43031a
15 changed files with 334 additions and 44 deletions

View File

@ -67,6 +67,12 @@ import java.util.*;
* VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out
* in the detailed example on the wiki.
*
* Note that CombineVariants supports multi-threaded parallelism (8/15/12). This is particularly useful
* when converting from VCF to BCF2, which can be expensive. In this case each thread spends CPU time
* doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together
* efficiency. However, since this merge runs in only one thread, you can quickly reach diminishing
* returns with the number of parallel threads. -nt 4 works well but -nt 8 may be too much.
*
* <h2>Input</h2>
* <p>
* One or more variant sets to combine.

View File

@ -32,7 +32,6 @@ import net.sf.samtools.util.StringUtil;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
import java.net.InetAddress;
@ -742,19 +741,23 @@ public class Utils {
* @param objects
* @param n
* @param <T>
* @param withReplacement if false, the resulting permutations will only contain unique objects from objects
* @return
*/
public static <T> List<List<T>> makeCombinations(final List<T> objects, final int n) {
public static <T> List<List<T>> makePermutations(final List<T> objects, final int n, final boolean withReplacement) {
final List<List<T>> combinations = new ArrayList<List<T>>();
if ( n == 1 ) {
if ( n <= 0 )
;
else if ( n == 1 ) {
for ( final T o : objects )
combinations.add(Collections.singletonList(o));
} else {
final List<List<T>> sub = makeCombinations(objects, n - 1);
final List<List<T>> sub = makePermutations(objects, n - 1, withReplacement);
for ( List<T> subI : sub ) {
for ( final T a : objects ) {
combinations.add(Utils.cons(a, subI));
if ( withReplacement || ! subI.contains(a) )
combinations.add(Utils.cons(a, subI));
}
}
}

View File

@ -423,9 +423,8 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
final LazyGenotypesContext.LazyParser lazyParser =
new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders);
LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser,
new LazyData(siteInfo.nFormatFields, decoder.getRecordBytes()),
header.getNGenotypeSamples());
final LazyData lazyData = new LazyData(header, siteInfo.nFormatFields, decoder.getRecordBytes());
final LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, lazyData, header.getNGenotypeSamples());
// did we resort the sample names? If so, we need to load the genotype data
if ( !header.samplesWereAlreadySorted() )
@ -436,11 +435,13 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
}
public static class LazyData {
final public VCFHeader header;
final public int nGenotypeFields;
final public byte[] bytes;
@Requires({"nGenotypeFields > 0", "bytes != null"})
public LazyData(final int nGenotypeFields, final byte[] bytes) {
public LazyData(final VCFHeader header, final int nGenotypeFields, final byte[] bytes) {
this.header = header;
this.nGenotypeFields = nGenotypeFields;
this.bytes = bytes;
}

View File

@ -39,7 +39,7 @@ import java.util.*;
* @author Mark DePristo
* @since 5/12
*/
class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
public class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
final protected static Logger logger = Logger.getLogger(BCF2LazyGenotypesDecoder.class);
// the essential information for us to use to decode the genotypes data

View File

@ -131,17 +131,21 @@ public final class BCF2Utils {
* @param strings size > 1 list of strings
* @return
*/
@Requires({"strings != null", "strings.size() > 1"})
@Requires({"strings != null"})
@Ensures("result != null")
public static String collapseStringList(final List<String> strings) {
final StringBuilder b = new StringBuilder();
for ( final String s : strings ) {
if ( s != null ) {
assert s.indexOf(",") == -1; // no commas in individual strings
b.append(",").append(s);
if ( strings.isEmpty() ) return "";
else if ( strings.size() == 1 ) return strings.get(0);
else {
final StringBuilder b = new StringBuilder();
for ( final String s : strings ) {
if ( s != null ) {
assert s.indexOf(",") == -1; // no commas in individual strings
b.append(",").append(s);
}
}
return b.toString();
}
return b.toString();
}
/**
@ -163,7 +167,7 @@ public final class BCF2Utils {
@Requires("s != null")
public static boolean isCollapsedString(final String s) {
return s.charAt(0) == ',';
return s.length() > 0 && s.charAt(0) == ',';
}
/**
@ -280,4 +284,49 @@ public final class BCF2Utils {
else if ( o instanceof List ) return (List<Object>)o;
else return Collections.singletonList(o);
}
/**
* Are the elements and their order in the output and input headers consistent so that
* we can write out the raw genotypes block without decoding and recoding it?
*
* If the order of INFO, FILTER, or contrig elements in the output header is different than
* in the input header we must decode the blocks using the input header and then recode them
* based on the new output order.
*
* If they are consistent, we can simply pass through the raw genotypes block bytes, which is
* a *huge* performance win for large blocks.
*
* Many common operations on BCF2 files (merging them for -nt, selecting a subset of records, etc)
* don't modify the ordering of the header fields and so can safely pass through the genotypes
* undecoded. Some operations -- those at add filters or info fields -- can change the ordering
* of the header fields and so produce invalid BCF2 files if the genotypes aren't decoded
*/
public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHeader, final VCFHeader genotypesBlockHeader) {
// first, we have to have the same samples in the same order
if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) )
return false;
final Iterator<? extends VCFIDHeaderLine> outputLinesIt = outputHeader.getIDHeaderLines().iterator();
final Iterator<? extends VCFIDHeaderLine> inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator();
while ( inputLinesIt.hasNext() ) {
if ( ! outputLinesIt.hasNext() ) // missing lines in output
return false;
final VCFIDHeaderLine outputLine = outputLinesIt.next();
final VCFIDHeaderLine inputLine = inputLinesIt.next();
if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) )
return false;
}
return true;
}
private static <T> List<T> nullAsEmpty(List<T> l) {
if ( l == null )
return Collections.emptyList();
else
return l;
}
}

View File

@ -47,8 +47,8 @@ public class VCFContigHeaderLine extends VCFSimpleHeaderLine {
this.contigIndex = contigIndex;
}
public VCFContigHeaderLine(final String key, final Map<String, String> mapping, int contigIndex) {
super(key, mapping, null);
public VCFContigHeaderLine(final Map<String, String> mapping, int contigIndex) {
super(VCFHeader.CONTIG_KEY, mapping, null);
this.contigIndex = contigIndex;
}

View File

@ -53,10 +53,10 @@ public class VCFHeader {
// the associated meta data
private final Set<VCFHeaderLine> mMetaData = new LinkedHashSet<VCFHeaderLine>();
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new HashMap<String, VCFInfoHeaderLine>();
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new HashMap<String, VCFFormatHeaderLine>();
private final Map<String, VCFFilterHeaderLine> mFilterMetaData = new HashMap<String, VCFFilterHeaderLine>();
private final Map<String, VCFHeaderLine> mOtherMetaData = new HashMap<String, VCFHeaderLine>();
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new LinkedHashMap<String, VCFInfoHeaderLine>();
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new LinkedHashMap<String, VCFFormatHeaderLine>();
private final Map<String, VCFFilterHeaderLine> mFilterMetaData = new LinkedHashMap<String, VCFFilterHeaderLine>();
private final Map<String, VCFHeaderLine> mOtherMetaData = new LinkedHashMap<String, VCFHeaderLine>();
private final List<VCFContigHeaderLine> contigMetaData = new ArrayList<VCFContigHeaderLine>();
// the list of auxillary tags
@ -101,6 +101,15 @@ public class VCFHeader {
loadMetaDataMaps();
}
/**
* Creates a shallow copy of the meta data in VCF header toCopy
*
* @param toCopy
*/
public VCFHeader(final VCFHeader toCopy) {
this(toCopy.mMetaData);
}
/**
* create a VCF header, given a list of meta data and auxillary tags
*
@ -153,12 +162,39 @@ public class VCFHeader {
}
/**
* @return all of the VCF header lines of the ##contig form in order, or an empty set if none were present
* @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present
*/
public List<VCFContigHeaderLine> getContigLines() {
return Collections.unmodifiableList(contigMetaData);
}
/**
* @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
*/
public List<VCFFilterHeaderLine> getFilterLines() {
final List<VCFFilterHeaderLine> filters = new ArrayList<VCFFilterHeaderLine>();
for ( VCFHeaderLine line : mMetaData ) {
if ( line instanceof VCFFilterHeaderLine ) {
filters.add((VCFFilterHeaderLine)line);
}
}
return filters;
}
/**
* @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
*/
public List<VCFIDHeaderLine> getIDHeaderLines() {
final List<VCFIDHeaderLine> filters = new ArrayList<VCFIDHeaderLine>();
for ( VCFHeaderLine line : mMetaData ) {
if ( line instanceof VCFIDHeaderLine ) {
filters.add((VCFIDHeaderLine)line);
}
}
return filters;
}
/**
* check our metadata for a VCF version tag, and throw an exception if the version is out of date
* or the version is not present
@ -299,10 +335,16 @@ public class VCFHeader {
return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
}
/**
* Returns the INFO HeaderLines in their original ordering
*/
public Collection<VCFInfoHeaderLine> getInfoHeaderLines() {
return mInfoMetaData.values();
}
/**
* Returns the FORMAT HeaderLines in their original ordering
*/
public Collection<VCFFormatHeaderLine> getFormatHeaderLines() {
return mFormatMetaData.values();
}
@ -390,4 +432,13 @@ public class VCFHeader {
public HashMap<String, Integer> getSampleNameToOffset() {
return sampleNameToOffset;
}
@Override
public String toString() {
final StringBuilder b = new StringBuilder();
b.append("[VCFHeader:");
for ( final VCFHeaderLine line : mMetaData )
b.append("\n\t").append(line);
return b.append("\n]").toString();
}
}

View File

@ -301,7 +301,7 @@ public class VCFUtils {
map.put("ID", contig.getSequenceName());
map.put("length", String.valueOf(contig.getSequenceLength()));
if ( assembly != null ) map.put("assembly", assembly);
return new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, map, contig.getSequenceIndex());
return new VCFContigHeaderLine(map, contig.getSequenceIndex());
}
private static String getReferenceAssembly(final String refPath) {

View File

@ -1351,7 +1351,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
private final Map<String, Object> fullyDecodeAttributes(final Map<String, Object> attributes,
final VCFHeader header,
final boolean lenientDecoding) {
final Map<String, Object> newAttributes = new HashMap<String, Object>(attributes.size());
final Map<String, Object> newAttributes = new HashMap<String, Object>(10);
for ( final Map.Entry<String, Object> attr : attributes.entrySet() ) {
final String field = attr.getKey();

View File

@ -504,7 +504,7 @@ public class VariantContextUtils {
Byte referenceBaseForIndel = null;
final Set<Allele> alleles = new LinkedHashSet<Allele>();
final Set<String> filters = new TreeSet<String>();
final Set<String> filters = new HashSet<String>();
final Map<String, Object> attributes = new TreeMap<String, Object>();
final Set<String> inconsistentAttributes = new HashSet<String>();
final Set<String> variantSources = new HashSet<String>(); // contains the set of sources we found in our set of VCs that are variant
@ -656,7 +656,8 @@ public class VariantContextUtils {
builder.alleles(alleles);
builder.genotypes(genotypes);
builder.log10PError(log10PError);
builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes);
builder.filters(filters.isEmpty() ? filters : new TreeSet<String>(filters));
builder.attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes);
// Trim the padded bases of all alleles if necessary
final VariantContext merged = builder.make();

View File

@ -124,7 +124,7 @@ public final class BCF2Encoder {
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeTyped(List<? extends Object> v, final BCF2Type type) throws IOException {
if ( type == BCF2Type.CHAR && v.size() != 0 ) {
final String s = v.size() > 1 ? BCF2Utils.collapseStringList((List<String>) v) : (String)v.get(0);
final String s = BCF2Utils.collapseStringList((List<String>) v);
v = stringToBytes(s);
}

View File

@ -335,7 +335,6 @@ public abstract class BCF2FieldEncoder {
else if (value instanceof List) {
final List<String> l = (List<String>)value;
if ( l.isEmpty() ) return "";
else if ( l.size() == 1 ) return (String)l.get(0);
else return BCF2Utils.collapseStringList(l);
} else
return (String)value;

View File

@ -87,14 +87,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
public static final int MAJOR_VERSION = 2;
public static final int MINOR_VERSION = 1;
/**
* If true, we will write out the undecoded raw bytes for a genotypes block, if it
* is found in the input VC. This can be very dangerous as the genotype encoding
* depends on the exact ordering of the header.
*
* TODO -- enable when the new smart VCF header code is created by Eric Banks
*/
private final static boolean WRITE_UNDECODED_GENOTYPE_BLOCK = false;
final protected static Logger logger = Logger.getLogger(BCF2Writer.class);
final private static boolean ALLOW_MISSING_CONTIG_LINES = false;
@ -108,6 +100,13 @@ class BCF2Writer extends IndexingVariantContextWriter {
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager();
/**
* cached results for whether we can write out raw genotypes data.
*/
private VCFHeader lastVCFHeaderOfUnparsedGenotypes = null;
private boolean canPassOnUnparsedGenotypeDataForLastVCFHeader = false;
public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
this.outputStream = getOutputStream();
@ -247,13 +246,39 @@ class BCF2Writer extends IndexingVariantContextWriter {
return encoder.getRecordBytes();
}
/**
* Can we safely write on the raw (undecoded) genotypes of an input VC?
*
* The cache depends on the undecoded lazy data header == lastVCFHeaderOfUnparsedGenotypes, in
* which case we return the previous result. If it's not cached, we use the BCF2Util to
* compare the VC header with our header (expensive) and cache it.
*
* @param lazyData
* @return
*/
private boolean canSafelyWriteRawGenotypesBytes(final BCF2Codec.LazyData lazyData) {
if ( lazyData.header != lastVCFHeaderOfUnparsedGenotypes ) {
// result is already cached
canPassOnUnparsedGenotypeDataForLastVCFHeader = BCF2Utils.headerLinesAreOrderedConsistently(this.header,lazyData.header);
lastVCFHeaderOfUnparsedGenotypes = lazyData.header;
}
return canPassOnUnparsedGenotypeDataForLastVCFHeader;
}
private BCF2Codec.LazyData getLazyData(final VariantContext vc) {
if ( vc.getGenotypes().isLazyWithData() ) {
LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes();
if ( WRITE_UNDECODED_GENOTYPE_BLOCK && lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData )
final LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes();
if ( lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData &&
canSafelyWriteRawGenotypesBytes((BCF2Codec.LazyData) lgc.getUnparsedGenotypeData())) {
//logger.info("Passing on raw BCF2 genotypes data");
return (BCF2Codec.LazyData)lgc.getUnparsedGenotypeData();
else
} else {
//logger.info("Decoding raw BCF2 genotypes data");
lgc.decode(); // WARNING -- required to avoid keeping around bad lazy data for too long
}
}
return null;

View File

@ -0,0 +1,155 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import java.io.*;
import java.util.*;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
/**
* Tests for BCF2Utils
*/
public final class BCF2UtilsUnitTest extends BaseTest {
@DataProvider(name = "CollapseExpandTest")
public Object[][] makeCollapseExpandTest() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{Arrays.asList("A"), "A", false});
tests.add(new Object[]{Arrays.asList("A", "B"), ",A,B", true});
tests.add(new Object[]{Arrays.asList("AB"), "AB", false});
tests.add(new Object[]{Arrays.asList("AB", "C"), ",AB,C", true});
tests.add(new Object[]{Arrays.asList(), "", false});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "CollapseExpandTest")
public void testCollapseExpandTest(final List<String> in, final String expectedCollapsed, final boolean isCollapsed) {
final String actualCollapsed = BCF2Utils.collapseStringList(in);
Assert.assertEquals(actualCollapsed, expectedCollapsed);
Assert.assertEquals(BCF2Utils.isCollapsedString(actualCollapsed), isCollapsed);
if ( isCollapsed )
Assert.assertEquals(BCF2Utils.explodeStringList(actualCollapsed), in);
}
@DataProvider(name = "HeaderOrderTestProvider")
public Object[][] makeHeaderOrderTestProvider() {
final List<VCFHeaderLine> inputLines = new ArrayList<VCFHeaderLine>();
final List<VCFHeaderLine> extraLines = new ArrayList<VCFHeaderLine>();
int counter = 0;
inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
final int inputLineCounter = counter;
final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet<VCFHeaderLine>(inputLines));
extraLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
extraLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
extraLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
extraLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
extraLines.add(new VCFHeaderLine("x", "misc"));
extraLines.add(new VCFHeaderLine("y", "misc"));
List<Object[]> tests = new ArrayList<Object[]>();
for ( final int extrasToTake : Arrays.asList(0, 1, 2, 3) ) {
final List<VCFHeaderLine> empty = Collections.emptyList();
final List<List<VCFHeaderLine>> permutations = extrasToTake == 0
? Collections.singletonList(empty)
: Utils.makePermutations(extraLines, extrasToTake, false);
for ( final List<VCFHeaderLine> permutation : permutations ) {
for ( int i = -1; i < inputLines.size(); i++ ) {
final List<VCFHeaderLine> allLines = new ArrayList<VCFHeaderLine>(inputLines);
if ( i >= 0 )
allLines.remove(i);
allLines.addAll(permutation);
final VCFHeader testHeader = new VCFHeader(new LinkedHashSet<VCFHeaderLine>(allLines));
final boolean expectedConsistent = expectedConsistent(testHeader, inputLineCounter);
tests.add(new Object[]{inputHeader, testHeader, expectedConsistent});
}
}
}
// sample name tests
final List<List<String>> sampleNameTests = Arrays.asList(
new ArrayList<String>(),
Arrays.asList("A"),
Arrays.asList("A", "B"),
Arrays.asList("A", "B", "C"));
for ( final List<String> inSamples : sampleNameTests ) {
for ( final List<String> testSamples : sampleNameTests ) {
final VCFHeader inputHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), inSamples);
final List<List<String>> permutations = testSamples.isEmpty()
? Collections.singletonList(testSamples)
: Utils.makePermutations(testSamples, testSamples.size(), false);
for ( final List<String> testSamplesPermutation : permutations ) {
final VCFHeader testHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), testSamplesPermutation);
final boolean expectedConsistent = testSamples.equals(inSamples);
tests.add(new Object[]{inputHeaderWithSamples, testHeaderWithSamples, expectedConsistent});
}
}
}
return tests.toArray(new Object[][]{});
}
private static boolean expectedConsistent(final VCFHeader combinationHeader, final int minCounterForInputLines) {
final List<Integer> ids = new ArrayList<Integer>();
for ( final VCFHeaderLine line : combinationHeader.getMetaDataInInputOrder() ) {
if ( line instanceof VCFIDHeaderLine ) {
ids.add(Integer.valueOf(((VCFIDHeaderLine) line).getID()));
}
}
// as long as the start contains all of the ids up to minCounterForInputLines in order
for ( int i = 0; i < minCounterForInputLines; i++ )
if ( i >= ids.size() || ids.get(i) != i )
return false;
return true;
}
//
// Test to make sure that we detect correctly the case where we can preserve the genotypes data in a BCF2
// even when the header file is slightly different
//
@Test(dataProvider = "HeaderOrderTestProvider")
public void testHeaderOrder(final VCFHeader inputHeader, final VCFHeader testHeader, final boolean expectedConsistent) {
final boolean actualOrderConsistency = BCF2Utils.headerLinesAreOrderedConsistently(testHeader, inputHeader);
Assert.assertEquals(actualOrderConsistency, expectedConsistent);
}
}

View File

@ -197,7 +197,7 @@ public class VariantContextTestProvider {
addHeaderLine(metaData, "FT", 1, VCFHeaderLineType.String);
// prep the header
metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0));
metaData.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0));
metaData.add(new VCFFilterHeaderLine("FILTER1"));
metaData.add(new VCFFilterHeaderLine("FILTER2"));
@ -889,7 +889,7 @@ public class VariantContextTestProvider {
}
private static List<List<Allele>> makeAllGenotypes(final List<Allele> alleles, final int highestPloidy) {
return Utils.makeCombinations(alleles, highestPloidy);
return Utils.makePermutations(alleles, highestPloidy, true);
}
public static void assertEquals(final VCFHeader actual, final VCFHeader expected) {