BCF2 optimizations; parallel CombineVariants
-- BCF2 now determines whether it can safely write out raw genotype blocks, which is true in the case where the VCF header of the input is a complete, ordered subset of the output header. Added utilities to determine this and extensive unit tests (headerLinesAreOrderedConsistently) -- Cleanup collapseStringList and exploreStringList for new unit tests of BCF2Utils. Fixed bug in edge case that never occurred in practice -- VCFContigHeaderLine now provides its own key (VCFHeader.CONTIG_KEY) directly instead of requiring the user to provide it (and hoping its right) -- More ways to access the data in VCFHeader -- BCF2Writer uses a cache to avoid recomputing unnecessarily whether raw genotype blocks can be emitted directly into the output -- Optimization of fullyDecodeAttributes -- attributes.size() is expensive and unnecessary. We just guess that on average we need ~10 elements for the attribute map -- CombineVariants optimization -- filters are online HashSet but are sorted at the end by creating a TreeSet -- makeCombinations is now makePermutations, and you can request to create the permutations with or without replacement
This commit is contained in:
parent
dafa7e3885
commit
669c43031a
|
|
@ -67,6 +67,12 @@ import java.util.*;
|
|||
* VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out
|
||||
* in the detailed example on the wiki.
|
||||
*
|
||||
* Note that CombineVariants supports multi-threaded parallelism (8/15/12). This is particularly useful
|
||||
* when converting from VCF to BCF2, which can be expensive. In this case each thread spends CPU time
|
||||
* doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together
|
||||
* efficiency. However, since this merge runs in only one thread, you can quickly reach diminishing
|
||||
* returns with the number of parallel threads. -nt 4 works well but -nt 8 may be too much.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more variant sets to combine.
|
||||
|
|
|
|||
|
|
@ -32,7 +32,6 @@ import net.sf.samtools.util.StringUtil;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
|
||||
|
||||
import java.net.InetAddress;
|
||||
|
|
@ -742,19 +741,23 @@ public class Utils {
|
|||
* @param objects
|
||||
* @param n
|
||||
* @param <T>
|
||||
* @param withReplacement if false, the resulting permutations will only contain unique objects from objects
|
||||
* @return
|
||||
*/
|
||||
public static <T> List<List<T>> makeCombinations(final List<T> objects, final int n) {
|
||||
public static <T> List<List<T>> makePermutations(final List<T> objects, final int n, final boolean withReplacement) {
|
||||
final List<List<T>> combinations = new ArrayList<List<T>>();
|
||||
|
||||
if ( n == 1 ) {
|
||||
if ( n <= 0 )
|
||||
;
|
||||
else if ( n == 1 ) {
|
||||
for ( final T o : objects )
|
||||
combinations.add(Collections.singletonList(o));
|
||||
} else {
|
||||
final List<List<T>> sub = makeCombinations(objects, n - 1);
|
||||
final List<List<T>> sub = makePermutations(objects, n - 1, withReplacement);
|
||||
for ( List<T> subI : sub ) {
|
||||
for ( final T a : objects ) {
|
||||
combinations.add(Utils.cons(a, subI));
|
||||
if ( withReplacement || ! subI.contains(a) )
|
||||
combinations.add(Utils.cons(a, subI));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -423,9 +423,8 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
|
|||
final LazyGenotypesContext.LazyParser lazyParser =
|
||||
new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields, builders);
|
||||
|
||||
LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser,
|
||||
new LazyData(siteInfo.nFormatFields, decoder.getRecordBytes()),
|
||||
header.getNGenotypeSamples());
|
||||
final LazyData lazyData = new LazyData(header, siteInfo.nFormatFields, decoder.getRecordBytes());
|
||||
final LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, lazyData, header.getNGenotypeSamples());
|
||||
|
||||
// did we resort the sample names? If so, we need to load the genotype data
|
||||
if ( !header.samplesWereAlreadySorted() )
|
||||
|
|
@ -436,11 +435,13 @@ public final class BCF2Codec implements FeatureCodec<VariantContext> {
|
|||
}
|
||||
|
||||
public static class LazyData {
|
||||
final public VCFHeader header;
|
||||
final public int nGenotypeFields;
|
||||
final public byte[] bytes;
|
||||
|
||||
@Requires({"nGenotypeFields > 0", "bytes != null"})
|
||||
public LazyData(final int nGenotypeFields, final byte[] bytes) {
|
||||
public LazyData(final VCFHeader header, final int nGenotypeFields, final byte[] bytes) {
|
||||
this.header = header;
|
||||
this.nGenotypeFields = nGenotypeFields;
|
||||
this.bytes = bytes;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ import java.util.*;
|
|||
* @author Mark DePristo
|
||||
* @since 5/12
|
||||
*/
|
||||
class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
|
||||
public class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
|
||||
final protected static Logger logger = Logger.getLogger(BCF2LazyGenotypesDecoder.class);
|
||||
|
||||
// the essential information for us to use to decode the genotypes data
|
||||
|
|
|
|||
|
|
@ -131,17 +131,21 @@ public final class BCF2Utils {
|
|||
* @param strings size > 1 list of strings
|
||||
* @return
|
||||
*/
|
||||
@Requires({"strings != null", "strings.size() > 1"})
|
||||
@Requires({"strings != null"})
|
||||
@Ensures("result != null")
|
||||
public static String collapseStringList(final List<String> strings) {
|
||||
final StringBuilder b = new StringBuilder();
|
||||
for ( final String s : strings ) {
|
||||
if ( s != null ) {
|
||||
assert s.indexOf(",") == -1; // no commas in individual strings
|
||||
b.append(",").append(s);
|
||||
if ( strings.isEmpty() ) return "";
|
||||
else if ( strings.size() == 1 ) return strings.get(0);
|
||||
else {
|
||||
final StringBuilder b = new StringBuilder();
|
||||
for ( final String s : strings ) {
|
||||
if ( s != null ) {
|
||||
assert s.indexOf(",") == -1; // no commas in individual strings
|
||||
b.append(",").append(s);
|
||||
}
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -163,7 +167,7 @@ public final class BCF2Utils {
|
|||
|
||||
@Requires("s != null")
|
||||
public static boolean isCollapsedString(final String s) {
|
||||
return s.charAt(0) == ',';
|
||||
return s.length() > 0 && s.charAt(0) == ',';
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -280,4 +284,49 @@ public final class BCF2Utils {
|
|||
else if ( o instanceof List ) return (List<Object>)o;
|
||||
else return Collections.singletonList(o);
|
||||
}
|
||||
|
||||
/**
|
||||
* Are the elements and their order in the output and input headers consistent so that
|
||||
* we can write out the raw genotypes block without decoding and recoding it?
|
||||
*
|
||||
* If the order of INFO, FILTER, or contrig elements in the output header is different than
|
||||
* in the input header we must decode the blocks using the input header and then recode them
|
||||
* based on the new output order.
|
||||
*
|
||||
* If they are consistent, we can simply pass through the raw genotypes block bytes, which is
|
||||
* a *huge* performance win for large blocks.
|
||||
*
|
||||
* Many common operations on BCF2 files (merging them for -nt, selecting a subset of records, etc)
|
||||
* don't modify the ordering of the header fields and so can safely pass through the genotypes
|
||||
* undecoded. Some operations -- those at add filters or info fields -- can change the ordering
|
||||
* of the header fields and so produce invalid BCF2 files if the genotypes aren't decoded
|
||||
*/
|
||||
public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHeader, final VCFHeader genotypesBlockHeader) {
|
||||
// first, we have to have the same samples in the same order
|
||||
if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) )
|
||||
return false;
|
||||
|
||||
final Iterator<? extends VCFIDHeaderLine> outputLinesIt = outputHeader.getIDHeaderLines().iterator();
|
||||
final Iterator<? extends VCFIDHeaderLine> inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator();
|
||||
|
||||
while ( inputLinesIt.hasNext() ) {
|
||||
if ( ! outputLinesIt.hasNext() ) // missing lines in output
|
||||
return false;
|
||||
|
||||
final VCFIDHeaderLine outputLine = outputLinesIt.next();
|
||||
final VCFIDHeaderLine inputLine = inputLinesIt.next();
|
||||
|
||||
if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static <T> List<T> nullAsEmpty(List<T> l) {
|
||||
if ( l == null )
|
||||
return Collections.emptyList();
|
||||
else
|
||||
return l;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,8 +47,8 @@ public class VCFContigHeaderLine extends VCFSimpleHeaderLine {
|
|||
this.contigIndex = contigIndex;
|
||||
}
|
||||
|
||||
public VCFContigHeaderLine(final String key, final Map<String, String> mapping, int contigIndex) {
|
||||
super(key, mapping, null);
|
||||
public VCFContigHeaderLine(final Map<String, String> mapping, int contigIndex) {
|
||||
super(VCFHeader.CONTIG_KEY, mapping, null);
|
||||
this.contigIndex = contigIndex;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -53,10 +53,10 @@ public class VCFHeader {
|
|||
|
||||
// the associated meta data
|
||||
private final Set<VCFHeaderLine> mMetaData = new LinkedHashSet<VCFHeaderLine>();
|
||||
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new HashMap<String, VCFInfoHeaderLine>();
|
||||
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new HashMap<String, VCFFormatHeaderLine>();
|
||||
private final Map<String, VCFFilterHeaderLine> mFilterMetaData = new HashMap<String, VCFFilterHeaderLine>();
|
||||
private final Map<String, VCFHeaderLine> mOtherMetaData = new HashMap<String, VCFHeaderLine>();
|
||||
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new LinkedHashMap<String, VCFInfoHeaderLine>();
|
||||
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new LinkedHashMap<String, VCFFormatHeaderLine>();
|
||||
private final Map<String, VCFFilterHeaderLine> mFilterMetaData = new LinkedHashMap<String, VCFFilterHeaderLine>();
|
||||
private final Map<String, VCFHeaderLine> mOtherMetaData = new LinkedHashMap<String, VCFHeaderLine>();
|
||||
private final List<VCFContigHeaderLine> contigMetaData = new ArrayList<VCFContigHeaderLine>();
|
||||
|
||||
// the list of auxillary tags
|
||||
|
|
@ -101,6 +101,15 @@ public class VCFHeader {
|
|||
loadMetaDataMaps();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a shallow copy of the meta data in VCF header toCopy
|
||||
*
|
||||
* @param toCopy
|
||||
*/
|
||||
public VCFHeader(final VCFHeader toCopy) {
|
||||
this(toCopy.mMetaData);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF header, given a list of meta data and auxillary tags
|
||||
*
|
||||
|
|
@ -153,12 +162,39 @@ public class VCFHeader {
|
|||
}
|
||||
|
||||
/**
|
||||
* @return all of the VCF header lines of the ##contig form in order, or an empty set if none were present
|
||||
* @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present
|
||||
*/
|
||||
public List<VCFContigHeaderLine> getContigLines() {
|
||||
return Collections.unmodifiableList(contigMetaData);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
|
||||
*/
|
||||
public List<VCFFilterHeaderLine> getFilterLines() {
|
||||
final List<VCFFilterHeaderLine> filters = new ArrayList<VCFFilterHeaderLine>();
|
||||
for ( VCFHeaderLine line : mMetaData ) {
|
||||
if ( line instanceof VCFFilterHeaderLine ) {
|
||||
filters.add((VCFFilterHeaderLine)line);
|
||||
}
|
||||
}
|
||||
return filters;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
|
||||
*/
|
||||
public List<VCFIDHeaderLine> getIDHeaderLines() {
|
||||
final List<VCFIDHeaderLine> filters = new ArrayList<VCFIDHeaderLine>();
|
||||
for ( VCFHeaderLine line : mMetaData ) {
|
||||
if ( line instanceof VCFIDHeaderLine ) {
|
||||
filters.add((VCFIDHeaderLine)line);
|
||||
}
|
||||
}
|
||||
return filters;
|
||||
}
|
||||
|
||||
/**
|
||||
* check our metadata for a VCF version tag, and throw an exception if the version is out of date
|
||||
* or the version is not present
|
||||
|
|
@ -299,10 +335,16 @@ public class VCFHeader {
|
|||
return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the INFO HeaderLines in their original ordering
|
||||
*/
|
||||
public Collection<VCFInfoHeaderLine> getInfoHeaderLines() {
|
||||
return mInfoMetaData.values();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the FORMAT HeaderLines in their original ordering
|
||||
*/
|
||||
public Collection<VCFFormatHeaderLine> getFormatHeaderLines() {
|
||||
return mFormatMetaData.values();
|
||||
}
|
||||
|
|
@ -390,4 +432,13 @@ public class VCFHeader {
|
|||
public HashMap<String, Integer> getSampleNameToOffset() {
|
||||
return sampleNameToOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder b = new StringBuilder();
|
||||
b.append("[VCFHeader:");
|
||||
for ( final VCFHeaderLine line : mMetaData )
|
||||
b.append("\n\t").append(line);
|
||||
return b.append("\n]").toString();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -301,7 +301,7 @@ public class VCFUtils {
|
|||
map.put("ID", contig.getSequenceName());
|
||||
map.put("length", String.valueOf(contig.getSequenceLength()));
|
||||
if ( assembly != null ) map.put("assembly", assembly);
|
||||
return new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, map, contig.getSequenceIndex());
|
||||
return new VCFContigHeaderLine(map, contig.getSequenceIndex());
|
||||
}
|
||||
|
||||
private static String getReferenceAssembly(final String refPath) {
|
||||
|
|
|
|||
|
|
@ -1351,7 +1351,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
|
|||
private final Map<String, Object> fullyDecodeAttributes(final Map<String, Object> attributes,
|
||||
final VCFHeader header,
|
||||
final boolean lenientDecoding) {
|
||||
final Map<String, Object> newAttributes = new HashMap<String, Object>(attributes.size());
|
||||
final Map<String, Object> newAttributes = new HashMap<String, Object>(10);
|
||||
|
||||
for ( final Map.Entry<String, Object> attr : attributes.entrySet() ) {
|
||||
final String field = attr.getKey();
|
||||
|
|
|
|||
|
|
@ -504,7 +504,7 @@ public class VariantContextUtils {
|
|||
Byte referenceBaseForIndel = null;
|
||||
|
||||
final Set<Allele> alleles = new LinkedHashSet<Allele>();
|
||||
final Set<String> filters = new TreeSet<String>();
|
||||
final Set<String> filters = new HashSet<String>();
|
||||
final Map<String, Object> attributes = new TreeMap<String, Object>();
|
||||
final Set<String> inconsistentAttributes = new HashSet<String>();
|
||||
final Set<String> variantSources = new HashSet<String>(); // contains the set of sources we found in our set of VCs that are variant
|
||||
|
|
@ -656,7 +656,8 @@ public class VariantContextUtils {
|
|||
builder.alleles(alleles);
|
||||
builder.genotypes(genotypes);
|
||||
builder.log10PError(log10PError);
|
||||
builder.filters(filters).attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes);
|
||||
builder.filters(filters.isEmpty() ? filters : new TreeSet<String>(filters));
|
||||
builder.attributes(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes);
|
||||
|
||||
// Trim the padded bases of all alleles if necessary
|
||||
final VariantContext merged = builder.make();
|
||||
|
|
|
|||
|
|
@ -124,7 +124,7 @@ public final class BCF2Encoder {
|
|||
@Ensures("encodeStream.size() > old(encodeStream.size())")
|
||||
public final void encodeTyped(List<? extends Object> v, final BCF2Type type) throws IOException {
|
||||
if ( type == BCF2Type.CHAR && v.size() != 0 ) {
|
||||
final String s = v.size() > 1 ? BCF2Utils.collapseStringList((List<String>) v) : (String)v.get(0);
|
||||
final String s = BCF2Utils.collapseStringList((List<String>) v);
|
||||
v = stringToBytes(s);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -335,7 +335,6 @@ public abstract class BCF2FieldEncoder {
|
|||
else if (value instanceof List) {
|
||||
final List<String> l = (List<String>)value;
|
||||
if ( l.isEmpty() ) return "";
|
||||
else if ( l.size() == 1 ) return (String)l.get(0);
|
||||
else return BCF2Utils.collapseStringList(l);
|
||||
} else
|
||||
return (String)value;
|
||||
|
|
|
|||
|
|
@ -87,14 +87,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
public static final int MAJOR_VERSION = 2;
|
||||
public static final int MINOR_VERSION = 1;
|
||||
|
||||
/**
|
||||
* If true, we will write out the undecoded raw bytes for a genotypes block, if it
|
||||
* is found in the input VC. This can be very dangerous as the genotype encoding
|
||||
* depends on the exact ordering of the header.
|
||||
*
|
||||
* TODO -- enable when the new smart VCF header code is created by Eric Banks
|
||||
*/
|
||||
private final static boolean WRITE_UNDECODED_GENOTYPE_BLOCK = false;
|
||||
final protected static Logger logger = Logger.getLogger(BCF2Writer.class);
|
||||
final private static boolean ALLOW_MISSING_CONTIG_LINES = false;
|
||||
|
||||
|
|
@ -108,6 +100,13 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
|
||||
final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager();
|
||||
|
||||
/**
|
||||
* cached results for whether we can write out raw genotypes data.
|
||||
*/
|
||||
private VCFHeader lastVCFHeaderOfUnparsedGenotypes = null;
|
||||
private boolean canPassOnUnparsedGenotypeDataForLastVCFHeader = false;
|
||||
|
||||
|
||||
public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
|
||||
super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
|
||||
this.outputStream = getOutputStream();
|
||||
|
|
@ -247,13 +246,39 @@ class BCF2Writer extends IndexingVariantContextWriter {
|
|||
return encoder.getRecordBytes();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Can we safely write on the raw (undecoded) genotypes of an input VC?
|
||||
*
|
||||
* The cache depends on the undecoded lazy data header == lastVCFHeaderOfUnparsedGenotypes, in
|
||||
* which case we return the previous result. If it's not cached, we use the BCF2Util to
|
||||
* compare the VC header with our header (expensive) and cache it.
|
||||
*
|
||||
* @param lazyData
|
||||
* @return
|
||||
*/
|
||||
private boolean canSafelyWriteRawGenotypesBytes(final BCF2Codec.LazyData lazyData) {
|
||||
if ( lazyData.header != lastVCFHeaderOfUnparsedGenotypes ) {
|
||||
// result is already cached
|
||||
canPassOnUnparsedGenotypeDataForLastVCFHeader = BCF2Utils.headerLinesAreOrderedConsistently(this.header,lazyData.header);
|
||||
lastVCFHeaderOfUnparsedGenotypes = lazyData.header;
|
||||
}
|
||||
|
||||
return canPassOnUnparsedGenotypeDataForLastVCFHeader;
|
||||
}
|
||||
|
||||
private BCF2Codec.LazyData getLazyData(final VariantContext vc) {
|
||||
if ( vc.getGenotypes().isLazyWithData() ) {
|
||||
LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes();
|
||||
if ( WRITE_UNDECODED_GENOTYPE_BLOCK && lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData )
|
||||
final LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes();
|
||||
|
||||
if ( lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData &&
|
||||
canSafelyWriteRawGenotypesBytes((BCF2Codec.LazyData) lgc.getUnparsedGenotypeData())) {
|
||||
//logger.info("Passing on raw BCF2 genotypes data");
|
||||
return (BCF2Codec.LazyData)lgc.getUnparsedGenotypeData();
|
||||
else
|
||||
} else {
|
||||
//logger.info("Decoding raw BCF2 genotypes data");
|
||||
lgc.decode(); // WARNING -- required to avoid keeping around bad lazy data for too long
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,155 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.codecs.bcf2;
|
||||
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
* Tests for BCF2Utils
|
||||
*/
|
||||
public final class BCF2UtilsUnitTest extends BaseTest {
|
||||
@DataProvider(name = "CollapseExpandTest")
|
||||
public Object[][] makeCollapseExpandTest() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
tests.add(new Object[]{Arrays.asList("A"), "A", false});
|
||||
tests.add(new Object[]{Arrays.asList("A", "B"), ",A,B", true});
|
||||
tests.add(new Object[]{Arrays.asList("AB"), "AB", false});
|
||||
tests.add(new Object[]{Arrays.asList("AB", "C"), ",AB,C", true});
|
||||
tests.add(new Object[]{Arrays.asList(), "", false});
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "CollapseExpandTest")
|
||||
public void testCollapseExpandTest(final List<String> in, final String expectedCollapsed, final boolean isCollapsed) {
|
||||
final String actualCollapsed = BCF2Utils.collapseStringList(in);
|
||||
Assert.assertEquals(actualCollapsed, expectedCollapsed);
|
||||
Assert.assertEquals(BCF2Utils.isCollapsedString(actualCollapsed), isCollapsed);
|
||||
if ( isCollapsed )
|
||||
Assert.assertEquals(BCF2Utils.explodeStringList(actualCollapsed), in);
|
||||
}
|
||||
|
||||
@DataProvider(name = "HeaderOrderTestProvider")
|
||||
public Object[][] makeHeaderOrderTestProvider() {
|
||||
final List<VCFHeaderLine> inputLines = new ArrayList<VCFHeaderLine>();
|
||||
final List<VCFHeaderLine> extraLines = new ArrayList<VCFHeaderLine>();
|
||||
|
||||
int counter = 0;
|
||||
inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
|
||||
inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
|
||||
inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
|
||||
inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
|
||||
inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
|
||||
inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
|
||||
inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
|
||||
inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
|
||||
final int inputLineCounter = counter;
|
||||
final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet<VCFHeaderLine>(inputLines));
|
||||
|
||||
extraLines.add(new VCFFilterHeaderLine(String.valueOf(counter++)));
|
||||
extraLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter));
|
||||
extraLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
|
||||
extraLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x"));
|
||||
extraLines.add(new VCFHeaderLine("x", "misc"));
|
||||
extraLines.add(new VCFHeaderLine("y", "misc"));
|
||||
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
for ( final int extrasToTake : Arrays.asList(0, 1, 2, 3) ) {
|
||||
final List<VCFHeaderLine> empty = Collections.emptyList();
|
||||
final List<List<VCFHeaderLine>> permutations = extrasToTake == 0
|
||||
? Collections.singletonList(empty)
|
||||
: Utils.makePermutations(extraLines, extrasToTake, false);
|
||||
for ( final List<VCFHeaderLine> permutation : permutations ) {
|
||||
for ( int i = -1; i < inputLines.size(); i++ ) {
|
||||
final List<VCFHeaderLine> allLines = new ArrayList<VCFHeaderLine>(inputLines);
|
||||
if ( i >= 0 )
|
||||
allLines.remove(i);
|
||||
allLines.addAll(permutation);
|
||||
final VCFHeader testHeader = new VCFHeader(new LinkedHashSet<VCFHeaderLine>(allLines));
|
||||
final boolean expectedConsistent = expectedConsistent(testHeader, inputLineCounter);
|
||||
tests.add(new Object[]{inputHeader, testHeader, expectedConsistent});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sample name tests
|
||||
final List<List<String>> sampleNameTests = Arrays.asList(
|
||||
new ArrayList<String>(),
|
||||
Arrays.asList("A"),
|
||||
Arrays.asList("A", "B"),
|
||||
Arrays.asList("A", "B", "C"));
|
||||
for ( final List<String> inSamples : sampleNameTests ) {
|
||||
for ( final List<String> testSamples : sampleNameTests ) {
|
||||
final VCFHeader inputHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), inSamples);
|
||||
|
||||
final List<List<String>> permutations = testSamples.isEmpty()
|
||||
? Collections.singletonList(testSamples)
|
||||
: Utils.makePermutations(testSamples, testSamples.size(), false);
|
||||
for ( final List<String> testSamplesPermutation : permutations ) {
|
||||
final VCFHeader testHeaderWithSamples = new VCFHeader(inputHeader.getMetaDataInInputOrder(), testSamplesPermutation);
|
||||
final boolean expectedConsistent = testSamples.equals(inSamples);
|
||||
tests.add(new Object[]{inputHeaderWithSamples, testHeaderWithSamples, expectedConsistent});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
private static boolean expectedConsistent(final VCFHeader combinationHeader, final int minCounterForInputLines) {
|
||||
final List<Integer> ids = new ArrayList<Integer>();
|
||||
for ( final VCFHeaderLine line : combinationHeader.getMetaDataInInputOrder() ) {
|
||||
if ( line instanceof VCFIDHeaderLine ) {
|
||||
ids.add(Integer.valueOf(((VCFIDHeaderLine) line).getID()));
|
||||
}
|
||||
}
|
||||
|
||||
// as long as the start contains all of the ids up to minCounterForInputLines in order
|
||||
for ( int i = 0; i < minCounterForInputLines; i++ )
|
||||
if ( i >= ids.size() || ids.get(i) != i )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// Test to make sure that we detect correctly the case where we can preserve the genotypes data in a BCF2
|
||||
// even when the header file is slightly different
|
||||
//
|
||||
@Test(dataProvider = "HeaderOrderTestProvider")
|
||||
public void testHeaderOrder(final VCFHeader inputHeader, final VCFHeader testHeader, final boolean expectedConsistent) {
|
||||
final boolean actualOrderConsistency = BCF2Utils.headerLinesAreOrderedConsistently(testHeader, inputHeader);
|
||||
Assert.assertEquals(actualOrderConsistency, expectedConsistent);
|
||||
}
|
||||
}
|
||||
|
|
@ -197,7 +197,7 @@ public class VariantContextTestProvider {
|
|||
addHeaderLine(metaData, "FT", 1, VCFHeaderLineType.String);
|
||||
|
||||
// prep the header
|
||||
metaData.add(new VCFContigHeaderLine(VCFHeader.CONTIG_KEY, Collections.singletonMap("ID", "1"), 0));
|
||||
metaData.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0));
|
||||
|
||||
metaData.add(new VCFFilterHeaderLine("FILTER1"));
|
||||
metaData.add(new VCFFilterHeaderLine("FILTER2"));
|
||||
|
|
@ -889,7 +889,7 @@ public class VariantContextTestProvider {
|
|||
}
|
||||
|
||||
private static List<List<Allele>> makeAllGenotypes(final List<Allele> alleles, final int highestPloidy) {
|
||||
return Utils.makeCombinations(alleles, highestPloidy);
|
||||
return Utils.makePermutations(alleles, highestPloidy, true);
|
||||
}
|
||||
|
||||
public static void assertEquals(final VCFHeader actual, final VCFHeader expected) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue