gatk-3.8/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextTestProvider....

961 lines
45 KiB
Java
Raw Normal View History

/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.variant.variantcontext;
import org.apache.log4j.Logger;
import org.broad.tribble.FeatureCodec;
import org.broad.tribble.FeatureCodecHeader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.BaseTest;
2012-06-06 02:36:10 +08:00
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.variant.bcf2.BCF2Codec;
import org.broadinstitute.variant.vcf.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.variant.variantcontext.writer.Options;
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
import org.testng.Assert;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.*;
/**
* Routines for generating all sorts of VCs for testing
*
* @author Your Name
* @since Date created
*/
public class VariantContextTestProvider {
final protected static Logger logger = Logger.getLogger(VariantContextTestProvider.class);
final private static boolean ENABLE_GENOTYPE_TESTS = true;
final private static boolean ENABLE_A_AND_G_TESTS = true;
final private static boolean ENABLE_VARARRAY_TESTS = true;
final private static boolean ENABLE_PLOIDY_TESTS = true;
final private static boolean ENABLE_PL_TESTS = true;
final private static boolean ENABLE_SYMBOLIC_ALLELE_TESTS = true;
final private static boolean ENABLE_SOURCE_VCF_TESTS = true;
final private static boolean ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS = true;
final private static List<Integer> TWENTY_INTS = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20);
private static VCFHeader syntheticHeader;
final static List<VariantContextTestData> TEST_DATAs = new ArrayList<VariantContextTestData>();
private static VariantContext ROOT;
private final static List<File> testSourceVCFs = new ArrayList<File>();
static {
testSourceVCFs.add(new File(BaseTest.privateTestDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf"));
testSourceVCFs.add(new File(BaseTest.privateTestDir + "ex2.vcf"));
testSourceVCFs.add(new File(BaseTest.privateTestDir + "dbsnp_135.b37.1000.vcf"));
if ( ENABLE_SYMBOLIC_ALLELE_TESTS ) {
testSourceVCFs.add(new File(BaseTest.privateTestDir + "diagnosis_targets_testfile.vcf"));
testSourceVCFs.add(new File(BaseTest.privateTestDir + "VQSR.mixedTest.recal"));
}
}
public abstract static class VariantContextIOTest {
public String toString() {
return "VariantContextIOTest:" + getExtension();
}
public abstract String getExtension();
public abstract FeatureCodec<VariantContext> makeCodec();
public abstract VariantContextWriter makeWriter(final File outputFile, final EnumSet<Options> baseOptions);
public List<VariantContext> preprocess(final VCFHeader header, List<VariantContext> vcsBeforeIO) {
return vcsBeforeIO;
}
public List<VariantContext> postprocess(final VCFHeader header, List<VariantContext> vcsAfterIO) {
return vcsAfterIO;
}
}
public static class VariantContextTestData {
public final VCFHeader header;
public List<VariantContext> vcs;
public VariantContextTestData(final VCFHeader header, final VariantContextBuilder builder) {
this(header, Collections.singletonList(builder.fullyDecoded(true).make()));
}
public VariantContextTestData(final VCFHeader header, final List<VariantContext> vcs) {
final Set<String> samples = new HashSet<String>();
for ( final VariantContext vc : vcs )
if ( vc.hasGenotypes() )
samples.addAll(vc.getSampleNames());
this.header = samples.isEmpty() ? header : new VCFHeader(header.getMetaDataInSortedOrder(), samples);
this.vcs = vcs;
}
public boolean hasGenotypes() {
return vcs.get(0).hasGenotypes();
}
public String toString() {
StringBuilder b = new StringBuilder();
b.append("VariantContextTestData: [");
final VariantContext vc = vcs.get(0);
final VariantContextBuilder builder = new VariantContextBuilder(vc);
builder.noGenotypes();
2012-06-06 02:36:10 +08:00
b.append(builder.make().toString());
if ( vc.getNSamples() < 5 ) {
for ( final Genotype g : vc.getGenotypes() )
b.append(g.toString());
} else {
b.append(" nGenotypes = ").append(vc.getNSamples());
}
if ( vcs.size() > 1 ) b.append(" ----- with another ").append(vcs.size() - 1).append(" VariantContext records");
b.append("]");
return b.toString();
}
}
private final static VariantContextBuilder builder() {
return new VariantContextBuilder(ROOT);
}
private final static void add(VariantContextBuilder builder) {
TEST_DATAs.add(new VariantContextTestData(syntheticHeader, builder));
}
public static void initializeTests() throws IOException {
createSyntheticHeader();
makeSyntheticTests();
makeEmpiricalTests();
}
private static void makeEmpiricalTests() throws IOException {
if ( ENABLE_SOURCE_VCF_TESTS ) {
for ( final File file : testSourceVCFs ) {
VCFCodec codec = new VCFCodec();
Phase I commit to get shadowBCFs passing tests -- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument -- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA -- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample -- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original -- Performance optimizations in BCF2 codec and writer -- using arrays not lists for intermediate data structures -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over -- VCFHeader (which needs a complete rewrite, FYI Eric) -- Warn and fix on the way flag values with counts > 0 -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation. -- Explicitly track FILTER fields for efficient lookup in their own hashmap -- Automatically add PL field when we see a GL field and no PL field -- Added get and has methods for INFO, FILTER, and FORMAT fields -- No longer add AC and AF values to the INFO field when there's no ALT allele -- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare -- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
Pair<VCFHeader, Iterable<VariantContext>> x = readAllVCs( file, codec );
List<VariantContext> fullyDecoded = new ArrayList<VariantContext>();
logger.warn("Reading records from " + file);
for ( final VariantContext raw : x.getSecond() ) {
if ( raw != null )
fullyDecoded.add(raw.fullyDecode(x.getFirst(), false));
}
logger.warn("Done reading " + file);
Phase I commit to get shadowBCFs passing tests -- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument -- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA -- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample -- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original -- Performance optimizations in BCF2 codec and writer -- using arrays not lists for intermediate data structures -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over -- VCFHeader (which needs a complete rewrite, FYI Eric) -- Warn and fix on the way flag values with counts > 0 -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation. -- Explicitly track FILTER fields for efficient lookup in their own hashmap -- Automatically add PL field when we see a GL field and no PL field -- Added get and has methods for INFO, FILTER, and FORMAT fields -- No longer add AC and AF values to the INFO field when there's no ALT allele -- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare -- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
TEST_DATAs.add(new VariantContextTestData(x.getFirst(), fullyDecoded));
}
}
}
private final static void addHeaderLine(final Set<VCFHeaderLine> metaData, final String id, final int count, final VCFHeaderLineType type) {
metaData.add(new VCFInfoHeaderLine(id, count, type, "x"));
if ( type != VCFHeaderLineType.Flag )
metaData.add(new VCFFormatHeaderLine(id, count, type, "x"));
}
private final static void addHeaderLine(final Set<VCFHeaderLine> metaData, final String id, final VCFHeaderLineCount count, final VCFHeaderLineType type) {
metaData.add(new VCFInfoHeaderLine(id, count, type, "x"));
if ( type != VCFHeaderLineType.Flag )
metaData.add(new VCFFormatHeaderLine(id, count, type, "x"));
}
private static void createSyntheticHeader() {
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
addHeaderLine(metaData, "STRING1", 1, VCFHeaderLineType.String);
addHeaderLine(metaData, "END", 1, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "STRING3", 3, VCFHeaderLineType.String);
addHeaderLine(metaData, "STRING20", 20, VCFHeaderLineType.String);
addHeaderLine(metaData, "VAR.INFO.STRING", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String);
addHeaderLine(metaData, "GT", 1, VCFHeaderLineType.String);
addHeaderLine(metaData, "GQ", 1, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "ADA", VCFHeaderLineCount.A, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "PL", VCFHeaderLineCount.G, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "GS", 2, VCFHeaderLineType.String);
addHeaderLine(metaData, "GV", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String);
addHeaderLine(metaData, "FT", 1, VCFHeaderLineType.String);
// prep the header
metaData.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0));
metaData.add(new VCFFilterHeaderLine("FILTER1"));
metaData.add(new VCFFilterHeaderLine("FILTER2"));
addHeaderLine(metaData, "INT1", 1, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "INT3", 3, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "INT20", 20, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "INT.VAR", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer);
addHeaderLine(metaData, "FLOAT1", 1, VCFHeaderLineType.Float);
addHeaderLine(metaData, "FLOAT3", 3, VCFHeaderLineType.Float);
addHeaderLine(metaData, "FLAG", 0, VCFHeaderLineType.Flag);
syntheticHeader = new VCFHeader(metaData);
}
private static void makeSyntheticTests() {
VariantContextBuilder rootBuilder = new VariantContextBuilder();
rootBuilder.source("test");
rootBuilder.loc("1", 10, 10);
rootBuilder.alleles("A", "C");
rootBuilder.unfiltered();
ROOT = rootBuilder.make();
add(builder());
add(builder().alleles("A"));
add(builder().alleles("A", "C", "T"));
add(builder().alleles("A", "AC"));
add(builder().alleles("A", "ACAGT"));
add(builder().loc("1", 10, 11).alleles("AC", "A"));
add(builder().loc("1", 10, 13).alleles("ACGT", "A"));
// make sure filters work
add(builder().unfiltered());
add(builder().passFilters());
add(builder().filters("FILTER1"));
add(builder().filters("FILTER1", "FILTER2"));
add(builder().log10PError(VariantContext.NO_LOG10_PERROR));
add(builder().log10PError(-1));
add(builder().log10PError(-1.234e6));
add(builder().noID());
add(builder().id("rsID12345"));
add(builder().attribute("INT1", 1));
add(builder().attribute("INT1", 100));
add(builder().attribute("INT1", 1000));
add(builder().attribute("INT1", 100000));
add(builder().attribute("INT1", null));
add(builder().attribute("INT3", Arrays.asList(1, 2, 3)));
add(builder().attribute("INT3", Arrays.asList(1000, 2000, 3000)));
add(builder().attribute("INT3", Arrays.asList(100000, 200000, 300000)));
add(builder().attribute("INT3", null));
add(builder().attribute("INT20", TWENTY_INTS));
add(builder().attribute("FLOAT1", 1.0));
add(builder().attribute("FLOAT1", 100.0));
add(builder().attribute("FLOAT1", 1000.0));
add(builder().attribute("FLOAT1", 100000.0));
add(builder().attribute("FLOAT1", null));
add(builder().attribute("FLOAT3", Arrays.asList(1.0, 2.0, 3.0)));
add(builder().attribute("FLOAT3", Arrays.asList(1000.0, 2000.0, 3000.0)));
add(builder().attribute("FLOAT3", Arrays.asList(100000.0, 200000.0, 300000.0)));
add(builder().attribute("FLOAT3", null));
add(builder().attribute("FLAG", true));
//add(builder().attribute("FLAG", false)); // NOTE -- VCF doesn't allow false flags
add(builder().attribute("STRING1", "s1"));
add(builder().attribute("STRING1", null));
add(builder().attribute("STRING3", Arrays.asList("s1", "s2", "s3")));
add(builder().attribute("STRING3", null));
add(builder().attribute("STRING20", Arrays.asList("s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20")));
add(builder().attribute("VAR.INFO.STRING", "s1"));
add(builder().attribute("VAR.INFO.STRING", Arrays.asList("s1", "s2")));
add(builder().attribute("VAR.INFO.STRING", Arrays.asList("s1", "s2", "s3")));
add(builder().attribute("VAR.INFO.STRING", null));
if ( ENABLE_GENOTYPE_TESTS ) {
addGenotypesToTestData();
addComplexGenotypesTest();
}
2012-06-06 02:36:10 +08:00
if ( ENABLE_A_AND_G_TESTS )
addGenotypesAndGTests();
if ( ENABLE_SYMBOLIC_ALLELE_TESTS )
addSymbolicAlleleTests();
}
private static void addSymbolicAlleleTests() {
// two tests to ensure that the end is computed correctly when there's (and not) an END field present
add(builder().alleles("N", "<VQSR>").start(10).stop(11).attribute("END", 11));
add(builder().alleles("N", "<VQSR>").start(10).stop(10));
}
private static void addGenotypesToTestData() {
final ArrayList<VariantContext> sites = new ArrayList<VariantContext>();
sites.add(builder().alleles("A").make());
sites.add(builder().alleles("A", "C", "T").make());
sites.add(builder().alleles("A", "AC").make());
sites.add(builder().alleles("A", "ACAGT").make());
for ( VariantContext site : sites ) {
addGenotypes(site);
}
}
private static void addGenotypeTests( final VariantContext site, Genotype ... genotypes ) {
// for each sites VC, we are going to add create two root genotypes.
// The first is the primary, and will be added to each new test
// The second is variable. In some tests it's absent (testing 1 genotype), in others it is duplicated
// 1 once, 10, 100, or 1000 times to test scaling
final VariantContextBuilder builder = new VariantContextBuilder(site);
// add a single context
builder.genotypes(genotypes[0]);
add(builder);
if ( genotypes.length > 1 ) {
// add all
add(builder.genotypes(Arrays.asList(genotypes)));
// add all with the last replicated 10x and 100x times
for ( int nCopiesOfLast : Arrays.asList(10, 100, 1000) ) {
final GenotypesContext gc = new GenotypesContext();
final Genotype last = genotypes[genotypes.length-1];
for ( int i = 0; i < genotypes.length - 1; i++ )
gc.add(genotypes[i]);
for ( int i = 0; i < nCopiesOfLast; i++ )
gc.add(new GenotypeBuilder(last).name("copy" + i).make());
add(builder.genotypes(gc));
}
}
}
private static void addGenotypes( final VariantContext site) {
// test ref/ref
final Allele ref = site.getReference();
final Allele alt1 = site.getNAlleles() > 1 ? site.getAlternateAllele(0) : null;
final Genotype homRef = GenotypeBuilder.create("homRef", Arrays.asList(ref, ref));
addGenotypeTests(site, homRef);
if ( alt1 != null ) {
final Genotype het = GenotypeBuilder.create("het", Arrays.asList(ref, alt1));
final Genotype homVar = GenotypeBuilder.create("homVar", Arrays.asList(alt1, alt1));
addGenotypeTests(site, homRef, het);
addGenotypeTests(site, homRef, het, homVar);
Phase I commit to get shadowBCFs passing tests -- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument -- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA -- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample -- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original -- Performance optimizations in BCF2 codec and writer -- using arrays not lists for intermediate data structures -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over -- VCFHeader (which needs a complete rewrite, FYI Eric) -- Warn and fix on the way flag values with counts > 0 -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation. -- Explicitly track FILTER fields for efficient lookup in their own hashmap -- Automatically add PL field when we see a GL field and no PL field -- Added get and has methods for INFO, FILTER, and FORMAT fields -- No longer add AC and AF values to the INFO field when there's no ALT allele -- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare -- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
// test no GT at all
addGenotypeTests(site, new GenotypeBuilder("noGT", new ArrayList<Allele>(0)).attribute("INT1", 10).make());
Phase I commit to get shadowBCFs passing tests -- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument -- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA -- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample -- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original -- Performance optimizations in BCF2 codec and writer -- using arrays not lists for intermediate data structures -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over -- VCFHeader (which needs a complete rewrite, FYI Eric) -- Warn and fix on the way flag values with counts > 0 -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation. -- Explicitly track FILTER fields for efficient lookup in their own hashmap -- Automatically add PL field when we see a GL field and no PL field -- Added get and has methods for INFO, FILTER, and FORMAT fields -- No longer add AC and AF values to the INFO field when there's no ALT allele -- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare -- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
final List<Allele> noCall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
// ploidy
if ( ENABLE_PLOIDY_TESTS ) {
addGenotypeTests(site,
GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)),
GenotypeBuilder.create("hap", Arrays.asList(ref)));
addGenotypeTests(site,
GenotypeBuilder.create("noCall", noCall),
GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)),
GenotypeBuilder.create("hap", Arrays.asList(ref)));
addGenotypeTests(site,
GenotypeBuilder.create("noCall", noCall),
GenotypeBuilder.create("noCall2", noCall),
GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)),
GenotypeBuilder.create("hap", Arrays.asList(ref)));
addGenotypeTests(site,
GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)),
GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1)));
addGenotypeTests(site,
GenotypeBuilder.create("noCall", noCall),
GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)),
GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1)));
addGenotypeTests(site,
GenotypeBuilder.create("noCall", noCall),
GenotypeBuilder.create("noCall2", noCall),
GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)),
GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1)));
addGenotypeTests(site,
GenotypeBuilder.create("nocall", noCall),
GenotypeBuilder.create("dip", Arrays.asList(ref, alt1)),
GenotypeBuilder.create("tet", Arrays.asList(ref, alt1, alt1)));
}
Finalizing BCF2 mark III commit -- Moved GENOTYPE_KEY vcf header line to VCFConstants. This general migration and cleanup is on Eric's plate now -- Updated HC to initialize the annotation engine in an order that allows it to write a proper VCF header. Still doesn't work... -- Updating integration test files. Moved many more files into public/testdata. Updated their headers to all work correctly with new strict VCF header checking. -- Bugfix for TandemRepeatAnnotation that must be unbounded not A count type as it provides info for the REF as well as each alt -- No longer add FALSE values to flag values in VCs in VariantAnnotatorEngine. DB = 0 is never seen in the output VCFs now -- Fixed bug in VCFDiffableReader that didn't differeniate between "." and "PASS" VC filter status -- Unconditionally add lowQual Filter to UG output VCF files as this is in some cases (EMIT_ALL_SITES) used when the previous check said it wouldn't be -- VariantsToVCF now properly writes out the GT FORMAT field -- BCF2 codec explodes when reading symbolic alleles as I literally cannot figure out how to use the allele clipping code. Eric said he and Ami will clean up this whole piece of instructure -- Fixed bug in BCF2Codec that wasn't setting the phase field correctly. UnitTested now -- PASS string now added at the end of the BCF2 dictionary after discussion with Heng -- Fixed bug where I was writing out all field values as BigEndian. Now everything is LittleEndian. -- VCFHeader detects the case where a count field has size < 0 (some of our files have count = -1) and throws a UserException -- Cleaned up unused code -- Fixed bug in BCF2 string encoder that wasn't handling the case of an empty list of strings for encoding -- Fixed bug where all samples are no called in a VC, in which case we (like the VCFwriter) write out no called diploid genotypes for all samples -- We always write the number of genotype samples into the BCF2 nSamples header. How we can have a variable number of samples per record isn't clear to me, as we don't have a map from missing samples to header names... -- Removed old filtersWereAppliedToContext code in VCF as properly handle unfiltered, filtered, and PASS records internally -- Fastpath function getDisplayBases() in allele that just gives you the raw bytes[] you'd see for an Allele -- Genotype fields no longer differentiate between unfiltered, filtered, and PASS values. Genotype objects are all PASS implicitly, or explicitly filtered. We only write out the FT values if at least one sample is filtered. Removed interface functions and cleaned up code -- Refactored padAllele code from createVariantContextWithPaddedAlleles into the function padAllele so that it actually works. In general, **** NEVER COPY CODE **** if you need to share funcitonality make a function, that's why there were invented! -- Increased the default number of records to read for DiffObjects to 1M
2012-06-19 21:46:26 +08:00
//
//
// TESTING PHASE
//
//
final Genotype gUnphased = new GenotypeBuilder("gUnphased", Arrays.asList(ref, alt1)).make();
final Genotype gPhased = new GenotypeBuilder("gPhased", Arrays.asList(ref, alt1)).phased(true).make();
final Genotype gPhased2 = new GenotypeBuilder("gPhased2", Arrays.asList(alt1, alt1)).phased(true).make();
final Genotype gPhased3 = new GenotypeBuilder("gPhased3", Arrays.asList(ref, ref)).phased(true).make();
final Genotype haploidNoPhase = new GenotypeBuilder("haploidNoPhase", Arrays.asList(ref)).make();
addGenotypeTests(site, gUnphased, gPhased);
addGenotypeTests(site, gUnphased, gPhased2);
addGenotypeTests(site, gUnphased, gPhased3);
addGenotypeTests(site, gPhased, gPhased2);
addGenotypeTests(site, gPhased, gPhased3);
addGenotypeTests(site, gPhased2, gPhased3);
addGenotypeTests(site, haploidNoPhase, gPhased);
addGenotypeTests(site, haploidNoPhase, gPhased2);
addGenotypeTests(site, haploidNoPhase, gPhased3);
addGenotypeTests(site, haploidNoPhase, gPhased, gPhased2);
addGenotypeTests(site, haploidNoPhase, gPhased, gPhased3);
addGenotypeTests(site, haploidNoPhase, gPhased2, gPhased3);
addGenotypeTests(site, haploidNoPhase, gPhased, gPhased2, gPhased3);
final Genotype gUnphasedTet = new GenotypeBuilder("gUnphasedTet", Arrays.asList(ref, alt1, ref, alt1)).make();
final Genotype gPhasedTet = new GenotypeBuilder("gPhasedTet", Arrays.asList(ref, alt1, alt1, alt1)).phased(true).make();
addGenotypeTests(site, gUnphasedTet, gPhasedTet);
}
if ( ENABLE_PL_TESTS ) {
if ( site.getNAlleles() == 2 ) {
// testing PLs
addGenotypeTests(site,
GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{0, -1, -2}),
GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2, -3}));
addGenotypeTests(site,
GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{-1, 0, -2}),
GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2, -3}));
addGenotypeTests(site,
GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{-1, 0, -2}),
GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2000, -1000}));
addGenotypeTests(site, // missing PLs
GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{-1, 0, -2}),
GenotypeBuilder.create("g2", Arrays.asList(ref, ref)));
}
else if ( site.getNAlleles() == 3 ) {
// testing PLs
addGenotypeTests(site,
GenotypeBuilder.create("g1", Arrays.asList(ref, ref), new double[]{0, -1, -2, -3, -4, -5}),
GenotypeBuilder.create("g2", Arrays.asList(ref, ref), new double[]{0, -2, -3, -4, -5, -6}));
}
}
// test attributes
addGenotypeTests(site,
attr("g1", ref, "INT1", 1),
attr("g2", ref, "INT1", 2));
addGenotypeTests(site,
attr("g1", ref, "INT1", 1),
attr("g2", ref, "INT1"));
addGenotypeTests(site,
attr("g1", ref, "INT3", 1, 2, 3),
attr("g2", ref, "INT3", 4, 5, 6));
addGenotypeTests(site,
attr("g1", ref, "INT3", 1, 2, 3),
attr("g2", ref, "INT3"));
addGenotypeTests(site,
attr("g1", ref, "INT20", TWENTY_INTS),
attr("g2", ref, "INT20", TWENTY_INTS));
if (ENABLE_VARARRAY_TESTS) {
addGenotypeTests(site,
attr("g1", ref, "INT.VAR", 1, 2, 3),
attr("g2", ref, "INT.VAR", 4, 5),
attr("g3", ref, "INT.VAR", 6));
addGenotypeTests(site,
attr("g1", ref, "INT.VAR", 1, 2, 3),
attr("g2", ref, "INT.VAR"),
attr("g3", ref, "INT.VAR", 5));
}
addGenotypeTests(site,
attr("g1", ref, "FLOAT1", 1.0),
attr("g2", ref, "FLOAT1", 2.0));
addGenotypeTests(site,
attr("g1", ref, "FLOAT1", 1.0),
attr("g2", ref, "FLOAT1"));
addGenotypeTests(site,
attr("g1", ref, "FLOAT3", 1.0, 2.0, 3.0),
attr("g2", ref, "FLOAT3", 4.0, 5.0, 6.0));
addGenotypeTests(site,
attr("g1", ref, "FLOAT3", 1.0, 2.0, 3.0),
attr("g2", ref, "FLOAT3"));
if (ENABLE_VARIABLE_LENGTH_GENOTYPE_STRING_TESTS) {
//
//
// TESTING MULTIPLE SIZED LISTS IN THE GENOTYPE FIELD
//
//
addGenotypeTests(site,
attr("g1", ref, "GS", Arrays.asList("S1", "S2")),
attr("g2", ref, "GS", Arrays.asList("S3", "S4")));
addGenotypeTests(site, // g1 is missing the string, and g2 is missing FLOAT1
attr("g1", ref, "FLOAT1", 1.0),
attr("g2", ref, "GS", Arrays.asList("S3", "S4")));
// variable sized lists
addGenotypeTests(site,
attr("g1", ref, "GV", "S1"),
attr("g2", ref, "GV", Arrays.asList("S3", "S4")));
addGenotypeTests(site,
attr("g1", ref, "GV", Arrays.asList("S1", "S2")),
attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5")));
addGenotypeTests(site, // missing value in varlist of string
attr("g1", ref, "FLOAT1", 1.0),
attr("g2", ref, "GV", Arrays.asList("S3", "S4", "S5")));
}
//
//
// TESTING GENOTYPE FILTERS
//
//
addGenotypeTests(site,
new GenotypeBuilder("g1-x", Arrays.asList(ref, ref)).filters("X").make(),
new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make());
addGenotypeTests(site,
new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(),
new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make());
addGenotypeTests(site,
new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(),
new GenotypeBuilder("g2-xy", Arrays.asList(ref, ref)).filters("X", "Y").make());
addGenotypeTests(site,
new GenotypeBuilder("g1-unft", Arrays.asList(ref, ref)).unfiltered().make(),
new GenotypeBuilder("g2-x", Arrays.asList(ref, ref)).filters("X").make(),
new GenotypeBuilder("g3-xy", Arrays.asList(ref, ref)).filters("X", "Y").make());
}
private static void addGenotypesAndGTests() {
// for ( final int ploidy : Arrays.asList(2)) {
for ( final int ploidy : Arrays.asList(1, 2, 3, 4, 5)) {
final List<List<String>> alleleCombinations =
Arrays.asList(
Arrays.asList("A"),
Arrays.asList("A", "C"),
Arrays.asList("A", "C", "G"),
Arrays.asList("A", "C", "G", "T"));
for ( final List<String> alleles : alleleCombinations ) {
final VariantContextBuilder vcb = builder().alleles(alleles);
final VariantContext site = vcb.make();
final int nAlleles = site.getNAlleles();
final Allele ref = site.getReference();
// base genotype is ref/.../ref up to ploidy
final List<Allele> baseGenotype = new ArrayList<Allele>(ploidy);
for ( int i = 0; i < ploidy; i++) baseGenotype.add(ref);
final int nPLs = GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy);
// ada is 0, 1, ..., nAlleles - 1
final List<Integer> ada = new ArrayList<Integer>(nAlleles);
for ( int i = 0; i < nAlleles - 1; i++ ) ada.add(i);
// pl is 0, 1, ..., up to nPLs (complex calc of nAlleles and ploidy)
final int[] pl = new int[nPLs];
for ( int i = 0; i < pl.length; i++ ) pl[i] = i;
final GenotypeBuilder gb = new GenotypeBuilder("ADA_PL_SAMPLE");
gb.alleles(baseGenotype);
gb.PL(pl);
gb.attribute("ADA", nAlleles == 2 ? ada.get(0) : ada);
vcb.genotypes(gb.make());
add(vcb);
}
}
}
private static Genotype attr(final String name, final Allele ref, final String key, final Object ... value) {
if ( value.length == 0 )
return GenotypeBuilder.create(name, Arrays.asList(ref, ref));
else {
final Object toAdd = value.length == 1 ? value[0] : Arrays.asList(value);
return new GenotypeBuilder(name, Arrays.asList(ref, ref)).attribute(key, toAdd).make();
}
}
public static List<VariantContextTestData> generateSiteTests() {
return TEST_DATAs;
}
public static void testReaderWriterWithMissingGenotypes(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException {
final int nSamples = data.header.getNGenotypeSamples();
if ( nSamples > 2 ) {
for ( final VariantContext vc : data.vcs )
if ( vc.isSymbolic() )
// cannot handle symbolic alleles because they may be weird non-call VCFs
return;
final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension());
tmpFile.deleteOnExit();
// write expected to disk
final EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
final VariantContextWriter writer = tester.makeWriter(tmpFile, options);
final Set<String> samplesInVCF = new HashSet<String>(data.header.getGenotypeSamples());
final List<String> missingSamples = Arrays.asList("MISSING1", "MISSING2");
final List<String> allSamples = new ArrayList<String>(missingSamples);
allSamples.addAll(samplesInVCF);
final VCFHeader header = new VCFHeader(data.header.getMetaDataInInputOrder(), allSamples);
writeVCsToFile(writer, header, data.vcs);
// ensure writing of expected == actual
final Pair<VCFHeader, Iterable<VariantContext>> p = readAllVCs(tmpFile, tester.makeCodec());
final Iterable<VariantContext> actual = p.getSecond();
int i = 0;
for ( final VariantContext readVC : actual ) {
if ( readVC == null ) continue; // sometimes we read null records...
final VariantContext expected = data.vcs.get(i++);
for ( final Genotype g : readVC.getGenotypes() ) {
Assert.assertTrue(allSamples.contains(g.getSampleName()));
if ( samplesInVCF.contains(g.getSampleName()) ) {
assertEquals(g, expected.getGenotype(g.getSampleName()));
} else {
// missing
Assert.assertTrue(g.isNoCall());
}
}
}
}
}
public static void testReaderWriter(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException {
testReaderWriter(tester, data.header, data.vcs, data.vcs, true);
}
public static void testReaderWriter(final VariantContextIOTest tester,
final VCFHeader header,
final List<VariantContext> expected,
final Iterable<VariantContext> vcs,
final boolean recurse) throws IOException {
final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension());
tmpFile.deleteOnExit();
// write expected to disk
final EnumSet<Options> options = EnumSet.of(Options.INDEX_ON_THE_FLY);
final VariantContextWriter writer = tester.makeWriter(tmpFile, options);
writeVCsToFile(writer, header, vcs);
// ensure writing of expected == actual
final Pair<VCFHeader, Iterable<VariantContext>> p = readAllVCs(tmpFile, tester.makeCodec());
final Iterable<VariantContext> actual = p.getSecond();
assertEquals(actual, expected);
if ( recurse ) {
// if we are doing a recursive test, grab a fresh iterator over the written values
final Iterable<VariantContext> read = readAllVCs(tmpFile, tester.makeCodec()).getSecond();
testReaderWriter(tester, p.getFirst(), expected, read, false);
}
}
private static void writeVCsToFile(final VariantContextWriter writer, final VCFHeader header, final Iterable<VariantContext> vcs) {
// write
writer.writeHeader(header);
for ( VariantContext vc : vcs )
if (vc != null)
writer.add(vc);
writer.close();
}
/**
* Utility class to read all of the VC records from a file
*
* @param source
* @param codec
* @return
* @throws IOException
*/
Phase I commit to get shadowBCFs passing tests -- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument -- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA -- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample -- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original -- Performance optimizations in BCF2 codec and writer -- using arrays not lists for intermediate data structures -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over -- VCFHeader (which needs a complete rewrite, FYI Eric) -- Warn and fix on the way flag values with counts > 0 -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation. -- Explicitly track FILTER fields for efficient lookup in their own hashmap -- Automatically add PL field when we see a GL field and no PL field -- Added get and has methods for INFO, FILTER, and FORMAT fields -- No longer add AC and AF values to the INFO field when there's no ALT allele -- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare -- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
private final static Pair<VCFHeader, Iterable<VariantContext>> readAllVCs( final File source, final FeatureCodec<VariantContext> codec ) throws IOException {
// read in the features
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source));
FeatureCodecHeader header = codec.readHeader(pbs);
pbs.close();
pbs = new PositionalBufferedStream(new FileInputStream(source));
pbs.skip(header.getHeaderEnd());
Phase I commit to get shadowBCFs passing tests -- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument -- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA -- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample -- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original -- Performance optimizations in BCF2 codec and writer -- using arrays not lists for intermediate data structures -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over -- VCFHeader (which needs a complete rewrite, FYI Eric) -- Warn and fix on the way flag values with counts > 0 -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation. -- Explicitly track FILTER fields for efficient lookup in their own hashmap -- Automatically add PL field when we see a GL field and no PL field -- Added get and has methods for INFO, FILTER, and FORMAT fields -- No longer add AC and AF values to the INFO field when there's no ALT allele -- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare -- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue();
return new Pair<VCFHeader, Iterable<VariantContext>>(vcfHeader, new VCIterable(pbs, codec, vcfHeader));
}
private static class VCIterable implements Iterable<VariantContext>, Iterator<VariantContext> {
final PositionalBufferedStream pbs;
final FeatureCodec<VariantContext> codec;
final VCFHeader header;
private VCIterable(final PositionalBufferedStream pbs, final FeatureCodec<VariantContext> codec, final VCFHeader header) {
this.pbs = pbs;
this.codec = codec;
this.header = header;
}
@Override
public Iterator<VariantContext> iterator() {
return this;
}
@Override
public boolean hasNext() {
try {
return ! pbs.isDone();
} catch ( IOException e ) {
throw new RuntimeException(e);
}
}
@Override
public VariantContext next() {
try {
final VariantContext vc = codec.decode(pbs);
return vc == null ? null : vc.fullyDecode(header, false);
Phase I commit to get shadowBCFs passing tests -- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument -- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA -- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample -- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original -- Performance optimizations in BCF2 codec and writer -- using arrays not lists for intermediate data structures -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over -- VCFHeader (which needs a complete rewrite, FYI Eric) -- Warn and fix on the way flag values with counts > 0 -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation. -- Explicitly track FILTER fields for efficient lookup in their own hashmap -- Automatically add PL field when we see a GL field and no PL field -- Added get and has methods for INFO, FILTER, and FORMAT fields -- No longer add AC and AF values to the INFO field when there's no ALT allele -- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare -- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
} catch ( IOException e ) {
throw new RuntimeException(e);
}
}
Phase I commit to get shadowBCFs passing tests -- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument -- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA -- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample -- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original -- Performance optimizations in BCF2 codec and writer -- using arrays not lists for intermediate data structures -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over -- VCFHeader (which needs a complete rewrite, FYI Eric) -- Warn and fix on the way flag values with counts > 0 -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation. -- Explicitly track FILTER fields for efficient lookup in their own hashmap -- Automatically add PL field when we see a GL field and no PL field -- Added get and has methods for INFO, FILTER, and FORMAT fields -- No longer add AC and AF values to the INFO field when there's no ALT allele -- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare -- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
@Override
public void remove() {
//To change body of implemented methods use File | Settings | File Templates.
}
}
public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException {
Phase I commit to get shadowBCFs passing tests -- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument -- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA -- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample -- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original -- Performance optimizations in BCF2 codec and writer -- using arrays not lists for intermediate data structures -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over -- VCFHeader (which needs a complete rewrite, FYI Eric) -- Warn and fix on the way flag values with counts > 0 -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation. -- Explicitly track FILTER fields for efficient lookup in their own hashmap -- Automatically add PL field when we see a GL field and no PL field -- Added get and has methods for INFO, FILTER, and FORMAT fields -- No longer add AC and AF values to the INFO field when there's no ALT allele -- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare -- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
final Pair<VCFHeader, Iterable<VariantContext>> vcfData = readAllVCs(vcfFile, new VCFCodec());
final Pair<VCFHeader, Iterable<VariantContext>> bcfData = readAllVCs(bcfFile, new BCF2Codec());
assertEquals(bcfData.getFirst(), vcfData.getFirst());
assertEquals(bcfData.getSecond(), vcfData.getSecond());
}
Phase I commit to get shadowBCFs passing tests -- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument -- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA -- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample -- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original -- Performance optimizations in BCF2 codec and writer -- using arrays not lists for intermediate data structures -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over -- VCFHeader (which needs a complete rewrite, FYI Eric) -- Warn and fix on the way flag values with counts > 0 -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation. -- Explicitly track FILTER fields for efficient lookup in their own hashmap -- Automatically add PL field when we see a GL field and no PL field -- Added get and has methods for INFO, FILTER, and FORMAT fields -- No longer add AC and AF values to the INFO field when there's no ALT allele -- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare -- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
public static void assertEquals(final Iterable<VariantContext> actual, final Iterable<VariantContext> expected) {
final Iterator<VariantContext> actualIT = actual.iterator();
final Iterator<VariantContext> expectedIT = expected.iterator();
while ( expectedIT.hasNext() ) {
final VariantContext expectedVC = expectedIT.next();
if ( expectedVC == null )
continue;
Phase I commit to get shadowBCFs passing tests -- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument -- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA -- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample -- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original -- Performance optimizations in BCF2 codec and writer -- using arrays not lists for intermediate data structures -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over -- VCFHeader (which needs a complete rewrite, FYI Eric) -- Warn and fix on the way flag values with counts > 0 -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation. -- Explicitly track FILTER fields for efficient lookup in their own hashmap -- Automatically add PL field when we see a GL field and no PL field -- Added get and has methods for INFO, FILTER, and FORMAT fields -- No longer add AC and AF values to the INFO field when there's no ALT allele -- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare -- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
VariantContext actualVC;
do {
Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual");
actualVC = actualIT.next();
} while ( actualIT.hasNext() && actualVC == null );
if ( actualVC == null )
Assert.fail("Too few records in actual");
assertEquals(actualVC, expectedVC);
}
Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual");
}
/**
* Assert that two variant contexts are actually equal
* @param actual
* @param expected
*/
public static void assertEquals( final VariantContext actual, final VariantContext expected ) {
Assert.assertNotNull(actual, "VariantContext expected not null");
Assert.assertEquals(actual.getChr(), expected.getChr(), "chr");
Assert.assertEquals(actual.getStart(), expected.getStart(), "start");
Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end");
Assert.assertEquals(actual.getID(), expected.getID(), "id");
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual);
assertAttributesEquals(actual.getAttributes(), expected.getAttributes());
Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied");
Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered");
BaseTest.assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters");
BaseTest.assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual());
Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes");
if ( expected.hasGenotypes() ) {
BaseTest.assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set");
Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names");
final Set<String> samples = expected.getSampleNames();
for ( final String sample : samples ) {
assertEquals(actual.getGenotype(sample), expected.getGenotype(sample));
}
}
}
public static void assertEquals(final Genotype actual, final Genotype expected) {
Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names");
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles");
Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string");
Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type");
// filters are the same
Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields");
Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered");
// inline attributes
Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp");
Assert.assertEquals(actual.getAD(), expected.getAD(), "Genotype ad");
Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq");
Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL");
Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD");
Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ");
Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP");
Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods");
Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString");
Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods");
Assert.assertEquals(actual.getPL(), expected.getPL(), "Genotype getPL");
Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual");
assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes());
Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased");
Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy");
}
private static void assertAttributesEquals(final Map<String, Object> actual, Map<String, Object> expected) {
final Set<String> expectedKeys = new HashSet<String>(expected.keySet());
for ( final Map.Entry<String, Object> act : actual.entrySet() ) {
final Object actualValue = act.getValue();
if ( expected.containsKey(act.getKey()) && expected.get(act.getKey()) != null ) {
final Object expectedValue = expected.get(act.getKey());
if ( expectedValue instanceof List ) {
final List<Object> expectedList = (List<Object>)expectedValue;
Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't");
final List<Object> actualList = (List<Object>)actualValue;
Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size");
for ( int i = 0; i < expectedList.size(); i++ )
assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i));
} else
assertAttributeEquals(act.getKey(), actualValue, expectedValue);
} else {
// it's ok to have a binding in x -> null that's absent in y
Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other");
}
expectedKeys.remove(act.getKey());
}
// now expectedKeys contains only the keys found in expected but not in actual,
// and they must all be null
for ( final String missingExpected : expectedKeys ) {
final Object value = expected.get(missingExpected);
Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" );
}
}
private static final boolean isMissing(final Object value) {
if ( value == null ) return true;
else if ( value.equals(VCFConstants.MISSING_VALUE_v4) ) return true;
else if ( value instanceof List ) {
// handles the case where all elements are null or the list is empty
for ( final Object elt : (List)value)
if ( elt != null )
return false;
return true;
} else
return false;
}
private static void assertAttributeEquals(final String key, final Object actual, final Object expected) {
if ( expected instanceof Double ) {
// must be very tolerant because doubles are being rounded to 2 sig figs
BaseTest.assertEqualsDoubleSmart(actual, (Double)expected, 1e-2);
} else
Assert.assertEquals(actual, expected, "Attribute " + key);
}
2012-06-06 02:36:10 +08:00
public static void addComplexGenotypesTest() {
final List<Allele> allAlleles = Arrays.asList(
Allele.create("A", true),
Allele.create("C", false),
Allele.create("G", false));
for ( int nAlleles : Arrays.asList(2, 3) ) {
for ( int highestPloidy : Arrays.asList(1, 2, 3) ) {
// site alleles
final List<Allele> siteAlleles = allAlleles.subList(0, nAlleles);
// possible alleles for genotypes
final List<Allele> possibleGenotypeAlleles = new ArrayList<Allele>(siteAlleles);
possibleGenotypeAlleles.add(Allele.NO_CALL);
// there are n^ploidy possible genotypes
final List<List<Allele>> possibleGenotypes = makeAllGenotypes(possibleGenotypeAlleles, highestPloidy);
final int nPossibleGenotypes = possibleGenotypes.size();
VariantContextBuilder vb = new VariantContextBuilder("unittest", "1", 1, 1, siteAlleles);
// first test -- create n copies of each genotype
for ( int i = 0; i < nPossibleGenotypes; i++ ) {
final List<Genotype> samples = new ArrayList<Genotype>(3);
samples.add(GenotypeBuilder.create("sample" + i, possibleGenotypes.get(i)));
add(vb.genotypes(samples));
}
// second test -- create one sample with each genotype
{
final List<Genotype> samples = new ArrayList<Genotype>(nPossibleGenotypes);
for ( int i = 0; i < nPossibleGenotypes; i++ ) {
samples.add(GenotypeBuilder.create("sample" + i, possibleGenotypes.get(i)));
}
add(vb.genotypes(samples));
}
// test mixed ploidy
for ( int i = 0; i < nPossibleGenotypes; i++ ) {
for ( int ploidy = 1; ploidy < highestPloidy; ploidy++ ) {
final List<Genotype> samples = new ArrayList<Genotype>(highestPloidy);
final List<Allele> genotype = possibleGenotypes.get(i).subList(0, ploidy);
samples.add(GenotypeBuilder.create("sample" + i, genotype));
add(vb.genotypes(samples));
}
}
}
}
}
private static List<List<Allele>> makeAllGenotypes(final List<Allele> alleles, final int highestPloidy) {
return Utils.makePermutations(alleles, highestPloidy, true);
2012-06-06 02:36:10 +08:00
}
public static void assertEquals(final VCFHeader actual, final VCFHeader expected) {
Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines");
// for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted?
//Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder());
final List<VCFHeaderLine> actualLines = new ArrayList<VCFHeaderLine>(actual.getMetaDataInSortedOrder());
final List<VCFHeaderLine> expectedLines = new ArrayList<VCFHeaderLine>(expected.getMetaDataInSortedOrder());
for ( int i = 0; i < actualLines.size(); i++ ) {
Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines");
}
}
Phase I commit to get shadowBCFs passing tests -- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument -- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA -- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample -- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original -- Performance optimizations in BCF2 codec and writer -- using arrays not lists for intermediate data structures -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over -- VCFHeader (which needs a complete rewrite, FYI Eric) -- Warn and fix on the way flag values with counts > 0 -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation. -- Explicitly track FILTER fields for efficient lookup in their own hashmap -- Automatically add PL field when we see a GL field and no PL field -- Added get and has methods for INFO, FILTER, and FORMAT fields -- No longer add AC and AF values to the INFO field when there's no ALT allele -- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare -- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
public static void main( String argv[] ) {
final File variants1 = new File(argv[0]);
final File variants2 = new File(argv[1]);
try {
VariantContextTestProvider.assertVCFandBCFFilesAreTheSame(variants1, variants2);
} catch ( IOException e ) {
throw new RuntimeException(e);
}
}
}