2013-01-11 06:04:08 +08:00
|
|
|
/*
|
|
|
|
|
* Copyright (c) 2012 The Broad Institute
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person
|
|
|
|
|
* obtaining a copy of this software and associated documentation
|
|
|
|
|
* files (the "Software"), to deal in the Software without
|
|
|
|
|
* restriction, including without limitation the rights to use,
|
|
|
|
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
|
* copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following
|
|
|
|
|
* conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice shall be
|
|
|
|
|
* included in all copies or substantial portions of the Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
|
|
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
|
|
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
|
|
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
|
|
|
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
2009-04-15 01:18:16 +08:00
|
|
|
package org.broadinstitute.sting;
|
|
|
|
|
|
2012-01-18 07:56:50 +08:00
|
|
|
import org.apache.log4j.AppenderSkeleton;
|
|
|
|
|
import org.apache.log4j.Level;
|
|
|
|
|
import org.apache.log4j.Logger;
|
|
|
|
|
import org.apache.log4j.PatternLayout;
|
2010-06-03 01:45:51 +08:00
|
|
|
import org.apache.log4j.spi.LoggingEvent;
|
2010-11-19 04:22:01 +08:00
|
|
|
import org.broadinstitute.sting.commandline.CommandLineUtils;
|
2013-02-06 06:20:23 +08:00
|
|
|
import org.broadinstitute.sting.utils.collections.Pair;
|
2012-02-23 05:45:20 +08:00
|
|
|
import org.broadinstitute.sting.utils.crypt.CryptUtils;
|
2010-09-12 23:07:38 +08:00
|
|
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
2011-12-17 07:07:26 +08:00
|
|
|
import org.broadinstitute.sting.utils.io.IOUtils;
|
2013-02-06 06:20:23 +08:00
|
|
|
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
|
|
|
|
|
import org.broadinstitute.variant.bcf2.BCF2Codec;
|
|
|
|
|
import org.broadinstitute.variant.variantcontext.Genotype;
|
|
|
|
|
import org.broadinstitute.variant.variantcontext.VariantContext;
|
|
|
|
|
import org.broadinstitute.variant.vcf.VCFCodec;
|
|
|
|
|
import org.broadinstitute.variant.vcf.VCFConstants;
|
|
|
|
|
import org.broadinstitute.variant.vcf.VCFHeader;
|
|
|
|
|
import org.broadinstitute.variant.vcf.VCFHeaderLine;
|
2012-05-27 23:13:43 +08:00
|
|
|
import org.testng.Assert;
|
2012-05-24 22:50:33 +08:00
|
|
|
import org.testng.Reporter;
|
2012-05-29 23:18:22 +08:00
|
|
|
import org.testng.SkipException;
|
2009-04-15 01:18:16 +08:00
|
|
|
|
2012-01-18 07:56:50 +08:00
|
|
|
import java.io.File;
|
|
|
|
|
import java.io.IOException;
|
2012-06-29 10:36:26 +08:00
|
|
|
import java.util.*;
|
2009-04-15 01:18:16 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
*
|
|
|
|
|
* User: aaron
|
|
|
|
|
* Date: Apr 14, 2009
|
|
|
|
|
* Time: 10:24:30 AM
|
|
|
|
|
*
|
|
|
|
|
* The Broad Institute
|
|
|
|
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
|
|
|
|
* This software and its documentation are copyright 2009 by the
|
|
|
|
|
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
|
|
|
|
*
|
|
|
|
|
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
|
|
|
|
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @author aaron
|
|
|
|
|
* @version 1.0
|
|
|
|
|
* @date Apr 14, 2009
|
|
|
|
|
* <p/>
|
|
|
|
|
* Class BaseTest
|
|
|
|
|
* <p/>
|
2009-04-15 01:41:38 +08:00
|
|
|
* This is the base test class for all of our test cases. All test cases should extend from this
|
2009-12-29 15:40:48 +08:00
|
|
|
* class; it sets up the logger, and resolves the location of directories that we rely on.
|
2009-04-15 01:18:16 +08:00
|
|
|
*/
|
2010-11-19 04:22:01 +08:00
|
|
|
@SuppressWarnings("unchecked")
|
2009-04-15 01:18:16 +08:00
|
|
|
public abstract class BaseTest {
|
|
|
|
|
/** our log, which we want to capture anything from org.broadinstitute.sting */
|
2010-11-19 04:22:01 +08:00
|
|
|
public static final Logger logger = CommandLineUtils.getStingLogger();
|
2009-12-29 15:40:48 +08:00
|
|
|
|
2010-11-19 04:22:01 +08:00
|
|
|
public static final String hg18Reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
|
2011-01-20 04:30:25 +08:00
|
|
|
public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta";
|
2010-11-19 04:22:01 +08:00
|
|
|
public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta";
|
2011-10-26 04:08:39 +08:00
|
|
|
//public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta";
|
2010-11-19 04:22:01 +08:00
|
|
|
public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta";
|
|
|
|
|
public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/";
|
|
|
|
|
public static final String validationDataLocation = GATKDataLocation + "Validation_Data/";
|
|
|
|
|
public static final String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/";
|
|
|
|
|
public static final String comparisonDataLocation = GATKDataLocation + "Comparisons/";
|
2010-11-23 06:59:42 +08:00
|
|
|
public static final String annotationDataLocation = GATKDataLocation + "Annotations/";
|
|
|
|
|
|
2012-01-23 22:52:07 +08:00
|
|
|
public static final String b37GoodBAM = validationDataLocation + "/CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
|
2012-03-08 21:57:29 +08:00
|
|
|
public static final String b37GoodNA12878BAM = validationDataLocation + "/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam";
|
|
|
|
|
public static final String b37_NA12878_OMNI = validationDataLocation + "/NA12878.omni.vcf";
|
2012-01-23 22:52:07 +08:00
|
|
|
|
2010-11-23 06:59:42 +08:00
|
|
|
public static final String dbsnpDataLocation = GATKDataLocation;
|
2011-08-07 22:33:20 +08:00
|
|
|
public static final String b36dbSNP129 = dbsnpDataLocation + "dbsnp_129_b36.vcf";
|
2011-08-07 23:26:07 +08:00
|
|
|
public static final String b37dbSNP129 = dbsnpDataLocation + "dbsnp_129_b37.vcf";
|
2011-01-20 04:30:25 +08:00
|
|
|
public static final String b37dbSNP132 = dbsnpDataLocation + "dbsnp_132_b37.vcf";
|
2011-08-12 05:04:09 +08:00
|
|
|
public static final String hg18dbSNP132 = dbsnpDataLocation + "dbsnp_132.hg18.vcf";
|
2009-04-15 01:18:16 +08:00
|
|
|
|
Many updates to SelectVariants :
1) There is now a different parameter for sample name (-sn), sample file (-sf) or sample expression (-se). The unexpected behavior of the previous implementation was way too tricky to leave unchecked. (if you had a file or directory named after a sample name, SV wouldn't work)
1b) Fixed a TODO added by Eric -- now the output vcf always has the samples sorted alphabetically regardless of input (this came as a byproduct of the implementation of 1)
2) Discordance and Concordance now work in combination with all other parameters.
3) Discordance now follows Guillermo's suggestion where the discordance track is your VCF and the variant track is the one you are comparing to. I have updated the example in the wiki to reflect this change in interpretation.
4) If you DON'T provide any samples (-sn, -se or -sf), SelectVariants works with all samples from the VCF and ignores sample/genotype information when doing concordance or discordance. That is, it will report every "missing line" or "concordant line" in the two vcfs, regardless of sample or genotype information.
5) When samples are provided (-sn, -se or -sf) discordance and concordance will go down to the genotypes to determine whether or not you have a discordance/concordance event. In this case, a concordance happens only when the two VCFs display the same sample/genotype information for that locus, and discordance happens when the disc track is missing the line or has a different genotype information for that sample.
6) When dealing with multiple samples, concordance only happens if ALL your samples agree, and discordance happens if AT LEAST ONE of your samples disagree.
---
Integration tests:
1) Discordance and concordance test added
2) All other tests updated to comply with the new 'sorted output' format and different inputs for samples.
---
Methods for handling sample expressions and files with list of samples were added to SampleUtils. I recommend *NOT USING* the old getSamplesFromCommandLineInput as this mixing of sample names with expressions and files creates a rogue error that can be challenging to catch.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6072 348d0f76-0448-11de-a6fe-93d51630548a
2011-06-24 04:18:45 +08:00
|
|
|
public static final String hapmapDataLocation = comparisonDataLocation + "Validated/HapMap/3.3/";
|
|
|
|
|
public static final String b37hapmapGenotypes = hapmapDataLocation + "genotypes_r27_nr.b37_fwd.vcf";
|
|
|
|
|
public static final String b37hapmapSites = hapmapDataLocation + "sites_r27_nr.b37_fwd.vcf";
|
|
|
|
|
|
2011-02-16 02:26:14 +08:00
|
|
|
public static final String intervalsLocation = GATKDataLocation;
|
|
|
|
|
public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list";
|
|
|
|
|
public static final String hg19Chr20Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list";
|
|
|
|
|
|
2012-05-16 02:39:12 +08:00
|
|
|
public static final boolean REQUIRE_NETWORK_CONNECTION = false;
|
2012-05-29 23:18:22 +08:00
|
|
|
private static final String networkTempDirRoot = "/broad/hptmp/";
|
|
|
|
|
private static final boolean networkTempDirRootExists = new File(networkTempDirRoot).exists();
|
|
|
|
|
private static final String networkTempDir;
|
|
|
|
|
private static final File networkTempDirFile;
|
2011-02-01 07:13:09 +08:00
|
|
|
|
2012-06-21 03:35:36 +08:00
|
|
|
private static final String privateTestDirRelative = "private/testdata/";
|
|
|
|
|
public static final String privateTestDir = new File(privateTestDirRelative).getAbsolutePath() + "/";
|
|
|
|
|
protected static final String privateTestDirRoot = privateTestDir.replace(privateTestDirRelative, "");
|
|
|
|
|
|
|
|
|
|
private static final String publicTestDirRelative = "public/testdata/";
|
|
|
|
|
public static final String publicTestDir = new File(publicTestDirRelative).getAbsolutePath() + "/";
|
|
|
|
|
protected static final String publicTestDirRoot = publicTestDir.replace(publicTestDirRelative, "");
|
2009-04-15 01:18:16 +08:00
|
|
|
|
2012-02-23 05:45:20 +08:00
|
|
|
public static final String keysDataLocation = validationDataLocation + "keys/";
|
|
|
|
|
public static final String gatkKeyFile = CryptUtils.GATK_USER_KEY_DIRECTORY + "gsamembers_broadinstitute.org.key";
|
|
|
|
|
|
2012-07-26 11:13:12 +08:00
|
|
|
public static final String exampleFASTA = publicTestDir + "exampleFASTA.fasta";
|
|
|
|
|
|
2013-06-29 02:57:16 +08:00
|
|
|
public static final boolean pipelineTestRunModeIsSet = System.getProperty("pipeline.run").equals("run");
|
|
|
|
|
|
2009-04-15 01:18:16 +08:00
|
|
|
/** before the class starts up */
|
2010-11-19 04:22:01 +08:00
|
|
|
static {
|
|
|
|
|
// setup a basic log configuration
|
|
|
|
|
CommandLineUtils.configureConsoleLogging();
|
2009-05-20 13:15:27 +08:00
|
|
|
|
2010-11-19 04:22:01 +08:00
|
|
|
// setup our log layout
|
|
|
|
|
PatternLayout layout = new PatternLayout();
|
|
|
|
|
layout.setConversionPattern("TEST %C{1}.%M - %d{HH:mm:ss,SSS} - %m%n");
|
2009-04-15 01:18:16 +08:00
|
|
|
|
2010-11-19 04:22:01 +08:00
|
|
|
// now set the layout of all the loggers to our layout
|
|
|
|
|
CommandLineUtils.setLayout(logger, layout);
|
2009-04-15 01:18:16 +08:00
|
|
|
|
2010-11-19 04:22:01 +08:00
|
|
|
// Set the Root logger to only output warnings.
|
|
|
|
|
logger.setLevel(Level.WARN);
|
2009-04-15 01:18:16 +08:00
|
|
|
|
2012-05-29 23:18:22 +08:00
|
|
|
if (networkTempDirRootExists) {
|
|
|
|
|
networkTempDirFile = IOUtils.tempDir("temp.", ".dir", new File(networkTempDirRoot + System.getProperty("user.name")));
|
2012-03-30 20:34:35 +08:00
|
|
|
networkTempDirFile.deleteOnExit();
|
|
|
|
|
networkTempDir = networkTempDirFile.getAbsolutePath() + "/";
|
2012-05-29 23:18:22 +08:00
|
|
|
} else {
|
|
|
|
|
networkTempDir = null;
|
|
|
|
|
networkTempDirFile = null;
|
|
|
|
|
}
|
2012-03-30 20:34:35 +08:00
|
|
|
|
2012-05-29 23:18:22 +08:00
|
|
|
|
|
|
|
|
if ( REQUIRE_NETWORK_CONNECTION ) {
|
2012-03-30 20:34:35 +08:00
|
|
|
// find our file sources
|
|
|
|
|
if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) {
|
|
|
|
|
logger.fatal("We can't locate the reference directories. Aborting!");
|
|
|
|
|
throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories");
|
|
|
|
|
}
|
|
|
|
|
}
|
2009-04-15 01:18:16 +08:00
|
|
|
}
|
|
|
|
|
|
2011-07-07 09:57:22 +08:00
|
|
|
/**
|
|
|
|
|
* Simple generic utility class to creating TestNG data providers:
|
|
|
|
|
*
|
|
|
|
|
* 1: inherit this class, as in
|
|
|
|
|
*
|
|
|
|
|
* private class SummarizeDifferenceTest extends TestDataProvider {
|
|
|
|
|
* public SummarizeDifferenceTest() {
|
|
|
|
|
* super(SummarizeDifferenceTest.class);
|
|
|
|
|
* }
|
|
|
|
|
* ...
|
|
|
|
|
* }
|
|
|
|
|
*
|
|
|
|
|
* Provide a reference to your class to the TestDataProvider constructor.
|
|
|
|
|
*
|
|
|
|
|
* 2: Create instances of your subclass. Return from it the call to getTests, providing
|
|
|
|
|
* the class type of your test
|
|
|
|
|
*
|
2011-07-18 22:46:01 +08:00
|
|
|
* @DataProvider(name = "summaries"
|
2011-07-07 09:57:22 +08:00
|
|
|
* public Object[][] createSummaries() {
|
|
|
|
|
* new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2");
|
|
|
|
|
* new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1");
|
|
|
|
|
* return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class);
|
|
|
|
|
* }
|
|
|
|
|
*
|
|
|
|
|
* This class magically tracks created objects of this
|
|
|
|
|
*/
|
|
|
|
|
public static class TestDataProvider {
|
|
|
|
|
private static final Map<Class, List<Object>> tests = new HashMap<Class, List<Object>>();
|
2012-03-02 04:01:11 +08:00
|
|
|
protected String name;
|
2011-07-07 09:57:22 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Create a new TestDataProvider instance bound to the class variable C
|
|
|
|
|
* @param c
|
|
|
|
|
*/
|
2011-09-23 05:04:32 +08:00
|
|
|
public TestDataProvider(Class c, String name) {
|
2011-07-07 09:57:22 +08:00
|
|
|
if ( ! tests.containsKey(c) )
|
|
|
|
|
tests.put(c, new ArrayList<Object>());
|
|
|
|
|
tests.get(c).add(this);
|
2011-09-23 05:04:32 +08:00
|
|
|
this.name = name;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public TestDataProvider(Class c) {
|
|
|
|
|
this(c, "");
|
2011-07-07 09:57:22 +08:00
|
|
|
}
|
2011-12-01 06:05:16 +08:00
|
|
|
|
|
|
|
|
public void setName(final String name) {
|
|
|
|
|
this.name = name;
|
|
|
|
|
}
|
2011-07-07 09:57:22 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return all of the data providers in the form expected by TestNG of type class C
|
|
|
|
|
* @param c
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
public static Object[][] getTests(Class c) {
|
|
|
|
|
List<Object[]> params2 = new ArrayList<Object[]>();
|
|
|
|
|
for ( Object x : tests.get(c) ) params2.add(new Object[]{x});
|
|
|
|
|
return params2.toArray(new Object[][]{});
|
|
|
|
|
}
|
2011-09-23 05:04:32 +08:00
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public String toString() {
|
|
|
|
|
return "TestDataProvider("+name+")";
|
|
|
|
|
}
|
2011-07-07 09:57:22 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-15 01:18:16 +08:00
|
|
|
/**
|
|
|
|
|
* test if the file exists
|
|
|
|
|
*
|
|
|
|
|
* @param file name as a string
|
|
|
|
|
* @return true if it exists
|
|
|
|
|
*/
|
|
|
|
|
public static boolean fileExist(String file) {
|
|
|
|
|
File temp = new File(file);
|
|
|
|
|
return temp.exists();
|
|
|
|
|
}
|
2010-06-03 01:45:51 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* this appender looks for a specific message in the log4j stream.
|
|
|
|
|
* It can be used to verify that a specific message was generated to the logging system.
|
|
|
|
|
*/
|
|
|
|
|
public static class ValidationAppender extends AppenderSkeleton {
|
|
|
|
|
|
|
|
|
|
private boolean foundString = false;
|
|
|
|
|
private String targetString = "";
|
|
|
|
|
|
|
|
|
|
public ValidationAppender(String target) {
|
|
|
|
|
targetString = target;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
protected void append(LoggingEvent loggingEvent) {
|
|
|
|
|
if (loggingEvent.getMessage().equals(targetString))
|
|
|
|
|
foundString = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void close() {
|
|
|
|
|
// do nothing
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean requiresLayout() {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean foundString() {
|
|
|
|
|
return foundString;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-01-25 12:11:49 +08:00
|
|
|
/**
|
|
|
|
|
* Creates a temp file that will be deleted on exit after tests are complete.
|
|
|
|
|
* @param name Prefix of the file.
|
|
|
|
|
* @param extension Extension to concat to the end of the file.
|
|
|
|
|
* @return A file in the temporary directory starting with name, ending with extension, which will be deleted after the program exits.
|
|
|
|
|
*/
|
|
|
|
|
public static File createTempFile(String name, String extension) {
|
|
|
|
|
try {
|
|
|
|
|
File file = File.createTempFile(name, extension);
|
|
|
|
|
file.deleteOnExit();
|
|
|
|
|
return file;
|
|
|
|
|
} catch (IOException ex) {
|
|
|
|
|
throw new ReviewedStingException("Cannot create temp file: " + ex.getMessage(), ex);
|
|
|
|
|
}
|
|
|
|
|
}
|
2011-02-01 07:13:09 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Creates a temp file that will be deleted on exit after tests are complete.
|
2011-12-17 07:07:26 +08:00
|
|
|
* @param name Name of the file.
|
|
|
|
|
* @return A file in the network temporary directory with name, which will be deleted after the program exits.
|
2012-05-29 23:18:22 +08:00
|
|
|
* @throws SkipException when the network is not available.
|
2011-02-01 07:13:09 +08:00
|
|
|
*/
|
2012-05-29 23:18:22 +08:00
|
|
|
public static File tryCreateNetworkTempFile(String name) {
|
|
|
|
|
if (!networkTempDirRootExists)
|
|
|
|
|
throw new SkipException("Network temporary directory does not exist: " + networkTempDirRoot);
|
2011-12-17 07:07:26 +08:00
|
|
|
File file = new File(networkTempDirFile, name);
|
|
|
|
|
file.deleteOnExit();
|
|
|
|
|
return file;
|
2011-02-01 07:13:09 +08:00
|
|
|
}
|
2012-05-24 22:50:33 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Log this message so that it shows up inline during output as well as in html reports
|
|
|
|
|
*
|
|
|
|
|
* @param message
|
|
|
|
|
*/
|
|
|
|
|
public static void log(final String message) {
|
|
|
|
|
Reporter.log(message, true);
|
|
|
|
|
}
|
2012-05-27 23:13:43 +08:00
|
|
|
|
Phase I commit to get shadowBCFs passing tests
-- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument
-- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA
-- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample
-- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original
-- Performance optimizations in BCF2 codec and writer
-- using arrays not lists for intermediate data structures
-- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over
-- VCFHeader (which needs a complete rewrite, FYI Eric)
-- Warn and fix on the way flag values with counts > 0
-- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation.
-- Explicitly track FILTER fields for efficient lookup in their own hashmap
-- Automatically add PL field when we see a GL field and no PL field
-- Added get and has methods for INFO, FILTER, and FORMAT fields
-- No longer add AC and AF values to the INFO field when there's no ALT allele
-- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare
-- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
|
|
|
private static final double DEFAULT_FLOAT_TOLERANCE = 1e-1;
|
2012-05-27 23:13:43 +08:00
|
|
|
|
|
|
|
|
public static final void assertEqualsDoubleSmart(final Object actual, final Double expected) {
|
2012-07-28 02:34:13 +08:00
|
|
|
Assert.assertTrue(actual instanceof Double, "Not a double");
|
2012-05-27 23:13:43 +08:00
|
|
|
assertEqualsDoubleSmart((double)(Double)actual, (double)expected);
|
|
|
|
|
}
|
|
|
|
|
|
2012-05-29 08:20:05 +08:00
|
|
|
public static final void assertEqualsDoubleSmart(final Object actual, final Double expected, final double tolerance) {
|
2012-07-28 02:34:13 +08:00
|
|
|
Assert.assertTrue(actual instanceof Double, "Not a double");
|
2012-05-29 08:20:05 +08:00
|
|
|
assertEqualsDoubleSmart((double)(Double)actual, (double)expected, tolerance);
|
|
|
|
|
}
|
|
|
|
|
|
2012-05-27 23:13:43 +08:00
|
|
|
public static final void assertEqualsDoubleSmart(final double actual, final double expected) {
|
|
|
|
|
assertEqualsDoubleSmart(actual, expected, DEFAULT_FLOAT_TOLERANCE);
|
|
|
|
|
}
|
|
|
|
|
|
2012-06-29 10:36:26 +08:00
|
|
|
public static final <T> void assertEqualsSet(final Set<T> actual, final Set<T> expected, final String info) {
|
|
|
|
|
final Set<T> actualSet = new HashSet<T>(actual);
|
|
|
|
|
final Set<T> expectedSet = new HashSet<T>(expected);
|
|
|
|
|
Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps
|
|
|
|
|
}
|
|
|
|
|
|
2013-01-02 21:46:11 +08:00
|
|
|
public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) {
|
|
|
|
|
assertEqualsDoubleSmart(actual, expected, tolerance, null);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance, final String message) {
|
2012-05-27 23:13:43 +08:00
|
|
|
if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately
|
2012-07-28 02:34:13 +08:00
|
|
|
Assert.assertTrue(Double.isNaN(actual), "expected is nan, actual is not");
|
2012-05-27 23:13:43 +08:00
|
|
|
else if ( Double.isInfinite(expected) ) // NaN == NaN => false unfortunately
|
2012-07-28 02:34:13 +08:00
|
|
|
Assert.assertTrue(Double.isInfinite(actual), "expected is infinite, actual is not");
|
2012-05-27 23:13:43 +08:00
|
|
|
else {
|
|
|
|
|
final double delta = Math.abs(actual - expected);
|
|
|
|
|
final double ratio = Math.abs(actual / expected - 1.0);
|
2013-01-02 21:46:11 +08:00
|
|
|
Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual
|
|
|
|
|
+ " not within tolerance " + tolerance
|
|
|
|
|
+ (message == null ? "" : "message: " + message));
|
2012-05-27 23:13:43 +08:00
|
|
|
}
|
|
|
|
|
}
|
2013-02-06 06:20:23 +08:00
|
|
|
|
|
|
|
|
public static void assertVariantContextsAreEqual( final VariantContext actual, final VariantContext expected ) {
|
|
|
|
|
Assert.assertNotNull(actual, "VariantContext expected not null");
|
|
|
|
|
Assert.assertEquals(actual.getChr(), expected.getChr(), "chr");
|
|
|
|
|
Assert.assertEquals(actual.getStart(), expected.getStart(), "start");
|
|
|
|
|
Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end");
|
|
|
|
|
Assert.assertEquals(actual.getID(), expected.getID(), "id");
|
|
|
|
|
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual);
|
|
|
|
|
|
|
|
|
|
assertAttributesEquals(actual.getAttributes(), expected.getAttributes());
|
|
|
|
|
Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied");
|
|
|
|
|
Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered");
|
|
|
|
|
assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters");
|
|
|
|
|
assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual());
|
|
|
|
|
|
|
|
|
|
Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes");
|
|
|
|
|
if ( expected.hasGenotypes() ) {
|
|
|
|
|
assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set");
|
|
|
|
|
Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names");
|
|
|
|
|
final Set<String> samples = expected.getSampleNames();
|
|
|
|
|
for ( final String sample : samples ) {
|
|
|
|
|
assertGenotypesAreEqual(actual.getGenotype(sample), expected.getGenotype(sample));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static void assertVariantContextStreamsAreEqual(final Iterable<VariantContext> actual, final Iterable<VariantContext> expected) {
|
|
|
|
|
final Iterator<VariantContext> actualIT = actual.iterator();
|
|
|
|
|
final Iterator<VariantContext> expectedIT = expected.iterator();
|
|
|
|
|
|
|
|
|
|
while ( expectedIT.hasNext() ) {
|
|
|
|
|
final VariantContext expectedVC = expectedIT.next();
|
|
|
|
|
if ( expectedVC == null )
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
VariantContext actualVC;
|
|
|
|
|
do {
|
|
|
|
|
Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual");
|
|
|
|
|
actualVC = actualIT.next();
|
|
|
|
|
} while ( actualIT.hasNext() && actualVC == null );
|
|
|
|
|
|
|
|
|
|
if ( actualVC == null )
|
|
|
|
|
Assert.fail("Too few records in actual");
|
|
|
|
|
|
|
|
|
|
assertVariantContextsAreEqual(actualVC, expectedVC);
|
|
|
|
|
}
|
|
|
|
|
Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static void assertGenotypesAreEqual(final Genotype actual, final Genotype expected) {
|
|
|
|
|
Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names");
|
|
|
|
|
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles");
|
|
|
|
|
Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string");
|
|
|
|
|
Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type");
|
|
|
|
|
|
|
|
|
|
// filters are the same
|
|
|
|
|
Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields");
|
|
|
|
|
Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered");
|
|
|
|
|
|
|
|
|
|
// inline attributes
|
|
|
|
|
Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp");
|
|
|
|
|
Assert.assertTrue(Arrays.equals(actual.getAD(), expected.getAD()));
|
|
|
|
|
Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq");
|
|
|
|
|
Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL");
|
|
|
|
|
Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD");
|
|
|
|
|
Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ");
|
|
|
|
|
Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP");
|
|
|
|
|
|
|
|
|
|
Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods");
|
|
|
|
|
Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString");
|
|
|
|
|
Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods");
|
|
|
|
|
Assert.assertTrue(Arrays.equals(actual.getPL(), expected.getPL()));
|
|
|
|
|
|
|
|
|
|
Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual");
|
|
|
|
|
assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes());
|
|
|
|
|
Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased");
|
|
|
|
|
Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static void assertVCFHeadersAreEqual(final VCFHeader actual, final VCFHeader expected) {
|
|
|
|
|
Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines");
|
|
|
|
|
|
|
|
|
|
// for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted?
|
|
|
|
|
//Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder());
|
|
|
|
|
final List<VCFHeaderLine> actualLines = new ArrayList<VCFHeaderLine>(actual.getMetaDataInSortedOrder());
|
|
|
|
|
final List<VCFHeaderLine> expectedLines = new ArrayList<VCFHeaderLine>(expected.getMetaDataInSortedOrder());
|
|
|
|
|
for ( int i = 0; i < actualLines.size(); i++ ) {
|
|
|
|
|
Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException {
|
|
|
|
|
final Pair<VCFHeader, GATKVCFUtils.VCIterable> vcfData = GATKVCFUtils.readAllVCs(vcfFile, new VCFCodec());
|
|
|
|
|
final Pair<VCFHeader, GATKVCFUtils.VCIterable> bcfData = GATKVCFUtils.readAllVCs(bcfFile, new BCF2Codec());
|
|
|
|
|
assertVCFHeadersAreEqual(bcfData.getFirst(), vcfData.getFirst());
|
|
|
|
|
assertVariantContextStreamsAreEqual(bcfData.getSecond(), vcfData.getSecond());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static void assertAttributeEquals(final String key, final Object actual, final Object expected) {
|
|
|
|
|
if ( expected instanceof Double ) {
|
|
|
|
|
// must be very tolerant because doubles are being rounded to 2 sig figs
|
|
|
|
|
assertEqualsDoubleSmart(actual, (Double) expected, 1e-2);
|
|
|
|
|
} else
|
|
|
|
|
Assert.assertEquals(actual, expected, "Attribute " + key);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static void assertAttributesEquals(final Map<String, Object> actual, Map<String, Object> expected) {
|
|
|
|
|
final Set<String> expectedKeys = new HashSet<String>(expected.keySet());
|
|
|
|
|
|
|
|
|
|
for ( final Map.Entry<String, Object> act : actual.entrySet() ) {
|
|
|
|
|
final Object actualValue = act.getValue();
|
|
|
|
|
if ( expected.containsKey(act.getKey()) && expected.get(act.getKey()) != null ) {
|
|
|
|
|
final Object expectedValue = expected.get(act.getKey());
|
|
|
|
|
if ( expectedValue instanceof List ) {
|
|
|
|
|
final List<Object> expectedList = (List<Object>)expectedValue;
|
|
|
|
|
Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't");
|
|
|
|
|
final List<Object> actualList = (List<Object>)actualValue;
|
|
|
|
|
Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size");
|
|
|
|
|
for ( int i = 0; i < expectedList.size(); i++ )
|
|
|
|
|
assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i));
|
|
|
|
|
} else
|
|
|
|
|
assertAttributeEquals(act.getKey(), actualValue, expectedValue);
|
|
|
|
|
} else {
|
|
|
|
|
// it's ok to have a binding in x -> null that's absent in y
|
|
|
|
|
Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other");
|
|
|
|
|
}
|
|
|
|
|
expectedKeys.remove(act.getKey());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// now expectedKeys contains only the keys found in expected but not in actual,
|
|
|
|
|
// and they must all be null
|
|
|
|
|
for ( final String missingExpected : expectedKeys ) {
|
|
|
|
|
final Object value = expected.get(missingExpected);
|
|
|
|
|
Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static final boolean isMissing(final Object value) {
|
|
|
|
|
if ( value == null ) return true;
|
|
|
|
|
else if ( value.equals(VCFConstants.MISSING_VALUE_v4) ) return true;
|
|
|
|
|
else if ( value instanceof List ) {
|
|
|
|
|
// handles the case where all elements are null or the list is empty
|
|
|
|
|
for ( final Object elt : (List)value)
|
|
|
|
|
if ( elt != null )
|
|
|
|
|
return false;
|
|
|
|
|
return true;
|
|
|
|
|
} else
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2009-04-15 01:18:16 +08:00
|
|
|
}
|