gatk-3.8/public/java/test/org/broadinstitute/sting/BaseTest.java

528 lines
25 KiB
Java
Raw Normal View History

/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting;
Refactor the Pileup Element with regards to indels Eric reported this bug due to the reduced reads failing with an index out of bounds on what we thought was a deletion, but turned out to be a read starting with insertion. * Refactored PileupElement to distinguish clearly between deletions and read starting with insertion * Modified ExtendedEventPileup to correctly distinguish elements with deletion when creating new pileups * Refactored most of the lazyLoadNextAlignment() function of the LocusIteratorByState for clarity and to create clear separation between what is a pileup with a deletion and what's not one. Got rid of many useless if statements. * Changed the way LocusIteratorByState creates extended event pileups to differentiate between insertions in the beginning of the read and deletions. * Every deletion now has an offset (start of the event) * Fixed bug when LocusITeratorByState found a read starting with insertion that happened to be a reduced read. * Separated the definitions of deletion/insertion (in the beginning of the read) in all UG annotations (and the annotator engine). * Pileup depth of coverage for a deleted base will now return the average coverage around the deletion. * Indel ReadPositionRankSum test now uses the deletion true offset from the read, changed all appropriate md5's * The extra pileup elements now properly read by the Indel mode of the UG made any subsequent call have a different random number and therefore all RankSum tests have slightly different values (in the 10^-3 range). Updated all appropriate md5s after extremely careful inspection -- Thanks Ryan! phew!
2012-01-18 07:56:50 +08:00
import org.apache.log4j.AppenderSkeleton;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;
import org.apache.log4j.spi.LoggingEvent;
import org.broad.tribble.readers.LineIterator;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.commandline.CommandLineUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.crypt.CryptUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.io.IOUtils;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import org.broadinstitute.variant.bcf2.BCF2Codec;
import org.broadinstitute.variant.variantcontext.Genotype;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.vcf.VCFCodec;
import org.broadinstitute.variant.vcf.VCFConstants;
import org.broadinstitute.variant.vcf.VCFHeader;
import org.broadinstitute.variant.vcf.VCFHeaderLine;
import org.testng.Assert;
import org.testng.Reporter;
import org.testng.SkipException;
Refactor the Pileup Element with regards to indels Eric reported this bug due to the reduced reads failing with an index out of bounds on what we thought was a deletion, but turned out to be a read starting with insertion. * Refactored PileupElement to distinguish clearly between deletions and read starting with insertion * Modified ExtendedEventPileup to correctly distinguish elements with deletion when creating new pileups * Refactored most of the lazyLoadNextAlignment() function of the LocusIteratorByState for clarity and to create clear separation between what is a pileup with a deletion and what's not one. Got rid of many useless if statements. * Changed the way LocusIteratorByState creates extended event pileups to differentiate between insertions in the beginning of the read and deletions. * Every deletion now has an offset (start of the event) * Fixed bug when LocusITeratorByState found a read starting with insertion that happened to be a reduced read. * Separated the definitions of deletion/insertion (in the beginning of the read) in all UG annotations (and the annotator engine). * Pileup depth of coverage for a deleted base will now return the average coverage around the deletion. * Indel ReadPositionRankSum test now uses the deletion true offset from the read, changed all appropriate md5's * The extra pileup elements now properly read by the Indel mode of the UG made any subsequent call have a different random number and therefore all RankSum tests have slightly different values (in the 10^-3 range). Updated all appropriate md5s after extremely careful inspection -- Thanks Ryan! phew!
2012-01-18 07:56:50 +08:00
import java.io.File;
import java.io.IOException;
import java.util.*;
/**
*
* User: aaron
* Date: Apr 14, 2009
* Time: 10:24:30 AM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
/**
* @author aaron
* @version 1.0
* @date Apr 14, 2009
* <p/>
* Class BaseTest
* <p/>
* This is the base test class for all of our test cases. All test cases should extend from this
* class; it sets up the logger, and resolves the location of directories that we rely on.
*/
@SuppressWarnings("unchecked")
public abstract class BaseTest {
/** our log, which we want to capture anything from org.broadinstitute.sting */
public static final Logger logger = CommandLineUtils.getStingLogger();
public static final String hg18Reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta";
public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta";
2011-10-26 04:08:39 +08:00
//public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta";
public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta";
Adding Graph-based likelihood ratio calculation to HC To active this feature add '--likelihoodCalculationEngine GraphBased' to the HC command line. New HC Options (both Advanced and Hidden): ========================================== --likelihoodCalculationEngine PairHMM/GraphBased/Random (default PairHMM) Specifies what engine should be used to generate read vs haplotype likelihoods. PairHMM : standard full-PairHMM approach. GraphBased : using the assembly graph to accelarate the process. Random : generate random likelihoods - used for benchmarking purposes only. --heterogeneousKmerSizeResolution COMBO_MIN/COMBO_MAX/MAX_ONLY/MIN_ONLY (default COMBO_MIN) It idicates how to merge haplotypes produced using different kmerSizes. Only has effect when used in combination with (--likelihooCalculationEngine GraphBased) COMBO_MIN : use the smallest kmerSize with all haplotypes. COMBO_MAX : use the larger kmerSize with all haplotypes. MIN_ONLY : use the smallest kmerSize with haplotypes assembled using it. MAX_ONLY : use the larger kmerSize with haplotypes asembled using it. Major code changes: =================== * Introduce multiple likelihood calculation engines (before there was just one). * Assembly results from different kmerSies are now packed together using the AssemblyResultSet class. * Added yet another PairHMM implementation with a different API in order to spport local PairHMM calculations, (e.g. a segment of the read vs a segment of the haplotype). Major components: ================ * FastLoglessPairHMM: New pair-hmm implemtation using some heuristic to speed up partial PairHMM calculations * GraphBasedLikelihoodCalculationEngine: delegates onto GraphBasedLikelihoodCalculationEngineInstance the exectution of the graph-based likelihood approach. * GraphBasedLikelihoodCalculationEngineInstance: one instance per active-region, implements the graph traversals to calcualte the likelihoods using the graph as an scafold. * HaplotypeGraph: haplotype threading graph where build from the assembly haplotypes. This structure is the one used by GraphBasedLikelihoodCalculationEngineInstance to do its work. * ReadAnchoring and KmerSequenceGraphMap: contain information as how a read map on the HaplotypeGraph that is used by GraphBasedLikelihoodCalcuationEngineInstance to do its work. Remove mergeCommonChains from HaplotypeGraph creation Fixed bamboo issues with HaplotypeGraphUnitTest Fixed probrems with HaplotypeCallerIntegrationTest Fixed issue with GraphLikelihoodVsLoglessAccuracyIntegrationTest Fixed ReadThreadingLikelihoodCalculationEngine issues Moved event-block iteration outside GraphBased*EngineInstance Removed unecessary parameter from ReadAnchoring constructor. Fixed test problem Added a bit more documentation to EventBlockSearchEngine Fixing some private - protected dependency issues Further refactoring making GraphBased*Instance and HaplotypeGraph slimmer. Addressed last pull request commit comments Fixed FastLoglessPairHMM public -> protected dependency Fixed probrem with HaplotypeGraph unit test Adding Graph-based likelihood ratio calculation to HC To active this feature add '--likelihoodCalculationEngine GraphBased' to the HC command line. New HC Options (both Advanced and Hidden): ========================================== --likelihoodCalculationEngine PairHMM/GraphBased/Random (default PairHMM) Specifies what engine should be used to generate read vs haplotype likelihoods. PairHMM : standard full-PairHMM approach. GraphBased : using the assembly graph to accelarate the process. Random : generate random likelihoods - used for benchmarking purposes only. --heterogeneousKmerSizeResolution COMBO_MIN/COMBO_MAX/MAX_ONLY/MIN_ONLY (default COMBO_MIN) It idicates how to merge haplotypes produced using different kmerSizes. Only has effect when used in combination with (--likelihooCalculationEngine GraphBased) COMBO_MIN : use the smallest kmerSize with all haplotypes. COMBO_MAX : use the larger kmerSize with all haplotypes. MIN_ONLY : use the smallest kmerSize with haplotypes assembled using it. MAX_ONLY : use the larger kmerSize with haplotypes asembled using it. Major code changes: =================== * Introduce multiple likelihood calculation engines (before there was just one). * Assembly results from different kmerSies are now packed together using the AssemblyResultSet class. * Added yet another PairHMM implementation with a different API in order to spport local PairHMM calculations, (e.g. a segment of the read vs a segment of the haplotype). Major components: ================ * FastLoglessPairHMM: New pair-hmm implemtation using some heuristic to speed up partial PairHMM calculations * GraphBasedLikelihoodCalculationEngine: delegates onto GraphBasedLikelihoodCalculationEngineInstance the exectution of the graph-based likelihood approach. * GraphBasedLikelihoodCalculationEngineInstance: one instance per active-region, implements the graph traversals to calcualte the likelihoods using the graph as an scafold. * HaplotypeGraph: haplotype threading graph where build from the assembly haplotypes. This structure is the one used by GraphBasedLikelihoodCalculationEngineInstance to do its work. * ReadAnchoring and KmerSequenceGraphMap: contain information as how a read map on the HaplotypeGraph that is used by GraphBasedLikelihoodCalcuationEngineInstance to do its work. Remove mergeCommonChains from HaplotypeGraph creation Fixed bamboo issues with HaplotypeGraphUnitTest Fixed probrems with HaplotypeCallerIntegrationTest Fixed issue with GraphLikelihoodVsLoglessAccuracyIntegrationTest Fixed ReadThreadingLikelihoodCalculationEngine issues Moved event-block iteration outside GraphBased*EngineInstance Removed unecessary parameter from ReadAnchoring constructor. Fixed test problem Added a bit more documentation to EventBlockSearchEngine Fixing some private - protected dependency issues Further refactoring making GraphBased*Instance and HaplotypeGraph slimmer. Addressed last pull request commit comments Fixed FastLoglessPairHMM public -> protected dependency Fixed probrem with HaplotypeGraph unit test
2013-11-19 01:07:59 +08:00
public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta";
public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/";
public static final String validationDataLocation = GATKDataLocation + "Validation_Data/";
public static final String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/";
public static final String comparisonDataLocation = GATKDataLocation + "Comparisons/";
public static final String annotationDataLocation = GATKDataLocation + "Annotations/";
public static final String b37GoodBAM = validationDataLocation + "/CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
public static final String b37GoodNA12878BAM = validationDataLocation + "/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam";
public static final String b37_NA12878_OMNI = validationDataLocation + "/NA12878.omni.vcf";
public static final String dbsnpDataLocation = GATKDataLocation;
2011-08-07 22:33:20 +08:00
public static final String b36dbSNP129 = dbsnpDataLocation + "dbsnp_129_b36.vcf";
2011-08-07 23:26:07 +08:00
public static final String b37dbSNP129 = dbsnpDataLocation + "dbsnp_129_b37.vcf";
public static final String b37dbSNP132 = dbsnpDataLocation + "dbsnp_132_b37.vcf";
public static final String hg18dbSNP132 = dbsnpDataLocation + "dbsnp_132.hg18.vcf";
Many updates to SelectVariants : 1) There is now a different parameter for sample name (-sn), sample file (-sf) or sample expression (-se). The unexpected behavior of the previous implementation was way too tricky to leave unchecked. (if you had a file or directory named after a sample name, SV wouldn't work) 1b) Fixed a TODO added by Eric -- now the output vcf always has the samples sorted alphabetically regardless of input (this came as a byproduct of the implementation of 1) 2) Discordance and Concordance now work in combination with all other parameters. 3) Discordance now follows Guillermo's suggestion where the discordance track is your VCF and the variant track is the one you are comparing to. I have updated the example in the wiki to reflect this change in interpretation. 4) If you DON'T provide any samples (-sn, -se or -sf), SelectVariants works with all samples from the VCF and ignores sample/genotype information when doing concordance or discordance. That is, it will report every "missing line" or "concordant line" in the two vcfs, regardless of sample or genotype information. 5) When samples are provided (-sn, -se or -sf) discordance and concordance will go down to the genotypes to determine whether or not you have a discordance/concordance event. In this case, a concordance happens only when the two VCFs display the same sample/genotype information for that locus, and discordance happens when the disc track is missing the line or has a different genotype information for that sample. 6) When dealing with multiple samples, concordance only happens if ALL your samples agree, and discordance happens if AT LEAST ONE of your samples disagree. --- Integration tests: 1) Discordance and concordance test added 2) All other tests updated to comply with the new 'sorted output' format and different inputs for samples. --- Methods for handling sample expressions and files with list of samples were added to SampleUtils. I recommend *NOT USING* the old getSamplesFromCommandLineInput as this mixing of sample names with expressions and files creates a rogue error that can be challenging to catch. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6072 348d0f76-0448-11de-a6fe-93d51630548a
2011-06-24 04:18:45 +08:00
public static final String hapmapDataLocation = comparisonDataLocation + "Validated/HapMap/3.3/";
public static final String b37hapmapGenotypes = hapmapDataLocation + "genotypes_r27_nr.b37_fwd.vcf";
public static final String intervalsLocation = "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/";
public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list";
public static final String hg19Chr20Intervals = GATKDataLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list";
public static final boolean REQUIRE_NETWORK_CONNECTION = false;
private static final String networkTempDirRoot = "/broad/hptmp/";
private static final boolean networkTempDirRootExists = new File(networkTempDirRoot).exists();
private static final File networkTempDirFile;
private static final String privateTestDirRelative = "private/testdata/";
public static final String privateTestDir = new File(privateTestDirRelative).getAbsolutePath() + "/";
protected static final String privateTestDirRoot = privateTestDir.replace(privateTestDirRelative, "");
private static final String publicTestDirRelative = "public/testdata/";
public static final String publicTestDir = new File(publicTestDirRelative).getAbsolutePath() + "/";
protected static final String publicTestDirRoot = publicTestDir.replace(publicTestDirRelative, "");
public static final String keysDataLocation = validationDataLocation + "keys/";
public static final String gatkKeyFile = CryptUtils.GATK_USER_KEY_DIRECTORY + "gsamembers_broadinstitute.org.key";
public static final String exampleFASTA = publicTestDir + "exampleFASTA.fasta";
Working version of HaplotypeCaller ReferenceConfidenceModel that accounts for indels as well as SNP confidences -- Assembly graph building now returns an object that describes whether the graph was successfully built and has variation, was succesfully built but didn't have variation, or truly failed in construction. Fixing an annoying bug where you'd prefectly assembly the sequence into the reference graph, but then return a null graph because of this, and you'd increase your kmer because it null was also used to indicate assembly failure -- -- Output format looks like: 20 10026072 . T <NON_REF> . . . GT:AD:DP:GQ:PL 0/0:3,0:3:9:0,9,120 20 10026073 . A <NON_REF> . . . GT:AD:DP:GQ:PL 0/0:3,0:3:9:0,9,119 20 10026074 . T <NON_REF> . . . GT:AD:DP:GQ:PL 0/0:3,0:3:9:0,9,121 20 10026075 . T <NON_REF> . . . GT:AD:DP:GQ:PL 0/0:3,0:3:9:0,9,119 20 10026076 . T <NON_REF> . . . GT:AD:DP:GQ:PL 0/0:3,0:3:9:0,9,120 20 10026077 . T <NON_REF> . . . GT:AD:DP:GQ:PL 0/0:3,0:3:9:0,9,120 20 10026078 . C <NON_REF> . . . GT:AD:DP:GQ:PL 0/0:5,0:5:15:0,15,217 20 10026079 . A <NON_REF> . . . GT:AD:DP:GQ:PL 0/0:6,0:6:18:0,18,240 20 10026080 . G <NON_REF> . . . GT:AD:DP:GQ:PL 0/0:6,0:6:18:0,18,268 20 10026081 . T <NON_REF> . . . GT:AD:DP:GQ:PL 0/0:7,0:7:21:0,21,267 We use a symbolic allele to indicate that the site is hom-ref, and because we have an ALT allele we can provide AD and PL field values. Currently these are calculated as ref vs. any non-ref value (mismatch or insertion) but doesn't yet account properly for alignment uncertainty. -- Can we enabled for single samples with --emitRefConfidence (-ERC). -- This is accomplished by realigning the each read to its most likley haplotype, and then evaluting the resulting pileups over the active region interval. The realignment is done by the HaplotypeBAMWriter, which now has a generalized interface that lets us provide a ReadDestination object so we can capture the realigned reads -- Provide access to the more raw LocusIteratorByState constructor so we can more easily make them programmatically without constructing lots of misc. GATK data structures. Moved the NO_DOWNSAMPLING constant from LIBSDownsamplingInfo to LocusIteratorByState so clients can use it without making LIBSDownsamplingInfo a public class. -- Includes GVCF writer -- Add 1 mb of WEx data to private/testdata -- Integration tests for reference model output for WGS and WEx data -- Emit GQ block information into VCF header for GVCF mode -- OutputMode from StandardCallerArgumentCollection moved to UnifiedArgumentCollection as its no longer relevant for HC -- Control max indel size for the reference confidence model from the command line. Increase default to 10 -- Don't use out_mode in HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest -- Unittests for ReferenceConfidenceModel -- Unittests for new MathUtils functions
2013-06-20 03:07:35 +08:00
public final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam";
public final static String NA12878_WEx = privateTestDir + "CEUTrio.HiSeq.WEx.b37_decoy.NA12878.20_10_11mb.bam";
public static final boolean pipelineTestRunModeIsSet = System.getProperty("pipeline.run").equals("run");
/** before the class starts up */
static {
// setup a basic log configuration
CommandLineUtils.configureConsoleLogging();
// setup our log layout
PatternLayout layout = new PatternLayout();
layout.setConversionPattern("TEST %C{1}.%M - %d{HH:mm:ss,SSS} - %m%n");
// now set the layout of all the loggers to our layout
CommandLineUtils.setLayout(logger, layout);
// Set the Root logger to only output warnings.
logger.setLevel(Level.WARN);
if (networkTempDirRootExists) {
networkTempDirFile = IOUtils.tempDir("temp.", ".dir", new File(networkTempDirRoot + System.getProperty("user.name")));
networkTempDirFile.deleteOnExit();
} else {
networkTempDirFile = null;
}
if ( REQUIRE_NETWORK_CONNECTION ) {
// find our file sources
if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) {
logger.fatal("We can't locate the reference directories. Aborting!");
throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories");
}
}
}
/**
* Simple generic utility class to creating TestNG data providers:
*
* 1: inherit this class, as in
*
* private class SummarizeDifferenceTest extends TestDataProvider {
* public SummarizeDifferenceTest() {
* super(SummarizeDifferenceTest.class);
* }
* ...
* }
*
* Provide a reference to your class to the TestDataProvider constructor.
*
* 2: Create instances of your subclass. Return from it the call to getTests, providing
* the class type of your test
*
* <code>
* {@literal @}DataProvider(name = "summaries")
* public Object[][] createSummaries() {
* new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2");
* new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1");
* return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class);
* }
* </code>
*
* This class magically tracks created objects of this
*/
public static class TestDataProvider {
private static final Map<Class, List<Object>> tests = new HashMap<>();
protected String name;
/**
* Create a new TestDataProvider instance bound to the class variable C
*/
2011-09-23 05:04:32 +08:00
public TestDataProvider(Class c, String name) {
if ( ! tests.containsKey(c) )
tests.put(c, new ArrayList<>());
tests.get(c).add(this);
2011-09-23 05:04:32 +08:00
this.name = name;
}
public TestDataProvider(Class c) {
this(c, "");
}
public void setName(final String name) {
this.name = name;
}
/**
* Return all of the data providers in the form expected by TestNG of type class C
* @param c
* @return
*/
public static Object[][] getTests(Class c) {
List<Object[]> params2 = new ArrayList<Object[]>();
for ( Object x : tests.get(c) ) params2.add(new Object[]{x});
return params2.toArray(new Object[][]{});
}
2011-09-23 05:04:32 +08:00
@Override
public String toString() {
return "TestDataProvider("+name+")";
}
}
/**
* test if the file exists
*
* @param file name as a string
* @return true if it exists
*/
public static boolean fileExist(String file) {
File temp = new File(file);
return temp.exists();
}
/**
* this appender looks for a specific message in the log4j stream.
* It can be used to verify that a specific message was generated to the logging system.
*/
public static class ValidationAppender extends AppenderSkeleton {
private boolean foundString = false;
private String targetString = "";
public ValidationAppender(String target) {
targetString = target;
}
@Override
protected void append(LoggingEvent loggingEvent) {
if (loggingEvent.getMessage().equals(targetString))
foundString = true;
}
public void close() {
// do nothing
}
public boolean requiresLayout() {
return false;
}
public boolean foundString() {
return foundString;
}
}
/**
* Creates a temp file that will be deleted on exit after tests are complete.
* @param name Prefix of the file.
* @param extension Extension to concat to the end of the file.
* @return A file in the temporary directory starting with name, ending with extension, which will be deleted after the program exits.
*/
public static File createTempFile(String name, String extension) {
try {
File file = File.createTempFile(name, extension);
file.deleteOnExit();
return file;
} catch (IOException ex) {
throw new ReviewedStingException("Cannot create temp file: " + ex.getMessage(), ex);
}
}
/**
* Creates a temp file that will be deleted on exit after tests are complete.
* @param name Name of the file.
* @return A file in the network temporary directory with name, which will be deleted after the program exits.
* @throws SkipException when the network is not available.
*/
public static File tryCreateNetworkTempFile(String name) {
if (!networkTempDirRootExists)
throw new SkipException("Network temporary directory does not exist: " + networkTempDirRoot);
File file = new File(networkTempDirFile, name);
file.deleteOnExit();
return file;
}
/**
* Log this message so that it shows up inline during output as well as in html reports
*
* @param message
*/
public static void log(final String message) {
Reporter.log(message, true);
}
Phase I commit to get shadowBCFs passing tests -- The GATK VCFWriter now enforces by default that all INFO, FILTER, and FORMAT fields be properly defined in the header. This helps avoid some of the low-level errors I saw in SelectVariants. This behavior can be disable in the engine with the --allowMissingVCFHeaders argument -- Fixed broken annotations in TandemRepeat, which were overwriting AD instead of defining RPA -- Optimizations to VariantEval, removing some obvious low-hanging fruit all in the subsetting of variants by sample -- SelectVariants header fixes -- Was defining DP for the info field as a FORMAT field, as for AC, AF, and AN original -- Performance optimizations in BCF2 codec and writer -- using arrays not lists for intermediate data structures -- Create once and reuse an array of GenotypeBuilders for the codec, avoiding reallocating this data structure over and over -- VCFHeader (which needs a complete rewrite, FYI Eric) -- Warn and fix on the way flag values with counts > 0 -- GenotypeSampleNames are now stored as a List as they are ordered, and the set iteration was slow. Duplicates are detected once at header creation. -- Explicitly track FILTER fields for efficient lookup in their own hashmap -- Automatically add PL field when we see a GL field and no PL field -- Added get and has methods for INFO, FILTER, and FORMAT fields -- No longer add AC and AF values to the INFO field when there's no ALT allele -- Memory efficient comparison of VCF and BCF files for shadow BCF testing. Now there's no (memory) constraint on the size of the files we can compare -- Because of VCF's limited floating point resolution we can only use 1 sig digit for comparing doubles between BCF and VCF
2012-06-16 02:25:00 +08:00
private static final double DEFAULT_FLOAT_TOLERANCE = 1e-1;
public static final void assertEqualsDoubleSmart(final Object actual, final Double expected) {
Assert.assertTrue(actual instanceof Double, "Not a double");
assertEqualsDoubleSmart((double)(Double)actual, (double)expected);
}
public static final void assertEqualsDoubleSmart(final Object actual, final Double expected, final double tolerance) {
Assert.assertTrue(actual instanceof Double, "Not a double");
assertEqualsDoubleSmart((double)(Double)actual, (double)expected, tolerance);
}
public static final void assertEqualsDoubleSmart(final double actual, final double expected) {
assertEqualsDoubleSmart(actual, expected, DEFAULT_FLOAT_TOLERANCE);
}
public static final <T> void assertEqualsSet(final Set<T> actual, final Set<T> expected, final String info) {
final Set<T> actualSet = new HashSet<T>(actual);
final Set<T> expectedSet = new HashSet<T>(expected);
Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps
}
public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) {
assertEqualsDoubleSmart(actual, expected, tolerance, null);
}
public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance, final String message) {
if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately
Assert.assertTrue(Double.isNaN(actual), "expected is nan, actual is not");
else if ( Double.isInfinite(expected) ) // NaN == NaN => false unfortunately
Assert.assertTrue(Double.isInfinite(actual), "expected is infinite, actual is not");
else {
final double delta = Math.abs(actual - expected);
final double ratio = Math.abs(actual / expected - 1.0);
Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual
+ " not within tolerance " + tolerance
+ (message == null ? "" : "message: " + message));
}
}
public static void assertVariantContextsAreEqual( final VariantContext actual, final VariantContext expected ) {
Assert.assertNotNull(actual, "VariantContext expected not null");
Assert.assertEquals(actual.getChr(), expected.getChr(), "chr");
Assert.assertEquals(actual.getStart(), expected.getStart(), "start");
Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end");
Assert.assertEquals(actual.getID(), expected.getID(), "id");
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual);
assertAttributesEquals(actual.getAttributes(), expected.getAttributes());
Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied");
Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered");
assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters");
assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual());
Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes");
if ( expected.hasGenotypes() ) {
assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set");
Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names");
final Set<String> samples = expected.getSampleNames();
for ( final String sample : samples ) {
assertGenotypesAreEqual(actual.getGenotype(sample), expected.getGenotype(sample));
}
}
}
public static void assertVariantContextStreamsAreEqual(final Iterable<VariantContext> actual, final Iterable<VariantContext> expected) {
final Iterator<VariantContext> actualIT = actual.iterator();
final Iterator<VariantContext> expectedIT = expected.iterator();
while ( expectedIT.hasNext() ) {
final VariantContext expectedVC = expectedIT.next();
if ( expectedVC == null )
continue;
VariantContext actualVC;
do {
Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual");
actualVC = actualIT.next();
} while ( actualIT.hasNext() && actualVC == null );
if ( actualVC == null )
Assert.fail("Too few records in actual");
assertVariantContextsAreEqual(actualVC, expectedVC);
}
Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual");
}
public static void assertGenotypesAreEqual(final Genotype actual, final Genotype expected) {
Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names");
Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles");
Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string");
Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type");
// filters are the same
Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields");
Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered");
// inline attributes
Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp");
Assert.assertTrue(Arrays.equals(actual.getAD(), expected.getAD()));
Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq");
Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL");
Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD");
Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ");
Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP");
Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods");
Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString");
Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods");
Assert.assertTrue(Arrays.equals(actual.getPL(), expected.getPL()));
Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual");
assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes());
Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased");
Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy");
}
public static void assertVCFHeadersAreEqual(final VCFHeader actual, final VCFHeader expected) {
Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines");
// for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted?
//Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder());
final List<VCFHeaderLine> actualLines = new ArrayList<VCFHeaderLine>(actual.getMetaDataInSortedOrder());
final List<VCFHeaderLine> expectedLines = new ArrayList<VCFHeaderLine>(expected.getMetaDataInSortedOrder());
for ( int i = 0; i < actualLines.size(); i++ ) {
Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines");
}
}
public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException {
final Pair<VCFHeader, GATKVCFUtils.VCIterable<LineIterator>> vcfData = GATKVCFUtils.readAllVCs(vcfFile, new VCFCodec());
final Pair<VCFHeader, GATKVCFUtils.VCIterable<PositionalBufferedStream>> bcfData = GATKVCFUtils.readAllVCs(bcfFile, new BCF2Codec());
assertVCFHeadersAreEqual(bcfData.getFirst(), vcfData.getFirst());
assertVariantContextStreamsAreEqual(bcfData.getSecond(), vcfData.getSecond());
}
private static void assertAttributeEquals(final String key, final Object actual, final Object expected) {
if ( expected instanceof Double ) {
// must be very tolerant because doubles are being rounded to 2 sig figs
assertEqualsDoubleSmart(actual, (Double) expected, 1e-2);
} else
Assert.assertEquals(actual, expected, "Attribute " + key);
}
private static void assertAttributesEquals(final Map<String, Object> actual, Map<String, Object> expected) {
final Set<String> expectedKeys = new HashSet<String>(expected.keySet());
for ( final Map.Entry<String, Object> act : actual.entrySet() ) {
final Object actualValue = act.getValue();
if ( expected.containsKey(act.getKey()) && expected.get(act.getKey()) != null ) {
final Object expectedValue = expected.get(act.getKey());
if ( expectedValue instanceof List ) {
final List<Object> expectedList = (List<Object>)expectedValue;
Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't");
final List<Object> actualList = (List<Object>)actualValue;
Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size");
for ( int i = 0; i < expectedList.size(); i++ )
assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i));
} else
assertAttributeEquals(act.getKey(), actualValue, expectedValue);
} else {
// it's ok to have a binding in x -> null that's absent in y
Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other");
}
expectedKeys.remove(act.getKey());
}
// now expectedKeys contains only the keys found in expected but not in actual,
// and they must all be null
for ( final String missingExpected : expectedKeys ) {
final Object value = expected.get(missingExpected);
Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" );
}
}
private static final boolean isMissing(final Object value) {
if ( value == null ) return true;
else if ( value.equals(VCFConstants.MISSING_VALUE_v4) ) return true;
else if ( value instanceof List ) {
// handles the case where all elements are null or the list is empty
for ( final Object elt : (List)value)
if ( elt != null )
return false;
return true;
} else
return false;
}
/**
* Checks whether two double array contain the same values or not.
* @param actual actual produced array.
* @param expected expected array.
* @param tolerance maximum difference between double value to be consider equivalent.
*/
protected static void assertEqualsDoubleArray(final double[] actual, final double[] expected, final double tolerance) {
if (expected == null)
Assert.assertNull(actual);
else {
Assert.assertNotNull(actual);
Assert.assertEquals(actual.length,expected.length,"array length");
}
for (int i = 0; i < actual.length; i++)
Assert.assertEquals(actual[i],expected[i],tolerance,"array position " + i);
}
}