2009-04-15 01:18:16 +08:00
|
|
|
package org.broadinstitute.sting;
|
|
|
|
|
|
2011-01-25 12:11:49 +08:00
|
|
|
import org.apache.commons.io.FileUtils;
|
2009-04-15 01:18:16 +08:00
|
|
|
import org.apache.log4j.*;
|
2010-06-03 01:45:51 +08:00
|
|
|
import org.apache.log4j.spi.LoggingEvent;
|
2010-11-19 04:22:01 +08:00
|
|
|
import org.broadinstitute.sting.commandline.CommandLineUtils;
|
2011-07-12 11:10:27 +08:00
|
|
|
import org.broadinstitute.sting.gatk.walkers.diffengine.DiffEngine;
|
2010-09-12 23:07:38 +08:00
|
|
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
2011-01-25 12:11:49 +08:00
|
|
|
import org.testng.Assert;
|
2009-04-15 01:18:16 +08:00
|
|
|
|
Many updates to SelectVariants :
1) There is now a different parameter for sample name (-sn), sample file (-sf) or sample expression (-se). The unexpected behavior of the previous implementation was way too tricky to leave unchecked. (if you had a file or directory named after a sample name, SV wouldn't work)
1b) Fixed a TODO added by Eric -- now the output vcf always has the samples sorted alphabetically regardless of input (this came as a byproduct of the implementation of 1)
2) Discordance and Concordance now work in combination with all other parameters.
3) Discordance now follows Guillermo's suggestion where the discordance track is your VCF and the variant track is the one you are comparing to. I have updated the example in the wiki to reflect this change in interpretation.
4) If you DON'T provide any samples (-sn, -se or -sf), SelectVariants works with all samples from the VCF and ignores sample/genotype information when doing concordance or discordance. That is, it will report every "missing line" or "concordant line" in the two vcfs, regardless of sample or genotype information.
5) When samples are provided (-sn, -se or -sf) discordance and concordance will go down to the genotypes to determine whether or not you have a discordance/concordance event. In this case, a concordance happens only when the two VCFs display the same sample/genotype information for that locus, and discordance happens when the disc track is missing the line or has a different genotype information for that sample.
6) When dealing with multiple samples, concordance only happens if ALL your samples agree, and discordance happens if AT LEAST ONE of your samples disagree.
---
Integration tests:
1) Discordance and concordance test added
2) All other tests updated to comply with the new 'sorted output' format and different inputs for samples.
---
Methods for handling sample expressions and files with list of samples were added to SampleUtils. I recommend *NOT USING* the old getSamplesFromCommandLineInput as this mixing of sample names with expressions and files creates a rogue error that can be challenging to catch.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6072 348d0f76-0448-11de-a6fe-93d51630548a
2011-06-24 04:18:45 +08:00
|
|
|
import javax.swing.*;
|
2010-06-22 14:31:05 +08:00
|
|
|
import java.io.*;
|
|
|
|
|
import java.math.BigInteger;
|
|
|
|
|
import java.security.MessageDigest;
|
|
|
|
|
import java.security.NoSuchAlgorithmException;
|
2011-07-18 22:46:01 +08:00
|
|
|
import java.util.*;
|
2009-04-15 01:18:16 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
*
|
|
|
|
|
* User: aaron
|
|
|
|
|
* Date: Apr 14, 2009
|
|
|
|
|
* Time: 10:24:30 AM
|
|
|
|
|
*
|
|
|
|
|
* The Broad Institute
|
|
|
|
|
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
|
|
|
|
* This software and its documentation are copyright 2009 by the
|
|
|
|
|
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
|
|
|
|
*
|
|
|
|
|
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
|
|
|
|
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @author aaron
|
|
|
|
|
* @version 1.0
|
|
|
|
|
* @date Apr 14, 2009
|
|
|
|
|
* <p/>
|
|
|
|
|
* Class BaseTest
|
|
|
|
|
* <p/>
|
2009-04-15 01:41:38 +08:00
|
|
|
* This is the base test class for all of our test cases. All test cases should extend from this
|
2009-12-29 15:40:48 +08:00
|
|
|
* class; it sets up the logger, and resolves the location of directories that we rely on.
|
2009-04-15 01:18:16 +08:00
|
|
|
*/
|
2010-11-19 04:22:01 +08:00
|
|
|
@SuppressWarnings("unchecked")
|
2009-04-15 01:18:16 +08:00
|
|
|
public abstract class BaseTest {
|
|
|
|
|
/** our log, which we want to capture anything from org.broadinstitute.sting */
|
2010-11-19 04:22:01 +08:00
|
|
|
public static final Logger logger = CommandLineUtils.getStingLogger();
|
2009-12-29 15:40:48 +08:00
|
|
|
|
2010-11-19 04:22:01 +08:00
|
|
|
public static final String hg18Reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
|
2011-01-20 04:30:25 +08:00
|
|
|
public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta";
|
2010-11-19 04:22:01 +08:00
|
|
|
public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta";
|
|
|
|
|
public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta";
|
|
|
|
|
public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/";
|
|
|
|
|
public static final String validationDataLocation = GATKDataLocation + "Validation_Data/";
|
|
|
|
|
public static final String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/";
|
|
|
|
|
public static final String comparisonDataLocation = GATKDataLocation + "Comparisons/";
|
2010-11-23 06:59:42 +08:00
|
|
|
public static final String annotationDataLocation = GATKDataLocation + "Annotations/";
|
|
|
|
|
|
|
|
|
|
public static final String refseqAnnotationLocation = annotationDataLocation + "refseq/";
|
|
|
|
|
public static final String hg18Refseq = refseqAnnotationLocation + "refGene-big-table-hg18.txt";
|
|
|
|
|
public static final String hg19Refseq = refseqAnnotationLocation + "refGene-big-table-hg19.txt";
|
|
|
|
|
public static final String b36Refseq = refseqAnnotationLocation + "refGene-big-table-b36.txt";
|
|
|
|
|
public static final String b37Refseq = refseqAnnotationLocation + "refGene-big-table-b37.txt";
|
|
|
|
|
|
|
|
|
|
public static final String dbsnpDataLocation = GATKDataLocation;
|
2011-08-07 22:33:20 +08:00
|
|
|
public static final String b36dbSNP129 = dbsnpDataLocation + "dbsnp_129_b36.vcf";
|
2011-08-07 23:26:07 +08:00
|
|
|
public static final String b37dbSNP129 = dbsnpDataLocation + "dbsnp_129_b37.vcf";
|
2011-01-20 04:30:25 +08:00
|
|
|
public static final String b37dbSNP132 = dbsnpDataLocation + "dbsnp_132_b37.vcf";
|
2011-08-12 05:04:09 +08:00
|
|
|
public static final String hg18dbSNP132 = dbsnpDataLocation + "dbsnp_132.hg18.vcf";
|
2009-04-15 01:18:16 +08:00
|
|
|
|
Many updates to SelectVariants :
1) There is now a different parameter for sample name (-sn), sample file (-sf) or sample expression (-se). The unexpected behavior of the previous implementation was way too tricky to leave unchecked. (if you had a file or directory named after a sample name, SV wouldn't work)
1b) Fixed a TODO added by Eric -- now the output vcf always has the samples sorted alphabetically regardless of input (this came as a byproduct of the implementation of 1)
2) Discordance and Concordance now work in combination with all other parameters.
3) Discordance now follows Guillermo's suggestion where the discordance track is your VCF and the variant track is the one you are comparing to. I have updated the example in the wiki to reflect this change in interpretation.
4) If you DON'T provide any samples (-sn, -se or -sf), SelectVariants works with all samples from the VCF and ignores sample/genotype information when doing concordance or discordance. That is, it will report every "missing line" or "concordant line" in the two vcfs, regardless of sample or genotype information.
5) When samples are provided (-sn, -se or -sf) discordance and concordance will go down to the genotypes to determine whether or not you have a discordance/concordance event. In this case, a concordance happens only when the two VCFs display the same sample/genotype information for that locus, and discordance happens when the disc track is missing the line or has a different genotype information for that sample.
6) When dealing with multiple samples, concordance only happens if ALL your samples agree, and discordance happens if AT LEAST ONE of your samples disagree.
---
Integration tests:
1) Discordance and concordance test added
2) All other tests updated to comply with the new 'sorted output' format and different inputs for samples.
---
Methods for handling sample expressions and files with list of samples were added to SampleUtils. I recommend *NOT USING* the old getSamplesFromCommandLineInput as this mixing of sample names with expressions and files creates a rogue error that can be challenging to catch.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@6072 348d0f76-0448-11de-a6fe-93d51630548a
2011-06-24 04:18:45 +08:00
|
|
|
public static final String hapmapDataLocation = comparisonDataLocation + "Validated/HapMap/3.3/";
|
|
|
|
|
public static final String b37hapmapGenotypes = hapmapDataLocation + "genotypes_r27_nr.b37_fwd.vcf";
|
|
|
|
|
public static final String b37hapmapSites = hapmapDataLocation + "sites_r27_nr.b37_fwd.vcf";
|
|
|
|
|
|
2011-02-16 02:26:14 +08:00
|
|
|
public static final String intervalsLocation = GATKDataLocation;
|
|
|
|
|
public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list";
|
|
|
|
|
public static final String hg19Chr20Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list";
|
|
|
|
|
|
2011-02-01 07:13:09 +08:00
|
|
|
public static final String networkTempDir = "/broad/shptmp/";
|
|
|
|
|
public static final File networkTempDirFile = new File(networkTempDir);
|
|
|
|
|
|
2011-06-30 05:36:47 +08:00
|
|
|
public static final String testDir = "public/testdata/";
|
2009-04-15 01:18:16 +08:00
|
|
|
|
|
|
|
|
/** before the class starts up */
|
2010-11-19 04:22:01 +08:00
|
|
|
static {
|
|
|
|
|
// setup a basic log configuration
|
|
|
|
|
CommandLineUtils.configureConsoleLogging();
|
2009-05-20 13:15:27 +08:00
|
|
|
|
2010-11-19 04:22:01 +08:00
|
|
|
// setup our log layout
|
|
|
|
|
PatternLayout layout = new PatternLayout();
|
|
|
|
|
layout.setConversionPattern("TEST %C{1}.%M - %d{HH:mm:ss,SSS} - %m%n");
|
2009-04-15 01:18:16 +08:00
|
|
|
|
2010-11-19 04:22:01 +08:00
|
|
|
// now set the layout of all the loggers to our layout
|
|
|
|
|
CommandLineUtils.setLayout(logger, layout);
|
2009-04-15 01:18:16 +08:00
|
|
|
|
2010-11-19 04:22:01 +08:00
|
|
|
// Set the Root logger to only output warnings.
|
|
|
|
|
logger.setLevel(Level.WARN);
|
2009-04-15 01:18:16 +08:00
|
|
|
|
2010-11-19 04:22:01 +08:00
|
|
|
// find our file sources
|
|
|
|
|
if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) {
|
|
|
|
|
logger.fatal("We can't locate the reference directories. Aborting!");
|
|
|
|
|
throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories");
|
2009-04-15 01:18:16 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-07-07 09:57:22 +08:00
|
|
|
/**
|
|
|
|
|
* Simple generic utility class to creating TestNG data providers:
|
|
|
|
|
*
|
|
|
|
|
* 1: inherit this class, as in
|
|
|
|
|
*
|
|
|
|
|
* private class SummarizeDifferenceTest extends TestDataProvider {
|
|
|
|
|
* public SummarizeDifferenceTest() {
|
|
|
|
|
* super(SummarizeDifferenceTest.class);
|
|
|
|
|
* }
|
|
|
|
|
* ...
|
|
|
|
|
* }
|
|
|
|
|
*
|
|
|
|
|
* Provide a reference to your class to the TestDataProvider constructor.
|
|
|
|
|
*
|
|
|
|
|
* 2: Create instances of your subclass. Return from it the call to getTests, providing
|
|
|
|
|
* the class type of your test
|
|
|
|
|
*
|
2011-07-18 22:46:01 +08:00
|
|
|
* @DataProvider(name = "summaries"
|
2011-07-07 09:57:22 +08:00
|
|
|
* public Object[][] createSummaries() {
|
|
|
|
|
* new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2");
|
|
|
|
|
* new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1");
|
|
|
|
|
* return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class);
|
|
|
|
|
* }
|
|
|
|
|
*
|
|
|
|
|
* This class magically tracks created objects of this
|
|
|
|
|
*/
|
|
|
|
|
public static class TestDataProvider {
|
|
|
|
|
private static final Map<Class, List<Object>> tests = new HashMap<Class, List<Object>>();
|
2011-09-23 05:04:32 +08:00
|
|
|
private final String name;
|
2011-07-07 09:57:22 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Create a new TestDataProvider instance bound to the class variable C
|
|
|
|
|
* @param c
|
|
|
|
|
*/
|
2011-09-23 05:04:32 +08:00
|
|
|
public TestDataProvider(Class c, String name) {
|
2011-07-07 09:57:22 +08:00
|
|
|
if ( ! tests.containsKey(c) )
|
|
|
|
|
tests.put(c, new ArrayList<Object>());
|
|
|
|
|
tests.get(c).add(this);
|
2011-09-23 05:04:32 +08:00
|
|
|
this.name = name;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public TestDataProvider(Class c) {
|
|
|
|
|
this(c, "");
|
2011-07-07 09:57:22 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return all of the data providers in the form expected by TestNG of type class C
|
|
|
|
|
* @param c
|
|
|
|
|
* @return
|
|
|
|
|
*/
|
|
|
|
|
public static Object[][] getTests(Class c) {
|
|
|
|
|
List<Object[]> params2 = new ArrayList<Object[]>();
|
|
|
|
|
for ( Object x : tests.get(c) ) params2.add(new Object[]{x});
|
|
|
|
|
return params2.toArray(new Object[][]{});
|
|
|
|
|
}
|
2011-09-23 05:04:32 +08:00
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public String toString() {
|
|
|
|
|
return "TestDataProvider("+name+")";
|
|
|
|
|
}
|
2011-07-07 09:57:22 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-15 01:18:16 +08:00
|
|
|
/**
|
|
|
|
|
* test if the file exists
|
|
|
|
|
*
|
|
|
|
|
* @param file name as a string
|
|
|
|
|
* @return true if it exists
|
|
|
|
|
*/
|
|
|
|
|
public static boolean fileExist(String file) {
|
|
|
|
|
File temp = new File(file);
|
|
|
|
|
return temp.exists();
|
|
|
|
|
}
|
2010-06-03 01:45:51 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* this appender looks for a specific message in the log4j stream.
|
|
|
|
|
* It can be used to verify that a specific message was generated to the logging system.
|
|
|
|
|
*/
|
|
|
|
|
public static class ValidationAppender extends AppenderSkeleton {
|
|
|
|
|
|
|
|
|
|
private boolean foundString = false;
|
|
|
|
|
private String targetString = "";
|
|
|
|
|
|
|
|
|
|
public ValidationAppender(String target) {
|
|
|
|
|
targetString = target;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
protected void append(LoggingEvent loggingEvent) {
|
|
|
|
|
if (loggingEvent.getMessage().equals(targetString))
|
|
|
|
|
foundString = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void close() {
|
|
|
|
|
// do nothing
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean requiresLayout() {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean foundString() {
|
|
|
|
|
return foundString;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-01-25 12:11:49 +08:00
|
|
|
/**
|
|
|
|
|
* Creates a temp file that will be deleted on exit after tests are complete.
|
|
|
|
|
* @param name Prefix of the file.
|
|
|
|
|
* @param extension Extension to concat to the end of the file.
|
|
|
|
|
* @return A file in the temporary directory starting with name, ending with extension, which will be deleted after the program exits.
|
|
|
|
|
*/
|
|
|
|
|
public static File createTempFile(String name, String extension) {
|
|
|
|
|
try {
|
|
|
|
|
File file = File.createTempFile(name, extension);
|
|
|
|
|
file.deleteOnExit();
|
|
|
|
|
return file;
|
|
|
|
|
} catch (IOException ex) {
|
|
|
|
|
throw new ReviewedStingException("Cannot create temp file: " + ex.getMessage(), ex);
|
|
|
|
|
}
|
|
|
|
|
}
|
2011-02-01 07:13:09 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Creates a temp file that will be deleted on exit after tests are complete.
|
|
|
|
|
* @param name Prefix of the file.
|
|
|
|
|
* @param extension Extension to concat to the end of the file.
|
|
|
|
|
* @return A file in the network temporary directory starting with name, ending with extension, which will be deleted after the program exits.
|
|
|
|
|
*/
|
|
|
|
|
public static File createNetworkTempFile(String name, String extension) {
|
|
|
|
|
try {
|
|
|
|
|
File file = File.createTempFile(name, extension, networkTempDirFile);
|
|
|
|
|
file.deleteOnExit();
|
|
|
|
|
return file;
|
|
|
|
|
} catch (IOException ex) {
|
|
|
|
|
throw new ReviewedStingException("Cannot create temp file: " + ex.getMessage(), ex);
|
|
|
|
|
}
|
|
|
|
|
}
|
2009-04-15 01:18:16 +08:00
|
|
|
}
|