General WalkerTest framework. Includes some minor changes to GATK core to enable creation of true command-line like GATK modules in the code. Extensive first-pass tests for SSG

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1538 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-09-04 19:13:37 +00:00
parent 471ca8201e
commit 2b0d1c52b2
9 changed files with 307 additions and 131 deletions

View File

@ -45,6 +45,8 @@ import java.util.Arrays;
* @author aaron
*/
public abstract class CommandLineExecutable extends CommandLineProgram {
public Object GATKResult = null;
/**
* The actual engine which performs the analysis.
@ -70,6 +72,11 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
* @return the return code to exit the program with
*/
protected int execute() {
GATKResult = executeGATK();
return 0;
}
protected Object executeGATK() {
GATKArgumentCollection arguments = getArgumentCollection();
Walker<?,?> mWalker = GATKEngine.getWalkerByName(getAnalysisName());
@ -83,9 +90,7 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
// set the analysis name in the argument collection
arguments.analysisName = getAnalysisName();
GATKEngine.execute(arguments, mWalker);
return 0;
return GATKEngine.execute(arguments, mWalker);
}
/**

View File

@ -66,6 +66,7 @@ public class CommandLineGATK extends CommandLineExecutable {
try {
CommandLineGATK instance = new CommandLineGATK();
start(instance, argv);
System.exit(CommandLineProgram.result); // todo -- this is a painful hack
} catch (Exception e) {
exitSystemWithError(e);
}

View File

@ -56,7 +56,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
if (Types.containsKey(boundName)) {
throw new RuntimeException(String.format("GATK BUG: adding ROD module %s that is already bound", boundName));
}
logger.info(String.format("* Adding rod class %s%n", name));
logger.info(String.format("* Adding rod class %s", name));
Types.put(boundName, new RODBinding(name, rodType));
}

View File

@ -41,6 +41,8 @@ public abstract class GenotypeLikelihoods implements Cloneable {
private final static int FIXED_PLOIDY = 2;
private final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
protected boolean enableCacheFlag = true;
//
// The three fundamental data arrays associated with a Genotype Likelhoods object
//
@ -175,8 +177,12 @@ public abstract class GenotypeLikelihoods implements Cloneable {
*
* @return true if caching should be enabled, false otherwise
*/
protected boolean enableCache() {
return true;
public boolean cacheIsEnabled() {
return enableCacheFlag;
}
public void setEnableCacheFlag(boolean enable) {
enableCacheFlag = enable;
}
/**
@ -247,7 +253,7 @@ public abstract class GenotypeLikelihoods implements Cloneable {
if ( qualityScore > getMinQScoreToInclude() ) {
// Handle caching if requested. Just look up the cached result if its available, or compute and store it
boolean enableCache = enableCacheArg && enableCache();
boolean enableCache = enableCacheArg && cacheIsEnabled();
GenotypeLikelihoods cached = null;
if ( enableCache ) {
if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read, offset ) ) {

View File

@ -72,6 +72,8 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li
@Override
public boolean equals(Object other) {
lazyEval();
if (other == null)
return false;
if (other instanceof SSGenotypeCall) {
@ -89,11 +91,29 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li
}
public String toString() {
return String.format("%s ref=%s depth=%d rmsMAPQ=%.2f likelihoods=%s",
getLocation(), mRefBase, mPileup.getReads().size(), Arrays.toString(mGenotypeLikelihoods.getLikelihoods()));
lazyEval();
return String.format("%s best=%s cmp=%s ref=%s depth=%d negLog10PError = %.2f, likelihoods=%s",
getLocation(), mGenotype, mCompareTo, mRefBase, mPileup.getReads().size(),
getNegLog10PError(), Arrays.toString(mGenotypeLikelihoods.getLikelihoods()));
}
private void lazyEval() {
// us
if (mGenotype == null) {
Integer sorted[] = Utils.SortPermutation(mGenotypeLikelihoods.getPosteriors());
mGenotype = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 1]];
}
// our comparison
if (mCompareTo == null) {
if (this.mBestVrsRef) {
mCompareTo = DiploidGenotype.valueOf(Utils.dupString(this.getReference(),2));
} else {
Integer sorted[] = Utils.SortPermutation(mGenotypeLikelihoods.getPosteriors());
mCompareTo = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 2]];
}
}
}
/**
@ -112,10 +132,7 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li
* get the best genotype
*/
public DiploidGenotype getBestGenotype() {
if (mGenotype == null) {
Integer sorted[] = Utils.SortPermutation(mGenotypeLikelihoods.getPosteriors());
mGenotype = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 1]];
}
lazyEval();
return mGenotype;
}
@ -123,14 +140,7 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li
* get the alternate genotype
*/
public DiploidGenotype getAltGenotype() {
if (mCompareTo == null) {
if (this.mBestVrsRef) {
mCompareTo = DiploidGenotype.valueOf(Utils.dupString(this.getReference(),2));
} else {
Integer sorted[] = Utils.SortPermutation(mGenotypeLikelihoods.getPosteriors());
mCompareTo = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 2]];
}
}
lazyEval();
return mCompareTo;
}

View File

@ -44,6 +44,10 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, SingleSam
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;
@Argument(fullName = "disableCache", doc = "[ADVANCED] If true, we won't use the caching system. This argument is for testing purposes only", required = false)
public boolean disableCache = false;
public class CallResult {
long nConfidentCalls = 0;
long nNonConfidentCalls = 0;
@ -94,6 +98,7 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, SingleSam
GenotypeLikelihoods gl = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
gl.setVerbose(VERBOSE);
gl.setEnableCacheFlag(! disableCache);
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
gl.add(pileup, true);
gl.validate();

View File

@ -80,7 +80,7 @@ public abstract class CommandLineProgram {
* this is used to indicate if they've asked for help
*/
@Argument(fullName="help",shortName="h",doc="Generate this help message",required=false)
public Boolean help = false;
public Boolean help = false;
/**
* our logging output patterns
@ -139,6 +139,8 @@ public abstract class CommandLineProgram {
BasicConfigurator.configure();
}
public static int result = 0;
/**
* This function is called to start processing the command line, and kick
* off the execute message of the program.
@ -233,10 +235,10 @@ public abstract class CommandLineProgram {
clp.setupLoggerLevel();
// call the execute
int result = clp.execute();
CommandLineProgram.result = clp.execute();
// return the result
System.exit(result);
//System.exit(result); // todo -- is this safe -- why exit here? I want to run the GATK like normal
}
catch (ArgumentException e) {
clp.parser.printHelp( clp.getApplicationDetails() );

View File

@ -0,0 +1,123 @@
package org.broadinstitute.sting;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.Utils;
import org.junit.Test;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.CommandLineExecutable;
import java.io.*;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
import java.security.MessageDigest;
import java.math.BigInteger;
import junit.framework.Assert;
public class WalkerTest extends BaseTest {
public String assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5 ) {
try {
byte[] bytesOfMessage = getBytesFromFile(resultsFile);
byte[] thedigest = MessageDigest.getInstance("MD5").digest(bytesOfMessage);
BigInteger bigInt = new BigInteger(1, thedigest);
String filemd5sum = bigInt.toString(16);
if ( parameterize() || expectedMD5.equals("") ) {
logger.warn(String.format("PARAMETERIZATION[%s]: file %s has md5 = %s, stated expectation is %s, equal? = %b",
name, resultsFile, filemd5sum, expectedMD5, filemd5sum.equals(expectedMD5)));
} else {
logger.warn(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5));
Assert.assertEquals(name + "Mismatching MD5s", expectedMD5, filemd5sum);
logger.warn(String.format(" => %s PASSED", name));
}
return filemd5sum;
} catch ( Exception e ) {
throw new RuntimeException("Failed to read bytes from calls file: " + resultsFile);
}
}
public List<String> assertMatchingMD5s(final String name, List<File> resultFiles, List<String> expectedMD5s ) {
List<String> md5s = new ArrayList<String>();
for ( int i = 0; i < resultFiles.size(); i++ ) {
String md5 = assertMatchingMD5(name, resultFiles.get(i), expectedMD5s.get(i));
md5s.add(i, md5);
}
return md5s;
}
public static byte[] getBytesFromFile(File file) throws IOException {
InputStream is = new FileInputStream(file);
// Get the size of the file
long length = file.length();
if (length > Integer.MAX_VALUE) {
// File is too large
}
// Create the byte array to hold the data
byte[] bytes = new byte[(int)length];
// Read in the bytes
int offset = 0;
int numRead = 0;
while (offset < bytes.length
&& (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) {
offset += numRead;
}
// Ensure all the bytes have been read in
if (offset < bytes.length) {
throw new IOException("Could not completely read file "+file.getName());
}
// Close the input stream and return bytes
is.close();
return bytes;
}
public class WalkerTestSpec {
String args = "";
int nOutputFiles = -1;
List<String> md5s = null;
public WalkerTestSpec(String args, int nOutputFiles, List<String> md5s) {
this.args = args;
this.nOutputFiles = nOutputFiles;
this.md5s = md5s;
}
}
protected boolean parameterize() {
return false;
}
protected List<String> executeTest(final String name, WalkerTestSpec spec) {
List<File> tmpFiles = new ArrayList<File>();
for ( int i = 0; i < spec.nOutputFiles; i++ ) {
try {
tmpFiles.add( File.createTempFile(String.format("walktest.tmp_param.%d", i), ".tmp" ) );
} catch (IOException ex) {
System.err.println("Cannot create temp file: " + ex.getMessage());
}
}
final String args = String.format(spec.args, tmpFiles.toArray());
logger.warn(Utils.dupString('-', 80));
logger.warn(String.format("Executing test %s with GATK arguments: %s", name, args));
CommandLineGATK instance = new CommandLineGATK();
CommandLineExecutable.start(instance, args.split(" "));
return assertMatchingMD5s(name, tmpFiles, spec.md5s);
}
@Test
public void testWalkerTest() {
logger.warn("WalkerTest is just a framework");
}
}

View File

@ -1,140 +1,164 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.utils.genotype.Variant;
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.junit.Test;
import org.broadinstitute.sting.gatk.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.executive.Accumulator;
import java.io.*;
import java.util.Arrays;
import java.util.EnumMap;
import java.util.*;
import java.security.MessageDigest;
import java.math.BigInteger;
import junit.framework.Assert;
public class SingleSampleGenotyperTest extends BaseTest {
private final static double DELTA = 1e-8;
private static final String oneMB = "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam";
private static final String fiveMB = "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_15_mb.SLX.bam";
private static String gatkargs(final String bam) {
final String GATKArgs =
"<GATK-argument-collection>\n" +
" <sam-files class=\"java.util.ArrayList\">\n" +
" <file>%s</file>\n" +
" </sam-files>\n" +
" <reference-file>/broad/1KG/reference/human_b36_both.fasta</reference-file>\n" +
" <analysis-name>SingleSampleGenotyper</analysis-name>\n" +
"</GATK-argument-collection>";
return String.format(GATKArgs, bam);
public class SingleSampleGenotyperTest extends WalkerTest {
public static String baseTestString() {
return "-T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s";
}
private static class ExpectedResult {
long nCalls;
String md5sum;
public ExpectedResult(long calls, String md5) {
this.nCalls = calls;
this.md5sum = md5;
}
public static String testGeliLod5() {
return baseTestString() + " --variant_output_format GELI -lod 5";
}
public static EnumMap<BaseMismatchModel, ExpectedResult> expectedOutput = new EnumMap<BaseMismatchModel, ExpectedResult>(BaseMismatchModel.class);
private static String OneMb1StateMD5 = "d5404668e76f206055f03d97162ea6d9";
private static String OneMb3StateMD5 = "46fb7b66da3dac341e9c342f751d74cd";
private static String OneMbEmpiricalMD5 = "ea0be2fd074a6c824a0670ad5b3e0aca";
static {
expectedOutput.put(BaseMismatchModel.ONE_STATE, new ExpectedResult(983L, "d5404668e76f206055f03d97162ea6d9"));
expectedOutput.put(BaseMismatchModel.THREE_STATE, new ExpectedResult(1055L, "46fb7b66da3dac341e9c342f751d74cd"));
expectedOutput.put(BaseMismatchModel.EMPIRICAL, new ExpectedResult(1070L, "ea0be2fd074a6c824a0670ad5b3e0aca"));
}
//private static double callingTolerance = 0.2; // I'm willing to tolerate +/- 10% variation in call rate from that expected
public static long nExpectedCalls(BaseMismatchModel model) {
return expectedOutput.get(model).nCalls;
}
// public static double nExpectedCallsTolerance(long nBasesCalled) {
// return nExpectedCalls(nBasesCalled) * callingTolerance;
// private static String oneMbMD5(BaseMismatchModel m) {
// switch (m) {
// case ONE_STATE: return OneMb1StateMD5;
// case THREE_STATE: return OneMb3StateMD5;
// case EMPIRICAL: return OneMbEmpiricalMD5;
// default: throw new RuntimeException("Unexpected BaseMismatchModel " + m);
// }
// }
public static void assertGoodNumberOfCalls(final String name, BaseMismatchModel model, SingleSampleGenotyper.CallResult calls) {
Assert.assertEquals(name, nExpectedCalls(model), calls.nConfidentCalls);
}
// Uncomment to not check outputs against expectations
//protected boolean parameterize() {
// return true;
//}
public static void assertMatchingMD5(final String name, BaseMismatchModel model, final File resultsFile ) {
try {
byte[] bytesOfMessage = getBytesFromFile(resultsFile);
byte[] thedigest = MessageDigest.getInstance("MD5").digest(bytesOfMessage);
BigInteger bigInt = new BigInteger(1, thedigest);
String filemd5sum = bigInt.toString(16);
Assert.assertEquals(name + "Mismatching MD5s", expectedOutput.get(model).md5sum, filemd5sum);
} catch ( Exception e ) {
throw new RuntimeException("Failed to read bytes from calls file: " + resultsFile);
// --------------------------------------------------------------------------------------------------------------
//
// testing the cache
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void testCache() {
for ( BaseMismatchModel model : BaseMismatchModel.values() ) {
// calculated the expected value without the cache enabled
WalkerTest.WalkerTestSpec withoutCacheSpec = new WalkerTest.WalkerTestSpec(
testGeliLod5() + " -L 1:10,000,000-10,100,000 --disableCache -m " + model.toString(), 1,
Arrays.asList(""));
List<String> withoutCache = executeTest("empirical1MbTest", withoutCacheSpec );
WalkerTest.WalkerTestSpec withCacheSpec = new WalkerTest.WalkerTestSpec(
testGeliLod5() + " -L 1:10,000,000-10,100,000 -m " + model.toString(), 1,
withoutCache);
executeTest(String.format("testCache[%s]", model), withCacheSpec );
}
}
public static byte[] getBytesFromFile(File file) throws IOException {
InputStream is = new FileInputStream(file);
// Get the size of the file
long length = file.length();
if (length > Integer.MAX_VALUE) {
// File is too large
}
// Create the byte array to hold the data
byte[] bytes = new byte[(int)length];
// Read in the bytes
int offset = 0;
int numRead = 0;
while (offset < bytes.length
&& (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) {
offset += numRead;
}
// Ensure all the bytes have been read in
if (offset < bytes.length) {
throw new IOException("Could not completely read file "+file.getName());
}
// Close the input stream and return bytes
is.close();
return bytes;
// --------------------------------------------------------------------------------------------------------------
//
// testing genotype mode
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void genotypeTest() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
testGeliLod5() + " -L 1:10,000,000-10,100,000 -m empirical --genotype", 1,
Arrays.asList("7e5dec6481bbbc890493925da9a8f691"));
executeTest("genotypeTest", spec);
}
private void oneMBCallTest(BaseMismatchModel model) {
logger.warn("Executing oneMBCallTest for " + model);
// --------------------------------------------------------------------------------------------------------------
//
// basic base calling models
//
// --------------------------------------------------------------------------------------------------------------
InputStream stream = new ByteArrayInputStream(gatkargs(oneMB).getBytes());
GATKArgumentCollection mycoll = GATKArgumentCollection.unmarshal(stream);
mycoll.intervals = Arrays.asList("1:10,000,000-11,000,000");
SingleSampleGenotyper SSG = new SingleSampleGenotyper();
SSG.VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
SSG.LOD_THRESHOLD = 5.0;
try {
SSG.VARIANTS_FILE = File.createTempFile("SSGTempTmpCalls.geli.calls", ".tmp" );
} catch (IOException ex) {
System.err.println("Cannot create temp file: " + ex.getMessage());
}
SSG.baseModel = model;
GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
System.out.printf("model is %s, SSG says %s%n", model, SSG.baseModel );
Accumulator obj = (Accumulator)engine.execute(mycoll, SSG);
SingleSampleGenotyper.CallResult calls = (SingleSampleGenotyper.CallResult)obj.finishTraversal();
logger.warn(String.format("SSG(%s): made %d calls at %d bases", SSG.baseModel, calls.nConfidentCalls, calls.nCalledBases));
assertMatchingMD5("testBasic", model, SSG.VARIANTS_FILE);
assertGoodNumberOfCalls("testBasic", model, calls);
@Test
public void oneState100bpTest() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( testGeliLod5() + " -L 1:10,000,000-10,000,100 -m one_state", 1, Arrays.asList("3cd402d889c015be4a318123468f4262"));
executeTest("oneState100bpTest", spec);
}
@Test public void oneStateOneMBTest() { oneMBCallTest(BaseMismatchModel.ONE_STATE); }
@Test public void threeStateOneMBTest() { oneMBCallTest(BaseMismatchModel.THREE_STATE); }
@Test public void empiricalStateOneMBTest() { oneMBCallTest(BaseMismatchModel.EMPIRICAL); }
@Test
public void oneState1MbTest() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
testGeliLod5() + " -L 1:10,000,000-11,000,000 -m one_state",
1, Arrays.asList(OneMb1StateMD5));
executeTest("oneState1MbTest", spec);
}
@Test
public void threeState1MbTest() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
testGeliLod5() + " -L 1:10,000,000-11,000,000 -m three_state", 1,
Arrays.asList(OneMb3StateMD5));
executeTest("threeState1MbTest", spec);
}
@Test
public void empirical1MbTest() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
testGeliLod5() + " -L 1:10,000,000-11,000,000 -m empirical", 1,
Arrays.asList(OneMbEmpiricalMD5));
executeTest("empirical1MbTest", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing output formats
//
// --------------------------------------------------------------------------------------------------------------
//@Argument(fullName = "variant_output_format", shortName = "vf", doc = "File format to be used", required = false)
//public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
// --------------------------------------------------------------------------------------------------------------
//
// testing LOD thresholding
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void testLOD() {
HashMap<Double, String> e = new HashMap<Double, String>();
e.put( 10.0, "e4c51dca6f1fa999f4399b7412829534" );
e.put( 3.0, "d804c24d49669235e3660e92e664ba1a" );
for ( Map.Entry<Double, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseTestString() + " --variant_output_format GELI -L 1:10,000,000-11,000,000 -m EMPIRICAL -lod " + entry.getKey(), 1,
Arrays.asList(entry.getValue()));
executeTest("testLOD", spec);
}
}
// --------------------------------------------------------------------------------------------------------------
//
// testing hetero setting
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void testHeterozyosity() {
HashMap<Double, String> e = new HashMap<Double, String>();
e.put( 0.01, "ca9986b32aac0d6ad6058f4bf10e7df2" );
e.put( 0.0001, "55d4e3e73215b70b22a8e689a4e16d37" );
e.put( 1.0 / 1850, "1ae2126f1a6490d6edd15d95bce726c4" );
for ( Map.Entry<Double, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
testGeliLod5() + " -L 1:10,000,000-11,000,000 -m EMPIRICAL --heterozygosity " + entry.getKey(), 1,
Arrays.asList(entry.getValue()));
executeTest(String.format("testHeterozyosity[%s]", entry.getKey()), spec);
}
}
}