From 2b0d1c52b232ffc09ea9a7ede555b3c356334695 Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 4 Sep 2009 19:13:37 +0000 Subject: [PATCH] General WalkerTest framework. Includes some minor changes to GATK core to enable creation of true command-line like GATK modules in the code. Extensive first-pass tests for SSG git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1538 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/CommandLineExecutable.java | 11 +- .../sting/gatk/CommandLineGATK.java | 1 + .../gatk/refdata/ReferenceOrderedData.java | 2 +- .../genotyper/GenotypeLikelihoods.java | 12 +- .../walkers/genotyper/SSGenotypeCall.java | 38 +-- .../genotyper/SingleSampleGenotyper.java | 5 + .../utils/cmdLine/CommandLineProgram.java | 8 +- .../org/broadinstitute/sting/WalkerTest.java | 123 +++++++++ .../genotyper/SingleSampleGenotyperTest.java | 238 ++++++++++-------- 9 files changed, 307 insertions(+), 131 deletions(-) create mode 100755 java/test/org/broadinstitute/sting/WalkerTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java index d5cb9c0a9..5b44248cb 100644 --- a/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java +++ b/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java @@ -45,6 +45,8 @@ import java.util.Arrays; * @author aaron */ public abstract class CommandLineExecutable extends CommandLineProgram { + public Object GATKResult = null; + /** * The actual engine which performs the analysis. @@ -70,6 +72,11 @@ public abstract class CommandLineExecutable extends CommandLineProgram { * @return the return code to exit the program with */ protected int execute() { + GATKResult = executeGATK(); + return 0; + } + + protected Object executeGATK() { GATKArgumentCollection arguments = getArgumentCollection(); Walker mWalker = GATKEngine.getWalkerByName(getAnalysisName()); @@ -83,9 +90,7 @@ public abstract class CommandLineExecutable extends CommandLineProgram { // set the analysis name in the argument collection arguments.analysisName = getAnalysisName(); - GATKEngine.execute(arguments, mWalker); - - return 0; + return GATKEngine.execute(arguments, mWalker); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index 2a945a5c9..db8b1529c 100755 --- a/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -66,6 +66,7 @@ public class CommandLineGATK extends CommandLineExecutable { try { CommandLineGATK instance = new CommandLineGATK(); start(instance, argv); + System.exit(CommandLineProgram.result); // todo -- this is a painful hack } catch (Exception e) { exitSystemWithError(e); } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index 44650604a..2371fd02e 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -56,7 +56,7 @@ public class ReferenceOrderedData implements if (Types.containsKey(boundName)) { throw new RuntimeException(String.format("GATK BUG: adding ROD module %s that is already bound", boundName)); } - logger.info(String.format("* Adding rod class %s%n", name)); + logger.info(String.format("* Adding rod class %s", name)); Types.put(boundName, new RODBinding(name, rodType)); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java index ee6418726..077a5d7ba 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java @@ -41,6 +41,8 @@ public abstract class GenotypeLikelihoods implements Cloneable { private final static int FIXED_PLOIDY = 2; private final static int MAX_PLOIDY = FIXED_PLOIDY + 1; + protected boolean enableCacheFlag = true; + // // The three fundamental data arrays associated with a Genotype Likelhoods object // @@ -175,8 +177,12 @@ public abstract class GenotypeLikelihoods implements Cloneable { * * @return true if caching should be enabled, false otherwise */ - protected boolean enableCache() { - return true; + public boolean cacheIsEnabled() { + return enableCacheFlag; + } + + public void setEnableCacheFlag(boolean enable) { + enableCacheFlag = enable; } /** @@ -247,7 +253,7 @@ public abstract class GenotypeLikelihoods implements Cloneable { if ( qualityScore > getMinQScoreToInclude() ) { // Handle caching if requested. Just look up the cached result if its available, or compute and store it - boolean enableCache = enableCacheArg && enableCache(); + boolean enableCache = enableCacheArg && cacheIsEnabled(); GenotypeLikelihoods cached = null; if ( enableCache ) { if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read, offset ) ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGenotypeCall.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGenotypeCall.java index 4ba7c8d9e..60fea0d3e 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGenotypeCall.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGenotypeCall.java @@ -72,6 +72,8 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li @Override public boolean equals(Object other) { + lazyEval(); + if (other == null) return false; if (other instanceof SSGenotypeCall) { @@ -89,11 +91,29 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li } public String toString() { - return String.format("%s ref=%s depth=%d rmsMAPQ=%.2f likelihoods=%s", - getLocation(), mRefBase, mPileup.getReads().size(), Arrays.toString(mGenotypeLikelihoods.getLikelihoods())); + lazyEval(); + return String.format("%s best=%s cmp=%s ref=%s depth=%d negLog10PError = %.2f, likelihoods=%s", + getLocation(), mGenotype, mCompareTo, mRefBase, mPileup.getReads().size(), + getNegLog10PError(), Arrays.toString(mGenotypeLikelihoods.getLikelihoods())); } + private void lazyEval() { + // us + if (mGenotype == null) { + Integer sorted[] = Utils.SortPermutation(mGenotypeLikelihoods.getPosteriors()); + mGenotype = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 1]]; + } + // our comparison + if (mCompareTo == null) { + if (this.mBestVrsRef) { + mCompareTo = DiploidGenotype.valueOf(Utils.dupString(this.getReference(),2)); + } else { + Integer sorted[] = Utils.SortPermutation(mGenotypeLikelihoods.getPosteriors()); + mCompareTo = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 2]]; + } + } + } /** @@ -112,10 +132,7 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li * get the best genotype */ public DiploidGenotype getBestGenotype() { - if (mGenotype == null) { - Integer sorted[] = Utils.SortPermutation(mGenotypeLikelihoods.getPosteriors()); - mGenotype = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 1]]; - } + lazyEval(); return mGenotype; } @@ -123,14 +140,7 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li * get the alternate genotype */ public DiploidGenotype getAltGenotype() { - if (mCompareTo == null) { - if (this.mBestVrsRef) { - mCompareTo = DiploidGenotype.valueOf(Utils.dupString(this.getReference(),2)); - } else { - Integer sorted[] = Utils.SortPermutation(mGenotypeLikelihoods.getPosteriors()); - mCompareTo = DiploidGenotype.values()[sorted[DiploidGenotype.values().length - 2]]; - } - } + lazyEval(); return mCompareTo; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java index de8dc4c49..224b397ae 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java @@ -44,6 +44,10 @@ public class SingleSampleGenotyper extends LocusWalker %s PASSED", name)); + } + + return filemd5sum; + } catch ( Exception e ) { + throw new RuntimeException("Failed to read bytes from calls file: " + resultsFile); + } + } + + public List assertMatchingMD5s(final String name, List resultFiles, List expectedMD5s ) { + List md5s = new ArrayList(); + for ( int i = 0; i < resultFiles.size(); i++ ) { + String md5 = assertMatchingMD5(name, resultFiles.get(i), expectedMD5s.get(i)); + md5s.add(i, md5); + } + + return md5s; + } + + + public static byte[] getBytesFromFile(File file) throws IOException { + InputStream is = new FileInputStream(file); + + // Get the size of the file + long length = file.length(); + + if (length > Integer.MAX_VALUE) { + // File is too large + } + + // Create the byte array to hold the data + byte[] bytes = new byte[(int)length]; + + // Read in the bytes + int offset = 0; + int numRead = 0; + while (offset < bytes.length + && (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) { + offset += numRead; + } + + // Ensure all the bytes have been read in + if (offset < bytes.length) { + throw new IOException("Could not completely read file "+file.getName()); + } + + // Close the input stream and return bytes + is.close(); + return bytes; + } + + public class WalkerTestSpec { + String args = ""; + int nOutputFiles = -1; + List md5s = null; + + public WalkerTestSpec(String args, int nOutputFiles, List md5s) { + this.args = args; + this.nOutputFiles = nOutputFiles; + this.md5s = md5s; + } + } + + protected boolean parameterize() { + return false; + } + + protected List executeTest(final String name, WalkerTestSpec spec) { + List tmpFiles = new ArrayList(); + for ( int i = 0; i < spec.nOutputFiles; i++ ) { + try { + tmpFiles.add( File.createTempFile(String.format("walktest.tmp_param.%d", i), ".tmp" ) ); + } catch (IOException ex) { + System.err.println("Cannot create temp file: " + ex.getMessage()); + } + } + + final String args = String.format(spec.args, tmpFiles.toArray()); + logger.warn(Utils.dupString('-', 80)); + logger.warn(String.format("Executing test %s with GATK arguments: %s", name, args)); + CommandLineGATK instance = new CommandLineGATK(); + CommandLineExecutable.start(instance, args.split(" ")); + + return assertMatchingMD5s(name, tmpFiles, spec.md5s); + } + + @Test + public void testWalkerTest() { + logger.warn("WalkerTest is just a framework"); + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyperTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyperTest.java index 4ddcd3f44..06c912f32 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyperTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyperTest.java @@ -1,140 +1,164 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.genotype.Variant; import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; +import org.broadinstitute.sting.utils.cmdLine.Argument; import org.junit.Test; import org.broadinstitute.sting.gatk.GATKArgumentCollection; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.executive.Accumulator; import java.io.*; -import java.util.Arrays; -import java.util.EnumMap; +import java.util.*; import java.security.MessageDigest; import java.math.BigInteger; import junit.framework.Assert; -public class SingleSampleGenotyperTest extends BaseTest { - private final static double DELTA = 1e-8; - - private static final String oneMB = "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam"; - private static final String fiveMB = "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_15_mb.SLX.bam"; - private static String gatkargs(final String bam) { - final String GATKArgs = - "\n" + - " \n" + - " %s\n" + - " \n" + - " /broad/1KG/reference/human_b36_both.fasta\n" + - " SingleSampleGenotyper\n" + - ""; - return String.format(GATKArgs, bam); +public class SingleSampleGenotyperTest extends WalkerTest { + public static String baseTestString() { + return "-T SingleSampleGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s"; } - private static class ExpectedResult { - long nCalls; - String md5sum; - public ExpectedResult(long calls, String md5) { - this.nCalls = calls; - this.md5sum = md5; - } + public static String testGeliLod5() { + return baseTestString() + " --variant_output_format GELI -lod 5"; } - public static EnumMap expectedOutput = new EnumMap(BaseMismatchModel.class); + private static String OneMb1StateMD5 = "d5404668e76f206055f03d97162ea6d9"; + private static String OneMb3StateMD5 = "46fb7b66da3dac341e9c342f751d74cd"; + private static String OneMbEmpiricalMD5 = "ea0be2fd074a6c824a0670ad5b3e0aca"; - static { - expectedOutput.put(BaseMismatchModel.ONE_STATE, new ExpectedResult(983L, "d5404668e76f206055f03d97162ea6d9")); - expectedOutput.put(BaseMismatchModel.THREE_STATE, new ExpectedResult(1055L, "46fb7b66da3dac341e9c342f751d74cd")); - expectedOutput.put(BaseMismatchModel.EMPIRICAL, new ExpectedResult(1070L, "ea0be2fd074a6c824a0670ad5b3e0aca")); - } - - //private static double callingTolerance = 0.2; // I'm willing to tolerate +/- 10% variation in call rate from that expected - - public static long nExpectedCalls(BaseMismatchModel model) { - return expectedOutput.get(model).nCalls; - } - -// public static double nExpectedCallsTolerance(long nBasesCalled) { -// return nExpectedCalls(nBasesCalled) * callingTolerance; +// private static String oneMbMD5(BaseMismatchModel m) { +// switch (m) { +// case ONE_STATE: return OneMb1StateMD5; +// case THREE_STATE: return OneMb3StateMD5; +// case EMPIRICAL: return OneMbEmpiricalMD5; +// default: throw new RuntimeException("Unexpected BaseMismatchModel " + m); +// } // } - public static void assertGoodNumberOfCalls(final String name, BaseMismatchModel model, SingleSampleGenotyper.CallResult calls) { - Assert.assertEquals(name, nExpectedCalls(model), calls.nConfidentCalls); - } + // Uncomment to not check outputs against expectations + //protected boolean parameterize() { + // return true; + //} - public static void assertMatchingMD5(final String name, BaseMismatchModel model, final File resultsFile ) { - try { - byte[] bytesOfMessage = getBytesFromFile(resultsFile); - byte[] thedigest = MessageDigest.getInstance("MD5").digest(bytesOfMessage); - BigInteger bigInt = new BigInteger(1, thedigest); - String filemd5sum = bigInt.toString(16); - Assert.assertEquals(name + "Mismatching MD5s", expectedOutput.get(model).md5sum, filemd5sum); - } catch ( Exception e ) { - throw new RuntimeException("Failed to read bytes from calls file: " + resultsFile); + // -------------------------------------------------------------------------------------------------------------- + // + // testing the cache + // + // -------------------------------------------------------------------------------------------------------------- + @Test + public void testCache() { + for ( BaseMismatchModel model : BaseMismatchModel.values() ) { + // calculated the expected value without the cache enabled + WalkerTest.WalkerTestSpec withoutCacheSpec = new WalkerTest.WalkerTestSpec( + testGeliLod5() + " -L 1:10,000,000-10,100,000 --disableCache -m " + model.toString(), 1, + Arrays.asList("")); + List withoutCache = executeTest("empirical1MbTest", withoutCacheSpec ); + + WalkerTest.WalkerTestSpec withCacheSpec = new WalkerTest.WalkerTestSpec( + testGeliLod5() + " -L 1:10,000,000-10,100,000 -m " + model.toString(), 1, + withoutCache); + executeTest(String.format("testCache[%s]", model), withCacheSpec ); } } - public static byte[] getBytesFromFile(File file) throws IOException { - InputStream is = new FileInputStream(file); - - // Get the size of the file - long length = file.length(); - - if (length > Integer.MAX_VALUE) { - // File is too large - } - - // Create the byte array to hold the data - byte[] bytes = new byte[(int)length]; - - // Read in the bytes - int offset = 0; - int numRead = 0; - while (offset < bytes.length - && (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) { - offset += numRead; - } - - // Ensure all the bytes have been read in - if (offset < bytes.length) { - throw new IOException("Could not completely read file "+file.getName()); - } - - // Close the input stream and return bytes - is.close(); - return bytes; + // -------------------------------------------------------------------------------------------------------------- + // + // testing genotype mode + // + // -------------------------------------------------------------------------------------------------------------- + @Test + public void genotypeTest() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + testGeliLod5() + " -L 1:10,000,000-10,100,000 -m empirical --genotype", 1, + Arrays.asList("7e5dec6481bbbc890493925da9a8f691")); + executeTest("genotypeTest", spec); } - private void oneMBCallTest(BaseMismatchModel model) { - logger.warn("Executing oneMBCallTest for " + model); + // -------------------------------------------------------------------------------------------------------------- + // + // basic base calling models + // + // -------------------------------------------------------------------------------------------------------------- - InputStream stream = new ByteArrayInputStream(gatkargs(oneMB).getBytes()); - GATKArgumentCollection mycoll = GATKArgumentCollection.unmarshal(stream); - mycoll.intervals = Arrays.asList("1:10,000,000-11,000,000"); - - SingleSampleGenotyper SSG = new SingleSampleGenotyper(); - SSG.VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI; - SSG.LOD_THRESHOLD = 5.0; - - try { - SSG.VARIANTS_FILE = File.createTempFile("SSGTempTmpCalls.geli.calls", ".tmp" ); - } catch (IOException ex) { - System.err.println("Cannot create temp file: " + ex.getMessage()); - } - SSG.baseModel = model; - - GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - System.out.printf("model is %s, SSG says %s%n", model, SSG.baseModel ); - Accumulator obj = (Accumulator)engine.execute(mycoll, SSG); - SingleSampleGenotyper.CallResult calls = (SingleSampleGenotyper.CallResult)obj.finishTraversal(); - logger.warn(String.format("SSG(%s): made %d calls at %d bases", SSG.baseModel, calls.nConfidentCalls, calls.nCalledBases)); - assertMatchingMD5("testBasic", model, SSG.VARIANTS_FILE); - assertGoodNumberOfCalls("testBasic", model, calls); + @Test + public void oneState100bpTest() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( testGeliLod5() + " -L 1:10,000,000-10,000,100 -m one_state", 1, Arrays.asList("3cd402d889c015be4a318123468f4262")); + executeTest("oneState100bpTest", spec); } - @Test public void oneStateOneMBTest() { oneMBCallTest(BaseMismatchModel.ONE_STATE); } - @Test public void threeStateOneMBTest() { oneMBCallTest(BaseMismatchModel.THREE_STATE); } - @Test public void empiricalStateOneMBTest() { oneMBCallTest(BaseMismatchModel.EMPIRICAL); } + @Test + public void oneState1MbTest() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + testGeliLod5() + " -L 1:10,000,000-11,000,000 -m one_state", + 1, Arrays.asList(OneMb1StateMD5)); + executeTest("oneState1MbTest", spec); + } + + @Test + public void threeState1MbTest() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + testGeliLod5() + " -L 1:10,000,000-11,000,000 -m three_state", 1, + Arrays.asList(OneMb3StateMD5)); + executeTest("threeState1MbTest", spec); + } + + @Test + public void empirical1MbTest() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + testGeliLod5() + " -L 1:10,000,000-11,000,000 -m empirical", 1, + Arrays.asList(OneMbEmpiricalMD5)); + executeTest("empirical1MbTest", spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing output formats + // + // -------------------------------------------------------------------------------------------------------------- + + //@Argument(fullName = "variant_output_format", shortName = "vf", doc = "File format to be used", required = false) + //public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing LOD thresholding + // + // -------------------------------------------------------------------------------------------------------------- + @Test + public void testLOD() { + HashMap e = new HashMap(); + e.put( 10.0, "e4c51dca6f1fa999f4399b7412829534" ); + e.put( 3.0, "d804c24d49669235e3660e92e664ba1a" ); + + for ( Map.Entry entry : e.entrySet() ) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseTestString() + " --variant_output_format GELI -L 1:10,000,000-11,000,000 -m EMPIRICAL -lod " + entry.getKey(), 1, + Arrays.asList(entry.getValue())); + executeTest("testLOD", spec); + } + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing hetero setting + // + // -------------------------------------------------------------------------------------------------------------- + @Test + public void testHeterozyosity() { + HashMap e = new HashMap(); + e.put( 0.01, "ca9986b32aac0d6ad6058f4bf10e7df2" ); + e.put( 0.0001, "55d4e3e73215b70b22a8e689a4e16d37" ); + e.put( 1.0 / 1850, "1ae2126f1a6490d6edd15d95bce726c4" ); + + for ( Map.Entry entry : e.entrySet() ) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + testGeliLod5() + " -L 1:10,000,000-11,000,000 -m EMPIRICAL --heterozygosity " + entry.getKey(), 1, + Arrays.asList(entry.getValue())); + executeTest(String.format("testHeterozyosity[%s]", entry.getKey()), spec); + } + } } \ No newline at end of file