From a08c68362ebbded0b6b639728e5e78a2fa7b73b0 Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 4 Sep 2009 12:39:06 +0000 Subject: [PATCH] Renaming error to getNegLog10PError(); added Cached clearing method to GL; SSG now has a CallResult that counts calls; No more Adding class to System.out, now to logger.info; First major testing piece (and general approach too) to unit testing of a walker -- SingleSampleGenotyper now knows how many calls to make on a particular 1mb region on NA12878 for each call type and counts the number of calls *AND* the compares the geli MD5 sum to the expected one! git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1530 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/refdata/ReferenceOrderedData.java | 2 +- .../genotyper/GenotypeLikelihoods.java | 7 +- .../walkers/genotyper/SSGenotypeCall.java | 2 +- .../genotyper/SingleSampleGenotyper.java | 43 ++++-- .../utils/ArtificialPoolContext.java | 2 +- .../sting/utils/genotype/Genotype.java | 2 +- .../genotyper/SingleSampleGenotyperTest.java | 140 ++++++++++++++++++ shell/calcCallerMD5s.csh | 9 ++ 8 files changed, 192 insertions(+), 15 deletions(-) create mode 100755 java/test/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyperTest.java create mode 100755 shell/calcCallerMD5s.csh diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index 7bb4ea379..496d5a0db 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -56,7 +56,7 @@ public class ReferenceOrderedData implements if (Types.containsKey(boundName)) { throw new RuntimeException(String.format("GATK BUG: adding ROD module %s that is already bound", boundName)); } - System.out.printf("* Adding rod class %s%n", name); + logger.info(String.format("* Adding rod class %s%n", name)); Types.put(boundName, new RODBinding(name, rodType)); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java index 3155412eb..ee6418726 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java @@ -297,10 +297,15 @@ public abstract class GenotypeLikelihoods implements Cloneable { // // // ----------------------------------------------------------------------------------------------------------------- - final static GenotypeLikelihoods[][][][][] CACHE = + static GenotypeLikelihoods[][][][][] CACHE = new GenotypeLikelihoods[EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][MAX_PLOIDY][2]; static int cacheSize = 0; + public static void clearCache() { + CACHE = new GenotypeLikelihoods[EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][MAX_PLOIDY][2]; + cacheSize = 0; + } + private GenotypeLikelihoods getSetCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset, GenotypeLikelihoods val ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGenotypeCall.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGenotypeCall.java index 8aa33f7d4..802fd8895 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGenotypeCall.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGenotypeCall.java @@ -103,7 +103,7 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li * @return get the one minus error value */ @Override - public double getLog10PError() { + public double getNegLog10PError() { getBestGenotype(); getAltGenotype(); return Math.abs(mGenotypeLikelihoods.getPosterior(mGenotype) - mGenotypeLikelihoods.getPosterior(mCompareTo)); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java index dd95c92a4..de8dc4c49 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java @@ -16,7 +16,7 @@ import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; import java.io.File; @ReadFilters(ZeroMappingQualityReadFilter.class) -public class SingleSampleGenotyper extends LocusWalker { +public class SingleSampleGenotyper extends LocusWalker { // Control output settings @Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = false) public File VARIANTS_FILE = null; @@ -26,7 +26,7 @@ public class SingleSampleGenotyper extends LocusWalker= LOD_THRESHOLD) { - sum.addGenotypeCall(call); + if (call.getNegLog10PError() >= LOD_THRESHOLD) { + sum.nConfidentCalls++; + //System.out.printf("Call %s%n", call); + sum.writer.addGenotypeCall(call); + } else { + sum.nNonConfidentCalls++; } } return sum; } /** Close the variant file. */ - public void onTraversalDone(GenotypeWriter sum) { - sum.close(); + public void onTraversalDone(CallResult sum) { + sum.writer.close(); } } diff --git a/java/src/org/broadinstitute/sting/playground/utils/ArtificialPoolContext.java b/java/src/org/broadinstitute/sting/playground/utils/ArtificialPoolContext.java index 13b8854d7..90fe0e3e4 100755 --- a/java/src/org/broadinstitute/sting/playground/utils/ArtificialPoolContext.java +++ b/java/src/org/broadinstitute/sting/playground/utils/ArtificialPoolContext.java @@ -251,7 +251,7 @@ public class ArtificialPoolContext { public String genotypeAndConfidenceToString(int group, String spacer) { Genotype call = this.getGenotype(group); - return (call.getBases() + spacer + call.getLog10PError()); // TODO: fix me + return (call.getBases() + spacer + call.getNegLog10PError()); // TODO: fix me } public Genotype getGenotype(int group) { diff --git a/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java b/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java index 0d4e64aa3..5c62f2d75 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java @@ -15,7 +15,7 @@ public interface Genotype { * * @return the log based error estimate */ - public double getLog10PError(); + public double getNegLog10PError(); /** * get the bases that represent this diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyperTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyperTest.java new file mode 100755 index 000000000..4ddcd3f44 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyperTest.java @@ -0,0 +1,140 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.genotype.Variant; +import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; +import org.junit.Test; +import org.broadinstitute.sting.gatk.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.executive.Accumulator; + +import java.io.*; +import java.util.Arrays; +import java.util.EnumMap; +import java.security.MessageDigest; +import java.math.BigInteger; + +import junit.framework.Assert; + +public class SingleSampleGenotyperTest extends BaseTest { + private final static double DELTA = 1e-8; + + private static final String oneMB = "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam"; + private static final String fiveMB = "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_15_mb.SLX.bam"; + private static String gatkargs(final String bam) { + final String GATKArgs = + "\n" + + " \n" + + " %s\n" + + " \n" + + " /broad/1KG/reference/human_b36_both.fasta\n" + + " SingleSampleGenotyper\n" + + ""; + return String.format(GATKArgs, bam); + } + + private static class ExpectedResult { + long nCalls; + String md5sum; + public ExpectedResult(long calls, String md5) { + this.nCalls = calls; + this.md5sum = md5; + } + } + + public static EnumMap expectedOutput = new EnumMap(BaseMismatchModel.class); + + static { + expectedOutput.put(BaseMismatchModel.ONE_STATE, new ExpectedResult(983L, "d5404668e76f206055f03d97162ea6d9")); + expectedOutput.put(BaseMismatchModel.THREE_STATE, new ExpectedResult(1055L, "46fb7b66da3dac341e9c342f751d74cd")); + expectedOutput.put(BaseMismatchModel.EMPIRICAL, new ExpectedResult(1070L, "ea0be2fd074a6c824a0670ad5b3e0aca")); + } + + //private static double callingTolerance = 0.2; // I'm willing to tolerate +/- 10% variation in call rate from that expected + + public static long nExpectedCalls(BaseMismatchModel model) { + return expectedOutput.get(model).nCalls; + } + +// public static double nExpectedCallsTolerance(long nBasesCalled) { +// return nExpectedCalls(nBasesCalled) * callingTolerance; +// } + + public static void assertGoodNumberOfCalls(final String name, BaseMismatchModel model, SingleSampleGenotyper.CallResult calls) { + Assert.assertEquals(name, nExpectedCalls(model), calls.nConfidentCalls); + } + + public static void assertMatchingMD5(final String name, BaseMismatchModel model, final File resultsFile ) { + try { + byte[] bytesOfMessage = getBytesFromFile(resultsFile); + byte[] thedigest = MessageDigest.getInstance("MD5").digest(bytesOfMessage); + BigInteger bigInt = new BigInteger(1, thedigest); + String filemd5sum = bigInt.toString(16); + Assert.assertEquals(name + "Mismatching MD5s", expectedOutput.get(model).md5sum, filemd5sum); + } catch ( Exception e ) { + throw new RuntimeException("Failed to read bytes from calls file: " + resultsFile); + } + } + + public static byte[] getBytesFromFile(File file) throws IOException { + InputStream is = new FileInputStream(file); + + // Get the size of the file + long length = file.length(); + + if (length > Integer.MAX_VALUE) { + // File is too large + } + + // Create the byte array to hold the data + byte[] bytes = new byte[(int)length]; + + // Read in the bytes + int offset = 0; + int numRead = 0; + while (offset < bytes.length + && (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) { + offset += numRead; + } + + // Ensure all the bytes have been read in + if (offset < bytes.length) { + throw new IOException("Could not completely read file "+file.getName()); + } + + // Close the input stream and return bytes + is.close(); + return bytes; + } + + private void oneMBCallTest(BaseMismatchModel model) { + logger.warn("Executing oneMBCallTest for " + model); + + InputStream stream = new ByteArrayInputStream(gatkargs(oneMB).getBytes()); + GATKArgumentCollection mycoll = GATKArgumentCollection.unmarshal(stream); + mycoll.intervals = Arrays.asList("1:10,000,000-11,000,000"); + + SingleSampleGenotyper SSG = new SingleSampleGenotyper(); + SSG.VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI; + SSG.LOD_THRESHOLD = 5.0; + + try { + SSG.VARIANTS_FILE = File.createTempFile("SSGTempTmpCalls.geli.calls", ".tmp" ); + } catch (IOException ex) { + System.err.println("Cannot create temp file: " + ex.getMessage()); + } + SSG.baseModel = model; + + GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + System.out.printf("model is %s, SSG says %s%n", model, SSG.baseModel ); + Accumulator obj = (Accumulator)engine.execute(mycoll, SSG); + SingleSampleGenotyper.CallResult calls = (SingleSampleGenotyper.CallResult)obj.finishTraversal(); + logger.warn(String.format("SSG(%s): made %d calls at %d bases", SSG.baseModel, calls.nConfidentCalls, calls.nCalledBases)); + assertMatchingMD5("testBasic", model, SSG.VARIANTS_FILE); + assertGoodNumberOfCalls("testBasic", model, calls); + } + + @Test public void oneStateOneMBTest() { oneMBCallTest(BaseMismatchModel.ONE_STATE); } + @Test public void threeStateOneMBTest() { oneMBCallTest(BaseMismatchModel.THREE_STATE); } + @Test public void empiricalStateOneMBTest() { oneMBCallTest(BaseMismatchModel.EMPIRICAL); } +} \ No newline at end of file diff --git a/shell/calcCallerMD5s.csh b/shell/calcCallerMD5s.csh new file mode 100755 index 000000000..c1b2eed21 --- /dev/null +++ b/shell/calcCallerMD5s.csh @@ -0,0 +1,9 @@ +#!/bin/tcsh + +foreach model (ONE_STATE THREE_STATE EMPIRICAL) +java -Xmx4096m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -L 1:10,000,000-11,000,000 -R /broad/1KG/reference/human_b36_both.fasta -T SingleSampleGenotyper -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout ./$model.geli.calls -lod 5 -m $model +end +wc -l *.geli.calls +md5sum *.geli.calls + +