Renaming error to getNegLog10PError(); added Cached clearing method to GL; SSG now has a CallResult that counts calls; No more Adding class to System.out, now to logger.info; First major testing piece (and general approach too) to unit testing of a walker -- SingleSampleGenotyper now knows how many calls to make on a particular 1mb region on NA12878 for each call type and counts the number of calls *AND* the compares the geli MD5 sum to the expected one!

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1530 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-09-04 12:39:06 +00:00
parent 3c2ae55859
commit a08c68362e
8 changed files with 192 additions and 15 deletions

View File

@ -56,7 +56,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
if (Types.containsKey(boundName)) {
throw new RuntimeException(String.format("GATK BUG: adding ROD module %s that is already bound", boundName));
}
System.out.printf("* Adding rod class %s%n", name);
logger.info(String.format("* Adding rod class %s%n", name));
Types.put(boundName, new RODBinding(name, rodType));
}

View File

@ -297,10 +297,15 @@ public abstract class GenotypeLikelihoods implements Cloneable {
//
//
// -----------------------------------------------------------------------------------------------------------------
final static GenotypeLikelihoods[][][][][] CACHE =
static GenotypeLikelihoods[][][][][] CACHE =
new GenotypeLikelihoods[EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][MAX_PLOIDY][2];
static int cacheSize = 0;
public static void clearCache() {
CACHE = new GenotypeLikelihoods[EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][MAX_PLOIDY][2];
cacheSize = 0;
}
private GenotypeLikelihoods getSetCache( char observedBase, byte qualityScore, int ploidy,
SAMRecord read, int offset, GenotypeLikelihoods val ) {

View File

@ -103,7 +103,7 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li
* @return get the one minus error value
*/
@Override
public double getLog10PError() {
public double getNegLog10PError() {
getBestGenotype();
getAltGenotype();
return Math.abs(mGenotypeLikelihoods.getPosterior(mGenotype) - mGenotypeLikelihoods.getPosterior(mCompareTo));

View File

@ -16,7 +16,7 @@ import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
import java.io.File;
@ReadFilters(ZeroMappingQualityReadFilter.class)
public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeWriter> {
public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, SingleSampleGenotyper.CallResult> {
// Control output settings
@Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = false)
public File VARIANTS_FILE = null;
@ -26,7 +26,7 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
// Control what goes into the variants file and what format that file should have
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
public Double LOD_THRESHOLD = Double.MIN_VALUE;
public double LOD_THRESHOLD = Double.MIN_VALUE;
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
public boolean GENOTYPE = false;
@ -44,6 +44,22 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;
public class CallResult {
long nConfidentCalls = 0;
long nNonConfidentCalls = 0;
long nCalledBases = 0;
GenotypeWriter writer;
CallResult(GenotypeWriter writer) {
this.writer = writer;
}
public String toString() {
return String.format("SSG: %d confident and %d non-confident calls were made at %d bases",
nConfidentCalls, nNonConfidentCalls, nCalledBases);
}
}
/**
* Filter out loci to ignore (at an ambiguous base in the reference or a locus with zero coverage).
*
@ -59,6 +75,7 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
/** Initialize the walker with some sensible defaults */
public void initialize() {
GenotypeLikelihoods.clearCache();
// nothing to do
}
@ -111,11 +128,11 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
*
* @return an empty string
*/
public GenotypeWriter reduceInit() {
public CallResult reduceInit() {
if ( VARIANTS_FILE != null )
return GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), VARIANTS_FILE);
return new CallResult(GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), VARIANTS_FILE));
else
return GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out);
return new CallResult(GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out));
}
/**
@ -126,18 +143,24 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
*
* @return an empty string
*/
public GenotypeWriter reduce(SSGenotypeCall call, GenotypeWriter sum) {
public CallResult reduce(SSGenotypeCall call, CallResult sum) {
sum.nCalledBases++;
if (call != null && (GENOTYPE || call.isVariant(call.getReference()))) {
if (call.getLog10PError() >= LOD_THRESHOLD) {
sum.addGenotypeCall(call);
if (call.getNegLog10PError() >= LOD_THRESHOLD) {
sum.nConfidentCalls++;
//System.out.printf("Call %s%n", call);
sum.writer.addGenotypeCall(call);
} else {
sum.nNonConfidentCalls++;
}
}
return sum;
}
/** Close the variant file. */
public void onTraversalDone(GenotypeWriter sum) {
sum.close();
public void onTraversalDone(CallResult sum) {
sum.writer.close();
}
}

View File

@ -251,7 +251,7 @@ public class ArtificialPoolContext {
public String genotypeAndConfidenceToString(int group, String spacer) {
Genotype call = this.getGenotype(group);
return (call.getBases() + spacer + call.getLog10PError()); // TODO: fix me
return (call.getBases() + spacer + call.getNegLog10PError()); // TODO: fix me
}
public Genotype getGenotype(int group) {

View File

@ -15,7 +15,7 @@ public interface Genotype {
*
* @return the log based error estimate
*/
public double getLog10PError();
public double getNegLog10PError();
/**
* get the bases that represent this

View File

@ -0,0 +1,140 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.genotype.Variant;
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
import org.junit.Test;
import org.broadinstitute.sting.gatk.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.executive.Accumulator;
import java.io.*;
import java.util.Arrays;
import java.util.EnumMap;
import java.security.MessageDigest;
import java.math.BigInteger;
import junit.framework.Assert;
public class SingleSampleGenotyperTest extends BaseTest {
private final static double DELTA = 1e-8;
private static final String oneMB = "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam";
private static final String fiveMB = "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_15_mb.SLX.bam";
private static String gatkargs(final String bam) {
final String GATKArgs =
"<GATK-argument-collection>\n" +
" <sam-files class=\"java.util.ArrayList\">\n" +
" <file>%s</file>\n" +
" </sam-files>\n" +
" <reference-file>/broad/1KG/reference/human_b36_both.fasta</reference-file>\n" +
" <analysis-name>SingleSampleGenotyper</analysis-name>\n" +
"</GATK-argument-collection>";
return String.format(GATKArgs, bam);
}
private static class ExpectedResult {
long nCalls;
String md5sum;
public ExpectedResult(long calls, String md5) {
this.nCalls = calls;
this.md5sum = md5;
}
}
public static EnumMap<BaseMismatchModel, ExpectedResult> expectedOutput = new EnumMap<BaseMismatchModel, ExpectedResult>(BaseMismatchModel.class);
static {
expectedOutput.put(BaseMismatchModel.ONE_STATE, new ExpectedResult(983L, "d5404668e76f206055f03d97162ea6d9"));
expectedOutput.put(BaseMismatchModel.THREE_STATE, new ExpectedResult(1055L, "46fb7b66da3dac341e9c342f751d74cd"));
expectedOutput.put(BaseMismatchModel.EMPIRICAL, new ExpectedResult(1070L, "ea0be2fd074a6c824a0670ad5b3e0aca"));
}
//private static double callingTolerance = 0.2; // I'm willing to tolerate +/- 10% variation in call rate from that expected
public static long nExpectedCalls(BaseMismatchModel model) {
return expectedOutput.get(model).nCalls;
}
// public static double nExpectedCallsTolerance(long nBasesCalled) {
// return nExpectedCalls(nBasesCalled) * callingTolerance;
// }
public static void assertGoodNumberOfCalls(final String name, BaseMismatchModel model, SingleSampleGenotyper.CallResult calls) {
Assert.assertEquals(name, nExpectedCalls(model), calls.nConfidentCalls);
}
public static void assertMatchingMD5(final String name, BaseMismatchModel model, final File resultsFile ) {
try {
byte[] bytesOfMessage = getBytesFromFile(resultsFile);
byte[] thedigest = MessageDigest.getInstance("MD5").digest(bytesOfMessage);
BigInteger bigInt = new BigInteger(1, thedigest);
String filemd5sum = bigInt.toString(16);
Assert.assertEquals(name + "Mismatching MD5s", expectedOutput.get(model).md5sum, filemd5sum);
} catch ( Exception e ) {
throw new RuntimeException("Failed to read bytes from calls file: " + resultsFile);
}
}
public static byte[] getBytesFromFile(File file) throws IOException {
InputStream is = new FileInputStream(file);
// Get the size of the file
long length = file.length();
if (length > Integer.MAX_VALUE) {
// File is too large
}
// Create the byte array to hold the data
byte[] bytes = new byte[(int)length];
// Read in the bytes
int offset = 0;
int numRead = 0;
while (offset < bytes.length
&& (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) {
offset += numRead;
}
// Ensure all the bytes have been read in
if (offset < bytes.length) {
throw new IOException("Could not completely read file "+file.getName());
}
// Close the input stream and return bytes
is.close();
return bytes;
}
private void oneMBCallTest(BaseMismatchModel model) {
logger.warn("Executing oneMBCallTest for " + model);
InputStream stream = new ByteArrayInputStream(gatkargs(oneMB).getBytes());
GATKArgumentCollection mycoll = GATKArgumentCollection.unmarshal(stream);
mycoll.intervals = Arrays.asList("1:10,000,000-11,000,000");
SingleSampleGenotyper SSG = new SingleSampleGenotyper();
SSG.VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
SSG.LOD_THRESHOLD = 5.0;
try {
SSG.VARIANTS_FILE = File.createTempFile("SSGTempTmpCalls.geli.calls", ".tmp" );
} catch (IOException ex) {
System.err.println("Cannot create temp file: " + ex.getMessage());
}
SSG.baseModel = model;
GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
System.out.printf("model is %s, SSG says %s%n", model, SSG.baseModel );
Accumulator obj = (Accumulator)engine.execute(mycoll, SSG);
SingleSampleGenotyper.CallResult calls = (SingleSampleGenotyper.CallResult)obj.finishTraversal();
logger.warn(String.format("SSG(%s): made %d calls at %d bases", SSG.baseModel, calls.nConfidentCalls, calls.nCalledBases));
assertMatchingMD5("testBasic", model, SSG.VARIANTS_FILE);
assertGoodNumberOfCalls("testBasic", model, calls);
}
@Test public void oneStateOneMBTest() { oneMBCallTest(BaseMismatchModel.ONE_STATE); }
@Test public void threeStateOneMBTest() { oneMBCallTest(BaseMismatchModel.THREE_STATE); }
@Test public void empiricalStateOneMBTest() { oneMBCallTest(BaseMismatchModel.EMPIRICAL); }
}

View File

@ -0,0 +1,9 @@
#!/bin/tcsh
foreach model (ONE_STATE THREE_STATE EMPIRICAL)
java -Xmx4096m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -L 1:10,000,000-11,000,000 -R /broad/1KG/reference/human_b36_both.fasta -T SingleSampleGenotyper -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout ./$model.geli.calls -lod 5 -m $model
end
wc -l *.geli.calls
md5sum *.geli.calls