Renaming error to getNegLog10PError(); added Cached clearing method to GL; SSG now has a CallResult that counts calls; No more Adding class to System.out, now to logger.info; First major testing piece (and general approach too) to unit testing of a walker -- SingleSampleGenotyper now knows how many calls to make on a particular 1mb region on NA12878 for each call type and counts the number of calls *AND* the compares the geli MD5 sum to the expected one!
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1530 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3c2ae55859
commit
a08c68362e
|
|
@ -56,7 +56,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
if (Types.containsKey(boundName)) {
|
||||
throw new RuntimeException(String.format("GATK BUG: adding ROD module %s that is already bound", boundName));
|
||||
}
|
||||
System.out.printf("* Adding rod class %s%n", name);
|
||||
logger.info(String.format("* Adding rod class %s%n", name));
|
||||
Types.put(boundName, new RODBinding(name, rodType));
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -297,10 +297,15 @@ public abstract class GenotypeLikelihoods implements Cloneable {
|
|||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
final static GenotypeLikelihoods[][][][][] CACHE =
|
||||
static GenotypeLikelihoods[][][][][] CACHE =
|
||||
new GenotypeLikelihoods[EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][MAX_PLOIDY][2];
|
||||
static int cacheSize = 0;
|
||||
|
||||
public static void clearCache() {
|
||||
CACHE = new GenotypeLikelihoods[EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][MAX_PLOIDY][2];
|
||||
cacheSize = 0;
|
||||
}
|
||||
|
||||
private GenotypeLikelihoods getSetCache( char observedBase, byte qualityScore, int ploidy,
|
||||
SAMRecord read, int offset, GenotypeLikelihoods val ) {
|
||||
|
||||
|
|
|
|||
|
|
@ -103,7 +103,7 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li
|
|||
* @return get the one minus error value
|
||||
*/
|
||||
@Override
|
||||
public double getLog10PError() {
|
||||
public double getNegLog10PError() {
|
||||
getBestGenotype();
|
||||
getAltGenotype();
|
||||
return Math.abs(mGenotypeLikelihoods.getPosterior(mGenotype) - mGenotypeLikelihoods.getPosterior(mCompareTo));
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
|
|||
import java.io.File;
|
||||
|
||||
@ReadFilters(ZeroMappingQualityReadFilter.class)
|
||||
public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeWriter> {
|
||||
public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, SingleSampleGenotyper.CallResult> {
|
||||
// Control output settings
|
||||
@Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = false)
|
||||
public File VARIANTS_FILE = null;
|
||||
|
|
@ -26,7 +26,7 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
|
|||
|
||||
// Control what goes into the variants file and what format that file should have
|
||||
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
|
||||
public Double LOD_THRESHOLD = Double.MIN_VALUE;
|
||||
public double LOD_THRESHOLD = Double.MIN_VALUE;
|
||||
|
||||
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
|
||||
public boolean GENOTYPE = false;
|
||||
|
|
@ -44,6 +44,22 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
|
|||
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
|
||||
public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;
|
||||
|
||||
public class CallResult {
|
||||
long nConfidentCalls = 0;
|
||||
long nNonConfidentCalls = 0;
|
||||
long nCalledBases = 0;
|
||||
GenotypeWriter writer;
|
||||
|
||||
CallResult(GenotypeWriter writer) {
|
||||
this.writer = writer;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("SSG: %d confident and %d non-confident calls were made at %d bases",
|
||||
nConfidentCalls, nNonConfidentCalls, nCalledBases);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter out loci to ignore (at an ambiguous base in the reference or a locus with zero coverage).
|
||||
*
|
||||
|
|
@ -59,6 +75,7 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
|
|||
|
||||
/** Initialize the walker with some sensible defaults */
|
||||
public void initialize() {
|
||||
GenotypeLikelihoods.clearCache();
|
||||
// nothing to do
|
||||
}
|
||||
|
||||
|
|
@ -111,11 +128,11 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
|
|||
*
|
||||
* @return an empty string
|
||||
*/
|
||||
public GenotypeWriter reduceInit() {
|
||||
public CallResult reduceInit() {
|
||||
if ( VARIANTS_FILE != null )
|
||||
return GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), VARIANTS_FILE);
|
||||
return new CallResult(GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), VARIANTS_FILE));
|
||||
else
|
||||
return GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out);
|
||||
return new CallResult(GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -126,18 +143,24 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
|
|||
*
|
||||
* @return an empty string
|
||||
*/
|
||||
public GenotypeWriter reduce(SSGenotypeCall call, GenotypeWriter sum) {
|
||||
public CallResult reduce(SSGenotypeCall call, CallResult sum) {
|
||||
sum.nCalledBases++;
|
||||
|
||||
if (call != null && (GENOTYPE || call.isVariant(call.getReference()))) {
|
||||
if (call.getLog10PError() >= LOD_THRESHOLD) {
|
||||
sum.addGenotypeCall(call);
|
||||
if (call.getNegLog10PError() >= LOD_THRESHOLD) {
|
||||
sum.nConfidentCalls++;
|
||||
//System.out.printf("Call %s%n", call);
|
||||
sum.writer.addGenotypeCall(call);
|
||||
} else {
|
||||
sum.nNonConfidentCalls++;
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
/** Close the variant file. */
|
||||
public void onTraversalDone(GenotypeWriter sum) {
|
||||
sum.close();
|
||||
public void onTraversalDone(CallResult sum) {
|
||||
sum.writer.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -251,7 +251,7 @@ public class ArtificialPoolContext {
|
|||
|
||||
public String genotypeAndConfidenceToString(int group, String spacer) {
|
||||
Genotype call = this.getGenotype(group);
|
||||
return (call.getBases() + spacer + call.getLog10PError()); // TODO: fix me
|
||||
return (call.getBases() + spacer + call.getNegLog10PError()); // TODO: fix me
|
||||
}
|
||||
|
||||
public Genotype getGenotype(int group) {
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ public interface Genotype {
|
|||
*
|
||||
* @return the log based error estimate
|
||||
*/
|
||||
public double getLog10PError();
|
||||
public double getNegLog10PError();
|
||||
|
||||
/**
|
||||
* get the bases that represent this
|
||||
|
|
|
|||
|
|
@ -0,0 +1,140 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.genotype.Variant;
|
||||
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
|
||||
import org.junit.Test;
|
||||
import org.broadinstitute.sting.gatk.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.executive.Accumulator;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumMap;
|
||||
import java.security.MessageDigest;
|
||||
import java.math.BigInteger;
|
||||
|
||||
import junit.framework.Assert;
|
||||
|
||||
public class SingleSampleGenotyperTest extends BaseTest {
|
||||
private final static double DELTA = 1e-8;
|
||||
|
||||
private static final String oneMB = "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam";
|
||||
private static final String fiveMB = "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_15_mb.SLX.bam";
|
||||
private static String gatkargs(final String bam) {
|
||||
final String GATKArgs =
|
||||
"<GATK-argument-collection>\n" +
|
||||
" <sam-files class=\"java.util.ArrayList\">\n" +
|
||||
" <file>%s</file>\n" +
|
||||
" </sam-files>\n" +
|
||||
" <reference-file>/broad/1KG/reference/human_b36_both.fasta</reference-file>\n" +
|
||||
" <analysis-name>SingleSampleGenotyper</analysis-name>\n" +
|
||||
"</GATK-argument-collection>";
|
||||
return String.format(GATKArgs, bam);
|
||||
}
|
||||
|
||||
private static class ExpectedResult {
|
||||
long nCalls;
|
||||
String md5sum;
|
||||
public ExpectedResult(long calls, String md5) {
|
||||
this.nCalls = calls;
|
||||
this.md5sum = md5;
|
||||
}
|
||||
}
|
||||
|
||||
public static EnumMap<BaseMismatchModel, ExpectedResult> expectedOutput = new EnumMap<BaseMismatchModel, ExpectedResult>(BaseMismatchModel.class);
|
||||
|
||||
static {
|
||||
expectedOutput.put(BaseMismatchModel.ONE_STATE, new ExpectedResult(983L, "d5404668e76f206055f03d97162ea6d9"));
|
||||
expectedOutput.put(BaseMismatchModel.THREE_STATE, new ExpectedResult(1055L, "46fb7b66da3dac341e9c342f751d74cd"));
|
||||
expectedOutput.put(BaseMismatchModel.EMPIRICAL, new ExpectedResult(1070L, "ea0be2fd074a6c824a0670ad5b3e0aca"));
|
||||
}
|
||||
|
||||
//private static double callingTolerance = 0.2; // I'm willing to tolerate +/- 10% variation in call rate from that expected
|
||||
|
||||
public static long nExpectedCalls(BaseMismatchModel model) {
|
||||
return expectedOutput.get(model).nCalls;
|
||||
}
|
||||
|
||||
// public static double nExpectedCallsTolerance(long nBasesCalled) {
|
||||
// return nExpectedCalls(nBasesCalled) * callingTolerance;
|
||||
// }
|
||||
|
||||
public static void assertGoodNumberOfCalls(final String name, BaseMismatchModel model, SingleSampleGenotyper.CallResult calls) {
|
||||
Assert.assertEquals(name, nExpectedCalls(model), calls.nConfidentCalls);
|
||||
}
|
||||
|
||||
public static void assertMatchingMD5(final String name, BaseMismatchModel model, final File resultsFile ) {
|
||||
try {
|
||||
byte[] bytesOfMessage = getBytesFromFile(resultsFile);
|
||||
byte[] thedigest = MessageDigest.getInstance("MD5").digest(bytesOfMessage);
|
||||
BigInteger bigInt = new BigInteger(1, thedigest);
|
||||
String filemd5sum = bigInt.toString(16);
|
||||
Assert.assertEquals(name + "Mismatching MD5s", expectedOutput.get(model).md5sum, filemd5sum);
|
||||
} catch ( Exception e ) {
|
||||
throw new RuntimeException("Failed to read bytes from calls file: " + resultsFile);
|
||||
}
|
||||
}
|
||||
|
||||
public static byte[] getBytesFromFile(File file) throws IOException {
|
||||
InputStream is = new FileInputStream(file);
|
||||
|
||||
// Get the size of the file
|
||||
long length = file.length();
|
||||
|
||||
if (length > Integer.MAX_VALUE) {
|
||||
// File is too large
|
||||
}
|
||||
|
||||
// Create the byte array to hold the data
|
||||
byte[] bytes = new byte[(int)length];
|
||||
|
||||
// Read in the bytes
|
||||
int offset = 0;
|
||||
int numRead = 0;
|
||||
while (offset < bytes.length
|
||||
&& (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) {
|
||||
offset += numRead;
|
||||
}
|
||||
|
||||
// Ensure all the bytes have been read in
|
||||
if (offset < bytes.length) {
|
||||
throw new IOException("Could not completely read file "+file.getName());
|
||||
}
|
||||
|
||||
// Close the input stream and return bytes
|
||||
is.close();
|
||||
return bytes;
|
||||
}
|
||||
|
||||
private void oneMBCallTest(BaseMismatchModel model) {
|
||||
logger.warn("Executing oneMBCallTest for " + model);
|
||||
|
||||
InputStream stream = new ByteArrayInputStream(gatkargs(oneMB).getBytes());
|
||||
GATKArgumentCollection mycoll = GATKArgumentCollection.unmarshal(stream);
|
||||
mycoll.intervals = Arrays.asList("1:10,000,000-11,000,000");
|
||||
|
||||
SingleSampleGenotyper SSG = new SingleSampleGenotyper();
|
||||
SSG.VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
|
||||
SSG.LOD_THRESHOLD = 5.0;
|
||||
|
||||
try {
|
||||
SSG.VARIANTS_FILE = File.createTempFile("SSGTempTmpCalls.geli.calls", ".tmp" );
|
||||
} catch (IOException ex) {
|
||||
System.err.println("Cannot create temp file: " + ex.getMessage());
|
||||
}
|
||||
SSG.baseModel = model;
|
||||
|
||||
GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
|
||||
System.out.printf("model is %s, SSG says %s%n", model, SSG.baseModel );
|
||||
Accumulator obj = (Accumulator)engine.execute(mycoll, SSG);
|
||||
SingleSampleGenotyper.CallResult calls = (SingleSampleGenotyper.CallResult)obj.finishTraversal();
|
||||
logger.warn(String.format("SSG(%s): made %d calls at %d bases", SSG.baseModel, calls.nConfidentCalls, calls.nCalledBases));
|
||||
assertMatchingMD5("testBasic", model, SSG.VARIANTS_FILE);
|
||||
assertGoodNumberOfCalls("testBasic", model, calls);
|
||||
}
|
||||
|
||||
@Test public void oneStateOneMBTest() { oneMBCallTest(BaseMismatchModel.ONE_STATE); }
|
||||
@Test public void threeStateOneMBTest() { oneMBCallTest(BaseMismatchModel.THREE_STATE); }
|
||||
@Test public void empiricalStateOneMBTest() { oneMBCallTest(BaseMismatchModel.EMPIRICAL); }
|
||||
}
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
#!/bin/tcsh
|
||||
|
||||
foreach model (ONE_STATE THREE_STATE EMPIRICAL)
|
||||
java -Xmx4096m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -L 1:10,000,000-11,000,000 -R /broad/1KG/reference/human_b36_both.fasta -T SingleSampleGenotyper -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout ./$model.geli.calls -lod 5 -m $model
|
||||
end
|
||||
wc -l *.geli.calls
|
||||
md5sum *.geli.calls
|
||||
|
||||
|
||||
Loading…
Reference in New Issue