Renaming error to getNegLog10PError(); added Cached clearing method to GL; SSG now has a CallResult that counts calls; No more Adding class to System.out, now to logger.info; First major testing piece (and general approach too) to unit testing of a walker -- SingleSampleGenotyper now knows how many calls to make on a particular 1mb region on NA12878 for each call type and counts the number of calls *AND* the compares the geli MD5 sum to the expected one!
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1530 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3c2ae55859
commit
a08c68362e
|
|
@ -56,7 +56,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
||||||
if (Types.containsKey(boundName)) {
|
if (Types.containsKey(boundName)) {
|
||||||
throw new RuntimeException(String.format("GATK BUG: adding ROD module %s that is already bound", boundName));
|
throw new RuntimeException(String.format("GATK BUG: adding ROD module %s that is already bound", boundName));
|
||||||
}
|
}
|
||||||
System.out.printf("* Adding rod class %s%n", name);
|
logger.info(String.format("* Adding rod class %s%n", name));
|
||||||
Types.put(boundName, new RODBinding(name, rodType));
|
Types.put(boundName, new RODBinding(name, rodType));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -297,10 +297,15 @@ public abstract class GenotypeLikelihoods implements Cloneable {
|
||||||
//
|
//
|
||||||
//
|
//
|
||||||
// -----------------------------------------------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------------------------------------------
|
||||||
final static GenotypeLikelihoods[][][][][] CACHE =
|
static GenotypeLikelihoods[][][][][] CACHE =
|
||||||
new GenotypeLikelihoods[EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][MAX_PLOIDY][2];
|
new GenotypeLikelihoods[EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][MAX_PLOIDY][2];
|
||||||
static int cacheSize = 0;
|
static int cacheSize = 0;
|
||||||
|
|
||||||
|
public static void clearCache() {
|
||||||
|
CACHE = new GenotypeLikelihoods[EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][MAX_PLOIDY][2];
|
||||||
|
cacheSize = 0;
|
||||||
|
}
|
||||||
|
|
||||||
private GenotypeLikelihoods getSetCache( char observedBase, byte qualityScore, int ploidy,
|
private GenotypeLikelihoods getSetCache( char observedBase, byte qualityScore, int ploidy,
|
||||||
SAMRecord read, int offset, GenotypeLikelihoods val ) {
|
SAMRecord read, int offset, GenotypeLikelihoods val ) {
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -103,7 +103,7 @@ public class SSGenotypeCall implements Genotype, ReadBacked, GenotypesBacked, Li
|
||||||
* @return get the one minus error value
|
* @return get the one minus error value
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public double getLog10PError() {
|
public double getNegLog10PError() {
|
||||||
getBestGenotype();
|
getBestGenotype();
|
||||||
getAltGenotype();
|
getAltGenotype();
|
||||||
return Math.abs(mGenotypeLikelihoods.getPosterior(mGenotype) - mGenotypeLikelihoods.getPosterior(mCompareTo));
|
return Math.abs(mGenotypeLikelihoods.getPosterior(mGenotype) - mGenotypeLikelihoods.getPosterior(mCompareTo));
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,7 @@ import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
|
||||||
@ReadFilters(ZeroMappingQualityReadFilter.class)
|
@ReadFilters(ZeroMappingQualityReadFilter.class)
|
||||||
public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeWriter> {
|
public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, SingleSampleGenotyper.CallResult> {
|
||||||
// Control output settings
|
// Control output settings
|
||||||
@Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = false)
|
@Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = false)
|
||||||
public File VARIANTS_FILE = null;
|
public File VARIANTS_FILE = null;
|
||||||
|
|
@ -26,7 +26,7 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
|
||||||
|
|
||||||
// Control what goes into the variants file and what format that file should have
|
// Control what goes into the variants file and what format that file should have
|
||||||
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
|
@Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
|
||||||
public Double LOD_THRESHOLD = Double.MIN_VALUE;
|
public double LOD_THRESHOLD = Double.MIN_VALUE;
|
||||||
|
|
||||||
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
|
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes or just the variants?", required = false)
|
||||||
public boolean GENOTYPE = false;
|
public boolean GENOTYPE = false;
|
||||||
|
|
@ -44,6 +44,22 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
|
||||||
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
|
@Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false)
|
||||||
public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;
|
public EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform defaultPlatform = null;
|
||||||
|
|
||||||
|
public class CallResult {
|
||||||
|
long nConfidentCalls = 0;
|
||||||
|
long nNonConfidentCalls = 0;
|
||||||
|
long nCalledBases = 0;
|
||||||
|
GenotypeWriter writer;
|
||||||
|
|
||||||
|
CallResult(GenotypeWriter writer) {
|
||||||
|
this.writer = writer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return String.format("SSG: %d confident and %d non-confident calls were made at %d bases",
|
||||||
|
nConfidentCalls, nNonConfidentCalls, nCalledBases);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filter out loci to ignore (at an ambiguous base in the reference or a locus with zero coverage).
|
* Filter out loci to ignore (at an ambiguous base in the reference or a locus with zero coverage).
|
||||||
*
|
*
|
||||||
|
|
@ -59,6 +75,7 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
|
||||||
|
|
||||||
/** Initialize the walker with some sensible defaults */
|
/** Initialize the walker with some sensible defaults */
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
GenotypeLikelihoods.clearCache();
|
||||||
// nothing to do
|
// nothing to do
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -111,11 +128,11 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
|
||||||
*
|
*
|
||||||
* @return an empty string
|
* @return an empty string
|
||||||
*/
|
*/
|
||||||
public GenotypeWriter reduceInit() {
|
public CallResult reduceInit() {
|
||||||
if ( VARIANTS_FILE != null )
|
if ( VARIANTS_FILE != null )
|
||||||
return GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), VARIANTS_FILE);
|
return new CallResult(GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), VARIANTS_FILE));
|
||||||
else
|
else
|
||||||
return GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out);
|
return new CallResult(GenotypeWriterFactory.create(VAR_FORMAT, GenomeAnalysisEngine.instance.getSAMFileHeader(), out));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -126,18 +143,24 @@ public class SingleSampleGenotyper extends LocusWalker<SSGenotypeCall, GenotypeW
|
||||||
*
|
*
|
||||||
* @return an empty string
|
* @return an empty string
|
||||||
*/
|
*/
|
||||||
public GenotypeWriter reduce(SSGenotypeCall call, GenotypeWriter sum) {
|
public CallResult reduce(SSGenotypeCall call, CallResult sum) {
|
||||||
|
sum.nCalledBases++;
|
||||||
|
|
||||||
if (call != null && (GENOTYPE || call.isVariant(call.getReference()))) {
|
if (call != null && (GENOTYPE || call.isVariant(call.getReference()))) {
|
||||||
if (call.getLog10PError() >= LOD_THRESHOLD) {
|
if (call.getNegLog10PError() >= LOD_THRESHOLD) {
|
||||||
sum.addGenotypeCall(call);
|
sum.nConfidentCalls++;
|
||||||
|
//System.out.printf("Call %s%n", call);
|
||||||
|
sum.writer.addGenotypeCall(call);
|
||||||
|
} else {
|
||||||
|
sum.nNonConfidentCalls++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Close the variant file. */
|
/** Close the variant file. */
|
||||||
public void onTraversalDone(GenotypeWriter sum) {
|
public void onTraversalDone(CallResult sum) {
|
||||||
sum.close();
|
sum.writer.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -251,7 +251,7 @@ public class ArtificialPoolContext {
|
||||||
|
|
||||||
public String genotypeAndConfidenceToString(int group, String spacer) {
|
public String genotypeAndConfidenceToString(int group, String spacer) {
|
||||||
Genotype call = this.getGenotype(group);
|
Genotype call = this.getGenotype(group);
|
||||||
return (call.getBases() + spacer + call.getLog10PError()); // TODO: fix me
|
return (call.getBases() + spacer + call.getNegLog10PError()); // TODO: fix me
|
||||||
}
|
}
|
||||||
|
|
||||||
public Genotype getGenotype(int group) {
|
public Genotype getGenotype(int group) {
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ public interface Genotype {
|
||||||
*
|
*
|
||||||
* @return the log based error estimate
|
* @return the log based error estimate
|
||||||
*/
|
*/
|
||||||
public double getLog10PError();
|
public double getNegLog10PError();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get the bases that represent this
|
* get the bases that represent this
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,140 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.Variant;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.broadinstitute.sting.gatk.GATKArgumentCollection;
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.gatk.executive.Accumulator;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.EnumMap;
|
||||||
|
import java.security.MessageDigest;
|
||||||
|
import java.math.BigInteger;
|
||||||
|
|
||||||
|
import junit.framework.Assert;
|
||||||
|
|
||||||
|
public class SingleSampleGenotyperTest extends BaseTest {
|
||||||
|
private final static double DELTA = 1e-8;
|
||||||
|
|
||||||
|
private static final String oneMB = "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam";
|
||||||
|
private static final String fiveMB = "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_15_mb.SLX.bam";
|
||||||
|
private static String gatkargs(final String bam) {
|
||||||
|
final String GATKArgs =
|
||||||
|
"<GATK-argument-collection>\n" +
|
||||||
|
" <sam-files class=\"java.util.ArrayList\">\n" +
|
||||||
|
" <file>%s</file>\n" +
|
||||||
|
" </sam-files>\n" +
|
||||||
|
" <reference-file>/broad/1KG/reference/human_b36_both.fasta</reference-file>\n" +
|
||||||
|
" <analysis-name>SingleSampleGenotyper</analysis-name>\n" +
|
||||||
|
"</GATK-argument-collection>";
|
||||||
|
return String.format(GATKArgs, bam);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class ExpectedResult {
|
||||||
|
long nCalls;
|
||||||
|
String md5sum;
|
||||||
|
public ExpectedResult(long calls, String md5) {
|
||||||
|
this.nCalls = calls;
|
||||||
|
this.md5sum = md5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static EnumMap<BaseMismatchModel, ExpectedResult> expectedOutput = new EnumMap<BaseMismatchModel, ExpectedResult>(BaseMismatchModel.class);
|
||||||
|
|
||||||
|
static {
|
||||||
|
expectedOutput.put(BaseMismatchModel.ONE_STATE, new ExpectedResult(983L, "d5404668e76f206055f03d97162ea6d9"));
|
||||||
|
expectedOutput.put(BaseMismatchModel.THREE_STATE, new ExpectedResult(1055L, "46fb7b66da3dac341e9c342f751d74cd"));
|
||||||
|
expectedOutput.put(BaseMismatchModel.EMPIRICAL, new ExpectedResult(1070L, "ea0be2fd074a6c824a0670ad5b3e0aca"));
|
||||||
|
}
|
||||||
|
|
||||||
|
//private static double callingTolerance = 0.2; // I'm willing to tolerate +/- 10% variation in call rate from that expected
|
||||||
|
|
||||||
|
public static long nExpectedCalls(BaseMismatchModel model) {
|
||||||
|
return expectedOutput.get(model).nCalls;
|
||||||
|
}
|
||||||
|
|
||||||
|
// public static double nExpectedCallsTolerance(long nBasesCalled) {
|
||||||
|
// return nExpectedCalls(nBasesCalled) * callingTolerance;
|
||||||
|
// }
|
||||||
|
|
||||||
|
public static void assertGoodNumberOfCalls(final String name, BaseMismatchModel model, SingleSampleGenotyper.CallResult calls) {
|
||||||
|
Assert.assertEquals(name, nExpectedCalls(model), calls.nConfidentCalls);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void assertMatchingMD5(final String name, BaseMismatchModel model, final File resultsFile ) {
|
||||||
|
try {
|
||||||
|
byte[] bytesOfMessage = getBytesFromFile(resultsFile);
|
||||||
|
byte[] thedigest = MessageDigest.getInstance("MD5").digest(bytesOfMessage);
|
||||||
|
BigInteger bigInt = new BigInteger(1, thedigest);
|
||||||
|
String filemd5sum = bigInt.toString(16);
|
||||||
|
Assert.assertEquals(name + "Mismatching MD5s", expectedOutput.get(model).md5sum, filemd5sum);
|
||||||
|
} catch ( Exception e ) {
|
||||||
|
throw new RuntimeException("Failed to read bytes from calls file: " + resultsFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static byte[] getBytesFromFile(File file) throws IOException {
|
||||||
|
InputStream is = new FileInputStream(file);
|
||||||
|
|
||||||
|
// Get the size of the file
|
||||||
|
long length = file.length();
|
||||||
|
|
||||||
|
if (length > Integer.MAX_VALUE) {
|
||||||
|
// File is too large
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the byte array to hold the data
|
||||||
|
byte[] bytes = new byte[(int)length];
|
||||||
|
|
||||||
|
// Read in the bytes
|
||||||
|
int offset = 0;
|
||||||
|
int numRead = 0;
|
||||||
|
while (offset < bytes.length
|
||||||
|
&& (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) {
|
||||||
|
offset += numRead;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure all the bytes have been read in
|
||||||
|
if (offset < bytes.length) {
|
||||||
|
throw new IOException("Could not completely read file "+file.getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close the input stream and return bytes
|
||||||
|
is.close();
|
||||||
|
return bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void oneMBCallTest(BaseMismatchModel model) {
|
||||||
|
logger.warn("Executing oneMBCallTest for " + model);
|
||||||
|
|
||||||
|
InputStream stream = new ByteArrayInputStream(gatkargs(oneMB).getBytes());
|
||||||
|
GATKArgumentCollection mycoll = GATKArgumentCollection.unmarshal(stream);
|
||||||
|
mycoll.intervals = Arrays.asList("1:10,000,000-11,000,000");
|
||||||
|
|
||||||
|
SingleSampleGenotyper SSG = new SingleSampleGenotyper();
|
||||||
|
SSG.VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
|
||||||
|
SSG.LOD_THRESHOLD = 5.0;
|
||||||
|
|
||||||
|
try {
|
||||||
|
SSG.VARIANTS_FILE = File.createTempFile("SSGTempTmpCalls.geli.calls", ".tmp" );
|
||||||
|
} catch (IOException ex) {
|
||||||
|
System.err.println("Cannot create temp file: " + ex.getMessage());
|
||||||
|
}
|
||||||
|
SSG.baseModel = model;
|
||||||
|
|
||||||
|
GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
|
||||||
|
System.out.printf("model is %s, SSG says %s%n", model, SSG.baseModel );
|
||||||
|
Accumulator obj = (Accumulator)engine.execute(mycoll, SSG);
|
||||||
|
SingleSampleGenotyper.CallResult calls = (SingleSampleGenotyper.CallResult)obj.finishTraversal();
|
||||||
|
logger.warn(String.format("SSG(%s): made %d calls at %d bases", SSG.baseModel, calls.nConfidentCalls, calls.nCalledBases));
|
||||||
|
assertMatchingMD5("testBasic", model, SSG.VARIANTS_FILE);
|
||||||
|
assertGoodNumberOfCalls("testBasic", model, calls);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void oneStateOneMBTest() { oneMBCallTest(BaseMismatchModel.ONE_STATE); }
|
||||||
|
@Test public void threeStateOneMBTest() { oneMBCallTest(BaseMismatchModel.THREE_STATE); }
|
||||||
|
@Test public void empiricalStateOneMBTest() { oneMBCallTest(BaseMismatchModel.EMPIRICAL); }
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,9 @@
|
||||||
|
#!/bin/tcsh
|
||||||
|
|
||||||
|
foreach model (ONE_STATE THREE_STATE EMPIRICAL)
|
||||||
|
java -Xmx4096m -jar ~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -L 1:10,000,000-11,000,000 -R /broad/1KG/reference/human_b36_both.fasta -T SingleSampleGenotyper -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout ./$model.geli.calls -lod 5 -m $model
|
||||||
|
end
|
||||||
|
wc -l *.geli.calls
|
||||||
|
md5sum *.geli.calls
|
||||||
|
|
||||||
|
|
||||||
Loading…
Reference in New Issue