Now understanding GLFs for calculating genotyping results like callable bases, as well as avoids emitting stupid amounts of data when doing a genotype evaluation (i.e., ignores non-SNP() calls)

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1267 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-07-16 21:03:26 +00:00
parent c5f6ab3dd5
commit 0548026a2e
5 changed files with 139 additions and 5 deletions

View File

@ -0,0 +1,88 @@
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.LocusContext;
import java.io.PrintStream;
import java.util.List;
import java.util.Arrays;
import java.util.ArrayList;
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
public class CallableBasesAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis {
long all_bases = 0;
long all_calls = 0;
//final static double[] Qthresholds = { 10, 20, 30, 40, 50, 100, 200, 500, 1000 };
final static double[] thresholds = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 50, 100 };
long[] discoverable_bases = new long[thresholds.length];
long[] genotypable_bases = new long[thresholds.length];
public CallableBasesAnalysis() {
super("callable_bases");
}
public long nSites() { return all_bases; }
public long nCalls() { return all_calls; }
public long nDiscoverable(int index) { return discoverable_bases[index]; }
public double percentDiscoverable(int index) { return (100.0*nDiscoverable(index)) / nSites(); }
public long nGenotypable(int index) { return genotypable_bases[index]; }
public double percentGenotypable(int index) { return (100.0*nGenotypable(index)) / nSites(); }
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
all_bases++;
if ( eval == null ) // no data here!
return null;
// we actually have a record here
if ( ! eval.isGenotype() ) { // evaluation record isn't a genotype, die!
throw new RuntimeException("Evaluation track isn't an Genotype!");
}
all_calls++;
// For every threshold, updated discoverable and callable
for ( int i = 0; i < thresholds.length; i++ ) {
double threshold = thresholds[i];
// update discoverable
if ( eval.isSNP() && eval.getVariationConfidence() >= threshold )
discoverable_bases[i]++;
if ( eval.isReference() && eval.getConsensusConfidence() >= threshold )
discoverable_bases[i]++;
if ( eval.getConsensusConfidence() >= threshold )
genotypable_bases[i]++;
//System.out.printf("Updating %s SNP=%b, REF=%b VarConf=%f ConConf=%f where threshold=%f: discoverable = %d, genotypable = %d%n",
// eval.getLocation(), eval.isSNP(), eval.isReference(), eval.getVariationConfidence(),
// eval.getConsensusConfidence(), threshold, discoverable_bases[i], genotypable_bases[i]);
}
return null;
}
public List<String> done() {
List<String> s = new ArrayList<String>();
s.add(String.format("total_no_sites %d", nSites()));
s.add(String.format("total_no_calls %d", nCalls()));
s.add(String.format(""));
s.add(String.format("confidence_threshold\tdiscoverable_bases\tdiscoverable_bases_percent\tgenotypable_bases\tgenotypable_bases_percent"));
for ( int i = 0; i < thresholds.length; i++ ) {
double threshold = thresholds[i];
s.add(String.format("%6.2f\t%d\t%.6f\t%d\t%.6f", threshold, nDiscoverable(i), percentDiscoverable(i), nGenotypable(i), percentGenotypable(i)));
}
return s;
}
}

View File

@ -40,7 +40,7 @@ public class ClusterCounterAnalysis extends BasicVariantAnalysis implements Geno
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
String r = null;
if ( eval != null ) {
if ( eval != null && eval.isSNP() ) {
IntervalRod intervalROD = (IntervalRod)tracker.lookup("interval", null);
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();

View File

@ -39,7 +39,7 @@ public class NeighborDistanceAnalysis extends ViolationVariantAnalysis implement
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
String r = null;
if ( eval != null ) {
if ( eval != null && eval.isSNP() ) {
IntervalRod intervalROD = (IntervalRod)tracker.lookup("interval", null);
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();

View File

@ -53,8 +53,8 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
// There are four cases here:
AllelicVariant dbsnp = (AllelicVariant)tracker.lookup(dbName, null);
inc(dbsnp != null, eval != null);
return dbsnp == null && eval != null ? "Novel " + eval : null;
inc(dbsnp != null, eval != null && eval.isSNP());
return dbsnp == null && eval != null && eval.isSNP() ? "Novel " + eval : null;
}
/**

View File

@ -5,6 +5,10 @@ import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.glf.GLFRecord;
import org.broadinstitute.sting.utils.genotype.glf.SinglePointCall;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
@ -87,6 +91,7 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
analyses.add(new NeighborDistanceAnalysis());
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
analyses.add(new ClusterCounterAnalysis());
analyses.add(new CallableBasesAnalysis());
//
// Filter out analyzes inappropriate for our evaluation type Population or Genotype
@ -97,7 +102,7 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
boolean disableForGenotyping = evalContainsGenotypes && ! (analysis instanceof GenotypeAnalysis);
boolean disableForPopulation = ! evalContainsGenotypes && ! (analysis instanceof PopulationAnalysis);
boolean disable = disableForGenotyping | disableForPopulation;
String causeName = disableForGenotyping ? "genotype" : (disableForPopulation ? "population" : null);
String causeName = disableForGenotyping ? "population" : (disableForPopulation ? "genotype" : null);
if ( disable ) {
logger.info(String.format("Disabling %s-only analysis %s in set %s", causeName, analysis, setName));
iter.remove();
@ -141,9 +146,50 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
}
}
// OMG this is painful
private rodVariants glf2geli(char ref, RodGLF glf) {
SinglePointCall rec = (SinglePointCall)glf.mRecord;
// contig pos refBase depth maxMappingQ bestGenotype lodBtr lodBtnb genotypes
Integer[] sorted = Utils.SortPermutation(rec.getLikelihoods());
int bestIndex = sorted[0];
char[] refs = {ref, ref};
String homRef = new String(refs);
int refIndex = rodVariants.Genotype.valueOf(homRef).ordinal();
double refLikelihood = rec.getLikelihoods()[refIndex];
double bestLikelihood = rec.getLikelihoods()[bestIndex];
double secondBestLikelihood = rec.getLikelihoods()[sorted[1]];
rodVariants var = new rodVariants("eval");
var.loc = glf.getLocation();
var.refBase = ref;
var.depth = rec.getReadDepth();
var.maxMappingQuality = rec.getRmsMapQ();
var.bestGenotype = rodVariants.Genotype.values()[bestIndex].toString();
var.lodBtr = Math.abs((bestLikelihood - refLikelihood) / GLFRecord.LIKELIHOOD_SCALE_FACTOR);
var.lodBtnb = Math.abs((bestLikelihood - secondBestLikelihood) / GLFRecord.LIKELIHOOD_SCALE_FACTOR);
var.genotypeLikelihoods = rec.getLikelihoods();
for ( int i = 0; i < var.genotypeLikelihoods.length; i++ )
var.genotypeLikelihoods[i] /= GLFRecord.LIKELIHOOD_SCALE_FACTOR;
if ( false ) {
System.out.printf("Converting : %s%n", glf);
System.out.printf(" homRef: %s%n", homRef);
System.out.printf(" refindex : %d%n", refIndex);
System.out.printf(" bestIndex : %d%n", sorted[0]);
System.out.printf(" 2ndindex : %d%n", sorted[1]);
System.out.printf(" => %s%n", var);
}
return var;
}
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
nSites++;
if ( tracker.lookup("eval", null) instanceof RodGLF) {
tracker.bind("eval", glf2geli(ref, (RodGLF)tracker.lookup("eval", null)));
}
// Iterate over each analysis, and update it
AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null);