Now understanding GLFs for calculating genotyping results like callable bases, as well as avoids emitting stupid amounts of data when doing a genotype evaluation (i.e., ignores non-SNP() calls)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1267 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c5f6ab3dd5
commit
0548026a2e
|
|
@ -0,0 +1,88 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.varianteval;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
public class CallableBasesAnalysis extends BasicVariantAnalysis implements GenotypeAnalysis {
|
||||
long all_bases = 0;
|
||||
long all_calls = 0;
|
||||
//final static double[] Qthresholds = { 10, 20, 30, 40, 50, 100, 200, 500, 1000 };
|
||||
final static double[] thresholds = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 50, 100 };
|
||||
long[] discoverable_bases = new long[thresholds.length];
|
||||
long[] genotypable_bases = new long[thresholds.length];
|
||||
|
||||
public CallableBasesAnalysis() {
|
||||
super("callable_bases");
|
||||
}
|
||||
|
||||
public long nSites() { return all_bases; }
|
||||
public long nCalls() { return all_calls; }
|
||||
public long nDiscoverable(int index) { return discoverable_bases[index]; }
|
||||
public double percentDiscoverable(int index) { return (100.0*nDiscoverable(index)) / nSites(); }
|
||||
public long nGenotypable(int index) { return genotypable_bases[index]; }
|
||||
public double percentGenotypable(int index) { return (100.0*nGenotypable(index)) / nSites(); }
|
||||
|
||||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
all_bases++;
|
||||
|
||||
if ( eval == null ) // no data here!
|
||||
return null;
|
||||
|
||||
// we actually have a record here
|
||||
if ( ! eval.isGenotype() ) { // evaluation record isn't a genotype, die!
|
||||
throw new RuntimeException("Evaluation track isn't an Genotype!");
|
||||
}
|
||||
|
||||
all_calls++;
|
||||
// For every threshold, updated discoverable and callable
|
||||
for ( int i = 0; i < thresholds.length; i++ ) {
|
||||
double threshold = thresholds[i];
|
||||
|
||||
// update discoverable
|
||||
if ( eval.isSNP() && eval.getVariationConfidence() >= threshold )
|
||||
discoverable_bases[i]++;
|
||||
if ( eval.isReference() && eval.getConsensusConfidence() >= threshold )
|
||||
discoverable_bases[i]++;
|
||||
|
||||
if ( eval.getConsensusConfidence() >= threshold )
|
||||
genotypable_bases[i]++;
|
||||
|
||||
//System.out.printf("Updating %s SNP=%b, REF=%b VarConf=%f ConConf=%f where threshold=%f: discoverable = %d, genotypable = %d%n",
|
||||
// eval.getLocation(), eval.isSNP(), eval.isReference(), eval.getVariationConfidence(),
|
||||
// eval.getConsensusConfidence(), threshold, discoverable_bases[i], genotypable_bases[i]);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public List<String> done() {
|
||||
List<String> s = new ArrayList<String>();
|
||||
s.add(String.format("total_no_sites %d", nSites()));
|
||||
s.add(String.format("total_no_calls %d", nCalls()));
|
||||
s.add(String.format(""));
|
||||
s.add(String.format("confidence_threshold\tdiscoverable_bases\tdiscoverable_bases_percent\tgenotypable_bases\tgenotypable_bases_percent"));
|
||||
|
||||
for ( int i = 0; i < thresholds.length; i++ ) {
|
||||
double threshold = thresholds[i];
|
||||
s.add(String.format("%6.2f\t%d\t%.6f\t%d\t%.6f", threshold, nDiscoverable(i), percentDiscoverable(i), nGenotypable(i), percentGenotypable(i)));
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
|
@ -40,7 +40,7 @@ public class ClusterCounterAnalysis extends BasicVariantAnalysis implements Geno
|
|||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
String r = null;
|
||||
|
||||
if ( eval != null ) {
|
||||
if ( eval != null && eval.isSNP() ) {
|
||||
IntervalRod intervalROD = (IntervalRod)tracker.lookup("interval", null);
|
||||
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();
|
||||
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ public class NeighborDistanceAnalysis extends ViolationVariantAnalysis implement
|
|||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
String r = null;
|
||||
|
||||
if ( eval != null ) {
|
||||
if ( eval != null && eval.isSNP() ) {
|
||||
IntervalRod intervalROD = (IntervalRod)tracker.lookup("interval", null);
|
||||
GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation();
|
||||
|
||||
|
|
|
|||
|
|
@ -53,8 +53,8 @@ public class VariantDBCoverage extends BasicVariantAnalysis implements GenotypeA
|
|||
public String update(AllelicVariant eval, RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
// There are four cases here:
|
||||
AllelicVariant dbsnp = (AllelicVariant)tracker.lookup(dbName, null);
|
||||
inc(dbsnp != null, eval != null);
|
||||
return dbsnp == null && eval != null ? "Novel " + eval : null;
|
||||
inc(dbsnp != null, eval != null && eval.isSNP());
|
||||
return dbsnp == null && eval != null && eval.isSNP() ? "Novel " + eval : null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -5,6 +5,10 @@ import org.broadinstitute.sting.gatk.refdata.*;
|
|||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.GLFRecord;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.SinglePointCall;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -87,6 +91,7 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
analyses.add(new NeighborDistanceAnalysis());
|
||||
analyses.add(new HardyWeinbergEquilibrium(badHWEThreshold));
|
||||
analyses.add(new ClusterCounterAnalysis());
|
||||
analyses.add(new CallableBasesAnalysis());
|
||||
|
||||
//
|
||||
// Filter out analyzes inappropriate for our evaluation type Population or Genotype
|
||||
|
|
@ -97,7 +102,7 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
boolean disableForGenotyping = evalContainsGenotypes && ! (analysis instanceof GenotypeAnalysis);
|
||||
boolean disableForPopulation = ! evalContainsGenotypes && ! (analysis instanceof PopulationAnalysis);
|
||||
boolean disable = disableForGenotyping | disableForPopulation;
|
||||
String causeName = disableForGenotyping ? "genotype" : (disableForPopulation ? "population" : null);
|
||||
String causeName = disableForGenotyping ? "population" : (disableForPopulation ? "genotype" : null);
|
||||
if ( disable ) {
|
||||
logger.info(String.format("Disabling %s-only analysis %s in set %s", causeName, analysis, setName));
|
||||
iter.remove();
|
||||
|
|
@ -141,9 +146,50 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
// OMG this is painful
|
||||
private rodVariants glf2geli(char ref, RodGLF glf) {
|
||||
SinglePointCall rec = (SinglePointCall)glf.mRecord;
|
||||
// contig pos refBase depth maxMappingQ bestGenotype lodBtr lodBtnb genotypes
|
||||
Integer[] sorted = Utils.SortPermutation(rec.getLikelihoods());
|
||||
int bestIndex = sorted[0];
|
||||
char[] refs = {ref, ref};
|
||||
String homRef = new String(refs);
|
||||
int refIndex = rodVariants.Genotype.valueOf(homRef).ordinal();
|
||||
double refLikelihood = rec.getLikelihoods()[refIndex];
|
||||
double bestLikelihood = rec.getLikelihoods()[bestIndex];
|
||||
double secondBestLikelihood = rec.getLikelihoods()[sorted[1]];
|
||||
|
||||
rodVariants var = new rodVariants("eval");
|
||||
var.loc = glf.getLocation();
|
||||
var.refBase = ref;
|
||||
var.depth = rec.getReadDepth();
|
||||
var.maxMappingQuality = rec.getRmsMapQ();
|
||||
var.bestGenotype = rodVariants.Genotype.values()[bestIndex].toString();
|
||||
var.lodBtr = Math.abs((bestLikelihood - refLikelihood) / GLFRecord.LIKELIHOOD_SCALE_FACTOR);
|
||||
var.lodBtnb = Math.abs((bestLikelihood - secondBestLikelihood) / GLFRecord.LIKELIHOOD_SCALE_FACTOR);
|
||||
var.genotypeLikelihoods = rec.getLikelihoods();
|
||||
for ( int i = 0; i < var.genotypeLikelihoods.length; i++ )
|
||||
var.genotypeLikelihoods[i] /= GLFRecord.LIKELIHOOD_SCALE_FACTOR;
|
||||
|
||||
if ( false ) {
|
||||
System.out.printf("Converting : %s%n", glf);
|
||||
System.out.printf(" homRef: %s%n", homRef);
|
||||
System.out.printf(" refindex : %d%n", refIndex);
|
||||
System.out.printf(" bestIndex : %d%n", sorted[0]);
|
||||
System.out.printf(" 2ndindex : %d%n", sorted[1]);
|
||||
System.out.printf(" => %s%n", var);
|
||||
}
|
||||
|
||||
return var;
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
nSites++;
|
||||
|
||||
if ( tracker.lookup("eval", null) instanceof RodGLF) {
|
||||
tracker.bind("eval", glf2geli(ref, (RodGLF)tracker.lookup("eval", null)));
|
||||
}
|
||||
|
||||
// Iterate over each analysis, and update it
|
||||
AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue