A TiTv-free approach for cutting variants! Apparently much better than previous approach, and will work for indels and SV will truly minor modifications to the code. Will discuss with methods group on Monday.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4822 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
81290d238d
commit
abd6ce1c77
|
|
@ -47,17 +47,20 @@ import java.util.*;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class Tranche implements Comparable<Tranche> {
|
public class Tranche implements Comparable<Tranche> {
|
||||||
private static final int CURRENT_VERSION = 2;
|
private static final int CURRENT_VERSION = 3;
|
||||||
|
|
||||||
public double fdr, minVQSLod, targetTiTv, knownTiTv, novelTiTv;
|
public double fdr, minVQSLod, targetTiTv, knownTiTv, novelTiTv;
|
||||||
public int numKnown,numNovel;
|
public int numKnown,numNovel;
|
||||||
public String name;
|
public String name;
|
||||||
|
|
||||||
public Tranche(double fdr, double targetTiTv, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv) {
|
int accessibleTruthSites = 0;
|
||||||
this(fdr, targetTiTv, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, "anonymous");
|
int callsAtTruthSites = 0;
|
||||||
|
|
||||||
|
public Tranche(double fdr, double targetTiTv, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites) {
|
||||||
|
this(fdr, targetTiTv, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, "anonymous");
|
||||||
}
|
}
|
||||||
|
|
||||||
public Tranche(double fdr, double targetTiTv, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, String name) {
|
public Tranche(double fdr, double targetTiTv, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, String name ) {
|
||||||
this.fdr = fdr;
|
this.fdr = fdr;
|
||||||
this.targetTiTv = targetTiTv;
|
this.targetTiTv = targetTiTv;
|
||||||
this.minVQSLod = minVQSLod;
|
this.minVQSLod = minVQSLod;
|
||||||
|
|
@ -67,7 +70,10 @@ public class Tranche implements Comparable<Tranche> {
|
||||||
this.numKnown = numKnown;
|
this.numKnown = numKnown;
|
||||||
this.name = name;
|
this.name = name;
|
||||||
|
|
||||||
if ( fdr <= 0.0 )
|
this.accessibleTruthSites = accessibleTruthSites;
|
||||||
|
this.callsAtTruthSites = callsAtTruthSites;
|
||||||
|
|
||||||
|
if ( fdr < 0.0 )
|
||||||
throw new UserException("Target FDR is unreasonable " + fdr);
|
throw new UserException("Target FDR is unreasonable " + fdr);
|
||||||
|
|
||||||
if ( targetTiTv < 0.5 || targetTiTv > 10 )
|
if ( targetTiTv < 0.5 || targetTiTv > 10 )
|
||||||
|
|
@ -80,13 +86,17 @@ public class Tranche implements Comparable<Tranche> {
|
||||||
throw new ReviewedStingException("BUG -- name cannot be null");
|
throw new ReviewedStingException("BUG -- name cannot be null");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private double getTruthSensitivity() {
|
||||||
|
return accessibleTruthSites > 0 ? callsAtTruthSites / (1.0*accessibleTruthSites) : 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
public int compareTo(Tranche other) {
|
public int compareTo(Tranche other) {
|
||||||
return Double.compare(this.fdr, other.fdr);
|
return Double.compare(this.fdr, other.fdr);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return String.format("Tranche fdr=%.2f minVQSLod=%.4f known=(%d @ %.2f) novel=(%d @ %.2f) name=%s]",
|
return String.format("Tranche fdr=%.2f minVQSLod=%.4f known=(%d @ %.2f) novel=(%d @ %.2f) truthSites(%d accessible, %d called), name=%s]",
|
||||||
fdr, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, name);
|
fdr, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, name);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -105,13 +115,13 @@ public class Tranche implements Comparable<Tranche> {
|
||||||
|
|
||||||
stream.println("# Variant quality score tranches file");
|
stream.println("# Variant quality score tranches file");
|
||||||
stream.println("# Version number " + CURRENT_VERSION);
|
stream.println("# Version number " + CURRENT_VERSION);
|
||||||
stream.println("FDRtranche,targetTiTv,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName");
|
stream.println("FDRtranche,targetTiTv,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,accessibleTruthSites,callsAtTruthSites,truthSensitivity");
|
||||||
|
|
||||||
Tranche prev = null;
|
Tranche prev = null;
|
||||||
for ( Tranche t : tranches ) {
|
for ( Tranche t : tranches ) {
|
||||||
stream.printf("%.2f,%.2f,%d,%d,%.4f,%.4f,%.4f,FDRtranche%.2fto%.2f%n",
|
stream.printf("%.2f,%.2f,%d,%d,%.4f,%.4f,%.4f,FDRtranche%.2fto%.2f,%d,%d,%.4f%n",
|
||||||
t.fdr,t.targetTiTv,t.numKnown,t.numNovel,t.knownTiTv,t.novelTiTv, t.minVQSLod,
|
t.fdr,t.targetTiTv,t.numKnown,t.numNovel,t.knownTiTv,t.novelTiTv, t.minVQSLod,
|
||||||
(prev == null ? 0.0 : prev.fdr), t.fdr);
|
(prev == null ? 0.0 : prev.fdr), t.fdr, t.accessibleTruthSites, t.callsAtTruthSites, t.getTruthSensitivity());
|
||||||
prev = t;
|
prev = t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -161,7 +171,7 @@ public class Tranche implements Comparable<Tranche> {
|
||||||
if ( header.length == 5 )
|
if ( header.length == 5 )
|
||||||
// old style tranches file, throw an error
|
// old style tranches file, throw an error
|
||||||
throw new UserException.MalformedFile(f, "Unfortuanately, your tranches file is from a previous version of this tool and cannot be used with the latest code. Please rerun VariantRecalibrator");
|
throw new UserException.MalformedFile(f, "Unfortuanately, your tranches file is from a previous version of this tool and cannot be used with the latest code. Please rerun VariantRecalibrator");
|
||||||
if ( header.length != 8 )
|
if ( header.length != 8 && header.length != 11 )
|
||||||
throw new UserException.MalformedFile(f, "Expected 8 elements in header line " + line);
|
throw new UserException.MalformedFile(f, "Expected 8 elements in header line " + line);
|
||||||
} else {
|
} else {
|
||||||
if ( header.length != vals.length )
|
if ( header.length != vals.length )
|
||||||
|
|
@ -176,6 +186,8 @@ public class Tranche implements Comparable<Tranche> {
|
||||||
getDouble(bindings,"knownTiTv", false),
|
getDouble(bindings,"knownTiTv", false),
|
||||||
getInteger(bindings,"numNovel", true),
|
getInteger(bindings,"numNovel", true),
|
||||||
getDouble(bindings,"novelTiTv", true),
|
getDouble(bindings,"novelTiTv", true),
|
||||||
|
getInteger(bindings,"accessibleTruthSites", false),
|
||||||
|
getInteger(bindings,"callsAtTruthSites", false),
|
||||||
bindings.get("filterName")));
|
bindings.get("filterName")));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,7 @@ public class VariantDatum implements Comparable<VariantDatum> {
|
||||||
public boolean isKnown;
|
public boolean isKnown;
|
||||||
public double lod;
|
public double lod;
|
||||||
public double weight;
|
public double weight;
|
||||||
|
public boolean atTruthSite;
|
||||||
|
|
||||||
public int compareTo(VariantDatum other) {
|
public int compareTo(VariantDatum other) {
|
||||||
return Double.compare(this.lod, other.lod);
|
return Double.compare(this.lod, other.lod);
|
||||||
|
|
|
||||||
|
|
@ -460,26 +460,114 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
// Code to determine FDR tranches for VariantDatum[]
|
// Code to determine FDR tranches for VariantDatum[]
|
||||||
//
|
//
|
||||||
// ---------------------------------------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------------------------------------
|
||||||
public final static List<Tranche> findTranches( final VariantDatum[] data, final double[] FDRtranches, double targetTiTv ) {
|
public static abstract class SelectionMetric {
|
||||||
return findTranches( data, FDRtranches, targetTiTv, null );
|
String name = null;
|
||||||
|
|
||||||
|
public SelectionMetric(String name) {
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() { return name; }
|
||||||
|
|
||||||
|
public abstract double getThreshold(double tranche);
|
||||||
|
public abstract double getTarget();
|
||||||
|
public abstract void calculateRunningMetric(List<VariantDatum> data);
|
||||||
|
public abstract double getRunningMetric(int i);
|
||||||
|
public abstract int datumValue(VariantDatum d);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final static List<Tranche> findTranches( final VariantDatum[] data, final double[] FDRtranches, double targetTiTv, File debugFile ) {
|
public static class NovelTiTvMetric extends SelectionMetric {
|
||||||
logger.info(String.format("Finding tranches for %d variants with %d FDRs and a target TiTv of %.2f", data.length, FDRtranches.length, targetTiTv));
|
double[] runningTiTv;
|
||||||
|
double targetTiTv = 0;
|
||||||
|
|
||||||
|
public NovelTiTvMetric(double target) {
|
||||||
|
super("NovelTiTv");
|
||||||
|
targetTiTv = target; // compute the desired TiTv
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getThreshold(double tranche) {
|
||||||
|
return fdrToTiTv(tranche, targetTiTv);
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getTarget() { return targetTiTv; }
|
||||||
|
|
||||||
|
public void calculateRunningMetric(List<VariantDatum> data) {
|
||||||
|
int ti = 0, tv = 0;
|
||||||
|
runningTiTv = new double[data.size()];
|
||||||
|
|
||||||
|
for ( int i = data.size() - 1; i >= 0; i-- ) {
|
||||||
|
VariantDatum datum = data.get(i);
|
||||||
|
if ( ! datum.isKnown ) {
|
||||||
|
if ( datum.isTransition ) { ti++; } else { tv++; }
|
||||||
|
runningTiTv[i] = ti / Math.max(1.0 * tv, 1.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getRunningMetric(int i) {
|
||||||
|
return runningTiTv[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
public int datumValue(VariantDatum d) {
|
||||||
|
return d.isTransition ? 1 : 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class TruthSensitivityMetric extends SelectionMetric {
|
||||||
|
double[] runningSensitivity;
|
||||||
|
double targetTiTv = 0;
|
||||||
|
int nTrueSites = 0;
|
||||||
|
|
||||||
|
public TruthSensitivityMetric(int nTrueSites) {
|
||||||
|
super("TruthSensitivity");
|
||||||
|
this.nTrueSites = nTrueSites;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getThreshold(double tranche) {
|
||||||
|
return tranche/100; // tranche of 1 => 99% sensivity target
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getTarget() { return 1.0; }
|
||||||
|
|
||||||
|
public void calculateRunningMetric(List<VariantDatum> data) {
|
||||||
|
int nCalledAtTruth = 0;
|
||||||
|
runningSensitivity = new double[data.size()];
|
||||||
|
|
||||||
|
for ( int i = data.size() - 1; i >= 0; i-- ) {
|
||||||
|
VariantDatum datum = data.get(i);
|
||||||
|
nCalledAtTruth += datum.atTruthSite ? 1 : 0;
|
||||||
|
runningSensitivity[i] = 1 - nCalledAtTruth / (1.0 * nTrueSites);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getRunningMetric(int i) {
|
||||||
|
return runningSensitivity[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
public int datumValue(VariantDatum d) {
|
||||||
|
return d.atTruthSite ? 1 : 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public final static List<Tranche> findTranches( final VariantDatum[] data, final double[] tranches, SelectionMetric metric ) {
|
||||||
|
return findTranches( data, tranches, metric, null );
|
||||||
|
}
|
||||||
|
|
||||||
|
public final static List<Tranche> findTranches( final VariantDatum[] data, final double[] trancheThresholds, SelectionMetric metric, File debugFile ) {
|
||||||
|
logger.info(String.format("Finding %d tranches for %d variants", trancheThresholds.length, data.length));
|
||||||
|
|
||||||
List<VariantDatum> tranchesData = sortVariantsbyLod(data);
|
List<VariantDatum> tranchesData = sortVariantsbyLod(data);
|
||||||
double[] runningTiTv = calculateRunningTiTv(tranchesData);
|
metric.calculateRunningMetric(tranchesData);
|
||||||
|
|
||||||
if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, tranchesData, runningTiTv); }
|
if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, tranchesData, metric); }
|
||||||
|
|
||||||
List<Tranche> tranches = new ArrayList<Tranche>();
|
List<Tranche> tranches = new ArrayList<Tranche>();
|
||||||
for ( double fdr : FDRtranches ) {
|
for ( double trancheThreshold : trancheThresholds ) {
|
||||||
Tranche t = findTranche(tranchesData, runningTiTv, fdr, targetTiTv);
|
Tranche t = findTranche(tranchesData, metric, trancheThreshold);
|
||||||
// todo -- should abort early when t's qual is 0 -- that's the lowest we'll get to
|
|
||||||
|
|
||||||
if ( t == null ) {
|
if ( t == null ) {
|
||||||
if ( tranches.size() == 0 )
|
if ( tranches.size() == 0 )
|
||||||
throw new UserException("Couldn't find any tranche containing variants with a TiTv > target of " + targetTiTv);
|
throw new UserException(String.format("Couldn't find any tranche containing variants with a %s > %.2f", metric.getName(), metric.getThreshold(trancheThreshold)));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -489,13 +577,15 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
return tranches;
|
return tranches;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void writeTranchesDebuggingInfo(File f, List<VariantDatum> tranchesData, double[] runningTiTv ) {
|
private static void writeTranchesDebuggingInfo(File f, List<VariantDatum> tranchesData, SelectionMetric metric ) {
|
||||||
try {
|
try {
|
||||||
PrintStream out = new PrintStream(f);
|
PrintStream out = new PrintStream(f);
|
||||||
out.println("Qual isTransition runningTiTv");
|
out.println("Qual metricValue runningValue");
|
||||||
for ( int i = 0; i < runningTiTv.length; i++ ) {
|
for ( int i = 0; i < tranchesData.size(); i++ ) {
|
||||||
VariantDatum d = tranchesData.get(i);
|
VariantDatum d = tranchesData.get(i);
|
||||||
out.printf("%.4f %d %.4f%n", d.lod, d.isTransition ? 1 : 0, runningTiTv[i]);
|
int score = metric.datumValue(d);
|
||||||
|
double runningValue = metric.getRunningMetric(i);
|
||||||
|
out.printf("%.4f %d %.4f%n", d.lod, score, runningValue);
|
||||||
}
|
}
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
throw new UserException.CouldNotCreateOutputFile(f, e);
|
throw new UserException.CouldNotCreateOutputFile(f, e);
|
||||||
|
|
@ -508,44 +598,46 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
return sorted;
|
return sorted;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static double[] calculateRunningTiTv(List<VariantDatum> data) {
|
public final static Tranche findTranche( final List<VariantDatum> data, final SelectionMetric metric, final double trancheThreshold ) {
|
||||||
int ti = 0, tv = 0;
|
logger.info(String.format(" Tranche threshold %.2f => selection metric threshold %.3f", trancheThreshold, metric.getThreshold(trancheThreshold)));
|
||||||
double[] run = new double[data.size()];
|
|
||||||
|
|
||||||
for ( int i = data.size() - 1; i >= 0; i-- ) {
|
double metricThreshold = metric.getThreshold(trancheThreshold);
|
||||||
VariantDatum datum = data.get(i);
|
int n = data.size();
|
||||||
if ( ! datum.isKnown ) {
|
for ( int i = 0; i < n; i++ ) {
|
||||||
if ( datum.isTransition ) { ti++; } else { tv++; }
|
if ( metric.getRunningMetric(i) >= metricThreshold ) {
|
||||||
run[i] = ti / Math.max(1.0 * tv, 1.0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return run;
|
|
||||||
}
|
|
||||||
|
|
||||||
public final static Tranche findTranche( final List<VariantDatum> data, double[] runningTiTv, final double desiredFDR, double targetTiTv ) {
|
|
||||||
if ( data.size() != runningTiTv.length ) throw new ReviewedStingException("BUG: data and running titv are of different sizes");
|
|
||||||
|
|
||||||
final double titvThreshold = fdrToTiTv(desiredFDR, targetTiTv); // compute the desired TiTv
|
|
||||||
|
|
||||||
for ( int i = 0; i < runningTiTv.length; i++ ) {
|
|
||||||
if ( runningTiTv[i] >= titvThreshold ) {
|
|
||||||
// we've found the largest group of variants with Ti/Tv >= our target titv
|
// we've found the largest group of variants with Ti/Tv >= our target titv
|
||||||
Tranche t = trancheOfVariants(data, i, desiredFDR, targetTiTv);
|
Tranche t = trancheOfVariants(data, i, trancheThreshold, metric.getTarget());
|
||||||
logger.info(String.format(" Found tranche for %.3f FDR = %.3f TiTv threshold starting with variant %d; running TiTv is %.2f ",
|
logger.info(String.format(" Found tranche for %.3f: %.3f threshold starting with variant %d; running score is %.3f ",
|
||||||
desiredFDR, titvThreshold, i, runningTiTv[i]));
|
trancheThreshold, metricThreshold, i, metric.getRunningMetric(i)));
|
||||||
logger.info(String.format(" Trance is %s", t));
|
logger.info(String.format(" Tranche is %s", t));
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// we get here when there's no subset of variants with Ti/Tv >= threshold, in which case we should return null
|
// we get here when there's no subset of variants with Ti/Tv >= threshold, in which case we should return null
|
||||||
logger.info(String.format(" **Couldn't find tranche for %.3f FDR = %.3f TiTv threshold; last running TiTv was %.2f ",
|
//logger.info(String.format(" **Couldn't find tranche for %.3f FDR = %.3f TiTv threshold; last running TiTv was %.2f ",
|
||||||
desiredFDR, titvThreshold, runningTiTv[runningTiTv.length-1]));
|
// desiredFDR, titvThreshold, runningTiTv[runningTiTv.length-1]));
|
||||||
|
|
||||||
|
|
||||||
|
// for ( int i = 0; i < runningTiTv.length; i++ ) {
|
||||||
|
// if ( runningTiTv[i] >= titvThreshold ) {
|
||||||
|
// // we've found the largest group of variants with Ti/Tv >= our target titv
|
||||||
|
// Tranche t = trancheOfVariants(data, i, desiredFDR, targetTiTv);
|
||||||
|
// logger.info(String.format(" Found tranche for %.3f FDR = %.3f TiTv threshold starting with variant %d; running TiTv is %.2f ",
|
||||||
|
// desiredFDR, titvThreshold, i, metric.getRunningMetric(i)));
|
||||||
|
// logger.info(String.format(" Tranche is %s", t));
|
||||||
|
// return t;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// // we get here when there's no subset of variants with Ti/Tv >= threshold, in which case we should return null
|
||||||
|
// logger.info(String.format(" **Couldn't find tranche for %.3f FDR = %.3f TiTv threshold; last running TiTv was %.2f ",
|
||||||
|
// desiredFDR, titvThreshold, runningTiTv[runningTiTv.length-1]));
|
||||||
|
//
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public final static Tranche trancheOfVariants( final List<VariantDatum> data, int minI, double fdr, double targetTiTv ) {
|
public final static Tranche trancheOfVariants( final List<VariantDatum> data, int minI, double fdr, double target ) {
|
||||||
int numKnown = 0, numNovel = 0, knownTi = 0, knownTv = 0, novelTi = 0, novelTv = 0;
|
int numKnown = 0, numNovel = 0, knownTi = 0, knownTv = 0, novelTi = 0, novelTv = 0;
|
||||||
|
|
||||||
double minLod = data.get(minI).lod;
|
double minLod = data.get(minI).lod;
|
||||||
|
|
@ -568,13 +660,34 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
double knownTiTv = knownTi / Math.max(1.0 * knownTv, 1.0);
|
double knownTiTv = knownTi / Math.max(1.0 * knownTv, 1.0);
|
||||||
double novelTiTv = novelTi / Math.max(1.0 * novelTv, 1.0);
|
double novelTiTv = novelTi / Math.max(1.0 * novelTv, 1.0);
|
||||||
|
|
||||||
return new Tranche(fdr, targetTiTv, minLod, numKnown, knownTiTv, numNovel, novelTiTv);
|
int accessibleTruthSites = countCallsAtTruth(data, Double.NEGATIVE_INFINITY);
|
||||||
|
int nCallsAtTruth = countCallsAtTruth(data, minLod);
|
||||||
|
|
||||||
|
|
||||||
|
return new Tranche(fdr, target, minLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, nCallsAtTruth);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final static double fdrToTiTv(double desiredFDR, double targetTiTv) {
|
public final static double fdrToTiTv(double desiredFDR, double targetTiTv) {
|
||||||
return (1.0 - desiredFDR / 100.0) * (targetTiTv - 0.5) + 0.5;
|
return (1.0 - desiredFDR / 100.0) * (targetTiTv - 0.5) + 0.5;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int countCallsAtTruth(final VariantDatum[] data, double minLOD ) {
|
||||||
|
int n = 0;
|
||||||
|
for ( VariantDatum d : data) { n += d.atTruthSite && d.lod >= minLOD ? 1 : 0; }
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int countCallsAtTruth(final List<VariantDatum> data, double minLOD ) {
|
||||||
|
int n = 0;
|
||||||
|
for ( VariantDatum d : data) { n += d.atTruthSite && d.lod >= minLOD ? 1 : 0; }
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Code to evaluate vectors given an already trained model
|
||||||
|
//
|
||||||
|
// ---------------------------------------------------------------------------------------------------------
|
||||||
private double evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {
|
private double evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {
|
||||||
|
|
||||||
final int numAnnotations = data[0].annotations.length;
|
final int numAnnotations = data[0].annotations.length;
|
||||||
|
|
|
||||||
|
|
@ -94,7 +94,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
||||||
@Argument(fullName="prior1KG", shortName="prior1KG", doc="A prior on the quality of 1000 Genomes Project variants, a phred scaled probability of being true.", required=false)
|
@Argument(fullName="prior1KG", shortName="prior1KG", doc="A prior on the quality of 1000 Genomes Project variants, a phred scaled probability of being true.", required=false)
|
||||||
private double PRIOR_1KG = 12.0;
|
private double PRIOR_1KG = 12.0;
|
||||||
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
|
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
|
||||||
private double[] FDR_TRANCHES = new double[]{0.1, 1.0, 5.0, 10.0};
|
private double[] FDR_TRANCHES = new double[]{0.1, 1.0, 10.0, 100.0};
|
||||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required=false)
|
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required=false)
|
||||||
private String PATH_TO_RSCRIPT = "Rscript";
|
private String PATH_TO_RSCRIPT = "Rscript";
|
||||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false)
|
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false)
|
||||||
|
|
@ -112,10 +112,20 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
||||||
@Hidden
|
@Hidden
|
||||||
@Argument(fullName = "qual", shortName = "qual", doc = "Don't use sites with original quality scores below the qual threshold. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
@Argument(fullName = "qual", shortName = "qual", doc = "Don't use sites with original quality scores below the qual threshold. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
||||||
private double QUAL_THRESHOLD = 0.0;
|
private double QUAL_THRESHOLD = 0.0;
|
||||||
|
|
||||||
@Hidden
|
@Hidden
|
||||||
@Argument(fullName = "debugFile", shortName = "debugFile", doc = "Print debugging information here", required=false)
|
@Argument(fullName = "debugFile", shortName = "debugFile", doc = "Print debugging information here", required=false)
|
||||||
private File DEBUG_FILE = null;
|
private File DEBUG_FILE = null;
|
||||||
|
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName = "selectionMetric", shortName = "sm", doc = "Selection metric to use", required=false)
|
||||||
|
private SelectionMetricType SELECTION_METRIC_TYPE = SelectionMetricType.NOVEL_TITV;
|
||||||
|
|
||||||
|
public enum SelectionMetricType {
|
||||||
|
NOVEL_TITV,
|
||||||
|
TRUTH_SENSITIVITY
|
||||||
|
}
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Private Member Variables
|
// Private Member Variables
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
|
@ -126,6 +136,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
||||||
private NestedHashMap priorCache = new NestedHashMap();
|
private NestedHashMap priorCache = new NestedHashMap();
|
||||||
private boolean trustACField = false;
|
private boolean trustACField = false;
|
||||||
private PrintStream tranchesStream = null;
|
private PrintStream tranchesStream = null;
|
||||||
|
private int nTruthSites = 0;
|
||||||
|
|
||||||
private static double round4(double num) {
|
private static double round4(double num) {
|
||||||
double result = num * 10000.0;
|
double result = num * 10000.0;
|
||||||
|
|
@ -217,6 +228,10 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
||||||
return mapList;
|
return mapList;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final Collection<VariantContext> vcsTruth = tracker.getVariantContexts(ref, "truth", null, context.getLocation(), false, true);
|
||||||
|
final VariantContext vcTruth = ( vcsTruth.size() != 0 ? vcsTruth.iterator().next() : null );
|
||||||
|
nTruthSites += vcTruth != null && vcTruth.isVariant() ? 1 : 0;
|
||||||
|
|
||||||
for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), false, false) ) {
|
for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), false, false) ) {
|
||||||
if( vc != null && vc.isSNP() ) {
|
if( vc != null && vc.isSNP() ) {
|
||||||
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
|
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
|
||||||
|
|
@ -293,6 +308,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
||||||
variantDatum.pos = vc.getStart();
|
variantDatum.pos = vc.getStart();
|
||||||
variantDatum.lod = round4(lod);
|
variantDatum.lod = round4(lod);
|
||||||
|
|
||||||
|
// deal with the truth calculation
|
||||||
|
variantDatum.atTruthSite = vcTruth != null && vcTruth.isVariant();
|
||||||
|
|
||||||
mapList.add( variantDatum );
|
mapList.add( variantDatum );
|
||||||
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
|
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
|
||||||
attrs.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod));
|
attrs.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod));
|
||||||
|
|
@ -327,11 +345,22 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
||||||
}
|
}
|
||||||
|
|
||||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||||
|
|
||||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
|
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
|
||||||
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
||||||
|
|
||||||
List<Tranche> tranches = VariantGaussianMixtureModel.findTranches( dataManager.data, FDR_TRANCHES, TARGET_TITV, DEBUG_FILE );
|
// deal with truth information
|
||||||
|
int nCallsAtTruth = VariantGaussianMixtureModel.countCallsAtTruth(dataManager.data, Double.NEGATIVE_INFINITY);
|
||||||
|
logger.info(String.format("Truth set size is %d, raw calls at these sites %d, maximum sensitivity of %.2f",
|
||||||
|
nTruthSites, nCallsAtTruth, (100.0*nCallsAtTruth / Math.max(nTruthSites, nCallsAtTruth))));
|
||||||
|
|
||||||
|
VariantGaussianMixtureModel.SelectionMetric metric = null;
|
||||||
|
switch ( SELECTION_METRIC_TYPE ) {
|
||||||
|
case NOVEL_TITV: metric = new VariantGaussianMixtureModel.NovelTiTvMetric(TARGET_TITV); break;
|
||||||
|
case TRUTH_SENSITIVITY: metric = new VariantGaussianMixtureModel.TruthSensitivityMetric(nCallsAtTruth); break;
|
||||||
|
default: throw new ReviewedStingException("BUG: unexpected SelectionMetricType " + SELECTION_METRIC_TYPE);
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Tranche> tranches = VariantGaussianMixtureModel.findTranches( dataManager.data, FDR_TRANCHES, metric, DEBUG_FILE );
|
||||||
tranchesStream.print(Tranche.tranchesString(tranches));
|
tranchesStream.print(Tranche.tranchesString(tranches));
|
||||||
|
|
||||||
// Execute Rscript command to plot the optimization curve
|
// Execute Rscript command to plot the optimization curve
|
||||||
|
|
|
||||||
|
|
@ -122,10 +122,15 @@ public final class VariantGaussianMixtureModelUnitTest extends BaseTest {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static final List<Tranche> findMyTranches(List<VariantDatum> vd, double[] fdrs, double targetTiTv) {
|
||||||
|
VariantGaussianMixtureModel.SelectionMetric metric = new VariantGaussianMixtureModel.NovelTiTvMetric(targetTiTv);
|
||||||
|
return VariantGaussianMixtureModel.findTranches(vd.toArray(new VariantDatum[0]), fdrs, metric);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public final void testFindTranches1() {
|
public final void testFindTranches1() {
|
||||||
List<VariantDatum> vd = readData();
|
List<VariantDatum> vd = readData();
|
||||||
List<Tranche> tranches = VariantGaussianMixtureModel.findTranches(vd.toArray(new VariantDatum[0]), FDRS, TARGET_TITV);
|
List<Tranche> tranches = findMyTranches(vd, FDRS, TARGET_TITV);
|
||||||
System.out.printf(Tranche.tranchesString(tranches));
|
System.out.printf(Tranche.tranchesString(tranches));
|
||||||
assertTranchesAreTheSame(read(EXPECTED_TRANCHES_NEW), tranches, true, false);
|
assertTranchesAreTheSame(read(EXPECTED_TRANCHES_NEW), tranches, true, false);
|
||||||
}
|
}
|
||||||
|
|
@ -133,12 +138,12 @@ public final class VariantGaussianMixtureModelUnitTest extends BaseTest {
|
||||||
@Test(expectedExceptions = {UserException.class})
|
@Test(expectedExceptions = {UserException.class})
|
||||||
public final void testBadFDR() {
|
public final void testBadFDR() {
|
||||||
List<VariantDatum> vd = readData();
|
List<VariantDatum> vd = readData();
|
||||||
VariantGaussianMixtureModel.findTranches(vd.toArray(new VariantDatum[0]), new double[]{-1}, TARGET_TITV);
|
List<Tranche> tranches = findMyTranches(vd, new double[]{-1}, TARGET_TITV);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(expectedExceptions = {UserException.class})
|
@Test(expectedExceptions = {UserException.class})
|
||||||
public final void testBadTargetTiTv() {
|
public final void testBadTargetTiTv() {
|
||||||
List<VariantDatum> vd = readData();
|
List<VariantDatum> vd = readData();
|
||||||
VariantGaussianMixtureModel.findTranches(vd.toArray(new VariantDatum[0]), FDRS, 0.1);
|
List<Tranche> tranches = findMyTranches(vd, FDRS, 0.1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue