Initial commit, without any tool changes, of a new infrastructure for determining tranches. This new version walker up from the lowest quality snps and determines Ti/Tv. This is marginally more stable than moving in the other direction when there are few novel variants (exomes). Can make a substantial difference in the size of the call set (10-20%). I'll hook it into the main system now. Includes an new class Tranche, isolated read/writing utilities that are now testing in TestVariantRecalibrator, which should be moved to UnitTest as soon as I can figure out how to do this on my mac.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4654 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ed6396ed43
commit
ec83a4b765
|
|
@ -0,0 +1,143 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.dbsnp.DbSNPFeature;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Mar 17, 2010
|
||||
*
|
||||
* @help.summary Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
|
||||
*/
|
||||
|
||||
public class TestVariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||
|
||||
/////////////////////////////
|
||||
// Inputs
|
||||
/////////////////////////////
|
||||
@Input(fullName="dataFile", shortName="dataFile", doc="The input cluster file generated by GenerateVariantClusters", required=true)
|
||||
private File dataFile;
|
||||
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (~~2.07 for whole genome experiments)", required=true)
|
||||
private double TARGET_TITV = 2.07;
|
||||
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
|
||||
private double[] FDR_TRANCHES = null;
|
||||
|
||||
@Argument(fullName="maxElements", shortName="maxElements", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (~~2.07 for whole genome experiments)", required=false)
|
||||
private double maxElements = -1;
|
||||
|
||||
@Argument(fullName="sixElementFile", shortName="sixElementFile", doc="", required=false)
|
||||
private File sixElementFile = null;
|
||||
|
||||
@Argument(fullName="fourElementFile", shortName="fourElementFile", doc="", required=false)
|
||||
private File fourElementFile = null;
|
||||
|
||||
public void initialize() {
|
||||
List<VariantDatum> vd = new ArrayList<VariantDatum>();
|
||||
try {
|
||||
for ( String line : new XReadLines(dataFile, true) ) {
|
||||
String[] parts = line.split(" ");
|
||||
if ( ! parts[0].equals("QUAL") ) {
|
||||
VariantDatum datum = new VariantDatum();
|
||||
datum.qual = Double.valueOf(parts[0]);
|
||||
datum.isTransition = parts[1].equals("1");
|
||||
datum.isKnown = parts[2].equals("1");
|
||||
vd.add(datum);
|
||||
|
||||
if ( maxElements != -1 && vd.size() > maxElements )
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("foo", e);
|
||||
}
|
||||
|
||||
List<Tranche> tranches = VariantGaussianMixtureModel.findTranches(vd.toArray(new VariantDatum[0]), FDR_TRANCHES, TARGET_TITV);
|
||||
System.out.printf(Tranche.tranchesString(tranches));
|
||||
|
||||
if ( sixElementFile != null ) {
|
||||
List<Tranche> six = Tranche.readTraches(sixElementFile);
|
||||
System.out.printf("six%n");
|
||||
System.out.printf(Tranche.tranchesString(six));
|
||||
}
|
||||
|
||||
if ( fourElementFile != null ) {
|
||||
List<Tranche> four = Tranche.readTraches(fourElementFile);
|
||||
System.out.printf("four%n");
|
||||
System.out.printf(Tranche.tranchesString(four));
|
||||
}
|
||||
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
return null;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduceInit() {
|
||||
return new ExpandingArrayList<VariantDatum>();
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
reduceSum.addAll( mapValue );
|
||||
return reduceSum;
|
||||
}
|
||||
|
||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,120 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
*/
|
||||
|
||||
public class Tranche {
|
||||
public double fdr, pCut, knownTiTv, novelTiTv;
|
||||
public int numKnown,numNovel;
|
||||
|
||||
public Tranche(double fdr, double pCut, double novelTiTv, int numNovel) {
|
||||
this(fdr,pCut,-1, -1, numNovel, novelTiTv);
|
||||
}
|
||||
|
||||
public Tranche(double fdr, double pCut, int numKnown, double knownTiTv, int numNovel, double novelTiTv) {
|
||||
this.fdr = fdr;
|
||||
this.pCut = pCut;
|
||||
this.novelTiTv = novelTiTv;
|
||||
this.numNovel = numNovel;
|
||||
this.knownTiTv = knownTiTv;
|
||||
this.numKnown = numKnown;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("[Tranche cut = %.3f with %d novels @ %.2f]", pCut, numNovel, novelTiTv);
|
||||
}
|
||||
|
||||
public static String tranchesString(List<Tranche> tranches) {
|
||||
ByteArrayOutputStream bytes = new ByteArrayOutputStream();
|
||||
PrintStream stream = new PrintStream(bytes);
|
||||
|
||||
stream.println("FDRtranche,numKnown,numNovel,knownTiTv,novelTiTv,pCut,filterName");
|
||||
|
||||
Tranche prev = null;
|
||||
for ( Tranche t : tranches ) {
|
||||
stream.printf("%.2f,%d,%d,%.4f,%.4f,%.4f,FDRtranche%.2fto%.2f%n",
|
||||
t.fdr,t.numKnown,t.numNovel,t.knownTiTv,t.novelTiTv, t.pCut,
|
||||
(prev == null ? 0.0 : prev.fdr), t.fdr);
|
||||
prev = t;
|
||||
}
|
||||
|
||||
return bytes.toString();
|
||||
}
|
||||
|
||||
private static double getDouble(Map<String,String> bindings, String key) {
|
||||
return bindings.containsKey(key) ? Double.valueOf(bindings.get(key)) : -1.0;
|
||||
}
|
||||
|
||||
private static int getInteger(Map<String,String> bindings, String key) {
|
||||
return bindings.containsKey(key) ? Integer.valueOf(bindings.get(key)) : -1;
|
||||
}
|
||||
|
||||
public static List<Tranche> readTraches(File f) {
|
||||
String[] header = null;
|
||||
List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
|
||||
try {
|
||||
for( final String line : new XReadLines(f) ) {
|
||||
final String[] vals = line.split(",");
|
||||
if( header == null ) {
|
||||
header = vals;
|
||||
} else {
|
||||
Map<String,String> bindings = new HashMap<String, String>();
|
||||
for ( int i = 0; i < vals.length; i++ ) bindings.put(header[i], vals[i]);
|
||||
tranches.add(new Tranche(getDouble(bindings,"FDRtranche"),
|
||||
getDouble(bindings,"pCut"),
|
||||
getInteger(bindings,"numKnown"),
|
||||
getDouble(bindings,"knownTiTv"),
|
||||
getInteger(bindings,"numNovel"),
|
||||
getDouble(bindings,"novelTiTv")));
|
||||
}
|
||||
}
|
||||
|
||||
return tranches;
|
||||
} catch( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotCreateOutputFile(f, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -31,10 +31,14 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
|||
* Date: Feb 24, 2010
|
||||
*/
|
||||
|
||||
public class VariantDatum {
|
||||
public class VariantDatum implements Comparable<VariantDatum> {
|
||||
public double[] annotations;
|
||||
public boolean isTransition;
|
||||
public boolean isKnown;
|
||||
public double qual;
|
||||
public double weight;
|
||||
|
||||
public int compareTo(VariantDatum other) {
|
||||
return Double.compare(this.qual, other.qual);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ import Jama.*;
|
|||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.Random;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
|
|
@ -462,6 +462,91 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
return sum;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Code to determine FDR tranches for VariantDatum[]
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
public final static List<Tranche> findTranches( final VariantDatum[] data, final double[] FDRtranches, double targetTiTv ) {
|
||||
List<VariantDatum> tranchesData = sortVariantsbyQual(data);
|
||||
double[] runningTiTv = calculateRunningTiTv(tranchesData);
|
||||
List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
for ( double fdr : FDRtranches ) {
|
||||
Tranche t = findTranche(tranchesData, runningTiTv, fdr, targetTiTv);
|
||||
|
||||
if ( t == null ) {
|
||||
if ( tranches.size() == 0 )
|
||||
throw new UserException("Couldn't find any tranche containing variants with a TiTv > target of " + targetTiTv);
|
||||
break;
|
||||
}
|
||||
|
||||
tranches.add(t);
|
||||
}
|
||||
|
||||
return tranches;
|
||||
}
|
||||
|
||||
private final static List<VariantDatum> sortVariantsbyQual(final VariantDatum[] data) {
|
||||
List<VariantDatum> sorted = new ArrayList<VariantDatum>(Arrays.asList(data));
|
||||
Collections.sort(sorted);
|
||||
return sorted;
|
||||
}
|
||||
|
||||
private static double[] calculateRunningTiTv(List<VariantDatum> data) {
|
||||
int ti = 0, tv = 0;
|
||||
double[] run = new double[data.size()];
|
||||
|
||||
for ( int i = data.size() - 1; i >= 0; i-- ) {
|
||||
VariantDatum datum = data.get(i);
|
||||
if ( ! datum.isKnown ) {
|
||||
if ( datum.isTransition ) { ti++; } else { tv++; }
|
||||
run[i] = ti / Math.max(1.0 * tv, 1.0);
|
||||
}
|
||||
}
|
||||
|
||||
return run;
|
||||
}
|
||||
|
||||
public final static Tranche findTranche( final List<VariantDatum> data, double[] runningTiTv, final double desiredFDR, double targetTiTv ) {
|
||||
final double titvThreshold = fdrToTiTv(desiredFDR, targetTiTv); // compute the desired TiTv
|
||||
|
||||
for ( int i = 0; i < runningTiTv.length; i++ ) {
|
||||
if ( runningTiTv[i] >= titvThreshold ) {
|
||||
// we've found the largest group of variants with Ti/Tv >= our target titv
|
||||
return trancheOfVariants(data, i, desiredFDR);
|
||||
}
|
||||
}
|
||||
|
||||
// we get here when there's no subset of variants with Ti/Tv >= threshold, in which case we should return null
|
||||
return null;
|
||||
}
|
||||
|
||||
public final static Tranche trancheOfVariants( final List<VariantDatum> data, int minI, double fdr ) {
|
||||
int numKnown = 0, numNovel = 0, knownTi = 0, knownTv = 0, novelTi = 0, novelTv = 0;
|
||||
|
||||
for ( int i = minI; i < data.size(); i++ ) {
|
||||
VariantDatum datum = data.get(i);
|
||||
if ( datum.isKnown ) {
|
||||
numKnown++;
|
||||
if ( datum.isTransition ) { knownTi++; } else { knownTv++; }
|
||||
} else {
|
||||
numNovel++;
|
||||
if ( datum.isTransition ) { novelTi++; } else { novelTv++; }
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
double knownTiTv = knownTi / Math.max(1.0 * knownTv, 1.0);
|
||||
double novelTiTv = novelTi / Math.max(1.0 * novelTv, 1.0);
|
||||
|
||||
return new Tranche(fdr, data.get(minI).qual, numKnown, knownTiTv, numNovel, novelTiTv);
|
||||
}
|
||||
|
||||
public final static double fdrToTiTv(double desiredFDR, double targetTiTv) {
|
||||
return (1.0 - desiredFDR / 100.0) * (targetTiTv - 0.5) + 0.5;
|
||||
}
|
||||
|
||||
public final void outputOptimizationCurve( final VariantDatum[] data, final PrintStream outputReportDatFile, final PrintStream tranchesOutputFile,
|
||||
final int desiredNumVariants, final Double[] FDRtranches, final double MAX_QUAL ) {
|
||||
final int numVariants = data.length;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
FDRtranche,novelTITV,pCut,numNovel,filterName
|
||||
0.10,2.8000,1.1800,893,FDRtranche0.00to0.10
|
||||
1.00,2.7782,1.0600,903,FDRtranche0.10to1.00
|
||||
10.00,2.5714,0.3800,975,FDRtranche1.00to10.00
|
||||
100.00,1.6091,0.0000,1602,FDRtranche10.00to100.00
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
FDRtranche,numKnown,numNovel,knownTiTv,novelTiTv,pCut,filterName
|
||||
0.10,15791,893,3.3086,2.8000,1.1800,FDRtranche0.00to0.10
|
||||
1.00,15823,903,3.3044,2.7782,1.0600,FDRtranche0.10to1.00
|
||||
10.00,16035,975,3.2932,2.5714,0.3800,FDRtranche1.00to10.00
|
||||
100.00,16578,1602,3.2291,1.6091,0.0000,FDRtranche10.00to100.00
|
||||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue