Initial commit, without any tool changes, of a new infrastructure for determining tranches. This new version walker up from the lowest quality snps and determines Ti/Tv. This is marginally more stable than moving in the other direction when there are few novel variants (exomes). Can make a substantial difference in the size of the call set (10-20%). I'll hook it into the main system now. Includes an new class Tranche, isolated read/writing utilities that are now testing in TestVariantRecalibrator, which should be moved to UnitTest as soon as I can figure out how to do this on my mac.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4654 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-11-11 23:52:49 +00:00
parent ed6396ed43
commit ec83a4b765
7 changed files with 18544 additions and 2 deletions

View File

@ -0,0 +1,143 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.collections.NestedHashMap;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*;
/**
* Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
*
* @author rpoplin
* @since Mar 17, 2010
*
* @help.summary Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
*/
public class TestVariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
/////////////////////////////
// Inputs
/////////////////////////////
@Input(fullName="dataFile", shortName="dataFile", doc="The input cluster file generated by GenerateVariantClusters", required=true)
private File dataFile;
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (~~2.07 for whole genome experiments)", required=true)
private double TARGET_TITV = 2.07;
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
private double[] FDR_TRANCHES = null;
@Argument(fullName="maxElements", shortName="maxElements", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (~~2.07 for whole genome experiments)", required=false)
private double maxElements = -1;
@Argument(fullName="sixElementFile", shortName="sixElementFile", doc="", required=false)
private File sixElementFile = null;
@Argument(fullName="fourElementFile", shortName="fourElementFile", doc="", required=false)
private File fourElementFile = null;
public void initialize() {
List<VariantDatum> vd = new ArrayList<VariantDatum>();
try {
for ( String line : new XReadLines(dataFile, true) ) {
String[] parts = line.split(" ");
if ( ! parts[0].equals("QUAL") ) {
VariantDatum datum = new VariantDatum();
datum.qual = Double.valueOf(parts[0]);
datum.isTransition = parts[1].equals("1");
datum.isKnown = parts[2].equals("1");
vd.add(datum);
if ( maxElements != -1 && vd.size() > maxElements )
break;
}
}
} catch (FileNotFoundException e) {
throw new StingException("foo", e);
}
List<Tranche> tranches = VariantGaussianMixtureModel.findTranches(vd.toArray(new VariantDatum[0]), FDR_TRANCHES, TARGET_TITV);
System.out.printf(Tranche.tranchesString(tranches));
if ( sixElementFile != null ) {
List<Tranche> six = Tranche.readTraches(sixElementFile);
System.out.printf("six%n");
System.out.printf(Tranche.tranchesString(six));
}
if ( fourElementFile != null ) {
List<Tranche> four = Tranche.readTraches(fourElementFile);
System.out.printf("four%n");
System.out.printf(Tranche.tranchesString(four));
}
System.exit(0);
}
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
return null;
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
//
//---------------------------------------------------------------------------------------------------------------
public ExpandingArrayList<VariantDatum> reduceInit() {
return new ExpandingArrayList<VariantDatum>();
}
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
reduceSum.addAll( mapValue );
return reduceSum;
}
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
}
}

View File

@ -0,0 +1,120 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.*;
import java.util.*;
/**
*/
public class Tranche {
public double fdr, pCut, knownTiTv, novelTiTv;
public int numKnown,numNovel;
public Tranche(double fdr, double pCut, double novelTiTv, int numNovel) {
this(fdr,pCut,-1, -1, numNovel, novelTiTv);
}
public Tranche(double fdr, double pCut, int numKnown, double knownTiTv, int numNovel, double novelTiTv) {
this.fdr = fdr;
this.pCut = pCut;
this.novelTiTv = novelTiTv;
this.numNovel = numNovel;
this.knownTiTv = knownTiTv;
this.numKnown = numKnown;
}
public String toString() {
return String.format("[Tranche cut = %.3f with %d novels @ %.2f]", pCut, numNovel, novelTiTv);
}
public static String tranchesString(List<Tranche> tranches) {
ByteArrayOutputStream bytes = new ByteArrayOutputStream();
PrintStream stream = new PrintStream(bytes);
stream.println("FDRtranche,numKnown,numNovel,knownTiTv,novelTiTv,pCut,filterName");
Tranche prev = null;
for ( Tranche t : tranches ) {
stream.printf("%.2f,%d,%d,%.4f,%.4f,%.4f,FDRtranche%.2fto%.2f%n",
t.fdr,t.numKnown,t.numNovel,t.knownTiTv,t.novelTiTv, t.pCut,
(prev == null ? 0.0 : prev.fdr), t.fdr);
prev = t;
}
return bytes.toString();
}
private static double getDouble(Map<String,String> bindings, String key) {
return bindings.containsKey(key) ? Double.valueOf(bindings.get(key)) : -1.0;
}
private static int getInteger(Map<String,String> bindings, String key) {
return bindings.containsKey(key) ? Integer.valueOf(bindings.get(key)) : -1;
}
public static List<Tranche> readTraches(File f) {
String[] header = null;
List<Tranche> tranches = new ArrayList<Tranche>();
try {
for( final String line : new XReadLines(f) ) {
final String[] vals = line.split(",");
if( header == null ) {
header = vals;
} else {
Map<String,String> bindings = new HashMap<String, String>();
for ( int i = 0; i < vals.length; i++ ) bindings.put(header[i], vals[i]);
tranches.add(new Tranche(getDouble(bindings,"FDRtranche"),
getDouble(bindings,"pCut"),
getInteger(bindings,"numKnown"),
getDouble(bindings,"knownTiTv"),
getInteger(bindings,"numNovel"),
getDouble(bindings,"novelTiTv")));
}
}
return tranches;
} catch( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(f, e);
}
}
}

View File

@ -31,10 +31,14 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
* Date: Feb 24, 2010
*/
public class VariantDatum {
public class VariantDatum implements Comparable<VariantDatum> {
public double[] annotations;
public boolean isTransition;
public boolean isKnown;
public double qual;
public double weight;
public int compareTo(VariantDatum other) {
return Double.compare(this.qual, other.qual);
}
}

View File

@ -40,7 +40,7 @@ import Jama.*;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.Random;
import java.util.*;
import java.util.regex.Pattern;
/**
@ -462,6 +462,91 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
return sum;
}
// ---------------------------------------------------------------------------------------------------------
//
// Code to determine FDR tranches for VariantDatum[]
//
// ---------------------------------------------------------------------------------------------------------
public final static List<Tranche> findTranches( final VariantDatum[] data, final double[] FDRtranches, double targetTiTv ) {
List<VariantDatum> tranchesData = sortVariantsbyQual(data);
double[] runningTiTv = calculateRunningTiTv(tranchesData);
List<Tranche> tranches = new ArrayList<Tranche>();
for ( double fdr : FDRtranches ) {
Tranche t = findTranche(tranchesData, runningTiTv, fdr, targetTiTv);
if ( t == null ) {
if ( tranches.size() == 0 )
throw new UserException("Couldn't find any tranche containing variants with a TiTv > target of " + targetTiTv);
break;
}
tranches.add(t);
}
return tranches;
}
private final static List<VariantDatum> sortVariantsbyQual(final VariantDatum[] data) {
List<VariantDatum> sorted = new ArrayList<VariantDatum>(Arrays.asList(data));
Collections.sort(sorted);
return sorted;
}
private static double[] calculateRunningTiTv(List<VariantDatum> data) {
int ti = 0, tv = 0;
double[] run = new double[data.size()];
for ( int i = data.size() - 1; i >= 0; i-- ) {
VariantDatum datum = data.get(i);
if ( ! datum.isKnown ) {
if ( datum.isTransition ) { ti++; } else { tv++; }
run[i] = ti / Math.max(1.0 * tv, 1.0);
}
}
return run;
}
public final static Tranche findTranche( final List<VariantDatum> data, double[] runningTiTv, final double desiredFDR, double targetTiTv ) {
final double titvThreshold = fdrToTiTv(desiredFDR, targetTiTv); // compute the desired TiTv
for ( int i = 0; i < runningTiTv.length; i++ ) {
if ( runningTiTv[i] >= titvThreshold ) {
// we've found the largest group of variants with Ti/Tv >= our target titv
return trancheOfVariants(data, i, desiredFDR);
}
}
// we get here when there's no subset of variants with Ti/Tv >= threshold, in which case we should return null
return null;
}
public final static Tranche trancheOfVariants( final List<VariantDatum> data, int minI, double fdr ) {
int numKnown = 0, numNovel = 0, knownTi = 0, knownTv = 0, novelTi = 0, novelTv = 0;
for ( int i = minI; i < data.size(); i++ ) {
VariantDatum datum = data.get(i);
if ( datum.isKnown ) {
numKnown++;
if ( datum.isTransition ) { knownTi++; } else { knownTv++; }
} else {
numNovel++;
if ( datum.isTransition ) { novelTi++; } else { novelTv++; }
}
}
double knownTiTv = knownTi / Math.max(1.0 * knownTv, 1.0);
double novelTiTv = novelTi / Math.max(1.0 * novelTv, 1.0);
return new Tranche(fdr, data.get(minI).qual, numKnown, knownTiTv, numNovel, novelTiTv);
}
public final static double fdrToTiTv(double desiredFDR, double targetTiTv) {
return (1.0 - desiredFDR / 100.0) * (targetTiTv - 0.5) + 0.5;
}
public final void outputOptimizationCurve( final VariantDatum[] data, final PrintStream outputReportDatFile, final PrintStream tranchesOutputFile,
final int desiredNumVariants, final Double[] FDRtranches, final double MAX_QUAL ) {
final int numVariants = data.length;

View File

@ -0,0 +1,5 @@
FDRtranche,novelTITV,pCut,numNovel,filterName
0.10,2.8000,1.1800,893,FDRtranche0.00to0.10
1.00,2.7782,1.0600,903,FDRtranche0.10to1.00
10.00,2.5714,0.3800,975,FDRtranche1.00to10.00
100.00,1.6091,0.0000,1602,FDRtranche10.00to100.00

View File

@ -0,0 +1,5 @@
FDRtranche,numKnown,numNovel,knownTiTv,novelTiTv,pCut,filterName
0.10,15791,893,3.3086,2.8000,1.1800,FDRtranche0.00to0.10
1.00,15823,903,3.3044,2.7782,1.0600,FDRtranche0.10to1.00
10.00,16035,975,3.2932,2.5714,0.3800,FDRtranche1.00to10.00
100.00,16578,1602,3.2291,1.6091,0.0000,FDRtranche10.00to100.00

File diff suppressed because it is too large Load Diff