Variant optimizer now outputs VCF files via ApplyVariantClustersWalker. Documentation to be added to the wiki. It is ready to be used by other people but only with great caution.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3028 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-03-17 20:41:42 +00:00
parent d9398dc347
commit 58a31bab6a
7 changed files with 145 additions and 73 deletions

View File

@ -11,13 +11,14 @@ data = read.table(input,sep=",",head=T)
maxVars = max(data$numKnown, data$numNovel)
maxTITV = max(data$knownTITV[is.finite(data$knownTITV) & data$numKnown>2000], data$novelTITV[is.finite(data$novelTITV) & data$numNovel > 2000], targetTITV)
minTITV = min(data$knownTITV[length(data$knownTITV)], data$novelTITV[length(data$novelTITV)], targetTITV)
maxPCut = max(data$pCut[data$numKnown>0 | data$numNovel>0])
outfile = paste(input, ".optimizationCurve.pdf", sep="")
pdf(outfile, height=7, width=8)
par(mar=c(4,4,1,4),cex=1.3)
plot(data$pCut, data$knownTITV, axes=F,xlab="Keep variants with p(true) > X",ylab="",ylim=c(minTITV,maxTITV),col="Blue",pch=20)
plot(data$pCut, data$knownTITV, axes=F,xlab="Keep variants with QUAL >= X",ylab="",ylim=c(minTITV,maxTITV),xlim=c(0,maxPCut),col="Blue",pch=20)
points(data$pCut, data$novelTITV,,col="DarkBlue",pch=20)
abline(h=targetTITV,lty=3,col="Blue")
axis(side=2,col="DarkBlue")
@ -25,7 +26,7 @@ axis(side=1)
mtext("Ti/Tv Ratio", side=2, line=2, col="blue",cex=1.4)
legend("left", c("Known Ti/Tv","Novel Ti/Tv"), col=c("Blue","DarkBlue"), pch=c(20,20),cex=0.7)
par(new=T)
plot(data$pCut, data$numKnown, axes=F,xlab="",ylab="",ylim=c(0,maxVars),col="Green",pch=20)
plot(data$pCut, data$numKnown, axes=F,xlab="",ylab="",ylim=c(0,maxVars),xlim=c(0,maxPCut),col="Green",pch=20)
points(data$pCut, data$numNovel,col="DarkGreen",pch=20)
axis(side=4,col="DarkGreen")
mtext("Number of Variants", side=4, line=2, col="DarkGreen",cex=1.4)

View File

@ -3,10 +3,19 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.ExpandingArrayList;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import java.util.TreeSet;
/*
* Copyright (c) 2010 The Broad Institute
@ -47,8 +56,8 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
/////////////////////////////
// Command Line Arguments
/////////////////////////////
//@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=false)
//private double TARGET_TITV = 2.1;
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true)
private double TARGET_TITV = 2.1;
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
private int DESIRED_NUM_VARIANTS = 0;
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
@ -67,7 +76,8 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
/////////////////////////////
// Private Member Variables
/////////////////////////////
private final ExpandingArrayList<String> annotationKeys = new ExpandingArrayList<String>();
private VariantGaussianMixtureModel theModel = null;
private VCFWriter vcfWriter;
//---------------------------------------------------------------------------------------------------------------
//
@ -78,7 +88,6 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
public void initialize() {
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
VariantOptimizationModel theModel = null;
switch (OPTIMIZATION_MODEL) {
case GAUSSIAN_MIXTURE_MODEL:
theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME );
@ -89,6 +98,18 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
default:
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
}
// setup the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score"));
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
vcfWriter = new VCFWriter( new File(OUTPUT_PREFIX + ".vcf") );
TreeSet<String> samples = new TreeSet<String>();
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
}
//---------------------------------------------------------------------------------------------------------------
@ -99,6 +120,35 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
return mapList;
}
for( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
if( rod != null && rod instanceof RodVCF ) {
final RodVCF rodVCF = ((RodVCF) rod);
if( rodVCF.isSNP() && (IGNORE_INPUT_FILTERS || !rodVCF.isFiltered()) ) {
final double pTrue = theModel.evaluateVariant( rodVCF.getInfoValues(), rodVCF.getQual() );
final double recalQual = QualityUtils.phredScaleErrorRate( Math.max( 1.0 - pTrue, 0.000000001) );
rodVCF.mCurrentRecord.addInfoField("OQ", ((Double)rodVCF.getQual()).toString() );
rodVCF.mCurrentRecord.setQual( recalQual );
vcfWriter.addRecord( rodVCF.mCurrentRecord );
VariantDatum variantDatum = new VariantDatum();
variantDatum.isTransition = BaseUtils.isTransition((byte)rodVCF.getAlternativeBaseForSNP(), (byte)rodVCF.getReferenceForSNP()); //vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
variantDatum.isKnown = !rodVCF.isNovel(); //!vc.getAttribute("ID").equals(".");
variantDatum.qual = recalQual;
mapList.add( variantDatum );
} else { // not a SNP or is filtered so just dump it out to the VCF file
vcfWriter.addRecord( rodVCF.mCurrentRecord );
}
}
}
return mapList;
}
@ -119,7 +169,22 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
reduceSum.clear(); // Don't need this ever again, clean up some memory
theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS );
// Execute Rscript command to plot the optimization curve
// Print out the command line to make it clear to the user what is being executed and how one might modify it
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
System.out.println( rScriptCommandLine );
// Execute the RScript command to plot the table of truth values
try {
Runtime.getRuntime().exec( rScriptCommandLine );
} catch ( IOException e ) {
throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
}
}
}

View File

@ -4,8 +4,6 @@ import org.broadinstitute.sting.utils.ExpandingArrayList;
import org.broadinstitute.sting.utils.StingException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.regex.Pattern;
/*
* Copyright (c) 2010 The Broad Institute
@ -43,9 +41,9 @@ public class VariantDataManager {
public final int numVariants;
public final int numAnnotations;
public final double[] meanVector;
public final double[] varianceVector;
public final double[] varianceVector; // This is really the standard deviation
public boolean isNormalized;
private final ExpandingArrayList<String> annotationKeys;
public final ExpandingArrayList<String> annotationKeys;
public VariantDataManager( final ExpandingArrayList<VariantDatum> dataList, final ExpandingArrayList<String> _annotationKeys ) {
numVariants = dataList.size();
@ -53,7 +51,7 @@ public class VariantDataManager {
if( numVariants <= 0 ) {
throw new StingException( "There are zero variants! (or possibly a problem with integer overflow)" );
}
numAnnotations = data[0].annotations.length;
numAnnotations = _annotationKeys.size() + 1; // +1 for QUAL
if( numAnnotations <= 0 ) {
throw new StingException( "There are zero annotations! (or possibly a problem with integer overflow)" );
}
@ -91,7 +89,7 @@ public class VariantDataManager {
throw new StingException("Zero variance is a problem: standard deviation = " + theSTD);
}
meanVector[jjj] = theMean;
varianceVector[jjj] = theSTD * theSTD;
varianceVector[jjj] = theSTD;
for( int iii = 0; iii < numVariants; iii++ ) {
data[iii].annotations[jjj] = ( data[iii].annotations[jjj] - theMean ) / theSTD;
}

View File

@ -35,5 +35,5 @@ public class VariantDatum {
public double[] annotations;
public boolean isTransition;
public boolean isKnown;
public double pTrue;
public double qual;
}

View File

@ -7,7 +7,7 @@ import org.broadinstitute.sting.utils.xReadLines;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Map;
import java.util.Random;
import java.util.regex.Pattern;
@ -44,7 +44,7 @@ import java.util.regex.Pattern;
public final class VariantGaussianMixtureModel extends VariantOptimizationModel implements VariantClusteringModel {
private final VariantDataManager dataManager;
public final VariantDataManager dataManager;
private final int numGaussians;
private final int numIterations;
private final long RANDOM_SEED = 91801305;
@ -59,9 +59,9 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
private final int[] numMaxClusterNovel;
private final double[] clusterTITV;
private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio
//private final int desiredNumVariants;
private final int minVarInCluster;
private static final double INFINITE_ANNOTATION_VALUE = 10000.0;
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
@ -78,7 +78,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
numMaxClusterNovel = new int[numGaussians];
clusterTITV = new double[numGaussians];
clusterTruePositiveRate = new double[numGaussians];
//desiredNumVariants = _desiredNumVariants;
minVarInCluster = _minVarInCluster;
}
@ -107,14 +106,25 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
numMaxClusterKnown = null;
numMaxClusterNovel = null;
clusterTITV = null;
clusterTruePositiveRate = null;
//desiredNumVariants = _desiredNumVariants;
minVarInCluster = 0;
numGaussians = clusterLines.size();
mu = new double[numGaussians][];
sigma = new double[numGaussians][];
mu = new double[numGaussians][dataManager.numAnnotations];
sigma = new double[numGaussians][dataManager.numAnnotations];
clusterTruePositiveRate = new double[numGaussians];
int kkk = 0;
for( String line : clusterLines ) {
String[] vals = line.split(",");
clusterTruePositiveRate[kkk] = Double.parseDouble(vals[4]);
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
mu[kkk][jjj] = Double.parseDouble(vals[5+jjj]);
sigma[kkk][jjj] = Double.parseDouble(vals[5+dataManager.numAnnotations+jjj]);
}
kkk++;
}
System.out.println("Found " + numGaussians + " clusters and using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys);
}
public final void run( final String clusterFileName ) {
@ -278,27 +288,45 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
}
/*
public final void applyClusters( final VariantDatum[] data, final String outputPrefix ) {
public final double evaluateVariant( final Map<String,String> annotationMap, final double qualityScore ) {
final double[] pVarInCluster = new double[numGaussians];
final double[] annotations = new double[dataManager.numAnnotations];
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
double value = 0.0;
if( dataManager.annotationKeys.get(jjj).equals("QUAL") ) {
value = qualityScore;
} else {
try {
final String stringValue = annotationMap.get( dataManager.annotationKeys.get(jjj) );
if( stringValue != null ) {
value = Double.parseDouble( stringValue );
if( Double.isInfinite(value) ) {
value = ( value > 0 ? 1.0 : -1.0 ) * INFINITE_ANNOTATION_VALUE;
}
}
} catch( NumberFormatException e ) {
// do nothing, default value is 0.0
}
}
annotations[jjj] = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
}
evaluateGaussiansForSingleVariant( annotations, pVarInCluster );
double sum = 0;
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
sum += pVarInCluster[kkk] * clusterTruePositiveRate[kkk];
}
return sum;
}
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) {
final int numVariants = data.length;
final double[] pTrueVariant = new double[numVariants];
final boolean[] markedVariant = new boolean[numVariants];
final double[] pVarInCluster = new double[numGaussians];
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Evaluate each variant using the probability of being in each cluster and that cluster's true positive rate
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
for( int iii = 0; iii < numVariants; iii++ ) {
evaluateGaussiansForSingleVariant( data[iii], pVarInCluster );
pTrueVariant[iii] = 0.0;
markedVariant[iii] = false;
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
pTrueVariant[iii] += pVarInCluster[kkk] * clusterTruePositiveRate[kkk];
}
}
PrintStream outputFile = null;
try {
@ -316,10 +344,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
int numNovelTv = 0;
boolean foundDesiredNumVariants = false;
outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
for( double pCut = 1.0; pCut >= 0.0; pCut -= 0.001 ) {
for( double pCut = 100.0; pCut >= 0.0; pCut -= 0.1 ) {
for( int iii = 0; iii < numVariants; iii++ ) {
if( !markedVariant[iii] ) {
if( pTrueVariant[iii] >= pCut ) {
if( data[iii].qual >= pCut ) {
markedVariant[iii] = true;
if( data[iii].isKnown ) { // known
numKnown++;
@ -340,7 +368,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
}
if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) {
System.out.println( "Keeping variants with p(true) >= " + String.format("%.3f",pCut) + " results in a filtered set with: " );
System.out.println( "Keeping variants with p(true) >= " + String.format("%.1f",pCut) + " results in a filtered set with: " );
System.out.println("\t" + numKnown + " known variants");
System.out.println("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)");
System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv)));
@ -353,7 +381,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
( numNovelTi ==0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) )));
}
}
*/
private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) {
@ -386,18 +413,20 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
private void evaluateGaussiansForSingleVariant( final VariantDatum datum, final double[] pVarInCluster ) {
private void evaluateGaussiansForSingleVariant( final double[] annotations, final double[] pVarInCluster ) {
final int numAnnotations = datum.annotations.length;
final int numAnnotations = annotations.length;
double sumProb = 0.0;
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
double sum = 0.0;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
sum += ( (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]) )
sum += ( (annotations[jjj] - mu[kkk][jjj]) * (annotations[jjj] - mu[kkk][jjj]) )
/ sigma[kkk][jjj];
}
pVarInCluster[kkk] = pCluster[kkk] * Math.exp( -0.5 * sum );
//BUGBUG: removed pCluster[kkk]*, for the second pass
//pVarInCluster[kkk] = pCluster[kkk] * Math.exp( -0.5 * sum );
pVarInCluster[kkk] = Math.exp( -0.5 * sum );
if( pVarInCluster[kkk] < MIN_PROB) { // Very small numbers are a very big problem
pVarInCluster[kkk] = MIN_PROB;

View File

@ -53,8 +53,6 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
/////////////////////////////
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true)
private double TARGET_TITV = 2.1;
//@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
//private int DESIRED_NUM_VARIANTS = 0;
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
private boolean IGNORE_INPUT_FILTERS = false;
@Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false)
@ -74,10 +72,6 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
//private int NUM_KNN = 2000;
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
//@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
//private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
//@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
//private String PATH_TO_RESOURCES = "R/";
/////////////////////////////
// Private Member Variables
@ -115,7 +109,7 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
for( final VariantContext vc : tracker.getAllVariantContexts(null, context.getLocation(), false, false) ) {
if( vc != null && vc.isSNP() && (IGNORE_INPUT_FILTERS || !vc.isFiltered()) ) {
if( vc != null && vc.isSNP() && (IGNORE_INPUT_FILTERS || !vc.isFiltered()) ) {
if( firstVariant ) { // This is the first variant encountered so set up the list of annotations
annotationKeys.addAll( vc.getAttributes().keySet() );
if( annotationKeys.contains("ID") ) { annotationKeys.remove("ID"); } // ID field is added to the vc's INFO field?
@ -203,21 +197,6 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
}
theModel.run( CLUSTER_FILENAME );
// Functionality moved to different walker
// Execute Rscript command to plot the optimization curve
// Print out the command line to make it clear to the user what is being executed and how one might modify it
//final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
//System.out.println( rScriptCommandLine );
// Execute the RScript command to plot the table of truth values
//try {
// Runtime.getRuntime().exec( rScriptCommandLine );
//} catch ( IOException e ) {
// throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
//}
}
}

View File

@ -56,7 +56,7 @@ public class QualityUtils {
}
/**
* Convert a probability to a quality score. Note, this is capped at Q40.
* Convert a probability to a quality score. Note, this is capped at a quality score which is determined by _eps_.
*
* @param prob a probability (0.0-1.0)
* @param eps min probabilty allowed (0.0-1.0)