Variant optimizer now outputs VCF files via ApplyVariantClustersWalker. Documentation to be added to the wiki. It is ready to be used by other people but only with great caution.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3028 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-03-17 20:41:42 +00:00
parent d9398dc347
commit 58a31bab6a
7 changed files with 145 additions and 73 deletions

View File

@ -11,13 +11,14 @@ data = read.table(input,sep=",",head=T)
maxVars = max(data$numKnown, data$numNovel) maxVars = max(data$numKnown, data$numNovel)
maxTITV = max(data$knownTITV[is.finite(data$knownTITV) & data$numKnown>2000], data$novelTITV[is.finite(data$novelTITV) & data$numNovel > 2000], targetTITV) maxTITV = max(data$knownTITV[is.finite(data$knownTITV) & data$numKnown>2000], data$novelTITV[is.finite(data$novelTITV) & data$numNovel > 2000], targetTITV)
minTITV = min(data$knownTITV[length(data$knownTITV)], data$novelTITV[length(data$novelTITV)], targetTITV) minTITV = min(data$knownTITV[length(data$knownTITV)], data$novelTITV[length(data$novelTITV)], targetTITV)
maxPCut = max(data$pCut[data$numKnown>0 | data$numNovel>0])
outfile = paste(input, ".optimizationCurve.pdf", sep="") outfile = paste(input, ".optimizationCurve.pdf", sep="")
pdf(outfile, height=7, width=8) pdf(outfile, height=7, width=8)
par(mar=c(4,4,1,4),cex=1.3) par(mar=c(4,4,1,4),cex=1.3)
plot(data$pCut, data$knownTITV, axes=F,xlab="Keep variants with p(true) > X",ylab="",ylim=c(minTITV,maxTITV),col="Blue",pch=20) plot(data$pCut, data$knownTITV, axes=F,xlab="Keep variants with QUAL >= X",ylab="",ylim=c(minTITV,maxTITV),xlim=c(0,maxPCut),col="Blue",pch=20)
points(data$pCut, data$novelTITV,,col="DarkBlue",pch=20) points(data$pCut, data$novelTITV,,col="DarkBlue",pch=20)
abline(h=targetTITV,lty=3,col="Blue") abline(h=targetTITV,lty=3,col="Blue")
axis(side=2,col="DarkBlue") axis(side=2,col="DarkBlue")
@ -25,7 +26,7 @@ axis(side=1)
mtext("Ti/Tv Ratio", side=2, line=2, col="blue",cex=1.4) mtext("Ti/Tv Ratio", side=2, line=2, col="blue",cex=1.4)
legend("left", c("Known Ti/Tv","Novel Ti/Tv"), col=c("Blue","DarkBlue"), pch=c(20,20),cex=0.7) legend("left", c("Known Ti/Tv","Novel Ti/Tv"), col=c("Blue","DarkBlue"), pch=c(20,20),cex=0.7)
par(new=T) par(new=T)
plot(data$pCut, data$numKnown, axes=F,xlab="",ylab="",ylim=c(0,maxVars),col="Green",pch=20) plot(data$pCut, data$numKnown, axes=F,xlab="",ylab="",ylim=c(0,maxVars),xlim=c(0,maxPCut),col="Green",pch=20)
points(data$pCut, data$numNovel,col="DarkGreen",pch=20) points(data$pCut, data$numNovel,col="DarkGreen",pch=20)
axis(side=4,col="DarkGreen") axis(side=4,col="DarkGreen")
mtext("Number of Variants", side=4, line=2, col="DarkGreen",cex=1.4) mtext("Number of Variants", side=4, line=2, col="DarkGreen",cex=1.4)

View File

@ -3,10 +3,19 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.RodVCF;
import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.ExpandingArrayList; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import java.util.TreeSet;
/* /*
* Copyright (c) 2010 The Broad Institute * Copyright (c) 2010 The Broad Institute
@ -47,8 +56,8 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
///////////////////////////// /////////////////////////////
// Command Line Arguments // Command Line Arguments
///////////////////////////// /////////////////////////////
//@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=false) @Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true)
//private double TARGET_TITV = 2.1; private double TARGET_TITV = 2.1;
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false) @Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
private int DESIRED_NUM_VARIANTS = 0; private int DESIRED_NUM_VARIANTS = 0;
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false) @Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
@ -67,7 +76,8 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
///////////////////////////// /////////////////////////////
// Private Member Variables // Private Member Variables
///////////////////////////// /////////////////////////////
private final ExpandingArrayList<String> annotationKeys = new ExpandingArrayList<String>(); private VariantGaussianMixtureModel theModel = null;
private VCFWriter vcfWriter;
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
// //
@ -78,7 +88,6 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
public void initialize() { public void initialize() {
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; } if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
VariantOptimizationModel theModel = null;
switch (OPTIMIZATION_MODEL) { switch (OPTIMIZATION_MODEL) {
case GAUSSIAN_MIXTURE_MODEL: case GAUSSIAN_MIXTURE_MODEL:
theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME ); theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME );
@ -89,6 +98,18 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
default: default:
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" ); throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
} }
// setup the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score"));
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
vcfWriter = new VCFWriter( new File(OUTPUT_PREFIX + ".vcf") );
TreeSet<String> samples = new TreeSet<String>();
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
} }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
@ -99,6 +120,35 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>(); final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
return mapList;
}
for( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
if( rod != null && rod instanceof RodVCF ) {
final RodVCF rodVCF = ((RodVCF) rod);
if( rodVCF.isSNP() && (IGNORE_INPUT_FILTERS || !rodVCF.isFiltered()) ) {
final double pTrue = theModel.evaluateVariant( rodVCF.getInfoValues(), rodVCF.getQual() );
final double recalQual = QualityUtils.phredScaleErrorRate( Math.max( 1.0 - pTrue, 0.000000001) );
rodVCF.mCurrentRecord.addInfoField("OQ", ((Double)rodVCF.getQual()).toString() );
rodVCF.mCurrentRecord.setQual( recalQual );
vcfWriter.addRecord( rodVCF.mCurrentRecord );
VariantDatum variantDatum = new VariantDatum();
variantDatum.isTransition = BaseUtils.isTransition((byte)rodVCF.getAlternativeBaseForSNP(), (byte)rodVCF.getReferenceForSNP()); //vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
variantDatum.isKnown = !rodVCF.isNovel(); //!vc.getAttribute("ID").equals(".");
variantDatum.qual = recalQual;
mapList.add( variantDatum );
} else { // not a SNP or is filtered so just dump it out to the VCF file
vcfWriter.addRecord( rodVCF.mCurrentRecord );
}
}
}
return mapList; return mapList;
} }
@ -119,7 +169,22 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) { public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys ); final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
reduceSum.clear(); // Don't need this ever again, clean up some memory
theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS );
// Execute Rscript command to plot the optimization curve
// Print out the command line to make it clear to the user what is being executed and how one might modify it
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
System.out.println( rScriptCommandLine );
// Execute the RScript command to plot the table of truth values
try {
Runtime.getRuntime().exec( rScriptCommandLine );
} catch ( IOException e ) {
throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
}
} }
} }

View File

@ -4,8 +4,6 @@ import org.broadinstitute.sting.utils.ExpandingArrayList;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.ArrayList;
import java.util.regex.Pattern;
/* /*
* Copyright (c) 2010 The Broad Institute * Copyright (c) 2010 The Broad Institute
@ -43,9 +41,9 @@ public class VariantDataManager {
public final int numVariants; public final int numVariants;
public final int numAnnotations; public final int numAnnotations;
public final double[] meanVector; public final double[] meanVector;
public final double[] varianceVector; public final double[] varianceVector; // This is really the standard deviation
public boolean isNormalized; public boolean isNormalized;
private final ExpandingArrayList<String> annotationKeys; public final ExpandingArrayList<String> annotationKeys;
public VariantDataManager( final ExpandingArrayList<VariantDatum> dataList, final ExpandingArrayList<String> _annotationKeys ) { public VariantDataManager( final ExpandingArrayList<VariantDatum> dataList, final ExpandingArrayList<String> _annotationKeys ) {
numVariants = dataList.size(); numVariants = dataList.size();
@ -53,7 +51,7 @@ public class VariantDataManager {
if( numVariants <= 0 ) { if( numVariants <= 0 ) {
throw new StingException( "There are zero variants! (or possibly a problem with integer overflow)" ); throw new StingException( "There are zero variants! (or possibly a problem with integer overflow)" );
} }
numAnnotations = data[0].annotations.length; numAnnotations = _annotationKeys.size() + 1; // +1 for QUAL
if( numAnnotations <= 0 ) { if( numAnnotations <= 0 ) {
throw new StingException( "There are zero annotations! (or possibly a problem with integer overflow)" ); throw new StingException( "There are zero annotations! (or possibly a problem with integer overflow)" );
} }
@ -91,7 +89,7 @@ public class VariantDataManager {
throw new StingException("Zero variance is a problem: standard deviation = " + theSTD); throw new StingException("Zero variance is a problem: standard deviation = " + theSTD);
} }
meanVector[jjj] = theMean; meanVector[jjj] = theMean;
varianceVector[jjj] = theSTD * theSTD; varianceVector[jjj] = theSTD;
for( int iii = 0; iii < numVariants; iii++ ) { for( int iii = 0; iii < numVariants; iii++ ) {
data[iii].annotations[jjj] = ( data[iii].annotations[jjj] - theMean ) / theSTD; data[iii].annotations[jjj] = ( data[iii].annotations[jjj] - theMean ) / theSTD;
} }

View File

@ -35,5 +35,5 @@ public class VariantDatum {
public double[] annotations; public double[] annotations;
public boolean isTransition; public boolean isTransition;
public boolean isKnown; public boolean isKnown;
public double pTrue; public double qual;
} }

View File

@ -7,7 +7,7 @@ import org.broadinstitute.sting.utils.xReadLines;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.ArrayList; import java.util.Map;
import java.util.Random; import java.util.Random;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -44,7 +44,7 @@ import java.util.regex.Pattern;
public final class VariantGaussianMixtureModel extends VariantOptimizationModel implements VariantClusteringModel { public final class VariantGaussianMixtureModel extends VariantOptimizationModel implements VariantClusteringModel {
private final VariantDataManager dataManager; public final VariantDataManager dataManager;
private final int numGaussians; private final int numGaussians;
private final int numIterations; private final int numIterations;
private final long RANDOM_SEED = 91801305; private final long RANDOM_SEED = 91801305;
@ -59,9 +59,9 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
private final int[] numMaxClusterNovel; private final int[] numMaxClusterNovel;
private final double[] clusterTITV; private final double[] clusterTITV;
private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio
//private final int desiredNumVariants;
private final int minVarInCluster; private final int minVarInCluster;
private static final double INFINITE_ANNOTATION_VALUE = 10000.0;
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*"); private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*"); private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
@ -78,7 +78,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
numMaxClusterNovel = new int[numGaussians]; numMaxClusterNovel = new int[numGaussians];
clusterTITV = new double[numGaussians]; clusterTITV = new double[numGaussians];
clusterTruePositiveRate = new double[numGaussians]; clusterTruePositiveRate = new double[numGaussians];
//desiredNumVariants = _desiredNumVariants;
minVarInCluster = _minVarInCluster; minVarInCluster = _minVarInCluster;
} }
@ -107,14 +106,25 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
numMaxClusterKnown = null; numMaxClusterKnown = null;
numMaxClusterNovel = null; numMaxClusterNovel = null;
clusterTITV = null; clusterTITV = null;
clusterTruePositiveRate = null;
//desiredNumVariants = _desiredNumVariants;
minVarInCluster = 0; minVarInCluster = 0;
numGaussians = clusterLines.size(); numGaussians = clusterLines.size();
mu = new double[numGaussians][]; mu = new double[numGaussians][dataManager.numAnnotations];
sigma = new double[numGaussians][]; sigma = new double[numGaussians][dataManager.numAnnotations];
clusterTruePositiveRate = new double[numGaussians];
int kkk = 0;
for( String line : clusterLines ) {
String[] vals = line.split(",");
clusterTruePositiveRate[kkk] = Double.parseDouble(vals[4]);
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
mu[kkk][jjj] = Double.parseDouble(vals[5+jjj]);
sigma[kkk][jjj] = Double.parseDouble(vals[5+dataManager.numAnnotations+jjj]);
}
kkk++;
}
System.out.println("Found " + numGaussians + " clusters and using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys);
} }
public final void run( final String clusterFileName ) { public final void run( final String clusterFileName ) {
@ -278,27 +288,45 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
} }
/* public final double evaluateVariant( final Map<String,String> annotationMap, final double qualityScore ) {
public final void applyClusters( final VariantDatum[] data, final String outputPrefix ) { final double[] pVarInCluster = new double[numGaussians];
final double[] annotations = new double[dataManager.numAnnotations];
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
double value = 0.0;
if( dataManager.annotationKeys.get(jjj).equals("QUAL") ) {
value = qualityScore;
} else {
try {
final String stringValue = annotationMap.get( dataManager.annotationKeys.get(jjj) );
if( stringValue != null ) {
value = Double.parseDouble( stringValue );
if( Double.isInfinite(value) ) {
value = ( value > 0 ? 1.0 : -1.0 ) * INFINITE_ANNOTATION_VALUE;
}
}
} catch( NumberFormatException e ) {
// do nothing, default value is 0.0
}
}
annotations[jjj] = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
}
evaluateGaussiansForSingleVariant( annotations, pVarInCluster );
double sum = 0;
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
sum += pVarInCluster[kkk] * clusterTruePositiveRate[kkk];
}
return sum;
}
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) {
final int numVariants = data.length; final int numVariants = data.length;
final double[] pTrueVariant = new double[numVariants];
final boolean[] markedVariant = new boolean[numVariants]; final boolean[] markedVariant = new boolean[numVariants];
final double[] pVarInCluster = new double[numGaussians];
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Evaluate each variant using the probability of being in each cluster and that cluster's true positive rate
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
for( int iii = 0; iii < numVariants; iii++ ) {
evaluateGaussiansForSingleVariant( data[iii], pVarInCluster );
pTrueVariant[iii] = 0.0;
markedVariant[iii] = false;
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
pTrueVariant[iii] += pVarInCluster[kkk] * clusterTruePositiveRate[kkk];
}
}
PrintStream outputFile = null; PrintStream outputFile = null;
try { try {
@ -316,10 +344,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
int numNovelTv = 0; int numNovelTv = 0;
boolean foundDesiredNumVariants = false; boolean foundDesiredNumVariants = false;
outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV"); outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
for( double pCut = 1.0; pCut >= 0.0; pCut -= 0.001 ) { for( double pCut = 100.0; pCut >= 0.0; pCut -= 0.1 ) {
for( int iii = 0; iii < numVariants; iii++ ) { for( int iii = 0; iii < numVariants; iii++ ) {
if( !markedVariant[iii] ) { if( !markedVariant[iii] ) {
if( pTrueVariant[iii] >= pCut ) { if( data[iii].qual >= pCut ) {
markedVariant[iii] = true; markedVariant[iii] = true;
if( data[iii].isKnown ) { // known if( data[iii].isKnown ) { // known
numKnown++; numKnown++;
@ -340,7 +368,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
} }
if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) { if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) {
System.out.println( "Keeping variants with p(true) >= " + String.format("%.3f",pCut) + " results in a filtered set with: " ); System.out.println( "Keeping variants with p(true) >= " + String.format("%.1f",pCut) + " results in a filtered set with: " );
System.out.println("\t" + numKnown + " known variants"); System.out.println("\t" + numKnown + " known variants");
System.out.println("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)"); System.out.println("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)");
System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv))); System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv)));
@ -353,7 +381,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
( numNovelTi ==0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) ))); ( numNovelTi ==0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) )));
} }
} }
*/
private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) { private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) {
@ -386,18 +413,20 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
private void evaluateGaussiansForSingleVariant( final VariantDatum datum, final double[] pVarInCluster ) { private void evaluateGaussiansForSingleVariant( final double[] annotations, final double[] pVarInCluster ) {
final int numAnnotations = datum.annotations.length; final int numAnnotations = annotations.length;
double sumProb = 0.0; double sumProb = 0.0;
for( int kkk = 0; kkk < numGaussians; kkk++ ) { for( int kkk = 0; kkk < numGaussians; kkk++ ) {
double sum = 0.0; double sum = 0.0;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) { for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
sum += ( (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]) ) sum += ( (annotations[jjj] - mu[kkk][jjj]) * (annotations[jjj] - mu[kkk][jjj]) )
/ sigma[kkk][jjj]; / sigma[kkk][jjj];
} }
pVarInCluster[kkk] = pCluster[kkk] * Math.exp( -0.5 * sum ); //BUGBUG: removed pCluster[kkk]*, for the second pass
//pVarInCluster[kkk] = pCluster[kkk] * Math.exp( -0.5 * sum );
pVarInCluster[kkk] = Math.exp( -0.5 * sum );
if( pVarInCluster[kkk] < MIN_PROB) { // Very small numbers are a very big problem if( pVarInCluster[kkk] < MIN_PROB) { // Very small numbers are a very big problem
pVarInCluster[kkk] = MIN_PROB; pVarInCluster[kkk] = MIN_PROB;

View File

@ -53,8 +53,6 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
///////////////////////////// /////////////////////////////
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true) @Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true)
private double TARGET_TITV = 2.1; private double TARGET_TITV = 2.1;
//@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
//private int DESIRED_NUM_VARIANTS = 0;
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false) @Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
private boolean IGNORE_INPUT_FILTERS = false; private boolean IGNORE_INPUT_FILTERS = false;
@Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false) @Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false)
@ -74,10 +72,6 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
//private int NUM_KNN = 2000; //private int NUM_KNN = 2000;
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false) //@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL; private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
//@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
//private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
//@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
//private String PATH_TO_RESOURCES = "R/";
///////////////////////////// /////////////////////////////
// Private Member Variables // Private Member Variables
@ -203,21 +197,6 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
} }
theModel.run( CLUSTER_FILENAME ); theModel.run( CLUSTER_FILENAME );
// Functionality moved to different walker
// Execute Rscript command to plot the optimization curve
// Print out the command line to make it clear to the user what is being executed and how one might modify it
//final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
//System.out.println( rScriptCommandLine );
// Execute the RScript command to plot the table of truth values
//try {
// Runtime.getRuntime().exec( rScriptCommandLine );
//} catch ( IOException e ) {
// throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
//}
} }
} }

View File

@ -56,7 +56,7 @@ public class QualityUtils {
} }
/** /**
* Convert a probability to a quality score. Note, this is capped at Q40. * Convert a probability to a quality score. Note, this is capped at a quality score which is determined by _eps_.
* *
* @param prob a probability (0.0-1.0) * @param prob a probability (0.0-1.0)
* @param eps min probabilty allowed (0.0-1.0) * @param eps min probabilty allowed (0.0-1.0)