Variant optimizer now outputs VCF files via ApplyVariantClustersWalker. Documentation to be added to the wiki. It is ready to be used by other people but only with great caution.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3028 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d9398dc347
commit
58a31bab6a
|
|
@ -11,13 +11,14 @@ data = read.table(input,sep=",",head=T)
|
||||||
maxVars = max(data$numKnown, data$numNovel)
|
maxVars = max(data$numKnown, data$numNovel)
|
||||||
maxTITV = max(data$knownTITV[is.finite(data$knownTITV) & data$numKnown>2000], data$novelTITV[is.finite(data$novelTITV) & data$numNovel > 2000], targetTITV)
|
maxTITV = max(data$knownTITV[is.finite(data$knownTITV) & data$numKnown>2000], data$novelTITV[is.finite(data$novelTITV) & data$numNovel > 2000], targetTITV)
|
||||||
minTITV = min(data$knownTITV[length(data$knownTITV)], data$novelTITV[length(data$novelTITV)], targetTITV)
|
minTITV = min(data$knownTITV[length(data$knownTITV)], data$novelTITV[length(data$novelTITV)], targetTITV)
|
||||||
|
maxPCut = max(data$pCut[data$numKnown>0 | data$numNovel>0])
|
||||||
|
|
||||||
|
|
||||||
outfile = paste(input, ".optimizationCurve.pdf", sep="")
|
outfile = paste(input, ".optimizationCurve.pdf", sep="")
|
||||||
pdf(outfile, height=7, width=8)
|
pdf(outfile, height=7, width=8)
|
||||||
|
|
||||||
par(mar=c(4,4,1,4),cex=1.3)
|
par(mar=c(4,4,1,4),cex=1.3)
|
||||||
plot(data$pCut, data$knownTITV, axes=F,xlab="Keep variants with p(true) > X",ylab="",ylim=c(minTITV,maxTITV),col="Blue",pch=20)
|
plot(data$pCut, data$knownTITV, axes=F,xlab="Keep variants with QUAL >= X",ylab="",ylim=c(minTITV,maxTITV),xlim=c(0,maxPCut),col="Blue",pch=20)
|
||||||
points(data$pCut, data$novelTITV,,col="DarkBlue",pch=20)
|
points(data$pCut, data$novelTITV,,col="DarkBlue",pch=20)
|
||||||
abline(h=targetTITV,lty=3,col="Blue")
|
abline(h=targetTITV,lty=3,col="Blue")
|
||||||
axis(side=2,col="DarkBlue")
|
axis(side=2,col="DarkBlue")
|
||||||
|
|
@ -25,7 +26,7 @@ axis(side=1)
|
||||||
mtext("Ti/Tv Ratio", side=2, line=2, col="blue",cex=1.4)
|
mtext("Ti/Tv Ratio", side=2, line=2, col="blue",cex=1.4)
|
||||||
legend("left", c("Known Ti/Tv","Novel Ti/Tv"), col=c("Blue","DarkBlue"), pch=c(20,20),cex=0.7)
|
legend("left", c("Known Ti/Tv","Novel Ti/Tv"), col=c("Blue","DarkBlue"), pch=c(20,20),cex=0.7)
|
||||||
par(new=T)
|
par(new=T)
|
||||||
plot(data$pCut, data$numKnown, axes=F,xlab="",ylab="",ylim=c(0,maxVars),col="Green",pch=20)
|
plot(data$pCut, data$numKnown, axes=F,xlab="",ylab="",ylim=c(0,maxVars),xlim=c(0,maxPCut),col="Green",pch=20)
|
||||||
points(data$pCut, data$numNovel,col="DarkGreen",pch=20)
|
points(data$pCut, data$numNovel,col="DarkGreen",pch=20)
|
||||||
axis(side=4,col="DarkGreen")
|
axis(side=4,col="DarkGreen")
|
||||||
mtext("Number of Variants", side=4, line=2, col="DarkGreen",cex=1.4)
|
mtext("Number of Variants", side=4, line=2, col="DarkGreen",cex=1.4)
|
||||||
|
|
|
||||||
|
|
@ -3,10 +3,19 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantoptimizer;
|
||||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RodVCF;
|
||||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
import org.broadinstitute.sting.utils.ExpandingArrayList;
|
import org.broadinstitute.sting.utils.*;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
|
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2010 The Broad Institute
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
|
@ -47,8 +56,8 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Command Line Arguments
|
// Command Line Arguments
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
//@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=false)
|
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true)
|
||||||
//private double TARGET_TITV = 2.1;
|
private double TARGET_TITV = 2.1;
|
||||||
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
|
@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
|
||||||
private int DESIRED_NUM_VARIANTS = 0;
|
private int DESIRED_NUM_VARIANTS = 0;
|
||||||
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||||
|
|
@ -67,7 +76,8 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Private Member Variables
|
// Private Member Variables
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
private final ExpandingArrayList<String> annotationKeys = new ExpandingArrayList<String>();
|
private VariantGaussianMixtureModel theModel = null;
|
||||||
|
private VCFWriter vcfWriter;
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
|
|
@ -78,7 +88,6 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||||
|
|
||||||
VariantOptimizationModel theModel = null;
|
|
||||||
switch (OPTIMIZATION_MODEL) {
|
switch (OPTIMIZATION_MODEL) {
|
||||||
case GAUSSIAN_MIXTURE_MODEL:
|
case GAUSSIAN_MIXTURE_MODEL:
|
||||||
theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME );
|
theModel = new VariantGaussianMixtureModel( CLUSTER_FILENAME );
|
||||||
|
|
@ -89,6 +98,18 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
||||||
default:
|
default:
|
||||||
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
|
throw new StingException( "Variant Optimization Model is unrecognized. Implemented options are GAUSSIAN_MIXTURE_MODEL and K_NEAREST_NEIGHBORS" );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// setup the header fields
|
||||||
|
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||||
|
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
|
||||||
|
hInfo.add(new VCFInfoHeaderLine("OQ", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "The original variant quality score"));
|
||||||
|
hInfo.add(new VCFHeaderLine("source", "VariantOptimizer"));
|
||||||
|
vcfWriter = new VCFWriter( new File(OUTPUT_PREFIX + ".vcf") );
|
||||||
|
TreeSet<String> samples = new TreeSet<String>();
|
||||||
|
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
|
||||||
|
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||||
|
vcfWriter.writeHeader(vcfHeader);
|
||||||
}
|
}
|
||||||
|
|
||||||
//---------------------------------------------------------------------------------------------------------------
|
//---------------------------------------------------------------------------------------------------------------
|
||||||
|
|
@ -99,6 +120,35 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
||||||
|
|
||||||
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||||
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
|
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
|
||||||
|
|
||||||
|
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||||
|
return mapList;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for( ReferenceOrderedDatum rod : tracker.getAllRods() ) {
|
||||||
|
if( rod != null && rod instanceof RodVCF ) {
|
||||||
|
final RodVCF rodVCF = ((RodVCF) rod);
|
||||||
|
if( rodVCF.isSNP() && (IGNORE_INPUT_FILTERS || !rodVCF.isFiltered()) ) {
|
||||||
|
final double pTrue = theModel.evaluateVariant( rodVCF.getInfoValues(), rodVCF.getQual() );
|
||||||
|
final double recalQual = QualityUtils.phredScaleErrorRate( Math.max( 1.0 - pTrue, 0.000000001) );
|
||||||
|
rodVCF.mCurrentRecord.addInfoField("OQ", ((Double)rodVCF.getQual()).toString() );
|
||||||
|
rodVCF.mCurrentRecord.setQual( recalQual );
|
||||||
|
vcfWriter.addRecord( rodVCF.mCurrentRecord );
|
||||||
|
|
||||||
|
VariantDatum variantDatum = new VariantDatum();
|
||||||
|
variantDatum.isTransition = BaseUtils.isTransition((byte)rodVCF.getAlternativeBaseForSNP(), (byte)rodVCF.getReferenceForSNP()); //vc.getSNPSubstitutionType().compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
|
||||||
|
variantDatum.isKnown = !rodVCF.isNovel(); //!vc.getAttribute("ID").equals(".");
|
||||||
|
variantDatum.qual = recalQual;
|
||||||
|
mapList.add( variantDatum );
|
||||||
|
|
||||||
|
} else { // not a SNP or is filtered so just dump it out to the VCF file
|
||||||
|
vcfWriter.addRecord( rodVCF.mCurrentRecord );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
return mapList;
|
return mapList;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -119,7 +169,22 @@ public class ApplyVariantClustersWalker extends RodWalker<ExpandingArrayList<Var
|
||||||
|
|
||||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||||
|
|
||||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
|
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
|
||||||
|
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
||||||
|
|
||||||
|
theModel.outputOptimizationCurve( dataManager.data, OUTPUT_PREFIX, DESIRED_NUM_VARIANTS );
|
||||||
|
|
||||||
|
// Execute Rscript command to plot the optimization curve
|
||||||
|
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||||
|
final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
|
||||||
|
System.out.println( rScriptCommandLine );
|
||||||
|
|
||||||
|
// Execute the RScript command to plot the table of truth values
|
||||||
|
try {
|
||||||
|
Runtime.getRuntime().exec( rScriptCommandLine );
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,8 +4,6 @@ import org.broadinstitute.sting.utils.ExpandingArrayList;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2010 The Broad Institute
|
* Copyright (c) 2010 The Broad Institute
|
||||||
|
|
@ -43,9 +41,9 @@ public class VariantDataManager {
|
||||||
public final int numVariants;
|
public final int numVariants;
|
||||||
public final int numAnnotations;
|
public final int numAnnotations;
|
||||||
public final double[] meanVector;
|
public final double[] meanVector;
|
||||||
public final double[] varianceVector;
|
public final double[] varianceVector; // This is really the standard deviation
|
||||||
public boolean isNormalized;
|
public boolean isNormalized;
|
||||||
private final ExpandingArrayList<String> annotationKeys;
|
public final ExpandingArrayList<String> annotationKeys;
|
||||||
|
|
||||||
public VariantDataManager( final ExpandingArrayList<VariantDatum> dataList, final ExpandingArrayList<String> _annotationKeys ) {
|
public VariantDataManager( final ExpandingArrayList<VariantDatum> dataList, final ExpandingArrayList<String> _annotationKeys ) {
|
||||||
numVariants = dataList.size();
|
numVariants = dataList.size();
|
||||||
|
|
@ -53,7 +51,7 @@ public class VariantDataManager {
|
||||||
if( numVariants <= 0 ) {
|
if( numVariants <= 0 ) {
|
||||||
throw new StingException( "There are zero variants! (or possibly a problem with integer overflow)" );
|
throw new StingException( "There are zero variants! (or possibly a problem with integer overflow)" );
|
||||||
}
|
}
|
||||||
numAnnotations = data[0].annotations.length;
|
numAnnotations = _annotationKeys.size() + 1; // +1 for QUAL
|
||||||
if( numAnnotations <= 0 ) {
|
if( numAnnotations <= 0 ) {
|
||||||
throw new StingException( "There are zero annotations! (or possibly a problem with integer overflow)" );
|
throw new StingException( "There are zero annotations! (or possibly a problem with integer overflow)" );
|
||||||
}
|
}
|
||||||
|
|
@ -91,7 +89,7 @@ public class VariantDataManager {
|
||||||
throw new StingException("Zero variance is a problem: standard deviation = " + theSTD);
|
throw new StingException("Zero variance is a problem: standard deviation = " + theSTD);
|
||||||
}
|
}
|
||||||
meanVector[jjj] = theMean;
|
meanVector[jjj] = theMean;
|
||||||
varianceVector[jjj] = theSTD * theSTD;
|
varianceVector[jjj] = theSTD;
|
||||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||||
data[iii].annotations[jjj] = ( data[iii].annotations[jjj] - theMean ) / theSTD;
|
data[iii].annotations[jjj] = ( data[iii].annotations[jjj] - theMean ) / theSTD;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -35,5 +35,5 @@ public class VariantDatum {
|
||||||
public double[] annotations;
|
public double[] annotations;
|
||||||
public boolean isTransition;
|
public boolean isTransition;
|
||||||
public boolean isKnown;
|
public boolean isKnown;
|
||||||
public double pTrue;
|
public double qual;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ import org.broadinstitute.sting.utils.xReadLines;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.util.ArrayList;
|
import java.util.Map;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
|
@ -44,7 +44,7 @@ import java.util.regex.Pattern;
|
||||||
|
|
||||||
public final class VariantGaussianMixtureModel extends VariantOptimizationModel implements VariantClusteringModel {
|
public final class VariantGaussianMixtureModel extends VariantOptimizationModel implements VariantClusteringModel {
|
||||||
|
|
||||||
private final VariantDataManager dataManager;
|
public final VariantDataManager dataManager;
|
||||||
private final int numGaussians;
|
private final int numGaussians;
|
||||||
private final int numIterations;
|
private final int numIterations;
|
||||||
private final long RANDOM_SEED = 91801305;
|
private final long RANDOM_SEED = 91801305;
|
||||||
|
|
@ -59,9 +59,9 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
private final int[] numMaxClusterNovel;
|
private final int[] numMaxClusterNovel;
|
||||||
private final double[] clusterTITV;
|
private final double[] clusterTITV;
|
||||||
private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio
|
private final double[] clusterTruePositiveRate; // The true positive rate implied by the cluster's Ti/Tv ratio
|
||||||
//private final int desiredNumVariants;
|
|
||||||
private final int minVarInCluster;
|
private final int minVarInCluster;
|
||||||
|
|
||||||
|
private static final double INFINITE_ANNOTATION_VALUE = 10000.0;
|
||||||
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
|
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
|
||||||
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
|
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
|
||||||
|
|
||||||
|
|
@ -78,7 +78,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
numMaxClusterNovel = new int[numGaussians];
|
numMaxClusterNovel = new int[numGaussians];
|
||||||
clusterTITV = new double[numGaussians];
|
clusterTITV = new double[numGaussians];
|
||||||
clusterTruePositiveRate = new double[numGaussians];
|
clusterTruePositiveRate = new double[numGaussians];
|
||||||
//desiredNumVariants = _desiredNumVariants;
|
|
||||||
minVarInCluster = _minVarInCluster;
|
minVarInCluster = _minVarInCluster;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -107,14 +106,25 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
numMaxClusterKnown = null;
|
numMaxClusterKnown = null;
|
||||||
numMaxClusterNovel = null;
|
numMaxClusterNovel = null;
|
||||||
clusterTITV = null;
|
clusterTITV = null;
|
||||||
clusterTruePositiveRate = null;
|
|
||||||
//desiredNumVariants = _desiredNumVariants;
|
|
||||||
minVarInCluster = 0;
|
minVarInCluster = 0;
|
||||||
|
|
||||||
numGaussians = clusterLines.size();
|
numGaussians = clusterLines.size();
|
||||||
mu = new double[numGaussians][];
|
mu = new double[numGaussians][dataManager.numAnnotations];
|
||||||
sigma = new double[numGaussians][];
|
sigma = new double[numGaussians][dataManager.numAnnotations];
|
||||||
|
clusterTruePositiveRate = new double[numGaussians];
|
||||||
|
|
||||||
|
int kkk = 0;
|
||||||
|
for( String line : clusterLines ) {
|
||||||
|
String[] vals = line.split(",");
|
||||||
|
clusterTruePositiveRate[kkk] = Double.parseDouble(vals[4]);
|
||||||
|
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
|
||||||
|
mu[kkk][jjj] = Double.parseDouble(vals[5+jjj]);
|
||||||
|
sigma[kkk][jjj] = Double.parseDouble(vals[5+dataManager.numAnnotations+jjj]);
|
||||||
|
}
|
||||||
|
kkk++;
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("Found " + numGaussians + " clusters and using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final void run( final String clusterFileName ) {
|
public final void run( final String clusterFileName ) {
|
||||||
|
|
@ -278,27 +288,45 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
public final double evaluateVariant( final Map<String,String> annotationMap, final double qualityScore ) {
|
||||||
public final void applyClusters( final VariantDatum[] data, final String outputPrefix ) {
|
final double[] pVarInCluster = new double[numGaussians];
|
||||||
|
final double[] annotations = new double[dataManager.numAnnotations];
|
||||||
|
|
||||||
|
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
|
||||||
|
double value = 0.0;
|
||||||
|
if( dataManager.annotationKeys.get(jjj).equals("QUAL") ) {
|
||||||
|
value = qualityScore;
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
final String stringValue = annotationMap.get( dataManager.annotationKeys.get(jjj) );
|
||||||
|
if( stringValue != null ) {
|
||||||
|
value = Double.parseDouble( stringValue );
|
||||||
|
if( Double.isInfinite(value) ) {
|
||||||
|
value = ( value > 0 ? 1.0 : -1.0 ) * INFINITE_ANNOTATION_VALUE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch( NumberFormatException e ) {
|
||||||
|
// do nothing, default value is 0.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
annotations[jjj] = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
|
||||||
|
}
|
||||||
|
|
||||||
|
evaluateGaussiansForSingleVariant( annotations, pVarInCluster );
|
||||||
|
|
||||||
|
double sum = 0;
|
||||||
|
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||||
|
sum += pVarInCluster[kkk] * clusterTruePositiveRate[kkk];
|
||||||
|
}
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void outputOptimizationCurve( final VariantDatum[] data, final String outputPrefix, final int desiredNumVariants ) {
|
||||||
|
|
||||||
final int numVariants = data.length;
|
final int numVariants = data.length;
|
||||||
|
|
||||||
final double[] pTrueVariant = new double[numVariants];
|
|
||||||
final boolean[] markedVariant = new boolean[numVariants];
|
final boolean[] markedVariant = new boolean[numVariants];
|
||||||
final double[] pVarInCluster = new double[numGaussians];
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Evaluate each variant using the probability of being in each cluster and that cluster's true positive rate
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
|
||||||
evaluateGaussiansForSingleVariant( data[iii], pVarInCluster );
|
|
||||||
|
|
||||||
pTrueVariant[iii] = 0.0;
|
|
||||||
markedVariant[iii] = false;
|
|
||||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
|
||||||
pTrueVariant[iii] += pVarInCluster[kkk] * clusterTruePositiveRate[kkk];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
PrintStream outputFile = null;
|
PrintStream outputFile = null;
|
||||||
try {
|
try {
|
||||||
|
|
@ -316,10 +344,10 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
int numNovelTv = 0;
|
int numNovelTv = 0;
|
||||||
boolean foundDesiredNumVariants = false;
|
boolean foundDesiredNumVariants = false;
|
||||||
outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
|
outputFile.println("pCut,numKnown,numNovel,knownTITV,novelTITV");
|
||||||
for( double pCut = 1.0; pCut >= 0.0; pCut -= 0.001 ) {
|
for( double pCut = 100.0; pCut >= 0.0; pCut -= 0.1 ) {
|
||||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||||
if( !markedVariant[iii] ) {
|
if( !markedVariant[iii] ) {
|
||||||
if( pTrueVariant[iii] >= pCut ) {
|
if( data[iii].qual >= pCut ) {
|
||||||
markedVariant[iii] = true;
|
markedVariant[iii] = true;
|
||||||
if( data[iii].isKnown ) { // known
|
if( data[iii].isKnown ) { // known
|
||||||
numKnown++;
|
numKnown++;
|
||||||
|
|
@ -340,7 +368,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) {
|
if( desiredNumVariants != 0 && !foundDesiredNumVariants && (numKnown + numNovel) >= desiredNumVariants ) {
|
||||||
System.out.println( "Keeping variants with p(true) >= " + String.format("%.3f",pCut) + " results in a filtered set with: " );
|
System.out.println( "Keeping variants with p(true) >= " + String.format("%.1f",pCut) + " results in a filtered set with: " );
|
||||||
System.out.println("\t" + numKnown + " known variants");
|
System.out.println("\t" + numKnown + " known variants");
|
||||||
System.out.println("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)");
|
System.out.println("\t" + numNovel + " novel variants, (dbSNP rate = " + String.format("%.2f",((double) numKnown * 100.0) / ((double) numKnown + numNovel) ) + "%)");
|
||||||
System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv)));
|
System.out.println("\t" + String.format("%.4f known Ti/Tv ratio", ((double)numKnownTi) / ((double)numKnownTv)));
|
||||||
|
|
@ -353,7 +381,6 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
( numNovelTi ==0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) )));
|
( numNovelTi ==0 || numNovelTv == 0 ? "NaN" : ( ((double)numNovelTi) / ((double)numNovelTv) )));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) {
|
private void evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster ) {
|
||||||
|
|
@ -386,18 +413,20 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void evaluateGaussiansForSingleVariant( final VariantDatum datum, final double[] pVarInCluster ) {
|
private void evaluateGaussiansForSingleVariant( final double[] annotations, final double[] pVarInCluster ) {
|
||||||
|
|
||||||
final int numAnnotations = datum.annotations.length;
|
final int numAnnotations = annotations.length;
|
||||||
|
|
||||||
double sumProb = 0.0;
|
double sumProb = 0.0;
|
||||||
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
for( int kkk = 0; kkk < numGaussians; kkk++ ) {
|
||||||
double sum = 0.0;
|
double sum = 0.0;
|
||||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||||
sum += ( (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]) )
|
sum += ( (annotations[jjj] - mu[kkk][jjj]) * (annotations[jjj] - mu[kkk][jjj]) )
|
||||||
/ sigma[kkk][jjj];
|
/ sigma[kkk][jjj];
|
||||||
}
|
}
|
||||||
pVarInCluster[kkk] = pCluster[kkk] * Math.exp( -0.5 * sum );
|
//BUGBUG: removed pCluster[kkk]*, for the second pass
|
||||||
|
//pVarInCluster[kkk] = pCluster[kkk] * Math.exp( -0.5 * sum );
|
||||||
|
pVarInCluster[kkk] = Math.exp( -0.5 * sum );
|
||||||
|
|
||||||
if( pVarInCluster[kkk] < MIN_PROB) { // Very small numbers are a very big problem
|
if( pVarInCluster[kkk] < MIN_PROB) { // Very small numbers are a very big problem
|
||||||
pVarInCluster[kkk] = MIN_PROB;
|
pVarInCluster[kkk] = MIN_PROB;
|
||||||
|
|
|
||||||
|
|
@ -53,8 +53,6 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true)
|
@Argument(fullName="target_titv", shortName="titv", doc="The target Ti/Tv ratio towards which to optimize. (~~2.2 for whole genome experiments)", required=true)
|
||||||
private double TARGET_TITV = 2.1;
|
private double TARGET_TITV = 2.1;
|
||||||
//@Argument(fullName="desired_num_variants", shortName="dV", doc="The desired number of variants to keep in a theoretically filtered set", required=false)
|
|
||||||
//private int DESIRED_NUM_VARIANTS = 0;
|
|
||||||
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
@Argument(fullName="ignore_input_filters", shortName="ignoreFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||||
private boolean IGNORE_INPUT_FILTERS = false;
|
private boolean IGNORE_INPUT_FILTERS = false;
|
||||||
@Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false)
|
@Argument(fullName="exclude_annotation", shortName="exclude", doc="The names of the annotations which should be excluded from the calculations", required=false)
|
||||||
|
|
@ -74,10 +72,6 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
||||||
//private int NUM_KNN = 2000;
|
//private int NUM_KNN = 2000;
|
||||||
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
|
//@Argument(fullName = "optimization_model", shortName = "om", doc = "Optimization calculation model to employ -- GAUSSIAN_MIXTURE_MODEL is currently the default, while K_NEAREST_NEIGHBORS is also available for small callsets.", required = false)
|
||||||
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
||||||
//@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is probably /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
|
||||||
//private String PATH_TO_RSCRIPT = "/broad/tools/apps/R-2.6.0/bin/Rscript";
|
|
||||||
//@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
|
|
||||||
//private String PATH_TO_RESOURCES = "R/";
|
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Private Member Variables
|
// Private Member Variables
|
||||||
|
|
@ -203,21 +197,6 @@ public class VariantOptimizer extends RodWalker<ExpandingArrayList<VariantDatum>
|
||||||
}
|
}
|
||||||
|
|
||||||
theModel.run( CLUSTER_FILENAME );
|
theModel.run( CLUSTER_FILENAME );
|
||||||
|
|
||||||
|
|
||||||
// Functionality moved to different walker
|
|
||||||
|
|
||||||
// Execute Rscript command to plot the optimization curve
|
|
||||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
|
||||||
//final String rScriptCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_OptimizationCurve.R" + " " + OUTPUT_PREFIX + ".dat" + " " + TARGET_TITV;
|
|
||||||
//System.out.println( rScriptCommandLine );
|
|
||||||
|
|
||||||
// Execute the RScript command to plot the table of truth values
|
|
||||||
//try {
|
|
||||||
// Runtime.getRuntime().exec( rScriptCommandLine );
|
|
||||||
//} catch ( IOException e ) {
|
|
||||||
// throw new StingException( "Unable to execute RScript command: " + rScriptCommandLine );
|
|
||||||
//}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ public class QualityUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert a probability to a quality score. Note, this is capped at Q40.
|
* Convert a probability to a quality score. Note, this is capped at a quality score which is determined by _eps_.
|
||||||
*
|
*
|
||||||
* @param prob a probability (0.0-1.0)
|
* @param prob a probability (0.0-1.0)
|
||||||
* @param eps min probabilty allowed (0.0-1.0)
|
* @param eps min probabilty allowed (0.0-1.0)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue