Initial commit of VQSRv2 that passes the old integration tests. Not ready to be used yet unless your name rhymes with ... oh wait, that's me.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5419 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ccc773d175
commit
b3464a6031
|
|
@ -0,0 +1,202 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Jun 2, 2010
|
||||
*
|
||||
* @help.summary Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
|
||||
*/
|
||||
|
||||
public class ApplyVariantCutsV2 extends RodWalker<Integer, Integer> {
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Inputs
|
||||
/////////////////////////////
|
||||
@Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
|
||||
private File TRANCHES_FILE;
|
||||
|
||||
/////////////////////////////
|
||||
// Outputs
|
||||
/////////////////////////////
|
||||
@Output( doc="The output filtered VCF file", required=true)
|
||||
private VCFWriter vcfWriter = null;
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="fdr_filter_level", shortName="fdr_filter_level", doc="The FDR level at which to start filtering.", required=false)
|
||||
private double FDR_FILTER_LEVEL = 0.0;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
final private List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
final private Set<String> inputNames = new HashSet<String>();
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
for ( Tranche t : Tranche.readTranches(TRANCHES_FILE) ) {
|
||||
if ( t.fdr >= FDR_FILTER_LEVEL) {
|
||||
tranches.add(t);
|
||||
//statusMsg = "Keeping, above FDR threshold";
|
||||
}
|
||||
logger.info(String.format("Read tranche " + t));
|
||||
}
|
||||
Collections.reverse(tranches); // this algorithm wants the tranches ordered from worst to best
|
||||
|
||||
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
||||
if( d.getName().startsWith("input") ) {
|
||||
inputNames.add(d.getName());
|
||||
logger.info("Found input variant track with name " + d.getName());
|
||||
} else {
|
||||
logger.info("Not evaluating ROD binding " + d.getName());
|
||||
}
|
||||
}
|
||||
|
||||
if( inputNames.size() == 0 ) {
|
||||
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
|
||||
}
|
||||
|
||||
// setup the header fields
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
|
||||
|
||||
if( tranches.size() >= 2 ) {
|
||||
for( int iii = 0; iii < tranches.size() - 1; iii++ ) {
|
||||
Tranche t = tranches.get(iii);
|
||||
hInfo.add(new VCFFilterHeaderLine(t.name, String.format("FDR tranche level at VSQ Lod: " + t.minVQSLod + " <= x < " + tranches.get(iii+1).minVQSLod)));
|
||||
}
|
||||
}
|
||||
if( tranches.size() >= 1 ) {
|
||||
hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("FDR tranche level at VQS Lod < " + tranches.get(0).minVQSLod)));
|
||||
} else {
|
||||
throw new UserException("No tranches were found in the file or were above the FDR Filter level " + FDR_FILTER_LEVEL);
|
||||
}
|
||||
|
||||
logger.info("Keeping all variants in tranche " + tranches.get(tranches.size()-1));
|
||||
|
||||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||
return 1;
|
||||
}
|
||||
|
||||
for( VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), true, false) ) {
|
||||
if( vc != null ) {
|
||||
String filterString = null;
|
||||
if( !vc.isFiltered() ) {
|
||||
try {
|
||||
final double lod = vc.getAttributeAsDouble(VariantRecalibratorV2.VQS_LOD_KEY);
|
||||
for( int i = tranches.size() - 1; i >= 0; i-- ) {
|
||||
Tranche tranche = tranches.get(i);
|
||||
if( lod >= tranche.minVQSLod ) {
|
||||
if (i == tranches.size() - 1) {
|
||||
filterString = VCFConstants.PASSES_FILTERS_v4;
|
||||
} else {
|
||||
filterString = tranche.name;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( filterString == null ) {
|
||||
filterString = tranches.get(0).name+"+";
|
||||
}
|
||||
|
||||
if ( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) {
|
||||
Set<String> filters = new HashSet<String>();
|
||||
filters.add(filterString);
|
||||
vc = VariantContext.modifyFilters(vc, filters);
|
||||
}
|
||||
} catch ( ClassCastException e ) {
|
||||
throw new UserException.MalformedFile(vc.getSource(), "Invalid value for VQS key " + VariantRecalibratorV2.VQS_LOD_KEY + " at variant " + vc, e);
|
||||
}
|
||||
}
|
||||
|
||||
vcfWriter.add( vc, ref.getBase() );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return 1; // This value isn't used for anything
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 1; // This value isn't used for anything
|
||||
}
|
||||
|
||||
public Integer reduce( final Integer mapValue, final Integer reduceSum ) {
|
||||
return 1; // This value isn't used for anything
|
||||
}
|
||||
|
||||
public void onTraversalDone( Integer reduceSum ) {
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,234 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import Jama.Matrix;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Mar 4, 2011
|
||||
*/
|
||||
|
||||
public class GaussianMixtureModel {
|
||||
|
||||
protected final static Logger logger = Logger.getLogger(GaussianMixtureModel.class);
|
||||
|
||||
private final ArrayList<MultivariateGaussian> gaussians;
|
||||
private final double shrinkage;
|
||||
private final double dirichletParameter;
|
||||
private final double[] empiricalMu; // BUGBUG: move these to VariantData class
|
||||
private final Matrix empiricalSigma;
|
||||
public boolean isModelReadyForEvaluation;
|
||||
|
||||
public GaussianMixtureModel( final int numGaussians, final int numAnnotations,
|
||||
final double shrinkage, final double dirichletParameter ) {
|
||||
|
||||
gaussians = new ArrayList<MultivariateGaussian>( numGaussians );
|
||||
for( int iii = 0; iii < numGaussians; iii++ ) {
|
||||
final MultivariateGaussian gaussian = new MultivariateGaussian( numAnnotations );
|
||||
gaussians.add( gaussian );
|
||||
}
|
||||
this.shrinkage = shrinkage;
|
||||
this.dirichletParameter = dirichletParameter;
|
||||
empiricalMu = new double[numAnnotations];
|
||||
empiricalSigma = new Matrix(numAnnotations, numAnnotations);
|
||||
isModelReadyForEvaluation = false;
|
||||
}
|
||||
|
||||
public GaussianMixtureModel( final List<String> annotationLines, final List<String> gaussianLines ) { //BUGBUG: recreated here to match the integration tests
|
||||
|
||||
gaussians = new ArrayList<MultivariateGaussian>( gaussianLines.size() );
|
||||
for( final String line : gaussianLines ) {
|
||||
final MultivariateGaussian gaussian = new MultivariateGaussian( annotationLines.size() );
|
||||
final String[] vals = line.split(",");
|
||||
gaussian.pMixtureLog10 = Math.log10( Double.parseDouble(vals[1]) );
|
||||
for( int iii = 0; iii < annotationLines.size(); iii++ ) {
|
||||
gaussian.mu[iii] = Double.parseDouble(vals[2+iii]); //BUGBUG: recreated here to match the integration tests
|
||||
for( int jjj = 0; jjj < annotationLines.size(); jjj++ ) {
|
||||
gaussian.sigma.set(iii, jjj, Double.parseDouble(vals[2+annotationLines.size()+(iii*annotationLines.size())+jjj]) * 1.3); // BUGBUG: VRAC backOff, or get rid of this completely!?
|
||||
}
|
||||
}
|
||||
gaussians.add( gaussian );
|
||||
}
|
||||
|
||||
this.shrinkage = 0.0; // not used when evaluating data, BUGBUG: move this to VariantData class
|
||||
this.dirichletParameter = 0.0; // not used when evaluating data
|
||||
empiricalMu = null; // not used when evaluating data
|
||||
empiricalSigma = null; // not used when evaluating data
|
||||
isModelReadyForEvaluation = false;
|
||||
}
|
||||
|
||||
public void cacheEmpiricalStats( final List<VariantDatum> data ) {
|
||||
for( final VariantDatum datum : data ) {
|
||||
for( int jjj = 0; jjj < empiricalMu.length; jjj++ ) {
|
||||
empiricalMu[jjj] += datum.annotations[jjj] / ((double) data.size());
|
||||
}
|
||||
}
|
||||
empiricalSigma.setMatrix(0, empiricalMu.length - 1, 0, empiricalMu.length - 1, Matrix.identity(empiricalMu.length, empiricalMu.length)); // BUGBUG: why does the identity matrix work best here?
|
||||
// is it because of a bug in the old implementation in which std>X variants were still counted in this calculation??
|
||||
}
|
||||
|
||||
public void initializeRandomModel( final List<VariantDatum> data, final Random rand ) {
|
||||
|
||||
// initialize random Gaussian means // BUGBUG: this is broken up this way to match the order of calls to rand.nextDouble() in the old code
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
gaussian.initializeRandomMu( rand );
|
||||
}
|
||||
|
||||
// initialize means using K-means algorithm
|
||||
initializeMeansUsingKMeans( data, 60, rand ); // BUGBUG: a VRAC argument?
|
||||
|
||||
// initialize uniform mixture coefficients, random covariance matrices, and initial hyperparameters
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
gaussian.pMixtureLog10 = Math.log10( 1.0 / ((double) gaussians.size()) );
|
||||
gaussian.initializeRandomSigma( rand );
|
||||
gaussian.hyperParameter_a = gaussian.mu.length;
|
||||
gaussian.hyperParameter_b = shrinkage;
|
||||
gaussian.hyperParameter_lambda = dirichletParameter;
|
||||
}
|
||||
}
|
||||
|
||||
private void initializeMeansUsingKMeans( final List<VariantDatum> data, final int numIterations, final Random rand ) {
|
||||
|
||||
int ttt = 0;
|
||||
while( ttt++ < numIterations ) {
|
||||
for( final VariantDatum datum : data ) {
|
||||
double minDistance = Double.MAX_VALUE;
|
||||
MultivariateGaussian minGaussian = null;
|
||||
datum.assignment = minGaussian;
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
final double dist = gaussian.calculateDistanceFromMeanSquared( datum );
|
||||
if(dist < minDistance) {
|
||||
minDistance = dist;
|
||||
minGaussian = gaussian;
|
||||
}
|
||||
}
|
||||
datum.assignment = minGaussian;
|
||||
}
|
||||
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
gaussian.zeroOutMu();
|
||||
int numAssigned = 0;
|
||||
|
||||
for( final VariantDatum datum : data ) {
|
||||
if( datum.assignment == gaussian ) {
|
||||
numAssigned++;
|
||||
gaussian.incrementMu( datum );
|
||||
}
|
||||
}
|
||||
if(numAssigned != 0) {
|
||||
gaussian.divideEqualsMu( ((double) numAssigned) );
|
||||
} else {
|
||||
gaussian.initializeRandomMu( rand );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public double expectationStep( final List<VariantDatum> data ) {
|
||||
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
gaussian.precomputeDenominatorForVariationalBayes( getSumHyperParameterLambda() );
|
||||
}
|
||||
|
||||
double likelihood = 0.0;
|
||||
for( final VariantDatum datum : data ) {
|
||||
final ArrayList<Double> pVarInGaussianLog10 = new ArrayList<Double>( gaussians.size() );
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
final double pVarLog10 = gaussian.evaluateDatumLog10( datum );
|
||||
pVarInGaussianLog10.add( pVarLog10 );
|
||||
likelihood += Math.pow( 10.0, pVarLog10 ); //BUGBUG: output recreated here to match the integration tests
|
||||
}
|
||||
final double[] pVarInGaussianReals = MathUtils.normalizeFromLog10( pVarInGaussianLog10 );
|
||||
int iii = 0;
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
gaussian.assignPVarInGaussian( pVarInGaussianReals[iii++] ); //BUGBUG: to clean up
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("explained likelihood = " + String.format( "%.5f", likelihood / ((double) data.size()) ));
|
||||
return likelihood / ((double) data.size());
|
||||
}
|
||||
|
||||
public void maximizationStep( final List<VariantDatum> data ) {
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
gaussian.maximizeGaussian( data, empiricalMu, empiricalSigma, shrinkage, dirichletParameter );
|
||||
}
|
||||
}
|
||||
|
||||
private double getSumHyperParameterLambda() {
|
||||
double sum = 0.0;
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
sum += gaussian.hyperParameter_lambda;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
public void output( final PrintStream clusterFile ) { //BUGBUG: output recreated here to match the integration tests
|
||||
normalizePMixtureLog10(); //BUGBUG: output recreated here to match the integration tests
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
if( Math.pow(10.0, gaussian.pMixtureLog10) > 1E-4 ) {
|
||||
final double sigmaVals[][] = gaussian.sigma.getArray();
|
||||
clusterFile.print("@!CLUSTER");
|
||||
clusterFile.print(String.format(",%.8f", Math.pow(10.0, gaussian.pMixtureLog10)));
|
||||
for(int jjj = 0; jjj < gaussian.mu.length; jjj++ ) {
|
||||
clusterFile.print(String.format(",%.8f", gaussian.mu[jjj]));
|
||||
}
|
||||
for(int jjj = 0; jjj < gaussian.mu.length; jjj++ ) {
|
||||
for(int ppp = 0; ppp < gaussian.mu.length; ppp++ ) {
|
||||
clusterFile.print(String.format(",%.8f", (sigmaVals[jjj][ppp] / gaussian.hyperParameter_a) )); // BUGBUG: this is a bug which should be fixed after passing integration tests
|
||||
}
|
||||
}
|
||||
clusterFile.println();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void normalizePMixtureLog10() {
|
||||
double sumPK = 0.0;
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
sumPK += gaussian.pMixtureLog10;
|
||||
}
|
||||
|
||||
int gaussianIndex = 0;
|
||||
double[] pGaussianLog10 = new double[gaussians.size()];
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
pGaussianLog10[gaussianIndex++] = Math.log10( gaussian.pMixtureLog10 / sumPK ); //BUGBUG: to clean up
|
||||
}
|
||||
pGaussianLog10 = MathUtils.normalizeFromLog10( pGaussianLog10, true );
|
||||
|
||||
gaussianIndex = 0;
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
gaussian.pMixtureLog10 = pGaussianLog10[gaussianIndex++];
|
||||
}
|
||||
}
|
||||
|
||||
public void precomputeDenominatorForEvaluation() {
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
gaussian.precomputeDenominatorForEvaluation();
|
||||
}
|
||||
|
||||
isModelReadyForEvaluation = true;
|
||||
}
|
||||
|
||||
public double evaluateDatum( final VariantDatum datum ) {
|
||||
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
|
||||
int gaussianIndex = 0;
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum );
|
||||
}
|
||||
return MathUtils.log10sumLog10(pVarInGaussianLog10);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,250 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Feb 11, 2010
|
||||
*
|
||||
* @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
|
||||
*/
|
||||
|
||||
public class GenerateVariantClustersV2 extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||
|
||||
/////////////////////////////
|
||||
// Outputs
|
||||
/////////////////////////////
|
||||
|
||||
@Output(fullName="cluster_file", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
private PrintStream CLUSTER_FILE;
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
|
||||
private String[] USE_ANNOTATIONS = null;
|
||||
@Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians to try during Bayesian clustering", required=false)
|
||||
private int MAX_GAUSSIANS = 4;
|
||||
@Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of iterations to be performed when clustering. Clustering will normally end when convergence is detected.", required=false)
|
||||
private int MAX_ITERATIONS = 200;
|
||||
@Argument(fullName="weightNovel", shortName="weightNovel", doc="The weight for novel variants during clustering", required=false)
|
||||
private double WEIGHT_NOVELS = 0.0;
|
||||
@Argument(fullName="weightDBSNP", shortName="weightDBSNP", doc="The weight for dbSNP variants during clustering", required=false)
|
||||
private double WEIGHT_DBSNP = 0.0;
|
||||
@Argument(fullName="weightHapMap", shortName="weightHapMap", doc="The weight for HapMap variants during clustering", required=false)
|
||||
private double WEIGHT_HAPMAP = 1.0;
|
||||
@Argument(fullName="weight1KG", shortName="weight1KG", doc="The weight for 1000 Genomes Project variants during clustering", required=false)
|
||||
private double WEIGHT_1KG = 1.0;
|
||||
@Argument(fullName="forceIndependent", shortName="forceIndependent", doc="Force off-diagonal entries in the covariance matrix to be zero.", required=false)
|
||||
private boolean FORCE_INDEPENDENT = false;
|
||||
@Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for clustering.", required=false)
|
||||
private double STD_THRESHOLD = 4.5;
|
||||
@Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for clustering.", required=false)
|
||||
private double QUAL_THRESHOLD = 100.0;
|
||||
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
|
||||
private double SHRINKAGE = 0.0001;
|
||||
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algorithm.", required=false)
|
||||
private double DIRICHLET_PARAMETER = 1000.0;
|
||||
|
||||
/////////////////////////////
|
||||
// Debug Arguments
|
||||
/////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName = "NO_HEADER", shortName = "NO_HEADER", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", required = false)
|
||||
protected Boolean NO_HEADER_LINE = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false)
|
||||
protected Boolean TRUST_ALL_POLYMORPHIC = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private ExpandingArrayList<String> annotationKeys;
|
||||
private Set<String> ignoreInputFilterSet = null;
|
||||
private Set<String> inputNames = new HashSet<String>();
|
||||
private VariantDataManager dataManager;
|
||||
private VariantRecalibratorEngine engine = new VariantRecalibratorEngine(new VariantRecalibratorArgumentCollection()); //BUGBUG: doesn't do anything at the moment
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
//if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
|
||||
annotationKeys = new ExpandingArrayList<String>(Arrays.asList(USE_ANNOTATIONS));
|
||||
|
||||
if( IGNORE_INPUT_FILTERS != null ) {
|
||||
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
|
||||
}
|
||||
|
||||
if( !NO_HEADER_LINE ) {
|
||||
CLUSTER_FILE.print("##GenerateVariantClusters = ");
|
||||
CLUSTER_FILE.println("\"" + getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this) + "\"");
|
||||
}
|
||||
|
||||
boolean foundTruthSet = false;
|
||||
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
||||
if( d.getName().startsWith("input") ) {
|
||||
inputNames.add(d.getName());
|
||||
logger.info("Found input variant track with name " + d.getName());
|
||||
} else if ( d.getName().equals("dbsnp") ) {
|
||||
logger.info("Found dbSNP track for use in training with weight = " + WEIGHT_DBSNP);
|
||||
if( WEIGHT_DBSNP > 0.0 ) {
|
||||
foundTruthSet = true;
|
||||
}
|
||||
} else if ( d.getName().equals("hapmap") ) {
|
||||
logger.info("Found HapMap track for use in training with weight = " + WEIGHT_HAPMAP);
|
||||
if( WEIGHT_HAPMAP > 0.0 ) {
|
||||
foundTruthSet = true;
|
||||
}
|
||||
|
||||
} else if ( d.getName().equals("1kg") ) {
|
||||
logger.info("Found 1KG track for use in training with weight = " + WEIGHT_1KG);
|
||||
if( WEIGHT_1KG > 0.0 ) {
|
||||
foundTruthSet = true;
|
||||
}
|
||||
|
||||
} else {
|
||||
logger.info("Not evaluating ROD binding " + d.getName());
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundTruthSet ) {
|
||||
throw new UserException.CommandLineException("No truth set found! Please provide sets of known polymorphic loci to be used as training data using the dbsnp, hapmap, or 1kg rod bindings. Clustering weights can be specified using -weightDBSNP, -weightHapMap, and -weight1KG");
|
||||
}
|
||||
|
||||
if( inputNames.size() == 0 ) {
|
||||
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
|
||||
|
||||
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||
return mapList;
|
||||
}
|
||||
|
||||
for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), false, false) ) {
|
||||
if( vc != null ) {
|
||||
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
|
||||
int iii = 0;
|
||||
final double annotationValues[] = new double[annotationKeys.size()];
|
||||
for( final String key : annotationKeys ) {
|
||||
annotationValues[iii++] = VariantDataManager.decodeAnnotation( getToolkit().getGenomeLocParser(), key, vc, true );
|
||||
}
|
||||
|
||||
final VariantDatum variantDatum = new VariantDatum();
|
||||
variantDatum.annotations = annotationValues;
|
||||
|
||||
final Collection<VariantContext> vcsDbsnp = tracker.getVariantContexts(ref, "dbsnp", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcsHapMap = tracker.getVariantContexts(ref, "hapmap", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcs1KG = tracker.getVariantContexts(ref, "1kg", null, context.getLocation(), false, true);
|
||||
final VariantContext vcDbsnp = ( vcsDbsnp.size() != 0 ? vcsDbsnp.iterator().next() : null );
|
||||
final VariantContext vcHapMap = ( vcsHapMap.size() != 0 ? vcsHapMap.iterator().next() : null );
|
||||
final VariantContext vc1KG = ( vcs1KG.size() != 0 ? vcs1KG.iterator().next() : null );
|
||||
|
||||
variantDatum.isKnown = ( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() );
|
||||
double weight = 0.0;
|
||||
weight = WEIGHT_NOVELS;
|
||||
if( vcHapMap != null && vcHapMap.isVariant() && !vcHapMap.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vcHapMap.hasGenotypes() || vcHapMap.isPolymorphic()) ) {
|
||||
weight = WEIGHT_HAPMAP;
|
||||
} else if( vc1KG != null && vc1KG.isVariant() && !vc1KG.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vc1KG.hasGenotypes() || vc1KG.isPolymorphic()) ) {
|
||||
weight = WEIGHT_1KG;
|
||||
} else if( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() ) {
|
||||
weight = WEIGHT_DBSNP;
|
||||
}
|
||||
|
||||
if( weight > 0.0 && vc.getPhredScaledQual() > QUAL_THRESHOLD ) {
|
||||
mapList.add( variantDatum );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return mapList;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduceInit() {
|
||||
return new ExpandingArrayList<VariantDatum>();
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
reduceSum.addAll( mapValue );
|
||||
|
||||
return reduceSum;
|
||||
}
|
||||
|
||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
|
||||
logger.info( "There are " + reduceSum.size() + " variants with > 0 clustering weight and qual > threshold (--qualThreshold = " + QUAL_THRESHOLD + ")" );
|
||||
logger.info( "The annotations used for clustering are: " + annotationKeys );
|
||||
|
||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
|
||||
dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
|
||||
dataManager.trimDataBySTD( STD_THRESHOLD );
|
||||
final GaussianMixtureModel theModel = engine.generateModel( dataManager.data );
|
||||
dataManager.printClusterFileHeader( CLUSTER_FILE );
|
||||
theModel.output( CLUSTER_FILE );
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,186 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import Jama.Matrix;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Mar 4, 2011
|
||||
*/
|
||||
|
||||
public class MultivariateGaussian {
|
||||
public double pMixtureLog10;
|
||||
final public double[] mu;
|
||||
final public Matrix sigma;
|
||||
private double cachedDeterminant;
|
||||
public double hyperParameter_a;
|
||||
public double hyperParameter_b;
|
||||
public double hyperParameter_lambda;
|
||||
private double cachedDenomLog10;
|
||||
private Matrix cachedSigmaInverse;
|
||||
final private ExpandingArrayList<Double> pVarInGaussian;
|
||||
|
||||
public MultivariateGaussian( final int numAnnotations ) {
|
||||
mu = new double[numAnnotations];
|
||||
sigma = new Matrix(numAnnotations, numAnnotations);
|
||||
pVarInGaussian = new ExpandingArrayList<Double>();
|
||||
}
|
||||
|
||||
public void zeroOutMu() {
|
||||
Arrays.fill( mu, 0.0 );
|
||||
}
|
||||
|
||||
public void zeroOutSigma() {
|
||||
final double[][] zeroSigma = new double[mu.length][mu.length];
|
||||
for( final double[] row : zeroSigma ) {
|
||||
Arrays.fill(row, 0);
|
||||
}
|
||||
final Matrix tmp = new Matrix(zeroSigma);
|
||||
sigma.setMatrix(0, mu.length - 1, 0, mu.length - 1, tmp);
|
||||
}
|
||||
|
||||
public void initializeRandomMu( final Random rand ) {
|
||||
for( int jjj = 0; jjj < mu.length; jjj++ ) {
|
||||
mu[jjj] = -4.0 + 8.0 * rand.nextDouble();
|
||||
}
|
||||
}
|
||||
|
||||
public void initializeRandomSigma( final Random rand ) {
|
||||
final double[][] randSigma = new double[mu.length][mu.length];
|
||||
for( int iii = 0; iii < mu.length; iii++ ) {
|
||||
for( int jjj = iii; jjj < mu.length; jjj++ ) {
|
||||
randSigma[jjj][iii] = 0.55 + 1.25 * rand.nextDouble();
|
||||
if(rand.nextBoolean()) {
|
||||
randSigma[jjj][iii] *= -1.0;
|
||||
}
|
||||
if(iii != jjj) { randSigma[iii][jjj] = 0.0; } // Sigma is a symmetric, positive-definite matrix created by taking a lower diagonal matrix and multiplying it by its transpose
|
||||
}
|
||||
}
|
||||
Matrix tmp = new Matrix( randSigma );
|
||||
tmp = tmp.times(tmp.transpose());
|
||||
sigma.setMatrix(0, mu.length - 1, 0, mu.length - 1, tmp);
|
||||
cachedDeterminant = sigma.det();
|
||||
}
|
||||
|
||||
public double calculateDistanceFromMeanSquared( final VariantDatum datum ) {
|
||||
return MathUtils.distanceSquared( datum.annotations, mu );
|
||||
}
|
||||
|
||||
public void incrementMu( final VariantDatum datum ) {
|
||||
incrementMu( datum, 1.0 );
|
||||
}
|
||||
|
||||
public void incrementMu( final VariantDatum datum, final double prob ) {
|
||||
for( int jjj = 0; jjj < mu.length; jjj++ ) {
|
||||
mu[jjj] += prob * datum.annotations[jjj];
|
||||
}
|
||||
}
|
||||
|
||||
public void divideEqualsMu( final double x ) {
|
||||
for( int jjj = 0; jjj < mu.length; jjj++ ) {
|
||||
mu[jjj] /= x;
|
||||
}
|
||||
}
|
||||
|
||||
public void precomputeDenominatorForEvaluation() {
|
||||
cachedSigmaInverse = sigma.inverse();
|
||||
cachedDenomLog10 = -1.0 * ( Math.log10(Math.pow(2.0 * Math.PI, ((double) mu.length) / 2.0)) + Math.log10(Math.pow(sigma.det(), 0.5)) );
|
||||
//BUGBUG: This should be determinant of sigma inverse?
|
||||
//BUGBUG: Denom --> constant factor log10
|
||||
}
|
||||
|
||||
public void precomputeDenominatorForVariationalBayes( final double sumHyperParameterLambda ) {
|
||||
cachedSigmaInverse = sigma.inverse();
|
||||
cachedSigmaInverse.timesEquals( hyperParameter_a );
|
||||
double sum = 0.0;
|
||||
for(int jjj = 1; jjj < mu.length; jjj++) {
|
||||
sum += MathUtils.diGamma( (hyperParameter_a + 1.0 - jjj) / 2.0 );
|
||||
}
|
||||
sum -= Math.log( cachedDeterminant );
|
||||
sum += Math.log(2.0) * mu.length;
|
||||
final double gamma = 0.5 * sum;
|
||||
final double pi = MathUtils.diGamma( hyperParameter_lambda ) - MathUtils.diGamma( sumHyperParameterLambda );
|
||||
final double beta = (-1.0 * mu.length) / (2.0 * hyperParameter_b);
|
||||
cachedDenomLog10 = (pi / Math.log(10.0)) + (gamma / Math.log(10.0)) + (beta / Math.log(10.0));
|
||||
}
|
||||
|
||||
public double evaluateDatumLog10( final VariantDatum datum ) {
|
||||
double sumKernel = 0.0;
|
||||
final double[] crossProdTmp = new double[mu.length];
|
||||
Arrays.fill(crossProdTmp, 0.0);
|
||||
for( int iii = 0; iii < mu.length; iii++ ) {
|
||||
for( int jjj = 0; jjj < mu.length; jjj++ ) {
|
||||
crossProdTmp[iii] += (datum.annotations[jjj] - mu[jjj]) * cachedSigmaInverse.get(jjj, iii);
|
||||
}
|
||||
}
|
||||
for( int iii = 0; iii < mu.length; iii++ ) {
|
||||
sumKernel += crossProdTmp[iii] * (datum.annotations[iii] - mu[iii]);
|
||||
}
|
||||
|
||||
return (( -0.5 * sumKernel ) / Math.log(10.0)) + cachedDenomLog10; // This is the definition of a Gaussian PDF Log10
|
||||
}
|
||||
|
||||
public void assignPVarInGaussian( final double pVar ) {
|
||||
pVarInGaussian.add( pVar );
|
||||
}
|
||||
|
||||
public void resetPVarInGaussian() {
|
||||
pVarInGaussian.clear();
|
||||
}
|
||||
|
||||
public void maximizeGaussian( final List<VariantDatum> data, final double[] empiricalMu, final Matrix empiricalSigma,
|
||||
final double SHRINKAGE, final double DIRICHLET_PARAMETER ) {
|
||||
double sumProb = 0.0;
|
||||
Matrix wishart = new Matrix(mu.length, mu.length);
|
||||
zeroOutMu();
|
||||
zeroOutSigma();
|
||||
|
||||
int datumIndex = 0;
|
||||
for( final VariantDatum datum : data ) {
|
||||
final double prob = pVarInGaussian.get(datumIndex++);
|
||||
sumProb += prob;
|
||||
incrementMu( datum, prob );
|
||||
}
|
||||
|
||||
for( int iii = 0; iii < mu.length; iii++ ) {
|
||||
mu[iii] = (mu[iii] + SHRINKAGE * empiricalMu[iii]) / (sumProb + SHRINKAGE);
|
||||
}
|
||||
|
||||
final double shrinkageFactor = (SHRINKAGE * sumProb) / (SHRINKAGE + sumProb);
|
||||
for( int iii = 0; iii < mu.length; iii++ ) {
|
||||
for( int jjj = 0; jjj < mu.length; jjj++ ) {
|
||||
wishart.set(iii, jjj, shrinkageFactor * (mu[iii] - empiricalMu[iii]) * (mu[jjj] - empiricalMu[jjj]));
|
||||
}
|
||||
}
|
||||
|
||||
datumIndex = 0;
|
||||
final Matrix pVarSigma = new Matrix(mu.length, mu.length);
|
||||
for( final VariantDatum datum : data ) {
|
||||
final double prob = pVarInGaussian.get(datumIndex++);
|
||||
for( int iii = 0; iii < mu.length; iii++ ) {
|
||||
for( int jjj = 0; jjj < mu.length; jjj++ ) {
|
||||
pVarSigma.set(iii, jjj, prob * (datum.annotations[iii]-mu[iii]) * (datum.annotations[jjj]-mu[jjj]));
|
||||
}
|
||||
}
|
||||
sigma.plusEquals( pVarSigma );
|
||||
}
|
||||
|
||||
sigma.plusEquals( empiricalSigma );
|
||||
sigma.plusEquals( wishart );
|
||||
cachedDeterminant = sigma.det();
|
||||
|
||||
pMixtureLog10 = sumProb; // will be normalized later by GaussianMixtureModel so no need to do it every iteration
|
||||
|
||||
hyperParameter_a = sumProb + mu.length;
|
||||
hyperParameter_b = sumProb + SHRINKAGE;
|
||||
hyperParameter_lambda = sumProb + DIRICHLET_PARAMETER;
|
||||
|
||||
resetPVarInGaussian(); // clean up some memory
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,193 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Mar 10, 2011
|
||||
*/
|
||||
|
||||
public class Tranche implements Comparable<Tranche> {
|
||||
private static final int CURRENT_VERSION = 3;
|
||||
|
||||
public double fdr, minVQSLod, targetTiTv, knownTiTv, novelTiTv;
|
||||
public int numKnown,numNovel;
|
||||
public String name;
|
||||
|
||||
int accessibleTruthSites = 0;
|
||||
int callsAtTruthSites = 0;
|
||||
|
||||
public Tranche(double fdr, double targetTiTv, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites) {
|
||||
this(fdr, targetTiTv, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, "anonymous");
|
||||
}
|
||||
|
||||
public Tranche(double fdr, double targetTiTv, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, String name ) {
|
||||
this.fdr = fdr;
|
||||
this.targetTiTv = targetTiTv;
|
||||
this.minVQSLod = minVQSLod;
|
||||
this.novelTiTv = novelTiTv;
|
||||
this.numNovel = numNovel;
|
||||
this.knownTiTv = knownTiTv;
|
||||
this.numKnown = numKnown;
|
||||
this.name = name;
|
||||
|
||||
this.accessibleTruthSites = accessibleTruthSites;
|
||||
this.callsAtTruthSites = callsAtTruthSites;
|
||||
|
||||
if ( fdr < 0.0 )
|
||||
throw new UserException("Target FDR is unreasonable " + fdr);
|
||||
|
||||
if ( targetTiTv < 0.5 || targetTiTv > 10 )
|
||||
throw new UserException("Target Ti/Tv ratio is unreasonable " + targetTiTv);
|
||||
|
||||
if ( numKnown < 0 || numNovel < 0)
|
||||
throw new ReviewedStingException("Invalid tranche - no. variants is < 0 : known " + numKnown + " novel " + numNovel);
|
||||
|
||||
if ( name == null )
|
||||
throw new ReviewedStingException("BUG -- name cannot be null");
|
||||
}
|
||||
|
||||
private double getTruthSensitivity() {
|
||||
return accessibleTruthSites > 0 ? callsAtTruthSites / (1.0*accessibleTruthSites) : 0.0;
|
||||
}
|
||||
|
||||
public int compareTo(Tranche other) {
|
||||
return Double.compare(this.fdr, other.fdr);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("Tranche fdr=%.2f minVQSLod=%.4f known=(%d @ %.2f) novel=(%d @ %.2f) truthSites(%d accessible, %d called), name=%s]",
|
||||
fdr, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an appropriately formatted string representing the raw tranches file on disk.
|
||||
*
|
||||
* @param rawTranches
|
||||
* @return
|
||||
*/
|
||||
public static String tranchesString(List<Tranche> rawTranches) {
|
||||
ByteArrayOutputStream bytes = new ByteArrayOutputStream();
|
||||
PrintStream stream = new PrintStream(bytes);
|
||||
|
||||
List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
tranches.addAll(rawTranches);
|
||||
Collections.sort(tranches);
|
||||
|
||||
stream.println("# Variant quality score tranches file");
|
||||
stream.println("# Version number " + CURRENT_VERSION);
|
||||
stream.println("FDRtranche,targetTiTv,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,accessibleTruthSites,callsAtTruthSites,truthSensitivity");
|
||||
|
||||
Tranche prev = null;
|
||||
for ( Tranche t : tranches ) {
|
||||
stream.printf("%.2f,%.2f,%d,%d,%.4f,%.4f,%.4f,FDRtranche%.2fto%.2f,%d,%d,%.4f%n",
|
||||
t.fdr,t.targetTiTv,t.numKnown,t.numNovel,t.knownTiTv,t.novelTiTv, t.minVQSLod,
|
||||
(prev == null ? 0.0 : prev.fdr), t.fdr, t.accessibleTruthSites, t.callsAtTruthSites, t.getTruthSensitivity());
|
||||
prev = t;
|
||||
}
|
||||
|
||||
return bytes.toString();
|
||||
}
|
||||
|
||||
private static double getDouble(Map<String,String> bindings, String key, boolean required) {
|
||||
if ( bindings.containsKey(key) ) {
|
||||
String val = bindings.get(key);
|
||||
return Double.valueOf(val);
|
||||
}
|
||||
else if ( required ) {
|
||||
throw new UserException.MalformedFile("Malformed tranches file. Missing required key " + key);
|
||||
}
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
private static int getInteger(Map<String,String> bindings, String key, boolean required) {
|
||||
if ( bindings.containsKey(key) )
|
||||
return Integer.valueOf(bindings.get(key));
|
||||
else if ( required ) {
|
||||
throw new UserException.MalformedFile("Malformed tranches file. Missing required key " + key);
|
||||
}
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of tranches, sorted from most to least specific, read in from file f
|
||||
*
|
||||
* @param f
|
||||
* @return
|
||||
*/
|
||||
public static List<Tranche> readTranches(File f) {
|
||||
String[] header = null;
|
||||
List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
|
||||
try {
|
||||
for( final String line : new XReadLines(f) ) {
|
||||
if ( line.startsWith("#") )
|
||||
continue;
|
||||
|
||||
final String[] vals = line.split(",");
|
||||
if( header == null ) {
|
||||
header = vals;
|
||||
if ( header.length == 5 )
|
||||
// old style tranches file, throw an error
|
||||
throw new UserException.MalformedFile(f, "Unfortunately, your tranches file is from a previous version of this tool and cannot be used with the latest code. Please rerun VariantRecalibrator");
|
||||
if ( header.length != 8 && header.length != 11 )
|
||||
throw new UserException.MalformedFile(f, "Expected 8 elements in header line " + line);
|
||||
} else {
|
||||
if ( header.length != vals.length )
|
||||
throw new UserException.MalformedFile(f, "Line had too few/many fields. Header = " + header.length + " vals " + vals.length + " line " + line);
|
||||
|
||||
Map<String,String> bindings = new HashMap<String, String>();
|
||||
for ( int i = 0; i < vals.length; i++ ) bindings.put(header[i], vals[i]);
|
||||
tranches.add(new Tranche(getDouble(bindings,"FDRtranche", true),
|
||||
getDouble(bindings,"targetTiTv", false),
|
||||
getDouble(bindings,"minVQSLod", true),
|
||||
getInteger(bindings,"numKnown", false),
|
||||
getDouble(bindings,"knownTiTv", false),
|
||||
getInteger(bindings,"numNovel", true),
|
||||
getDouble(bindings,"novelTiTv", true),
|
||||
getInteger(bindings,"accessibleTruthSites", false),
|
||||
getInteger(bindings,"callsAtTruthSites", false),
|
||||
bindings.get("filterName")));
|
||||
}
|
||||
}
|
||||
|
||||
Collections.sort(tranches); // sort this in the standard order
|
||||
return tranches;
|
||||
} catch( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(f, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,223 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Mar 10, 2011
|
||||
*/
|
||||
|
||||
public class TrancheManager {
|
||||
|
||||
protected final static Logger logger = Logger.getLogger(TrancheManager.class);
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Code to determine FDR tranches for VariantDatum[]
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
public static abstract class SelectionMetric {
|
||||
String name = null;
|
||||
|
||||
public SelectionMetric(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getName() { return name; }
|
||||
|
||||
public abstract double getThreshold(double tranche);
|
||||
public abstract double getTarget();
|
||||
public abstract void calculateRunningMetric(List<VariantDatum> data);
|
||||
public abstract double getRunningMetric(int i);
|
||||
public abstract int datumValue(VariantDatum d);
|
||||
}
|
||||
|
||||
public static class NovelTiTvMetric extends SelectionMetric {
|
||||
double[] runningTiTv;
|
||||
double targetTiTv = 0;
|
||||
|
||||
public NovelTiTvMetric(double target) {
|
||||
super("NovelTiTv");
|
||||
targetTiTv = target; // compute the desired TiTv
|
||||
}
|
||||
|
||||
public double getThreshold(double tranche) {
|
||||
return fdrToTiTv(tranche, targetTiTv);
|
||||
}
|
||||
|
||||
public double getTarget() { return targetTiTv; }
|
||||
|
||||
public void calculateRunningMetric(List<VariantDatum> data) {
|
||||
int ti = 0, tv = 0;
|
||||
runningTiTv = new double[data.size()];
|
||||
|
||||
for ( int i = data.size() - 1; i >= 0; i-- ) {
|
||||
VariantDatum datum = data.get(i);
|
||||
if ( ! datum.isKnown ) {
|
||||
if ( datum.isTransition ) { ti++; } else { tv++; }
|
||||
runningTiTv[i] = ti / Math.max(1.0 * tv, 1.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public double getRunningMetric(int i) {
|
||||
return runningTiTv[i];
|
||||
}
|
||||
|
||||
public int datumValue(VariantDatum d) {
|
||||
return d.isTransition ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
public static class TruthSensitivityMetric extends SelectionMetric {
|
||||
double[] runningSensitivity;
|
||||
double targetTiTv = 0;
|
||||
int nTrueSites = 0;
|
||||
|
||||
public TruthSensitivityMetric(int nTrueSites) {
|
||||
super("TruthSensitivity");
|
||||
this.nTrueSites = nTrueSites;
|
||||
}
|
||||
|
||||
public double getThreshold(double tranche) {
|
||||
return tranche/100; // tranche of 1 => 99% sensitivity target
|
||||
}
|
||||
|
||||
public double getTarget() { return 1.0; }
|
||||
|
||||
public void calculateRunningMetric(List<VariantDatum> data) {
|
||||
int nCalledAtTruth = 0;
|
||||
runningSensitivity = new double[data.size()];
|
||||
|
||||
for ( int i = data.size() - 1; i >= 0; i-- ) {
|
||||
VariantDatum datum = data.get(i);
|
||||
nCalledAtTruth += datum.atTruthSite ? 1 : 0;
|
||||
runningSensitivity[i] = 1 - nCalledAtTruth / (1.0 * nTrueSites);
|
||||
}
|
||||
}
|
||||
|
||||
public double getRunningMetric(int i) {
|
||||
return runningSensitivity[i];
|
||||
}
|
||||
|
||||
public int datumValue(VariantDatum d) {
|
||||
return d.atTruthSite ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
public static List<Tranche> findTranches( ArrayList<VariantDatum> data, final double[] tranches, SelectionMetric metric ) {
|
||||
return findTranches( data, tranches, metric, null );
|
||||
}
|
||||
|
||||
public static List<Tranche> findTranches( ArrayList<VariantDatum> data, final double[] trancheThresholds, SelectionMetric metric, File debugFile ) {
|
||||
logger.info(String.format("Finding %d tranches for %d variants", trancheThresholds.length, data.size()));
|
||||
|
||||
List<VariantDatum> tranchesData = sortVariantsbyLod(data);
|
||||
metric.calculateRunningMetric(tranchesData);
|
||||
|
||||
if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, tranchesData, metric); }
|
||||
|
||||
List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
for ( double trancheThreshold : trancheThresholds ) {
|
||||
Tranche t = findTranche(tranchesData, metric, trancheThreshold);
|
||||
|
||||
if ( t == null ) {
|
||||
if ( tranches.size() == 0 )
|
||||
throw new UserException(String.format("Couldn't find any tranche containing variants with a %s > %.2f", metric.getName(), metric.getThreshold(trancheThreshold)));
|
||||
break;
|
||||
}
|
||||
|
||||
tranches.add(t);
|
||||
}
|
||||
|
||||
return tranches;
|
||||
}
|
||||
|
||||
private static void writeTranchesDebuggingInfo(File f, List<VariantDatum> tranchesData, SelectionMetric metric ) {
|
||||
try {
|
||||
PrintStream out = new PrintStream(f);
|
||||
out.println("Qual metricValue runningValue");
|
||||
for ( int i = 0; i < tranchesData.size(); i++ ) {
|
||||
VariantDatum d = tranchesData.get(i);
|
||||
int score = metric.datumValue(d);
|
||||
double runningValue = metric.getRunningMetric(i);
|
||||
out.printf("%.4f %d %.4f%n", d.lod, score, runningValue);
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(f, e);
|
||||
}
|
||||
}
|
||||
|
||||
private static List<VariantDatum> sortVariantsbyLod(final ArrayList<VariantDatum> data) {
|
||||
Collections.sort(data);
|
||||
return data;
|
||||
}
|
||||
|
||||
public static Tranche findTranche( final List<VariantDatum> data, final SelectionMetric metric, final double trancheThreshold ) {
|
||||
logger.info(String.format(" Tranche threshold %.2f => selection metric threshold %.3f", trancheThreshold, metric.getThreshold(trancheThreshold)));
|
||||
|
||||
double metricThreshold = metric.getThreshold(trancheThreshold);
|
||||
int n = data.size();
|
||||
for ( int i = 0; i < n; i++ ) {
|
||||
if ( metric.getRunningMetric(i) >= metricThreshold ) {
|
||||
// we've found the largest group of variants with Ti/Tv >= our target titv
|
||||
Tranche t = trancheOfVariants(data, i, trancheThreshold, metric.getTarget());
|
||||
logger.info(String.format(" Found tranche for %.3f: %.3f threshold starting with variant %d; running score is %.3f ",
|
||||
trancheThreshold, metricThreshold, i, metric.getRunningMetric(i)));
|
||||
logger.info(String.format(" Tranche is %s", t));
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public static Tranche trancheOfVariants( final List<VariantDatum> data, int minI, double fdr, double target ) {
|
||||
int numKnown = 0, numNovel = 0, knownTi = 0, knownTv = 0, novelTi = 0, novelTv = 0;
|
||||
|
||||
double minLod = data.get(minI).lod;
|
||||
for ( VariantDatum datum : data ) {
|
||||
if ( datum.lod >= minLod ) {
|
||||
//if( ! datum.isKnown ) System.out.println(datum.pos);
|
||||
if ( datum.isKnown ) {
|
||||
numKnown++;
|
||||
if ( datum.isTransition ) { knownTi++; } else { knownTv++; }
|
||||
} else {
|
||||
numNovel++;
|
||||
if ( datum.isTransition ) { novelTi++; } else { novelTv++; }
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double knownTiTv = knownTi / Math.max(1.0 * knownTv, 1.0);
|
||||
double novelTiTv = novelTi / Math.max(1.0 * novelTv, 1.0);
|
||||
|
||||
int accessibleTruthSites = countCallsAtTruth(data, Double.NEGATIVE_INFINITY);
|
||||
int nCallsAtTruth = countCallsAtTruth(data, minLod);
|
||||
|
||||
return new Tranche(fdr, target, minLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, nCallsAtTruth);
|
||||
}
|
||||
|
||||
public static double fdrToTiTv(double desiredFDR, double targetTiTv) {
|
||||
return (1.0 - desiredFDR / 100.0) * (targetTiTv - 0.5) + 0.5;
|
||||
}
|
||||
|
||||
public static int countCallsAtTruth(final List<VariantDatum> data, double minLOD ) {
|
||||
int n = 0;
|
||||
for ( VariantDatum d : data) { n += d.atTruthSite && d.lod >= minLOD ? 1 : 0; }
|
||||
return n;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,142 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Mar 4, 2011
|
||||
*/
|
||||
|
||||
public class VariantDataManager {
|
||||
public ExpandingArrayList<VariantDatum> data;
|
||||
public final double[] meanVector;
|
||||
public final double[] varianceVector; // This is really the standard deviation
|
||||
public final ArrayList<String> annotationKeys;
|
||||
|
||||
private final static long RANDOM_SEED = 83409701;
|
||||
private final static Random rand = new Random( RANDOM_SEED ); // this is going to cause problems if it is ever used in an integration test
|
||||
|
||||
protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
|
||||
|
||||
public VariantDataManager( final ExpandingArrayList<VariantDatum> data, final List<String> annotationKeys ) {
|
||||
this.data = data;
|
||||
this.annotationKeys = new ArrayList<String>( annotationKeys );
|
||||
meanVector = new double[this.annotationKeys.size()];
|
||||
varianceVector = new double[this.annotationKeys.size()];
|
||||
}
|
||||
|
||||
public VariantDataManager( final List<String> annotationLines ) {
|
||||
data = null;
|
||||
annotationKeys = new ArrayList<String>( annotationLines.size() );
|
||||
meanVector = new double[annotationLines.size()];
|
||||
varianceVector = new double[annotationLines.size()];
|
||||
|
||||
int jjj = 0;
|
||||
for( final String line : annotationLines ) {
|
||||
final String[] vals = line.split(",");
|
||||
annotationKeys.add(vals[1]);
|
||||
meanVector[jjj] = Double.parseDouble(vals[2]);
|
||||
varianceVector[jjj] = Double.parseDouble(vals[3]);
|
||||
jjj++;
|
||||
}
|
||||
}
|
||||
|
||||
public void add( final VariantDatum datum ) {
|
||||
data.add( datum );
|
||||
}
|
||||
|
||||
public void normalizeData() {
|
||||
boolean foundZeroVarianceAnnotation = false;
|
||||
for( int jjj = 0; jjj < meanVector.length; jjj++ ) { //BUGBUG: to clean up
|
||||
final double theMean = mean(jjj); //BUGBUG: to clean up
|
||||
final double theSTD = standardDeviation(theMean, jjj); //BUGBUG: to clean up
|
||||
logger.info( annotationKeys.get(jjj) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) );
|
||||
if( theSTD < 1E-8 ) { //BUGBUG: output recreated here to match the integration tests
|
||||
foundZeroVarianceAnnotation = true;
|
||||
} else if( theSTD < 1E-2 ) {
|
||||
logger.warn("Warning! Tiny variance. It is strongly recommended that you -exclude " + annotationKeys.get(jjj));
|
||||
}
|
||||
meanVector[jjj] = theMean;
|
||||
varianceVector[jjj] = theSTD;
|
||||
for(final VariantDatum datum : data ) {
|
||||
datum.annotations[jjj] = ( datum.annotations[jjj] - theMean ) / theSTD; // Each data point is now [ (x - mean) / standard deviation ]
|
||||
}
|
||||
}
|
||||
if( foundZeroVarianceAnnotation ) {
|
||||
throw new UserException.BadInput("Found annotations with zero variance. They must be excluded before proceeding.");
|
||||
}
|
||||
}
|
||||
|
||||
private double mean( final int index ) {
|
||||
double sum = 0.0;
|
||||
final int numVars = data.size();
|
||||
for( final VariantDatum datum : data ) {
|
||||
sum += (datum.annotations[index] / ((double) numVars));
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
private double standardDeviation( final double mean, final int index ) {
|
||||
double sum = 0.0;
|
||||
final int numVars = data.size();
|
||||
for( final VariantDatum datum : data ) {
|
||||
sum += ( ((datum.annotations[index] - mean)*(datum.annotations[index] - mean)) / ((double) numVars));
|
||||
}
|
||||
return Math.sqrt( sum );
|
||||
}
|
||||
|
||||
public void trimDataBySTD( final double STD_THRESHOLD ) {
|
||||
final ExpandingArrayList<VariantDatum> dataToRemove = new ExpandingArrayList<VariantDatum>();
|
||||
for( final VariantDatum datum : data ) {
|
||||
boolean remove = false;
|
||||
for( double val : datum.annotations ) {
|
||||
if( Math.abs(val) > STD_THRESHOLD ) {
|
||||
remove = true;
|
||||
}
|
||||
}
|
||||
if( remove ) { dataToRemove.add( datum ); }
|
||||
}
|
||||
data.removeAll( dataToRemove );
|
||||
}
|
||||
|
||||
public void printClusterFileHeader( PrintStream outputFile ) {
|
||||
for( int jjj = 0; jjj < annotationKeys.size(); jjj++ ) {
|
||||
outputFile.println(String.format("@!ANNOTATION," + annotationKeys.get(jjj) + ",%.8f,%.8f", meanVector[jjj], varianceVector[jjj]));
|
||||
}
|
||||
}
|
||||
|
||||
public static double decodeAnnotation( final GenomeLocParser genomeLocParser, final String annotationKey, final VariantContext vc, final boolean jitter ) {
|
||||
double value;
|
||||
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // HRun values must be jittered a bit to work in this GMM
|
||||
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
|
||||
value += -0.25 + 0.5 * rand.nextDouble(); //BUGBUG: recreated here to match the integration tests
|
||||
} else if( annotationKey.equals("QUAL") ) {
|
||||
value = vc.getPhredScaledQual();
|
||||
} else {
|
||||
try {
|
||||
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
|
||||
} catch( Exception e ) {
|
||||
throw new UserException.MalformedFile(vc.getSource(), "No double value detected for annotation = " + annotationKey + " in variant at " + VariantContextUtils.getLocation(genomeLocParser,vc) + ", reported annotation value = " + vc.getAttribute( annotationKey ), e );
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
public double[] normalizeDatum( final double[] annotationValues ) {
|
||||
for(int iii = 0; iii < annotationValues.length; iii++) {
|
||||
annotationValues[iii] = (annotationValues[iii] - meanVector[iii]) / varianceVector[iii];
|
||||
}
|
||||
return annotationValues;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Mar 4, 2011
|
||||
*/
|
||||
|
||||
public class VariantDatum implements Comparable<VariantDatum> {
|
||||
|
||||
public double[] annotations;
|
||||
public boolean isKnown;
|
||||
public double lod;
|
||||
public double pVarGivenModel;
|
||||
public boolean atTruthSite;
|
||||
public boolean isTransition;
|
||||
public MultivariateGaussian assignment; // used in K-means implementation
|
||||
|
||||
public int compareTo( final VariantDatum other ) {
|
||||
return Double.compare(this.lod, other.lod);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Mar 4, 2011
|
||||
*/
|
||||
|
||||
public class VariantRecalibratorArgumentCollection {
|
||||
|
||||
public VariantRecalibratorArgumentCollection clone() {
|
||||
final VariantRecalibratorArgumentCollection vrac = new VariantRecalibratorArgumentCollection();
|
||||
|
||||
return vrac;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,101 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Mar 4, 2011
|
||||
*/
|
||||
|
||||
public class VariantRecalibratorEngine {
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
|
||||
protected final static Logger logger = Logger.getLogger(VariantRecalibratorEngine.class);
|
||||
|
||||
// the unified argument collection
|
||||
final private VariantRecalibratorArgumentCollection VRAC;
|
||||
|
||||
private final static long RANDOM_SEED = 91801305;
|
||||
private final Random rand = new Random( RANDOM_SEED );
|
||||
private final static double MIN_PROB_CONVERGENCE = 1E-5;
|
||||
|
||||
/////////////////////////////
|
||||
// Public Methods to interface with the Engine
|
||||
/////////////////////////////
|
||||
|
||||
public VariantRecalibratorEngine( final VariantRecalibratorArgumentCollection VRAC ) {
|
||||
this.VRAC = VRAC;
|
||||
initialize( this.VRAC );
|
||||
}
|
||||
|
||||
public GaussianMixtureModel generateModel( final List<VariantDatum> data ) {
|
||||
final GaussianMixtureModel model = new GaussianMixtureModel( 4, 3, 0.0001, 1000.0 ); //BUGBUG: VRAC.maxGaussians, VRAC.numAnnotations
|
||||
variationalBayesExpectationMaximization( model, data );
|
||||
return model;
|
||||
}
|
||||
|
||||
public void evaluateData( final List<VariantDatum> data, final GaussianMixtureModel model ) {
|
||||
if( !model.isModelReadyForEvaluation ) {
|
||||
model.precomputeDenominatorForEvaluation();
|
||||
}
|
||||
for( final VariantDatum datum : data ) {
|
||||
datum.pVarGivenModel = evaluateDatum( datum, model );
|
||||
}
|
||||
}
|
||||
|
||||
public void evaluateDataContrastively( final List<VariantDatum> data, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel ) {
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
// Private Methods used for initialization
|
||||
/////////////////////////////
|
||||
|
||||
private void initialize( final VariantRecalibratorArgumentCollection VRAC ) {
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
// Private Methods used for generating a GaussianMixtureModel
|
||||
/////////////////////////////
|
||||
|
||||
private void variationalBayesExpectationMaximization( final GaussianMixtureModel model, final List<VariantDatum> data ) {
|
||||
|
||||
model.cacheEmpiricalStats( data );
|
||||
model.initializeRandomModel( data, rand );
|
||||
|
||||
// The VBEM loop
|
||||
double previousLikelihood = -1E20;
|
||||
double currentLikelihood;
|
||||
int iteration = 1;
|
||||
while( iteration < 200 ) { //BUGBUG: VRAC.maxIterations
|
||||
currentLikelihood = model.expectationStep( data );
|
||||
model.maximizationStep( data );
|
||||
|
||||
logger.info("Finished iteration " + iteration );
|
||||
iteration++;
|
||||
if( Math.abs(currentLikelihood - previousLikelihood) < MIN_PROB_CONVERGENCE ) {
|
||||
logger.info("Convergence!");
|
||||
return; // Early return here because we have converged!
|
||||
}
|
||||
previousLikelihood = currentLikelihood;
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
// Private Methods used for evaluating data given a GaussianMixtureModel
|
||||
/////////////////////////////
|
||||
|
||||
private double evaluateDatum( final VariantDatum datum, final GaussianMixtureModel model ) {
|
||||
return model.evaluateDatum( datum );
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,406 @@
|
|||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Mar 17, 2010
|
||||
*
|
||||
* @help.summary Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
|
||||
*/
|
||||
|
||||
public class VariantRecalibratorV2 extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||
public static final String VQS_LOD_KEY = "VQSLOD";
|
||||
|
||||
/////////////////////////////
|
||||
// Inputs
|
||||
/////////////////////////////
|
||||
@Input(fullName="cluster_file", shortName="clusterFile", doc="The input cluster file generated by GenerateVariantClusters", required=true)
|
||||
private File CLUSTER_FILE;
|
||||
|
||||
/////////////////////////////
|
||||
// Outputs
|
||||
/////////////////////////////
|
||||
@Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyVariantCuts", required=true)
|
||||
private File TRANCHES_FILE;
|
||||
@Output(doc="File to which recalibrated variants should be written", required=true)
|
||||
private VCFWriter vcfWriter = null;
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (~~2.07 for whole genome experiments)", required=false)
|
||||
private double TARGET_TITV = 2.07;
|
||||
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by enlarging the Gaussians.", required=false)
|
||||
private double BACKOFF_FACTOR = 1.3;
|
||||
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Argument(fullName="priorNovel", shortName="priorNovel", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
|
||||
private double PRIOR_NOVELS = 2.0;
|
||||
@Argument(fullName="priorDBSNP", shortName="priorDBSNP", doc="A prior on the quality of dbSNP variants, a phred scaled probability of being true.", required=false)
|
||||
private double PRIOR_DBSNP = 10.0;
|
||||
@Argument(fullName="priorHapMap", shortName="priorHapMap", doc="A prior on the quality of HapMap variants, a phred scaled probability of being true.", required=false)
|
||||
private double PRIOR_HAPMAP = 15.0;
|
||||
@Argument(fullName="prior1KG", shortName="prior1KG", doc="A prior on the quality of 1000 Genomes Project variants, a phred scaled probability of being true.", required=false)
|
||||
private double PRIOR_1KG = 12.0;
|
||||
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
|
||||
private double[] FDR_TRANCHES = new double[]{0.1, 1.0, 10.0, 100.0};
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required=false)
|
||||
private String PATH_TO_RSCRIPT = "Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false)
|
||||
private String PATH_TO_RESOURCES = "R/";
|
||||
@Argument(fullName="singleton_fp_rate", shortName="fp_rate", doc="Prior expectation that a singleton call would be a FP", required=false)
|
||||
private double SINGLETON_FP_RATE = 0.5;
|
||||
@Argument(fullName="max_ac_prior", shortName="maxACPrior", doc="Maximum value for the prior expectation based on allele count. Needed because (1 - 0.5^x) approaches 1.0 very quickly.", required=false)
|
||||
private double MAX_AC_PRIOR = 0.99;
|
||||
@Argument(fullName="dontTrustACField", shortName="dontTrustACField", doc="If specified the VR will not use the AC field and will instead always parse the genotypes to figure out how many variant chromosomes there are at a given site.", required=false)
|
||||
private boolean NEVER_TRUST_AC_FIELD = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Debug Arguments
|
||||
/////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName = "qual", shortName = "qual", doc = "Don't use sites with original quality scores below the qual threshold. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
||||
private double QUAL_THRESHOLD = 0.0;
|
||||
@Hidden
|
||||
@Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false)
|
||||
protected Boolean TRUST_ALL_POLYMORPHIC = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "debugFile", shortName = "debugFile", doc = "Print debugging information here", required=false)
|
||||
private File DEBUG_FILE = null;
|
||||
@Hidden
|
||||
@Argument(fullName = "selectionMetric", shortName = "sm", doc = "Selection metric to use", required=false)
|
||||
private SelectionMetricType SELECTION_METRIC_TYPE = SelectionMetricType.NOVEL_TITV;
|
||||
|
||||
public enum SelectionMetricType {
|
||||
NOVEL_TITV,
|
||||
TRUTH_SENSITIVITY
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private Set<String> ignoreInputFilterSet = null;
|
||||
private Set<String> inputNames = new HashSet<String>();
|
||||
private Set<String> truthNames = new HashSet<String>();
|
||||
private NestedHashMap priorCache = new NestedHashMap();
|
||||
private boolean trustACField = false;
|
||||
private PrintStream tranchesStream = null;
|
||||
private int nTruthSites = 0;
|
||||
private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( new VariantRecalibratorArgumentCollection() );
|
||||
private GaussianMixtureModel model;
|
||||
private VariantDataManager dataManager;
|
||||
|
||||
private static double round4(double num) {
|
||||
double result = num * 10000.0;
|
||||
result = Math.round(result);
|
||||
result = result / 10000.0;
|
||||
return result;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
|
||||
if( IGNORE_INPUT_FILTERS != null ) {
|
||||
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
|
||||
}
|
||||
|
||||
final Pattern COMMENT_PATTERN = Pattern.compile("^##.*");
|
||||
final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
|
||||
final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
|
||||
|
||||
final ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
|
||||
final ExpandingArrayList<String> gaussianLines = new ExpandingArrayList<String>();
|
||||
|
||||
try {
|
||||
for ( final String line : new XReadLines( CLUSTER_FILE ) ) {
|
||||
if( ANNOTATION_PATTERN.matcher(line).matches() ) {
|
||||
annotationLines.add(line);
|
||||
} else if( CLUSTER_PATTERN.matcher(line).matches() ) {
|
||||
gaussianLines.add(line);
|
||||
} else if( !COMMENT_PATTERN.matcher(line).matches() ) {
|
||||
throw new UserException.MalformedFile(CLUSTER_FILE, "Could not parse line: " + line);
|
||||
}
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(CLUSTER_FILE, e);
|
||||
}
|
||||
|
||||
dataManager = new VariantDataManager( annotationLines );
|
||||
model = new GaussianMixtureModel( annotationLines, gaussianLines );
|
||||
|
||||
boolean foundDBSNP = false;
|
||||
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
||||
if( d.getName().startsWith("input") ) {
|
||||
inputNames.add(d.getName());
|
||||
logger.info("Found input variant track with name " + d.getName());
|
||||
} else if( d.getName().startsWith("truth") ) {
|
||||
truthNames.add(d.getName());
|
||||
logger.info("Found truth variant track with name " + d.getName());
|
||||
} else if ( d.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||
logger.info("Found dbSNP track with prior probability = Q" + PRIOR_DBSNP);
|
||||
foundDBSNP = true;
|
||||
} else if ( d.getName().equals("hapmap") ) {
|
||||
logger.info("Found HapMap track with prior probability = Q" + PRIOR_HAPMAP);
|
||||
} else if ( d.getName().equals("1kg") ) {
|
||||
logger.info("Found 1KG track for with prior probability = Q" + PRIOR_1KG);
|
||||
} else {
|
||||
logger.info("Not evaluating ROD binding " + d.getName());
|
||||
}
|
||||
}
|
||||
|
||||
if(!foundDBSNP) {
|
||||
throw new UserException.CommandLineException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
|
||||
}
|
||||
|
||||
if( inputNames.size() == 0 ) {
|
||||
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
|
||||
}
|
||||
|
||||
// setup the header fields
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
|
||||
hInfo.add(new VCFInfoHeaderLine(VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "log10-scaled probability of variant being true under the trained gaussian mixture model"));
|
||||
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
|
||||
|
||||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
|
||||
try {
|
||||
tranchesStream = new PrintStream(TRANCHES_FILE);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(TRANCHES_FILE, e);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
|
||||
|
||||
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||
return mapList;
|
||||
}
|
||||
|
||||
final Collection<VariantContext> vcsTruth = tracker.getVariantContexts(ref, truthNames, null, context.getLocation(), true, true);
|
||||
boolean isAtTruthSite = false;
|
||||
for( final VariantContext vcTruth : vcsTruth ) {
|
||||
if( vcTruth != null && vcTruth.isVariant() && !vcTruth.isFiltered() && (!vcTruth.hasGenotypes() || vcTruth.isPolymorphic()) ) {
|
||||
isAtTruthSite = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
nTruthSites += isAtTruthSite ? 1 : 0;
|
||||
|
||||
for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), true, false) ) {
|
||||
if( vc != null ) {
|
||||
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
|
||||
if( vc.getPhredScaledQual() >= QUAL_THRESHOLD ) {
|
||||
int iii = 0;
|
||||
final double annotationValues[] = new double[dataManager.annotationKeys.size()];
|
||||
for( final String key : dataManager.annotationKeys ) {
|
||||
annotationValues[iii++] = VariantDataManager.decodeAnnotation( getToolkit().getGenomeLocParser(), key, vc, false );
|
||||
}
|
||||
final VariantDatum variantDatum = new VariantDatum();
|
||||
variantDatum.annotations = dataManager.normalizeDatum( annotationValues );
|
||||
|
||||
if (vc.isSNP())
|
||||
variantDatum.isTransition = VariantContextUtils.getSNPSubstitutionType(vc).compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
|
||||
|
||||
final Collection<VariantContext> vcsDbsnp = tracker.getVariantContexts(ref, "dbsnp", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcsHapMap = tracker.getVariantContexts(ref, "hapmap", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcs1KG = tracker.getVariantContexts(ref, "1kg", null, context.getLocation(), false, true);
|
||||
final VariantContext vcDbsnp = ( vcsDbsnp.size() != 0 ? vcsDbsnp.iterator().next() : null );
|
||||
final VariantContext vcHapMap = ( vcsHapMap.size() != 0 ? vcsHapMap.iterator().next() : null );
|
||||
final VariantContext vc1KG = ( vcs1KG.size() != 0 ? vcs1KG.iterator().next() : null );
|
||||
|
||||
variantDatum.isKnown = ( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() );
|
||||
double knownPrior_qScore = PRIOR_NOVELS;
|
||||
if( vcHapMap != null && vcHapMap.isVariant() && !vcHapMap.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vcHapMap.hasGenotypes() || vcHapMap.isPolymorphic()) ) {
|
||||
knownPrior_qScore = PRIOR_HAPMAP;
|
||||
} else if( vc1KG != null && vc1KG.isVariant() && !vc1KG.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vc1KG.hasGenotypes() || vc1KG.isPolymorphic()) ) {
|
||||
knownPrior_qScore = PRIOR_1KG;
|
||||
} else if( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() ) {
|
||||
knownPrior_qScore = PRIOR_DBSNP;
|
||||
}
|
||||
|
||||
// If we can trust the AC field then use it instead of parsing all the genotypes. Results in a substantial speed up.
|
||||
final int alleleCount = (trustACField ? Integer.parseInt(vc.getAttribute("AC",-1).toString()) : vc.getChromosomeCount(vc.getAlternateAllele(0)));
|
||||
if( !trustACField && !NEVER_TRUST_AC_FIELD ) {
|
||||
if( !vc.getAttributes().containsKey("AC") ) {
|
||||
NEVER_TRUST_AC_FIELD = true;
|
||||
} else {
|
||||
if( alleleCount == Integer.parseInt( vc.getAttribute("AC").toString()) ) {
|
||||
// The AC field is correct at this record so we trust it forever
|
||||
trustACField = true;
|
||||
} else { // We found a record in which the AC field wasn't correct but we are trying to trust it
|
||||
throw new UserException.BadInput("AC info field doesn't match the variant chromosome count so we can't trust it! Please run with --dontTrustACField which will force the walker to parse the genotypes for each record, drastically increasing the runtime." +
|
||||
"First observed at " + vc.getChr() + ":" + vc.getStart());
|
||||
}
|
||||
}
|
||||
}
|
||||
if( trustACField && alleleCount == -1 ) {
|
||||
throw new UserException.BadInput("AC info field doesn't exist for all records (although it does for some) so we can't trust it! Please run with --dontTrustACField which will force the walker to parse the genotypes for each record, drastically increasing the runtime." +
|
||||
"First observed at " + vc.getChr() + ":" + vc.getStart());
|
||||
}
|
||||
|
||||
final Object[] priorKey = new Object[2];
|
||||
priorKey[0] = alleleCount;
|
||||
priorKey[1] = knownPrior_qScore;
|
||||
|
||||
Double priorLodFactor = (Double)priorCache.get( priorKey );
|
||||
|
||||
// If this prior factor hasn't been calculated before, do so now
|
||||
if(priorLodFactor == null) {
|
||||
final double knownPrior = QualityUtils.qualToProb(knownPrior_qScore);
|
||||
final double acPrior = Math.min( MAX_AC_PRIOR, 1.0 - Math.pow(SINGLETON_FP_RATE, alleleCount) ); //theModel.getAlleleCountPrior( alleleCount, MAX_AC_PRIOR );
|
||||
final double totalPrior = 1.0 - ((1.0 - acPrior) * (1.0 - knownPrior));
|
||||
|
||||
if( MathUtils.compareDoubles(totalPrior, 1.0, 1E-8) == 0 || MathUtils.compareDoubles(totalPrior, 0.0, 1E-8) == 0 ) {
|
||||
throw new UserException.CommandLineException("Something is wrong with the priors that were entered by the user: Prior = " + totalPrior); // TODO - fix this up later
|
||||
}
|
||||
|
||||
priorLodFactor = Math.log10(totalPrior) - Math.log10(1.0 - totalPrior) - Math.log10(1.0);
|
||||
|
||||
priorCache.put( priorLodFactor, false, priorKey );
|
||||
}
|
||||
|
||||
final ArrayList<VariantDatum> datumList = new ArrayList<VariantDatum>(1);
|
||||
datumList.add( variantDatum );
|
||||
engine.evaluateData(datumList, model);
|
||||
final double lod = priorLodFactor + datumList.get(0).pVarGivenModel;
|
||||
|
||||
if ( lod == Double.NEGATIVE_INFINITY ) {
|
||||
throw new ReviewedStingException("Negative infinity detected during summation at " + vc);
|
||||
}
|
||||
|
||||
//variantDatum.pos = vc.getStart();
|
||||
variantDatum.lod = round4(lod);
|
||||
|
||||
// deal with the truth calculation
|
||||
variantDatum.atTruthSite = isAtTruthSite;
|
||||
|
||||
mapList.add( variantDatum );
|
||||
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
|
||||
attrs.put(VariantRecalibratorV2.VQS_LOD_KEY, String.format("%.4f", lod));
|
||||
VariantContext newVC = VariantContext.modifyPErrorFiltersAndAttributes(vc, vc.getNegLog10PError(), new HashSet<String>(), attrs);
|
||||
|
||||
vcfWriter.add( newVC, ref.getBase() );
|
||||
}
|
||||
|
||||
} else { // not a SNP or is filtered so just dump it out to the VCF file
|
||||
vcfWriter.add( vc, ref.getBase() );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return mapList;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduceInit() {
|
||||
return new ExpandingArrayList<VariantDatum>();
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
reduceSum.addAll( mapValue );
|
||||
return reduceSum;
|
||||
}
|
||||
|
||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
dataManager.data = reduceSum;
|
||||
|
||||
// deal with truth information
|
||||
int nCallsAtTruth = TrancheManager.countCallsAtTruth(dataManager.data, Double.NEGATIVE_INFINITY);
|
||||
logger.info(String.format("Truth set size is %d, raw calls at these sites %d, maximum sensitivity of %.2f",
|
||||
nTruthSites, nCallsAtTruth, (100.0*nCallsAtTruth / Math.max(nTruthSites, nCallsAtTruth))));
|
||||
|
||||
TrancheManager.SelectionMetric metric = new TrancheManager.TruthSensitivityMetric(nCallsAtTruth);
|
||||
List<Tranche> tranches = TrancheManager.findTranches( dataManager.data, FDR_TRANCHES, metric, DEBUG_FILE ); //BUGBUG: recreated here to match the integration tests
|
||||
tranchesStream.print(Tranche.tranchesString(tranches));
|
||||
|
||||
// Execute Rscript command to plot the optimization curve
|
||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||
final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + TRANCHES_FILE.getAbsolutePath() + " " + TARGET_TITV;
|
||||
logger.info( rScriptTranchesCommandLine );
|
||||
|
||||
// Execute the RScript command to plot the table of truth values
|
||||
try {
|
||||
Process p;
|
||||
p = Runtime.getRuntime().exec( rScriptTranchesCommandLine );
|
||||
p.waitFor();
|
||||
} catch ( Exception e ) {
|
||||
Utils.warnUser("Unable to execute the RScript command. While not critical to the calculations themselves, the script outputs a report that is extremely useful for confirming that the recalibration proceded as expected. We highly recommend trying to rerun the script manually if possible.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,124 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.annotations.DataProvider;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.File;
|
||||
|
||||
public class VariantRecalibrationWalkersV2IntegrationTest extends WalkerTest {
|
||||
static HashMap<String, String> clusterFiles = new HashMap<String, String>();
|
||||
static HashMap<String, String> tranchesFiles = new HashMap<String, String>();
|
||||
static HashMap<String, String> inputVCFFiles = new HashMap<String, String>();
|
||||
|
||||
private static class VRTest {
|
||||
String inVCF;
|
||||
String clusterMD5;
|
||||
String tranchesMD5;
|
||||
String recalVCFMD5;
|
||||
String cutVCFMD5;
|
||||
public VRTest(String inVCF, String clusterMD5, String tranchesMD5, String recalVCFMD5, String cutVCFMD5) {
|
||||
this.inVCF = validationDataLocation + inVCF;
|
||||
this.clusterMD5 = clusterMD5;
|
||||
this.tranchesMD5 = tranchesMD5;
|
||||
this.recalVCFMD5 = recalVCFMD5;
|
||||
this.cutVCFMD5 = cutVCFMD5;
|
||||
}
|
||||
}
|
||||
|
||||
VRTest yriTrio = new VRTest("yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf",
|
||||
"f65c27ee40053adc72dd0bfbb628e4d7", // cluster file
|
||||
"dce581b880ffb6ea39cbada1ecc95915", // tranches
|
||||
"c3e8a2f43656eab7d847dbf850f844a6", // recalVCF
|
||||
"50f752a72643db9ad0aa94b3fc4e23d6"); // cut VCF
|
||||
|
||||
VRTest lowPass = new VRTest("lowpass.N3.chr1.raw.vcf",
|
||||
"bda8f17cfc19d23e7e51f99e547f4b3d", // cluster file
|
||||
"66edae83c50f4e8601fef7fafba774af", // tranches
|
||||
"0123537e373657386068a534c0f5c91b", // recalVCF
|
||||
"2172368e8585841e5ad96c95d0827c4b"); // cut VCF
|
||||
|
||||
@DataProvider(name = "VRTest")
|
||||
public Object[][] createData1() {
|
||||
return new Object[][]{ {yriTrio}, {lowPass} };
|
||||
}
|
||||
|
||||
@Test(dataProvider = "VRTest", enabled = true)
|
||||
public void testGenerateVariantClusters(VRTest params) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||
" -weightDBSNP 1.0 -weightHapMap 1.0" +
|
||||
" -T GenerateVariantClustersV2" +
|
||||
" -B:input,VCF " + params.inVCF +
|
||||
" -L 1:50,000,000-200,000,000" +
|
||||
" -qual 50.0" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" -an QD -an MQ -an SB" +
|
||||
" -clusterFile %s",
|
||||
Arrays.asList(params.clusterMD5));
|
||||
executeTest("testGenerateVariantClusters-"+params.inVCF, spec).getFirst();
|
||||
}
|
||||
|
||||
@Test(dataProvider = "VRTest",dependsOnMethods="testGenerateVariantClusters")
|
||||
public void testVariantRecalibrator(VRTest params) {
|
||||
//System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||
" -B:truthHapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||
" -T VariantRecalibratorV2" +
|
||||
" -B:input,VCF " + params.inVCF +
|
||||
" -L 1:20,000,000-100,000,000" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" --ignore_filter HARD_TO_VALIDATE" +
|
||||
" -clusterFile " + getFileForMD5(params.clusterMD5) +
|
||||
" -sm TRUTH_SENSITIVITY" +
|
||||
" -o %s" +
|
||||
" -tranchesFile %s",
|
||||
Arrays.asList(params.recalVCFMD5, params.tranchesMD5));
|
||||
executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst();
|
||||
}
|
||||
|
||||
@Test(dataProvider = "VRTest",dependsOnMethods="testVariantRecalibrator")
|
||||
public void testApplyVariantCuts(VRTest params) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -T ApplyVariantCutsV2" +
|
||||
" -L 1:20,000,000-100,000,000" +
|
||||
" -B:input,VCF " + getFileForMD5(params.recalVCFMD5) +
|
||||
" -o %s" +
|
||||
" -tranchesFile " + getFileForMD5(params.tranchesMD5),
|
||||
Arrays.asList(params.cutVCFMD5));
|
||||
executeTest("testApplyVariantCuts-"+params.inVCF, spec);
|
||||
}
|
||||
|
||||
@Test()
|
||||
public void testFailWithBadAnnotation() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||
" -weightDBSNP 0.2 -weightHapMap 1.0" +
|
||||
" -T GenerateVariantClustersV2" +
|
||||
" -B:input,VCF " + lowPass.inVCF +
|
||||
" -L 1:50,000,000-200,000,000" +
|
||||
" -qual 50.0" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" -an QD -an HRun -an ThisAnnotationIsBAD" + // There is a bad annotation here
|
||||
" -clusterFile %s",
|
||||
1, // just one output file
|
||||
UserException.MalformedFile.class);
|
||||
executeTest("testFailWithBadAnnotation", spec);
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue