A version of VQSRv2 that does contrastive clustering in two passes. The walkers will be renamed when they are moved to core.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5443 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fcc347bb05
commit
2a2538136d
|
|
@ -1,202 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Jun 2, 2010
|
||||
*
|
||||
* @help.summary Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
|
||||
*/
|
||||
|
||||
public class ApplyVariantCutsV2 extends RodWalker<Integer, Integer> {
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Inputs
|
||||
/////////////////////////////
|
||||
@Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
|
||||
private File TRANCHES_FILE;
|
||||
|
||||
/////////////////////////////
|
||||
// Outputs
|
||||
/////////////////////////////
|
||||
@Output( doc="The output filtered VCF file", required=true)
|
||||
private VCFWriter vcfWriter = null;
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="fdr_filter_level", shortName="fdr_filter_level", doc="The FDR level at which to start filtering.", required=false)
|
||||
private double FDR_FILTER_LEVEL = 0.0;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
final private List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
final private Set<String> inputNames = new HashSet<String>();
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
for ( Tranche t : Tranche.readTranches(TRANCHES_FILE) ) {
|
||||
if ( t.fdr >= FDR_FILTER_LEVEL) {
|
||||
tranches.add(t);
|
||||
//statusMsg = "Keeping, above FDR threshold";
|
||||
}
|
||||
logger.info(String.format("Read tranche " + t));
|
||||
}
|
||||
Collections.reverse(tranches); // this algorithm wants the tranches ordered from worst to best
|
||||
|
||||
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
||||
if( d.getName().startsWith("input") ) {
|
||||
inputNames.add(d.getName());
|
||||
logger.info("Found input variant track with name " + d.getName());
|
||||
} else {
|
||||
logger.info("Not evaluating ROD binding " + d.getName());
|
||||
}
|
||||
}
|
||||
|
||||
if( inputNames.size() == 0 ) {
|
||||
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
|
||||
}
|
||||
|
||||
// setup the header fields
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
|
||||
|
||||
if( tranches.size() >= 2 ) {
|
||||
for( int iii = 0; iii < tranches.size() - 1; iii++ ) {
|
||||
Tranche t = tranches.get(iii);
|
||||
hInfo.add(new VCFFilterHeaderLine(t.name, String.format("FDR tranche level at VSQ Lod: " + t.minVQSLod + " <= x < " + tranches.get(iii+1).minVQSLod)));
|
||||
}
|
||||
}
|
||||
if( tranches.size() >= 1 ) {
|
||||
hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("FDR tranche level at VQS Lod < " + tranches.get(0).minVQSLod)));
|
||||
} else {
|
||||
throw new UserException("No tranches were found in the file or were above the FDR Filter level " + FDR_FILTER_LEVEL);
|
||||
}
|
||||
|
||||
logger.info("Keeping all variants in tranche " + tranches.get(tranches.size()-1));
|
||||
|
||||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||
return 1;
|
||||
}
|
||||
|
||||
for( VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), true, false) ) {
|
||||
if( vc != null ) {
|
||||
String filterString = null;
|
||||
if( !vc.isFiltered() ) {
|
||||
try {
|
||||
final double lod = vc.getAttributeAsDouble(VariantRecalibratorV2.VQS_LOD_KEY);
|
||||
for( int i = tranches.size() - 1; i >= 0; i-- ) {
|
||||
Tranche tranche = tranches.get(i);
|
||||
if( lod >= tranche.minVQSLod ) {
|
||||
if (i == tranches.size() - 1) {
|
||||
filterString = VCFConstants.PASSES_FILTERS_v4;
|
||||
} else {
|
||||
filterString = tranche.name;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( filterString == null ) {
|
||||
filterString = tranches.get(0).name+"+";
|
||||
}
|
||||
|
||||
if ( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) {
|
||||
Set<String> filters = new HashSet<String>();
|
||||
filters.add(filterString);
|
||||
vc = VariantContext.modifyFilters(vc, filters);
|
||||
}
|
||||
} catch ( ClassCastException e ) {
|
||||
throw new UserException.MalformedFile(vc.getSource(), "Invalid value for VQS key " + VariantRecalibratorV2.VQS_LOD_KEY + " at variant " + vc, e);
|
||||
}
|
||||
}
|
||||
|
||||
vcfWriter.add( vc, ref.getBase() );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return 1; // This value isn't used for anything
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 1; // This value isn't used for anything
|
||||
}
|
||||
|
||||
public Integer reduce( final Integer mapValue, final Integer reduceSum ) {
|
||||
return 1; // This value isn't used for anything
|
||||
}
|
||||
|
||||
public void onTraversalDone( Integer reduceSum ) {
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,222 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score.
|
||||
*
|
||||
* User: rpoplin
|
||||
* Date: 3/12/11
|
||||
*
|
||||
* @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variants
|
||||
*/
|
||||
|
||||
public class ContrastiveRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> implements TreeReducible<ExpandingArrayList<VariantDatum>> {
|
||||
|
||||
public static final String VQS_LOD_KEY = "VQSLOD";
|
||||
|
||||
/////////////////////////////
|
||||
// Outputs
|
||||
/////////////////////////////
|
||||
@Output(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true)
|
||||
private PrintStream RECAL_FILE;
|
||||
@Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyRecalibration", required=true)
|
||||
private PrintStream TRANCHES_FILE;
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
//BUGBUG: use VariantRecalibrationArgumentCollection
|
||||
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
|
||||
private String[] USE_ANNOTATIONS = null;
|
||||
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
|
||||
private double[] FDR_TRANCHES = new double[]{0.1, 1.0, 10.0, 100.0};
|
||||
|
||||
/////////////////////////////
|
||||
// Debug Arguments
|
||||
/////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName = "debugFile", shortName = "debugFile", doc = "Print debugging information here", required=false)
|
||||
private File DEBUG_FILE = null;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private VariantDataManager dataManager;
|
||||
private Set<String> inputNames = new HashSet<String>();
|
||||
private VariantRecalibratorEngine engine = new VariantRecalibratorEngine(new VariantRecalibratorArgumentCollection()); //BUGBUG: doesn't do anything with the args at the moment
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
dataManager = new VariantDataManager( new ArrayList<String>(Arrays.asList(USE_ANNOTATIONS)) );
|
||||
|
||||
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
||||
if( d.getName().toLowerCase().startsWith("input") ) {
|
||||
inputNames.add(d.getName());
|
||||
logger.info("Found input variant track with name " + d.getName());
|
||||
} else if ( d.getName().equals("dbsnp") ) { //BUGBUG: values are hard coded for now until command line tagging up for Rod bindings
|
||||
logger.info("Found dbSNP track: \tKnown = true \tTraining = false \tTruth = false \tPrior = Q10.0");
|
||||
dataManager.addTrainingSet(new TrainingSet("dbsnp", true, false, false, 10.0));
|
||||
} else if ( d.getName().equals("hapmap") ) {
|
||||
logger.info("Found HapMap track: \tKnown = false \tTraining = true \tTruth = true \tPrior = Q12.0");
|
||||
dataManager.addTrainingSet(new TrainingSet("hapmap", false, true, true, 12.0));
|
||||
} else if ( d.getName().equals("1kg") ) {
|
||||
logger.info("Found 1KG track: \tKnown = false \tTraining = true \tTruth = false \tPrior = Q6.0");
|
||||
dataManager.addTrainingSet(new TrainingSet("1kg", false, true, false, 6.0));
|
||||
} else if ( d.getName().equals("omni") ) {
|
||||
logger.info("Found Omni track: \tKnown = false \tTraining = true \tTruth = true \tPrior = Q15.0");
|
||||
dataManager.addTrainingSet( new TrainingSet("omni", false, true, true, 15.0) );
|
||||
} else {
|
||||
logger.info("WARNING! Not evaluating ROD binding: " + d.getName());
|
||||
}
|
||||
}
|
||||
|
||||
if( !dataManager.checkHasTrainingSet() ) {
|
||||
throw new UserException.CommandLineException("No training set found! Please provide sets of known polymorphic loci to be used as training data using the dbsnp, hapmap, 1kg, or omni rod bindings.");
|
||||
}
|
||||
if( !dataManager.checkHasTruthSet() ) {
|
||||
throw new UserException.CommandLineException("No truth set found! Please provide sets of known polymorphic loci to be used as training data using the dbsnp, hapmap, 1kg, or omni rod bindings.");
|
||||
}
|
||||
if( !dataManager.checkHasKnownSet() ) {
|
||||
throw new UserException.CommandLineException("No known set found! Please provide sets of known polymorphic loci to be used as training data using the dbsnp, hapmap, 1kg, or omni rod bindings.");
|
||||
}
|
||||
|
||||
if( inputNames.size() == 0 ) {
|
||||
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> map( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) {
|
||||
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
|
||||
|
||||
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||
return mapList;
|
||||
}
|
||||
|
||||
VariantContext thisVC = null;
|
||||
int maxAC = -1;
|
||||
for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), true, false) ) {
|
||||
if( vc != null ) {
|
||||
final int ac = vc.getAttributeAsInt("AC");
|
||||
if( ac > maxAC ) {
|
||||
thisVC = vc;
|
||||
maxAC = ac;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), true, false) ) {
|
||||
if( thisVC != null ) { // BUGBUG: filtered?
|
||||
final VariantDatum datum = new VariantDatum();
|
||||
datum.annotations = dataManager.decodeAnnotations( ref.getGenomeLocParser(), thisVC, true ); //BUGBUG: when run with HierarchicalMicroScheduler this is non-deterministic because order of calls depends on load of machine
|
||||
datum.pos = context.getLocation();
|
||||
datum.originalQual = thisVC.getPhredScaledQual();
|
||||
datum.isTransition = thisVC.isSNP() && ( VariantContextUtils.getSNPSubstitutionType(thisVC).compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0 );
|
||||
dataManager.parseTrainingSets( tracker, ref, context, thisVC, datum );
|
||||
final double priorFactor = QualityUtils.qualToProb( datum.prior );
|
||||
datum.prior = Math.log10( priorFactor ) - Math.log10( 1.0 - priorFactor );
|
||||
mapList.add( datum );
|
||||
}
|
||||
//}
|
||||
|
||||
return mapList;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduceInit() {
|
||||
return new ExpandingArrayList<VariantDatum>();
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
reduceSum.addAll( mapValue );
|
||||
return reduceSum;
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> treeReduce( final ExpandingArrayList<VariantDatum> lhs, final ExpandingArrayList<VariantDatum> rhs ) {
|
||||
rhs.addAll( lhs );
|
||||
return rhs;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// on traversal done
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void onTraversalDone( final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
dataManager.setData( reduceSum );
|
||||
dataManager.normalizeData();
|
||||
engine.evaluateData( dataManager.getData(), engine.generateModel( dataManager.getTrainingData() ), false );
|
||||
engine.evaluateData( dataManager.getData(), engine.generateModel( dataManager.selectWorstVariants( 0.07f ) ), true );
|
||||
|
||||
// old tranches stuff
|
||||
int nCallsAtTruth = TrancheManager.countCallsAtTruth( dataManager.getData(), Double.NEGATIVE_INFINITY );
|
||||
//logger.info(String.format("Truth set size is %d, raw calls at these sites %d, maximum sensitivity of %.2f",
|
||||
// nTruthSites, nCallsAtTruth, (100.0*nCallsAtTruth / Math.max(nTruthSites, nCallsAtTruth))));
|
||||
TrancheManager.SelectionMetric metric = new TrancheManager.TruthSensitivityMetric( nCallsAtTruth );
|
||||
List<Tranche> tranches = TrancheManager.findTranches( dataManager.getData(), FDR_TRANCHES, metric, DEBUG_FILE ); //BUGBUG: recreated here to match the integration tests
|
||||
TRANCHES_FILE.print(Tranche.tranchesString( tranches ));
|
||||
|
||||
logger.info( "Writing out recalibration table..." );
|
||||
dataManager.writeOutRecalibrationTable( RECAL_FILE );
|
||||
}
|
||||
}
|
||||
|
|
@ -3,17 +3,10 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
|||
import Jama.Matrix;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -29,8 +22,8 @@ public class GaussianMixtureModel {
|
|||
private final double shrinkage;
|
||||
private final double dirichletParameter;
|
||||
private final double degreesOfFreedom;
|
||||
private final double[] empiricalMu; // BUGBUG: move these to VariantData class
|
||||
private final Matrix empiricalSigma; // BUGBUG: move these to VariantData class
|
||||
private final double[] empiricalMu;
|
||||
private final Matrix empiricalSigma;
|
||||
public boolean isModelReadyForEvaluation;
|
||||
|
||||
public GaussianMixtureModel( final int numGaussians, final int numAnnotations,
|
||||
|
|
@ -43,43 +36,19 @@ public class GaussianMixtureModel {
|
|||
}
|
||||
this.shrinkage = shrinkage;
|
||||
this.dirichletParameter = dirichletParameter;
|
||||
degreesOfFreedom = numAnnotations;
|
||||
degreesOfFreedom = numAnnotations + 2;
|
||||
empiricalMu = new double[numAnnotations];
|
||||
empiricalSigma = new Matrix(numAnnotations, numAnnotations);
|
||||
isModelReadyForEvaluation = false;
|
||||
}
|
||||
|
||||
public GaussianMixtureModel( final List<String> annotationLines, final List<String> gaussianLines ) { //BUGBUG: recreated here to match the integration tests
|
||||
|
||||
gaussians = new ArrayList<MultivariateGaussian>( gaussianLines.size() );
|
||||
for( final String line : gaussianLines ) {
|
||||
final MultivariateGaussian gaussian = new MultivariateGaussian( annotationLines.size() );
|
||||
final String[] vals = line.split(",");
|
||||
gaussian.pMixtureLog10 = Math.log10( Double.parseDouble(vals[1]) );
|
||||
for( int iii = 0; iii < annotationLines.size(); iii++ ) {
|
||||
gaussian.mu[iii] = Double.parseDouble(vals[2+iii]); //BUGBUG: recreated here to match the integration tests
|
||||
for( int jjj = 0; jjj < annotationLines.size(); jjj++ ) {
|
||||
gaussian.sigma.set(iii, jjj, 1.3*Double.parseDouble(vals[2+annotationLines.size()+(iii*annotationLines.size())+jjj])); //BUGBUG: VRAC backoff
|
||||
}
|
||||
}
|
||||
gaussians.add( gaussian );
|
||||
}
|
||||
|
||||
shrinkage = 0.0; // not used when evaluating data, BUGBUG: move this to VariantData class
|
||||
dirichletParameter = 0.0; // not used when evaluating data
|
||||
degreesOfFreedom = 0.0; // not used when evaluating data
|
||||
empiricalMu = null; // not used when evaluating data
|
||||
empiricalSigma = null; // not used when evaluating data
|
||||
isModelReadyForEvaluation = false;
|
||||
}
|
||||
|
||||
public void cacheEmpiricalStats( final List<VariantDatum> data ) {
|
||||
//final double[][] tmpSigmaVals = new double[empiricalMu.length][empiricalMu.length];
|
||||
final double[][] tmpSigmaVals = new double[empiricalMu.length][empiricalMu.length];
|
||||
for( int iii = 0; iii < empiricalMu.length; iii++ ) {
|
||||
empiricalMu[iii] = 0.0;
|
||||
//for( int jjj = iii; jjj < empiricalMu.length; jjj++ ) {
|
||||
// tmpSigmaVals[iii][jjj] = 0.0;
|
||||
//}
|
||||
for( int jjj = iii; jjj < empiricalMu.length; jjj++ ) {
|
||||
tmpSigmaVals[iii][jjj] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
for( final VariantDatum datum : data ) {
|
||||
|
|
@ -88,19 +57,17 @@ public class GaussianMixtureModel {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
for( final VariantDatum datum : data ) {
|
||||
for( int iii = 0; iii < empiricalMu.length; iii++ ) {
|
||||
for( int jjj = 0; jjj < empiricalMu.length; jjj++ ) {
|
||||
tmpSigmaVals[iii][jjj] += (datum.annotations[iii]-empiricalMu[iii]) * (datum.annotations[jjj]-empiricalMu[jjj]);
|
||||
}
|
||||
}
|
||||
}
|
||||
//for( final VariantDatum datum : data ) {
|
||||
// for( int iii = 0; iii < empiricalMu.length; iii++ ) {
|
||||
// for( int jjj = 0; jjj < empiricalMu.length; jjj++ ) {
|
||||
// tmpSigmaVals[iii][jjj] += (datum.annotations[iii]-empiricalMu[iii]) * (datum.annotations[jjj]-empiricalMu[jjj]);
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
||||
empiricalSigma.setMatrix(0, empiricalMu.length - 1, 0, empiricalMu.length - 1, new Matrix(tmpSigmaVals));
|
||||
empiricalSigma.timesEquals( 1.0 / ((double) data.size()) );
|
||||
empiricalSigma.timesEquals( 1.0 / (Math.pow(gaussians.size(), 2.0 / ((double) empiricalMu.length))) );
|
||||
*/
|
||||
//empiricalSigma.setMatrix(0, empiricalMu.length - 1, 0, empiricalMu.length - 1, new Matrix(tmpSigmaVals));
|
||||
//empiricalSigma.timesEquals( 1.0 / ((double) data.size()) );
|
||||
//empiricalSigma.timesEquals( 1.0 / (Math.pow(gaussians.size(), 2.0 / ((double) empiricalMu.length))) );
|
||||
empiricalSigma.setMatrix(0, empiricalMu.length - 1, 0, empiricalMu.length - 1, Matrix.identity(empiricalMu.length, empiricalMu.length));
|
||||
}
|
||||
|
||||
|
|
@ -112,7 +79,9 @@ public class GaussianMixtureModel {
|
|||
}
|
||||
|
||||
// initialize means using K-means algorithm
|
||||
initializeMeansUsingKMeans( data, 60, rand ); // BUGBUG: a VRAC argument?
|
||||
final int numKMeansIterations = 10; // BUGBUG: VRAC argument
|
||||
logger.info( "Initializing model with " + numKMeansIterations + " k-means iterations..." );
|
||||
initializeMeansUsingKMeans( data, numKMeansIterations, rand );
|
||||
|
||||
// initialize uniform mixture coefficients, random covariance matrices, and initial hyperparameters
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
|
|
@ -134,7 +103,7 @@ public class GaussianMixtureModel {
|
|||
datum.assignment = minGaussian;
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
final double dist = gaussian.calculateDistanceFromMeanSquared( datum );
|
||||
if(dist < minDistance) {
|
||||
if( dist < minDistance ) {
|
||||
minDistance = dist;
|
||||
minGaussian = gaussian;
|
||||
}
|
||||
|
|
@ -152,7 +121,7 @@ public class GaussianMixtureModel {
|
|||
gaussian.incrementMu( datum );
|
||||
}
|
||||
}
|
||||
if(numAssigned != 0) {
|
||||
if( numAssigned != 0 ) {
|
||||
gaussian.divideEqualsMu( ((double) numAssigned) );
|
||||
} else {
|
||||
gaussian.initializeRandomMu( rand );
|
||||
|
|
@ -173,17 +142,18 @@ public class GaussianMixtureModel {
|
|||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
final double pVarLog10 = gaussian.evaluateDatumLog10( datum );
|
||||
pVarInGaussianLog10.add( pVarLog10 );
|
||||
likelihood += Math.pow( 10.0, pVarLog10 ); //BUGBUG: output recreated here to match the integration tests
|
||||
likelihood += pVarLog10;
|
||||
}
|
||||
final double[] pVarInGaussianReals = MathUtils.normalizeFromLog10( pVarInGaussianLog10 );
|
||||
final double[] pVarInGaussianNormalized = MathUtils.normalizeFromLog10( pVarInGaussianLog10 );
|
||||
int iii = 0;
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
gaussian.assignPVarInGaussian( pVarInGaussianReals[iii++] ); //BUGBUG: to clean up
|
||||
gaussian.assignPVarInGaussian( pVarInGaussianNormalized[iii++] ); //BUGBUG: to clean up
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("explained likelihood = " + String.format( "%.5f", likelihood / ((double) data.size()) ));
|
||||
return likelihood / ((double) data.size());
|
||||
final double scaledTotalLikelihoodLog10 = likelihood / ((double) data.size());
|
||||
logger.info( "sum Log10 likelihood = " + String.format("%.5f", scaledTotalLikelihoodLog10) );
|
||||
return scaledTotalLikelihoodLog10;
|
||||
}
|
||||
|
||||
public void maximizationStep( final List<VariantDatum> data ) {
|
||||
|
|
@ -200,24 +170,11 @@ public class GaussianMixtureModel {
|
|||
return sum;
|
||||
}
|
||||
|
||||
public void output( final PrintStream clusterFile ) { //BUGBUG: output recreated here to match the integration tests
|
||||
normalizePMixtureLog10(); //BUGBUG: output recreated here to match the integration tests
|
||||
public void evaluateFinalModelParameters( final List<VariantDatum> data ) {
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
if( Math.pow(10.0, gaussian.pMixtureLog10) > 1E-4 ) {
|
||||
final double sigmaVals[][] = gaussian.sigma.getArray();
|
||||
clusterFile.print("@!CLUSTER");
|
||||
clusterFile.print(String.format(",%.8f", Math.pow(10.0, gaussian.pMixtureLog10)));
|
||||
for(int jjj = 0; jjj < gaussian.mu.length; jjj++ ) {
|
||||
clusterFile.print(String.format(",%.8f", gaussian.mu[jjj]));
|
||||
}
|
||||
for(int iii = 0; iii < gaussian.mu.length; iii++ ) {
|
||||
for(int jjj = 0; jjj < gaussian.mu.length; jjj++ ) {
|
||||
clusterFile.print(String.format(",%.8f", (sigmaVals[iii][jjj] / gaussian.hyperParameter_a)));
|
||||
}
|
||||
}
|
||||
clusterFile.println();
|
||||
}
|
||||
gaussian.evaluateFinalModelParameters( data );
|
||||
}
|
||||
normalizePMixtureLog10();
|
||||
}
|
||||
|
||||
private void normalizePMixtureLog10() {
|
||||
|
|
|
|||
|
|
@ -1,250 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Feb 11, 2010
|
||||
*
|
||||
* @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
|
||||
*/
|
||||
|
||||
public class GenerateVariantClustersV2 extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||
|
||||
/////////////////////////////
|
||||
// Outputs
|
||||
/////////////////////////////
|
||||
|
||||
@Output(fullName="cluster_file", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
private PrintStream CLUSTER_FILE;
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
|
||||
private String[] USE_ANNOTATIONS = null;
|
||||
@Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians to try during Bayesian clustering", required=false)
|
||||
private int MAX_GAUSSIANS = 4;
|
||||
@Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of iterations to be performed when clustering. Clustering will normally end when convergence is detected.", required=false)
|
||||
private int MAX_ITERATIONS = 200;
|
||||
@Argument(fullName="weightNovel", shortName="weightNovel", doc="The weight for novel variants during clustering", required=false)
|
||||
private double WEIGHT_NOVELS = 0.0;
|
||||
@Argument(fullName="weightDBSNP", shortName="weightDBSNP", doc="The weight for dbSNP variants during clustering", required=false)
|
||||
private double WEIGHT_DBSNP = 0.0;
|
||||
@Argument(fullName="weightHapMap", shortName="weightHapMap", doc="The weight for HapMap variants during clustering", required=false)
|
||||
private double WEIGHT_HAPMAP = 1.0;
|
||||
@Argument(fullName="weight1KG", shortName="weight1KG", doc="The weight for 1000 Genomes Project variants during clustering", required=false)
|
||||
private double WEIGHT_1KG = 1.0;
|
||||
@Argument(fullName="forceIndependent", shortName="forceIndependent", doc="Force off-diagonal entries in the covariance matrix to be zero.", required=false)
|
||||
private boolean FORCE_INDEPENDENT = false;
|
||||
@Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for clustering.", required=false)
|
||||
private double STD_THRESHOLD = 4.5;
|
||||
@Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for clustering.", required=false)
|
||||
private double QUAL_THRESHOLD = 100.0;
|
||||
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
|
||||
private double SHRINKAGE = 0.0001;
|
||||
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algorithm.", required=false)
|
||||
private double DIRICHLET_PARAMETER = 1000.0;
|
||||
|
||||
/////////////////////////////
|
||||
// Debug Arguments
|
||||
/////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName = "NO_HEADER", shortName = "NO_HEADER", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", required = false)
|
||||
protected Boolean NO_HEADER_LINE = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false)
|
||||
protected Boolean TRUST_ALL_POLYMORPHIC = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private ExpandingArrayList<String> annotationKeys;
|
||||
private Set<String> ignoreInputFilterSet = null;
|
||||
private Set<String> inputNames = new HashSet<String>();
|
||||
private VariantDataManager dataManager;
|
||||
private VariantRecalibratorEngine engine = new VariantRecalibratorEngine(new VariantRecalibratorArgumentCollection()); //BUGBUG: doesn't do anything at the moment
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
//if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
|
||||
annotationKeys = new ExpandingArrayList<String>(Arrays.asList(USE_ANNOTATIONS));
|
||||
|
||||
if( IGNORE_INPUT_FILTERS != null ) {
|
||||
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
|
||||
}
|
||||
|
||||
if( !NO_HEADER_LINE ) {
|
||||
CLUSTER_FILE.print("##GenerateVariantClusters = ");
|
||||
CLUSTER_FILE.println("\"" + getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this) + "\"");
|
||||
}
|
||||
|
||||
boolean foundTruthSet = false;
|
||||
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
||||
if( d.getName().startsWith("input") ) {
|
||||
inputNames.add(d.getName());
|
||||
logger.info("Found input variant track with name " + d.getName());
|
||||
} else if ( d.getName().equals("dbsnp") ) {
|
||||
logger.info("Found dbSNP track for use in training with weight = " + WEIGHT_DBSNP);
|
||||
if( WEIGHT_DBSNP > 0.0 ) {
|
||||
foundTruthSet = true;
|
||||
}
|
||||
} else if ( d.getName().equals("hapmap") ) {
|
||||
logger.info("Found HapMap track for use in training with weight = " + WEIGHT_HAPMAP);
|
||||
if( WEIGHT_HAPMAP > 0.0 ) {
|
||||
foundTruthSet = true;
|
||||
}
|
||||
|
||||
} else if ( d.getName().equals("1kg") ) {
|
||||
logger.info("Found 1KG track for use in training with weight = " + WEIGHT_1KG);
|
||||
if( WEIGHT_1KG > 0.0 ) {
|
||||
foundTruthSet = true;
|
||||
}
|
||||
|
||||
} else {
|
||||
logger.info("Not evaluating ROD binding " + d.getName());
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundTruthSet ) {
|
||||
throw new UserException.CommandLineException("No truth set found! Please provide sets of known polymorphic loci to be used as training data using the dbsnp, hapmap, or 1kg rod bindings. Clustering weights can be specified using -weightDBSNP, -weightHapMap, and -weight1KG");
|
||||
}
|
||||
|
||||
if( inputNames.size() == 0 ) {
|
||||
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
|
||||
|
||||
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||
return mapList;
|
||||
}
|
||||
|
||||
for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), false, false) ) {
|
||||
if( vc != null ) {
|
||||
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
|
||||
int iii = 0;
|
||||
final double annotationValues[] = new double[annotationKeys.size()];
|
||||
for( final String key : annotationKeys ) {
|
||||
annotationValues[iii++] = VariantDataManager.decodeAnnotation( getToolkit().getGenomeLocParser(), key, vc, true );
|
||||
}
|
||||
|
||||
final VariantDatum variantDatum = new VariantDatum();
|
||||
variantDatum.annotations = annotationValues;
|
||||
|
||||
final Collection<VariantContext> vcsDbsnp = tracker.getVariantContexts(ref, "dbsnp", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcsHapMap = tracker.getVariantContexts(ref, "hapmap", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcs1KG = tracker.getVariantContexts(ref, "1kg", null, context.getLocation(), false, true);
|
||||
final VariantContext vcDbsnp = ( vcsDbsnp.size() != 0 ? vcsDbsnp.iterator().next() : null );
|
||||
final VariantContext vcHapMap = ( vcsHapMap.size() != 0 ? vcsHapMap.iterator().next() : null );
|
||||
final VariantContext vc1KG = ( vcs1KG.size() != 0 ? vcs1KG.iterator().next() : null );
|
||||
|
||||
variantDatum.isKnown = ( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() );
|
||||
double weight = 0.0;
|
||||
weight = WEIGHT_NOVELS;
|
||||
if( vcHapMap != null && vcHapMap.isVariant() && !vcHapMap.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vcHapMap.hasGenotypes() || vcHapMap.isPolymorphic()) ) {
|
||||
weight = WEIGHT_HAPMAP;
|
||||
} else if( vc1KG != null && vc1KG.isVariant() && !vc1KG.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vc1KG.hasGenotypes() || vc1KG.isPolymorphic()) ) {
|
||||
weight = WEIGHT_1KG;
|
||||
} else if( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() ) {
|
||||
weight = WEIGHT_DBSNP;
|
||||
}
|
||||
|
||||
if( weight > 0.0 && vc.getPhredScaledQual() > QUAL_THRESHOLD ) {
|
||||
mapList.add( variantDatum );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return mapList;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduceInit() {
|
||||
return new ExpandingArrayList<VariantDatum>();
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
reduceSum.addAll( mapValue );
|
||||
|
||||
return reduceSum;
|
||||
}
|
||||
|
||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
|
||||
logger.info( "There are " + reduceSum.size() + " variants with > 0 clustering weight and qual > threshold (--qualThreshold = " + QUAL_THRESHOLD + ")" );
|
||||
logger.info( "The annotations used for clustering are: " + annotationKeys );
|
||||
|
||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
|
||||
dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
|
||||
dataManager.trimDataBySTD( STD_THRESHOLD );
|
||||
final GaussianMixtureModel theModel = engine.generateModel( dataManager.data );
|
||||
dataManager.printClusterFileHeader( CLUSTER_FILE );
|
||||
theModel.output( CLUSTER_FILE );
|
||||
}
|
||||
}
|
||||
|
|
@ -133,7 +133,7 @@ public class MultivariateGaussian {
|
|||
public void maximizeGaussian( final List<VariantDatum> data, final double[] empiricalMu, final Matrix empiricalSigma,
|
||||
final double SHRINKAGE, final double DIRICHLET_PARAMETER, final double DEGREES_OF_FREEDOM ) {
|
||||
double sumProb = 0.0;
|
||||
Matrix wishart = new Matrix(mu.length, mu.length);
|
||||
final Matrix wishart = new Matrix(mu.length, mu.length);
|
||||
zeroOutMu();
|
||||
zeroOutSigma();
|
||||
|
||||
|
|
@ -143,10 +143,7 @@ public class MultivariateGaussian {
|
|||
sumProb += prob;
|
||||
incrementMu( datum, prob );
|
||||
}
|
||||
|
||||
for( int iii = 0; iii < mu.length; iii++ ) {
|
||||
mu[iii] = (mu[iii] + SHRINKAGE * empiricalMu[iii]) / (sumProb + SHRINKAGE);
|
||||
}
|
||||
divideEqualsMu( sumProb );
|
||||
|
||||
final double shrinkageFactor = (SHRINKAGE * sumProb) / (SHRINKAGE + sumProb);
|
||||
for( int iii = 0; iii < mu.length; iii++ ) {
|
||||
|
|
@ -170,6 +167,10 @@ public class MultivariateGaussian {
|
|||
sigma.plusEquals( empiricalSigma );
|
||||
sigma.plusEquals( wishart );
|
||||
|
||||
for( int iii = 0; iii < mu.length; iii++ ) {
|
||||
mu[iii] = (sumProb * mu[iii] + SHRINKAGE * empiricalMu[iii]) / (sumProb + SHRINKAGE);
|
||||
}
|
||||
|
||||
pMixtureLog10 = sumProb; // will be normalized later by GaussianMixtureModel so no need to do it every iteration
|
||||
|
||||
hyperParameter_a = sumProb + DEGREES_OF_FREEDOM;
|
||||
|
|
@ -178,4 +179,34 @@ public class MultivariateGaussian {
|
|||
|
||||
resetPVarInGaussian(); // clean up some memory
|
||||
}
|
||||
}
|
||||
|
||||
public void evaluateFinalModelParameters( final List<VariantDatum> data ) {
|
||||
double sumProb = 0.0;
|
||||
zeroOutMu();
|
||||
zeroOutSigma();
|
||||
|
||||
int datumIndex = 0;
|
||||
for( final VariantDatum datum : data ) {
|
||||
final double prob = pVarInGaussian.get(datumIndex++);
|
||||
sumProb += prob;
|
||||
incrementMu( datum, prob );
|
||||
}
|
||||
divideEqualsMu( sumProb );
|
||||
|
||||
datumIndex = 0;
|
||||
final Matrix pVarSigma = new Matrix(mu.length, mu.length);
|
||||
for( final VariantDatum datum : data ) {
|
||||
final double prob = pVarInGaussian.get(datumIndex++);
|
||||
for( int iii = 0; iii < mu.length; iii++ ) {
|
||||
for( int jjj = 0; jjj < mu.length; jjj++ ) {
|
||||
pVarSigma.set(iii, jjj, prob * (datum.annotations[iii]-mu[iii]) * (datum.annotations[jjj]-mu[jjj]));
|
||||
}
|
||||
}
|
||||
sigma.plusEquals( pVarSigma );
|
||||
}
|
||||
sigma.timesEquals( 1.0 / sumProb );
|
||||
|
||||
pMixtureLog10 = sumProb; // will be normalized later by GaussianMixtureModel so no need to do it here
|
||||
resetPVarInGaussian(); // clean up some memory
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 3/12/11
|
||||
*/
|
||||
|
||||
public class TrainingSet implements Comparable<TrainingSet> {
|
||||
|
||||
public String name;
|
||||
public boolean isKnown;
|
||||
public boolean isTraining;
|
||||
public boolean isTruth;
|
||||
public double prior;
|
||||
|
||||
public TrainingSet( final String name, final boolean isKnown, final boolean isTraining, final boolean isTruth, final double prior ) {
|
||||
this.name = name;
|
||||
this.isKnown = isKnown;
|
||||
this.isTraining = isTraining;
|
||||
this.isTruth = isTruth;
|
||||
this.prior = prior;
|
||||
}
|
||||
|
||||
public int compareTo( final TrainingSet other ) {
|
||||
return Double.compare(this.prior, other.prior);
|
||||
}
|
||||
}
|
||||
|
|
@ -2,15 +2,16 @@ package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
|||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -19,41 +20,31 @@ import java.util.Random;
|
|||
*/
|
||||
|
||||
public class VariantDataManager {
|
||||
public ExpandingArrayList<VariantDatum> data;
|
||||
public final double[] meanVector;
|
||||
public final double[] varianceVector; // This is really the standard deviation
|
||||
private ExpandingArrayList<VariantDatum> data;
|
||||
private final double[] meanVector;
|
||||
private final double[] varianceVector; // this is really the standard deviation
|
||||
public final ArrayList<String> annotationKeys;
|
||||
private final ExpandingArrayList<TrainingSet> trainingSets;
|
||||
|
||||
private final static long RANDOM_SEED = 83409701;
|
||||
private final static Random rand = new Random( RANDOM_SEED ); // this is going to cause problems if it is ever used in an integration test
|
||||
|
||||
protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
|
||||
|
||||
public VariantDataManager( final ExpandingArrayList<VariantDatum> data, final List<String> annotationKeys ) {
|
||||
this.data = data;
|
||||
public VariantDataManager( final List<String> annotationKeys ) {
|
||||
this.data = null;
|
||||
this.annotationKeys = new ArrayList<String>( annotationKeys );
|
||||
meanVector = new double[this.annotationKeys.size()];
|
||||
varianceVector = new double[this.annotationKeys.size()];
|
||||
trainingSets = new ExpandingArrayList<TrainingSet>();
|
||||
}
|
||||
|
||||
public VariantDataManager( final List<String> annotationLines ) {
|
||||
data = null;
|
||||
annotationKeys = new ArrayList<String>( annotationLines.size() );
|
||||
meanVector = new double[annotationLines.size()];
|
||||
varianceVector = new double[annotationLines.size()];
|
||||
|
||||
int jjj = 0;
|
||||
for( final String line : annotationLines ) {
|
||||
final String[] vals = line.split(",");
|
||||
annotationKeys.add(vals[1]);
|
||||
meanVector[jjj] = Double.parseDouble(vals[2]);
|
||||
varianceVector[jjj] = Double.parseDouble(vals[3]);
|
||||
jjj++;
|
||||
}
|
||||
public void setData( final ExpandingArrayList<VariantDatum> data ) {
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
public void add( final VariantDatum datum ) {
|
||||
data.add( datum );
|
||||
public ExpandingArrayList<VariantDatum> getData() {
|
||||
return data;
|
||||
}
|
||||
|
||||
public void normalizeData() {
|
||||
|
|
@ -69,7 +60,7 @@ public class VariantDataManager {
|
|||
}
|
||||
meanVector[jjj] = theMean;
|
||||
varianceVector[jjj] = theSTD;
|
||||
for(final VariantDatum datum : data ) {
|
||||
for( final VariantDatum datum : data ) {
|
||||
datum.annotations[jjj] = ( datum.annotations[jjj] - theMean ) / theSTD; // Each data point is now [ (x - mean) / standard deviation ]
|
||||
}
|
||||
}
|
||||
|
|
@ -78,6 +69,51 @@ public class VariantDataManager {
|
|||
}
|
||||
}
|
||||
|
||||
public void addTrainingSet( final TrainingSet trainingSet ) {
|
||||
trainingSets.add( trainingSet );
|
||||
}
|
||||
|
||||
public boolean checkHasTrainingSet() {
|
||||
for( final TrainingSet trainingSet : trainingSets ) {
|
||||
if( trainingSet.isTraining ) { return true; }
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean checkHasTruthSet() {
|
||||
for( final TrainingSet trainingSet : trainingSets ) {
|
||||
if( trainingSet.isTruth ) { return true; }
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean checkHasKnownSet() {
|
||||
for( final TrainingSet trainingSet : trainingSets ) {
|
||||
if( trainingSet.isKnown ) { return true; }
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> getTrainingData() {
|
||||
final ExpandingArrayList<VariantDatum> trainingData = new ExpandingArrayList<VariantDatum>();
|
||||
for( final VariantDatum datum : data ) {
|
||||
if( datum.atTrainingSite && datum.originalQual > 80.0 ) { //BUGBUG: VRAC argument
|
||||
trainingData.add( datum );
|
||||
}
|
||||
}
|
||||
trimDataBySTD(trainingData, 4.5); //BUGBUG: VRAC argument
|
||||
logger.info( "Training with " + trainingData.size() + " variants found in the training set(s)." );
|
||||
return trainingData;
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> selectWorstVariants( final float bottomPercentage ) {
|
||||
Collections.sort( data );
|
||||
final ExpandingArrayList<VariantDatum> trainingData = new ExpandingArrayList<VariantDatum>();
|
||||
trainingData.addAll( data.subList(0, Math.round(bottomPercentage * data.size())) );
|
||||
logger.info( "Training with worst " + bottomPercentage * 100.0f + "% of data --> " + trainingData.size() + " variants with LOD <= " + String.format("%.4f", data.get(Math.round(bottomPercentage * data.size())).lod) + "." );
|
||||
return trainingData;
|
||||
}
|
||||
|
||||
private double mean( final int index ) {
|
||||
double sum = 0.0;
|
||||
final int numVars = data.size();
|
||||
|
|
@ -96,31 +132,32 @@ public class VariantDataManager {
|
|||
return Math.sqrt( sum );
|
||||
}
|
||||
|
||||
public void trimDataBySTD( final double STD_THRESHOLD ) {
|
||||
public static void trimDataBySTD( final ExpandingArrayList<VariantDatum> listData, final double STD_THRESHOLD ) {
|
||||
final ExpandingArrayList<VariantDatum> dataToRemove = new ExpandingArrayList<VariantDatum>();
|
||||
for( final VariantDatum datum : data ) {
|
||||
for( final VariantDatum datum : listData ) {
|
||||
boolean remove = false;
|
||||
for( double val : datum.annotations ) {
|
||||
if( Math.abs(val) > STD_THRESHOLD ) {
|
||||
remove = true;
|
||||
}
|
||||
remove = remove || (Math.abs(val) > STD_THRESHOLD);
|
||||
}
|
||||
if( remove ) { dataToRemove.add( datum ); }
|
||||
}
|
||||
data.removeAll( dataToRemove );
|
||||
listData.removeAll( dataToRemove );
|
||||
}
|
||||
|
||||
public void printClusterFileHeader( PrintStream outputFile ) {
|
||||
for( int jjj = 0; jjj < annotationKeys.size(); jjj++ ) {
|
||||
outputFile.println(String.format("@!ANNOTATION," + annotationKeys.get(jjj) + ",%.8f,%.8f", meanVector[jjj], varianceVector[jjj]));
|
||||
public double[] decodeAnnotations( final GenomeLocParser genomeLocParser, final VariantContext vc, final boolean jitter ) {
|
||||
final double[] annotations = new double[annotationKeys.size()];
|
||||
int iii = 0;
|
||||
for( final String key : annotationKeys ) {
|
||||
annotations[iii++] = decodeAnnotation( genomeLocParser, key, vc, jitter );
|
||||
}
|
||||
return annotations;
|
||||
}
|
||||
|
||||
public static double decodeAnnotation( final GenomeLocParser genomeLocParser, final String annotationKey, final VariantContext vc, final boolean jitter ) {
|
||||
private static double decodeAnnotation( final GenomeLocParser genomeLocParser, final String annotationKey, final VariantContext vc, final boolean jitter ) {
|
||||
double value;
|
||||
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // HRun values must be jittered a bit to work in this GMM
|
||||
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
|
||||
value += -0.25 + 0.5 * rand.nextDouble(); //BUGBUG: recreated here to match the integration tests
|
||||
value += -0.25 + 0.5 * rand.nextDouble();
|
||||
} else if( annotationKey.equals("QUAL") ) {
|
||||
value = vc.getPhredScaledQual();
|
||||
} else {
|
||||
|
|
@ -133,10 +170,26 @@ public class VariantDataManager {
|
|||
return value;
|
||||
}
|
||||
|
||||
public double[] normalizeDatum( final double[] annotationValues ) {
|
||||
for(int iii = 0; iii < annotationValues.length; iii++) {
|
||||
annotationValues[iii] = (annotationValues[iii] - meanVector[iii]) / varianceVector[iii];
|
||||
public void parseTrainingSets( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context, final VariantContext evalVC, final VariantDatum datum ) {
|
||||
datum.isKnown = false;
|
||||
datum.atTruthSite = false;
|
||||
datum.atTrainingSite = false;
|
||||
datum.prior = 3.0;
|
||||
for( final TrainingSet trainingSet : trainingSets ) {
|
||||
final Collection<VariantContext> vcs = tracker.getVariantContexts( ref, trainingSet.name, null, context.getLocation(), false, true );
|
||||
final VariantContext trainVC = ( vcs.size() != 0 ? vcs.iterator().next() : null );
|
||||
if( trainVC != null && trainVC.isVariant() && !trainVC.isFiltered() && ((evalVC.isSNP() && trainVC.isSNP()) || (evalVC.isIndel() && trainVC.isIndel())) && (!trainVC.hasGenotypes() || trainVC.isPolymorphic()) ) {
|
||||
datum.isKnown = datum.isKnown || trainingSet.isKnown;
|
||||
datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth;
|
||||
datum.atTrainingSite = datum.atTrainingSite || trainingSet.isTraining;
|
||||
datum.prior = Math.max( datum.prior, trainingSet.prior );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) {
|
||||
for( final VariantDatum datum : data ) { //BUGBUG: quick and dirty for now, only works for SNPs.
|
||||
RECAL_FILE.println(String.format("%s,%d,%.4f", datum.pos.getContig(), datum.pos.getStart(), datum.lod));
|
||||
}
|
||||
return annotationValues;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
|
|
@ -11,9 +13,12 @@ public class VariantDatum implements Comparable<VariantDatum> {
|
|||
public double[] annotations;
|
||||
public boolean isKnown;
|
||||
public double lod;
|
||||
public double pVarGivenModel;
|
||||
public boolean atTruthSite;
|
||||
public boolean atTrainingSite;
|
||||
public boolean isTransition;
|
||||
public double originalQual;
|
||||
public double prior;
|
||||
public GenomeLoc pos;
|
||||
public MultivariateGaussian assignment; // used in K-means implementation
|
||||
|
||||
public int compareTo( final VariantDatum other ) {
|
||||
|
|
|
|||
|
|
@ -1,10 +1,6 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
|
@ -28,7 +24,7 @@ public class VariantRecalibratorEngine {
|
|||
|
||||
private final static long RANDOM_SEED = 91801305;
|
||||
private final Random rand = new Random( RANDOM_SEED );
|
||||
private final static double MIN_PROB_CONVERGENCE = 1E-5;
|
||||
private final static double MIN_PROB_CONVERGENCE_LOG10 = 1.0;
|
||||
|
||||
/////////////////////////////
|
||||
// Public Methods to interface with the Engine
|
||||
|
|
@ -40,23 +36,23 @@ public class VariantRecalibratorEngine {
|
|||
}
|
||||
|
||||
public GaussianMixtureModel generateModel( final List<VariantDatum> data ) {
|
||||
final GaussianMixtureModel model = new GaussianMixtureModel( 4, 3, 0.0001, 1000.0 ); //BUGBUG: VRAC arguments
|
||||
final GaussianMixtureModel model = new GaussianMixtureModel( 32, 4, 0.0001, 0.0001 ); //BUGBUG: VRAC arguments
|
||||
variationalBayesExpectationMaximization( model, data );
|
||||
return model;
|
||||
}
|
||||
|
||||
public void evaluateData( final List<VariantDatum> data, final GaussianMixtureModel model ) {
|
||||
public void evaluateData( final List<VariantDatum> data, final GaussianMixtureModel model, final boolean evaluateContrastively ) {
|
||||
if( !model.isModelReadyForEvaluation ) {
|
||||
model.precomputeDenominatorForEvaluation();
|
||||
}
|
||||
|
||||
logger.info("Evaluating full set of " + data.size() + " variants...");
|
||||
for( final VariantDatum datum : data ) {
|
||||
datum.pVarGivenModel = evaluateDatum( datum, model );
|
||||
final double thisLod = evaluateDatum( datum, model );
|
||||
datum.lod = ( evaluateContrastively ? (datum.prior + datum.lod - thisLod) : thisLod );
|
||||
}
|
||||
}
|
||||
|
||||
public void evaluateDataContrastively( final List<VariantDatum> data, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel ) {
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
// Private Methods used for initialization
|
||||
/////////////////////////////
|
||||
|
|
@ -74,21 +70,24 @@ public class VariantRecalibratorEngine {
|
|||
model.initializeRandomModel( data, rand );
|
||||
|
||||
// The VBEM loop
|
||||
double previousLikelihood = -1E20;
|
||||
double previousLikelihood = model.expectationStep( data );
|
||||
double currentLikelihood;
|
||||
int iteration = 1;
|
||||
while( iteration < 200 ) { //BUGBUG: VRAC.maxIterations
|
||||
currentLikelihood = model.expectationStep( data );
|
||||
int iteration = 0;
|
||||
logger.info("Finished iteration " + iteration );
|
||||
while( iteration < 100 ) { //BUGBUG: VRAC.maxIterations
|
||||
iteration++;
|
||||
model.maximizationStep( data );
|
||||
currentLikelihood = model.expectationStep( data );
|
||||
|
||||
logger.info("Finished iteration " + iteration );
|
||||
iteration++;
|
||||
if( Math.abs(currentLikelihood - previousLikelihood) < MIN_PROB_CONVERGENCE ) {
|
||||
if( Math.abs(currentLikelihood - previousLikelihood) < MIN_PROB_CONVERGENCE_LOG10 ) {
|
||||
logger.info("Convergence!");
|
||||
return; // Early return here because we have converged!
|
||||
break;
|
||||
}
|
||||
previousLikelihood = currentLikelihood;
|
||||
}
|
||||
|
||||
model.evaluateFinalModelParameters( data );
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
|
|
|
|||
|
|
@ -1,406 +0,0 @@
|
|||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.playground.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Mar 17, 2010
|
||||
*
|
||||
* @help.summary Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
|
||||
*/
|
||||
|
||||
public class VariantRecalibratorV2 extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||
public static final String VQS_LOD_KEY = "VQSLOD";
|
||||
|
||||
/////////////////////////////
|
||||
// Inputs
|
||||
/////////////////////////////
|
||||
@Input(fullName="cluster_file", shortName="clusterFile", doc="The input cluster file generated by GenerateVariantClusters", required=true)
|
||||
private File CLUSTER_FILE;
|
||||
|
||||
/////////////////////////////
|
||||
// Outputs
|
||||
/////////////////////////////
|
||||
@Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyVariantCuts", required=true)
|
||||
private File TRANCHES_FILE;
|
||||
@Output(doc="File to which recalibrated variants should be written", required=true)
|
||||
private VCFWriter vcfWriter = null;
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (~~2.07 for whole genome experiments)", required=false)
|
||||
private double TARGET_TITV = 2.07;
|
||||
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by enlarging the Gaussians.", required=false)
|
||||
private double BACKOFF_FACTOR = 1.3;
|
||||
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Argument(fullName="priorNovel", shortName="priorNovel", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
|
||||
private double PRIOR_NOVELS = 2.0;
|
||||
@Argument(fullName="priorDBSNP", shortName="priorDBSNP", doc="A prior on the quality of dbSNP variants, a phred scaled probability of being true.", required=false)
|
||||
private double PRIOR_DBSNP = 10.0;
|
||||
@Argument(fullName="priorHapMap", shortName="priorHapMap", doc="A prior on the quality of HapMap variants, a phred scaled probability of being true.", required=false)
|
||||
private double PRIOR_HAPMAP = 15.0;
|
||||
@Argument(fullName="prior1KG", shortName="prior1KG", doc="A prior on the quality of 1000 Genomes Project variants, a phred scaled probability of being true.", required=false)
|
||||
private double PRIOR_1KG = 12.0;
|
||||
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
|
||||
private double[] FDR_TRANCHES = new double[]{0.1, 1.0, 10.0, 100.0};
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required=false)
|
||||
private String PATH_TO_RSCRIPT = "Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false)
|
||||
private String PATH_TO_RESOURCES = "R/";
|
||||
@Argument(fullName="singleton_fp_rate", shortName="fp_rate", doc="Prior expectation that a singleton call would be a FP", required=false)
|
||||
private double SINGLETON_FP_RATE = 0.5;
|
||||
@Argument(fullName="max_ac_prior", shortName="maxACPrior", doc="Maximum value for the prior expectation based on allele count. Needed because (1 - 0.5^x) approaches 1.0 very quickly.", required=false)
|
||||
private double MAX_AC_PRIOR = 0.99;
|
||||
@Argument(fullName="dontTrustACField", shortName="dontTrustACField", doc="If specified the VR will not use the AC field and will instead always parse the genotypes to figure out how many variant chromosomes there are at a given site.", required=false)
|
||||
private boolean NEVER_TRUST_AC_FIELD = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Debug Arguments
|
||||
/////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName = "qual", shortName = "qual", doc = "Don't use sites with original quality scores below the qual threshold. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
||||
private double QUAL_THRESHOLD = 0.0;
|
||||
@Hidden
|
||||
@Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false)
|
||||
protected Boolean TRUST_ALL_POLYMORPHIC = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "debugFile", shortName = "debugFile", doc = "Print debugging information here", required=false)
|
||||
private File DEBUG_FILE = null;
|
||||
@Hidden
|
||||
@Argument(fullName = "selectionMetric", shortName = "sm", doc = "Selection metric to use", required=false)
|
||||
private SelectionMetricType SELECTION_METRIC_TYPE = SelectionMetricType.NOVEL_TITV;
|
||||
|
||||
public enum SelectionMetricType {
|
||||
NOVEL_TITV,
|
||||
TRUTH_SENSITIVITY
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private Set<String> ignoreInputFilterSet = null;
|
||||
private Set<String> inputNames = new HashSet<String>();
|
||||
private Set<String> truthNames = new HashSet<String>();
|
||||
private NestedHashMap priorCache = new NestedHashMap();
|
||||
private boolean trustACField = false;
|
||||
private PrintStream tranchesStream = null;
|
||||
private int nTruthSites = 0;
|
||||
private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( new VariantRecalibratorArgumentCollection() );
|
||||
private GaussianMixtureModel model;
|
||||
private VariantDataManager dataManager;
|
||||
|
||||
private static double round4(double num) {
|
||||
double result = num * 10000.0;
|
||||
result = Math.round(result);
|
||||
result = result / 10000.0;
|
||||
return result;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
|
||||
if( IGNORE_INPUT_FILTERS != null ) {
|
||||
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
|
||||
}
|
||||
|
||||
final Pattern COMMENT_PATTERN = Pattern.compile("^##.*");
|
||||
final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
|
||||
final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
|
||||
|
||||
final ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
|
||||
final ExpandingArrayList<String> gaussianLines = new ExpandingArrayList<String>();
|
||||
|
||||
try {
|
||||
for ( final String line : new XReadLines( CLUSTER_FILE ) ) {
|
||||
if( ANNOTATION_PATTERN.matcher(line).matches() ) {
|
||||
annotationLines.add(line);
|
||||
} else if( CLUSTER_PATTERN.matcher(line).matches() ) {
|
||||
gaussianLines.add(line);
|
||||
} else if( !COMMENT_PATTERN.matcher(line).matches() ) {
|
||||
throw new UserException.MalformedFile(CLUSTER_FILE, "Could not parse line: " + line);
|
||||
}
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(CLUSTER_FILE, e);
|
||||
}
|
||||
|
||||
dataManager = new VariantDataManager( annotationLines );
|
||||
model = new GaussianMixtureModel( annotationLines, gaussianLines );
|
||||
|
||||
boolean foundDBSNP = false;
|
||||
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
||||
if( d.getName().startsWith("input") ) {
|
||||
inputNames.add(d.getName());
|
||||
logger.info("Found input variant track with name " + d.getName());
|
||||
} else if( d.getName().startsWith("truth") ) {
|
||||
truthNames.add(d.getName());
|
||||
logger.info("Found truth variant track with name " + d.getName());
|
||||
} else if ( d.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||
logger.info("Found dbSNP track with prior probability = Q" + PRIOR_DBSNP);
|
||||
foundDBSNP = true;
|
||||
} else if ( d.getName().equals("hapmap") ) {
|
||||
logger.info("Found HapMap track with prior probability = Q" + PRIOR_HAPMAP);
|
||||
} else if ( d.getName().equals("1kg") ) {
|
||||
logger.info("Found 1KG track for with prior probability = Q" + PRIOR_1KG);
|
||||
} else {
|
||||
logger.info("Not evaluating ROD binding " + d.getName());
|
||||
}
|
||||
}
|
||||
|
||||
if(!foundDBSNP) {
|
||||
throw new UserException.CommandLineException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
|
||||
}
|
||||
|
||||
if( inputNames.size() == 0 ) {
|
||||
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
|
||||
}
|
||||
|
||||
// setup the header fields
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
|
||||
hInfo.add(new VCFInfoHeaderLine(VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "log10-scaled probability of variant being true under the trained gaussian mixture model"));
|
||||
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
|
||||
|
||||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
|
||||
try {
|
||||
tranchesStream = new PrintStream(TRANCHES_FILE);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(TRANCHES_FILE, e);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
|
||||
|
||||
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||
return mapList;
|
||||
}
|
||||
|
||||
final Collection<VariantContext> vcsTruth = tracker.getVariantContexts(ref, truthNames, null, context.getLocation(), true, true);
|
||||
boolean isAtTruthSite = false;
|
||||
for( final VariantContext vcTruth : vcsTruth ) {
|
||||
if( vcTruth != null && vcTruth.isVariant() && !vcTruth.isFiltered() && (!vcTruth.hasGenotypes() || vcTruth.isPolymorphic()) ) {
|
||||
isAtTruthSite = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
nTruthSites += isAtTruthSite ? 1 : 0;
|
||||
|
||||
for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), true, false) ) {
|
||||
if( vc != null ) {
|
||||
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
|
||||
if( vc.getPhredScaledQual() >= QUAL_THRESHOLD ) {
|
||||
int iii = 0;
|
||||
final double annotationValues[] = new double[dataManager.annotationKeys.size()];
|
||||
for( final String key : dataManager.annotationKeys ) {
|
||||
annotationValues[iii++] = VariantDataManager.decodeAnnotation( getToolkit().getGenomeLocParser(), key, vc, false );
|
||||
}
|
||||
final VariantDatum variantDatum = new VariantDatum();
|
||||
variantDatum.annotations = dataManager.normalizeDatum( annotationValues );
|
||||
|
||||
if (vc.isSNP())
|
||||
variantDatum.isTransition = VariantContextUtils.getSNPSubstitutionType(vc).compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
|
||||
|
||||
final Collection<VariantContext> vcsDbsnp = tracker.getVariantContexts(ref, "dbsnp", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcsHapMap = tracker.getVariantContexts(ref, "hapmap", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcs1KG = tracker.getVariantContexts(ref, "1kg", null, context.getLocation(), false, true);
|
||||
final VariantContext vcDbsnp = ( vcsDbsnp.size() != 0 ? vcsDbsnp.iterator().next() : null );
|
||||
final VariantContext vcHapMap = ( vcsHapMap.size() != 0 ? vcsHapMap.iterator().next() : null );
|
||||
final VariantContext vc1KG = ( vcs1KG.size() != 0 ? vcs1KG.iterator().next() : null );
|
||||
|
||||
variantDatum.isKnown = ( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() );
|
||||
double knownPrior_qScore = PRIOR_NOVELS;
|
||||
if( vcHapMap != null && vcHapMap.isVariant() && !vcHapMap.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vcHapMap.hasGenotypes() || vcHapMap.isPolymorphic()) ) {
|
||||
knownPrior_qScore = PRIOR_HAPMAP;
|
||||
} else if( vc1KG != null && vc1KG.isVariant() && !vc1KG.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vc1KG.hasGenotypes() || vc1KG.isPolymorphic()) ) {
|
||||
knownPrior_qScore = PRIOR_1KG;
|
||||
} else if( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() ) {
|
||||
knownPrior_qScore = PRIOR_DBSNP;
|
||||
}
|
||||
|
||||
// If we can trust the AC field then use it instead of parsing all the genotypes. Results in a substantial speed up.
|
||||
final int alleleCount = (trustACField ? Integer.parseInt(vc.getAttribute("AC",-1).toString()) : vc.getChromosomeCount(vc.getAlternateAllele(0)));
|
||||
if( !trustACField && !NEVER_TRUST_AC_FIELD ) {
|
||||
if( !vc.getAttributes().containsKey("AC") ) {
|
||||
NEVER_TRUST_AC_FIELD = true;
|
||||
} else {
|
||||
if( alleleCount == Integer.parseInt( vc.getAttribute("AC").toString()) ) {
|
||||
// The AC field is correct at this record so we trust it forever
|
||||
trustACField = true;
|
||||
} else { // We found a record in which the AC field wasn't correct but we are trying to trust it
|
||||
throw new UserException.BadInput("AC info field doesn't match the variant chromosome count so we can't trust it! Please run with --dontTrustACField which will force the walker to parse the genotypes for each record, drastically increasing the runtime." +
|
||||
"First observed at " + vc.getChr() + ":" + vc.getStart());
|
||||
}
|
||||
}
|
||||
}
|
||||
if( trustACField && alleleCount == -1 ) {
|
||||
throw new UserException.BadInput("AC info field doesn't exist for all records (although it does for some) so we can't trust it! Please run with --dontTrustACField which will force the walker to parse the genotypes for each record, drastically increasing the runtime." +
|
||||
"First observed at " + vc.getChr() + ":" + vc.getStart());
|
||||
}
|
||||
|
||||
final Object[] priorKey = new Object[2];
|
||||
priorKey[0] = alleleCount;
|
||||
priorKey[1] = knownPrior_qScore;
|
||||
|
||||
Double priorLodFactor = (Double)priorCache.get( priorKey );
|
||||
|
||||
// If this prior factor hasn't been calculated before, do so now
|
||||
if(priorLodFactor == null) {
|
||||
final double knownPrior = QualityUtils.qualToProb(knownPrior_qScore);
|
||||
final double acPrior = Math.min( MAX_AC_PRIOR, 1.0 - Math.pow(SINGLETON_FP_RATE, alleleCount) ); //theModel.getAlleleCountPrior( alleleCount, MAX_AC_PRIOR );
|
||||
final double totalPrior = 1.0 - ((1.0 - acPrior) * (1.0 - knownPrior));
|
||||
|
||||
if( MathUtils.compareDoubles(totalPrior, 1.0, 1E-8) == 0 || MathUtils.compareDoubles(totalPrior, 0.0, 1E-8) == 0 ) {
|
||||
throw new UserException.CommandLineException("Something is wrong with the priors that were entered by the user: Prior = " + totalPrior); // TODO - fix this up later
|
||||
}
|
||||
|
||||
priorLodFactor = Math.log10(totalPrior) - Math.log10(1.0 - totalPrior) - Math.log10(1.0);
|
||||
|
||||
priorCache.put( priorLodFactor, false, priorKey );
|
||||
}
|
||||
|
||||
final ArrayList<VariantDatum> datumList = new ArrayList<VariantDatum>(1);
|
||||
datumList.add( variantDatum );
|
||||
engine.evaluateData(datumList, model);
|
||||
final double lod = priorLodFactor + datumList.get(0).pVarGivenModel;
|
||||
|
||||
if ( lod == Double.NEGATIVE_INFINITY ) {
|
||||
throw new ReviewedStingException("Negative infinity detected during summation at " + vc);
|
||||
}
|
||||
|
||||
//variantDatum.pos = vc.getStart();
|
||||
variantDatum.lod = round4(lod);
|
||||
|
||||
// deal with the truth calculation
|
||||
variantDatum.atTruthSite = isAtTruthSite;
|
||||
|
||||
mapList.add( variantDatum );
|
||||
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
|
||||
attrs.put(VariantRecalibratorV2.VQS_LOD_KEY, String.format("%.4f", lod));
|
||||
VariantContext newVC = VariantContext.modifyPErrorFiltersAndAttributes(vc, vc.getNegLog10PError(), new HashSet<String>(), attrs);
|
||||
|
||||
vcfWriter.add( newVC, ref.getBase() );
|
||||
}
|
||||
|
||||
} else { // not a SNP or is filtered so just dump it out to the VCF file
|
||||
vcfWriter.add( vc, ref.getBase() );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return mapList;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduceInit() {
|
||||
return new ExpandingArrayList<VariantDatum>();
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
reduceSum.addAll( mapValue );
|
||||
return reduceSum;
|
||||
}
|
||||
|
||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
dataManager.data = reduceSum;
|
||||
|
||||
// deal with truth information
|
||||
int nCallsAtTruth = TrancheManager.countCallsAtTruth(dataManager.data, Double.NEGATIVE_INFINITY);
|
||||
logger.info(String.format("Truth set size is %d, raw calls at these sites %d, maximum sensitivity of %.2f",
|
||||
nTruthSites, nCallsAtTruth, (100.0*nCallsAtTruth / Math.max(nTruthSites, nCallsAtTruth))));
|
||||
|
||||
TrancheManager.SelectionMetric metric = new TrancheManager.TruthSensitivityMetric(nCallsAtTruth);
|
||||
List<Tranche> tranches = TrancheManager.findTranches( dataManager.data, FDR_TRANCHES, metric, DEBUG_FILE ); //BUGBUG: recreated here to match the integration tests
|
||||
tranchesStream.print(Tranche.tranchesString(tranches));
|
||||
|
||||
// Execute Rscript command to plot the optimization curve
|
||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||
final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + TRANCHES_FILE.getAbsolutePath() + " " + TARGET_TITV;
|
||||
logger.info( rScriptTranchesCommandLine );
|
||||
|
||||
// Execute the RScript command to plot the table of truth values
|
||||
try {
|
||||
Process p;
|
||||
p = Runtime.getRuntime().exec( rScriptTranchesCommandLine );
|
||||
p.waitFor();
|
||||
} catch ( Exception e ) {
|
||||
Utils.warnUser("Unable to execute the RScript command. While not critical to the calculations themselves, the script outputs a report that is extremely useful for confirming that the recalibration proceded as expected. We highly recommend trying to rerun the script manually if possible.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -52,7 +52,7 @@ public class NestedHashMap{
|
|||
return null;
|
||||
}
|
||||
|
||||
public synchronized void put( final Object value, final Object... keys ) {
|
||||
public synchronized void put( final Object value, final Object... keys ) { // WARNING! value comes before the keys!
|
||||
this.put(value, false, keys );
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -15,110 +15,62 @@ public class VariantRecalibrationWalkersV2IntegrationTest extends WalkerTest {
|
|||
|
||||
private static class VRTest {
|
||||
String inVCF;
|
||||
String clusterMD5;
|
||||
String tranchesMD5;
|
||||
String recalVCFMD5;
|
||||
String recalMD5;
|
||||
String cutVCFMD5;
|
||||
public VRTest(String inVCF, String clusterMD5, String tranchesMD5, String recalVCFMD5, String cutVCFMD5) {
|
||||
public VRTest(String inVCF, String tranchesMD5, String recalMD5, String cutVCFMD5) {
|
||||
this.inVCF = validationDataLocation + inVCF;
|
||||
this.clusterMD5 = clusterMD5;
|
||||
this.tranchesMD5 = tranchesMD5;
|
||||
this.recalVCFMD5 = recalVCFMD5;
|
||||
this.recalMD5 = recalMD5;
|
||||
this.cutVCFMD5 = cutVCFMD5;
|
||||
}
|
||||
}
|
||||
|
||||
VRTest yriTrio = new VRTest("yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf",
|
||||
"f65c27ee40053adc72dd0bfbb628e4d7", // cluster file
|
||||
"dce581b880ffb6ea39cbada1ecc95915", // tranches
|
||||
"c3e8a2f43656eab7d847dbf850f844a6", // recalVCF
|
||||
"50f752a72643db9ad0aa94b3fc4e23d6"); // cut VCF
|
||||
"aa211561e6e9b1e323dec4eeaa318088", // tranches
|
||||
"226adf939e90ad20599108e77ad25dae", // recal file
|
||||
"345a159fd54f5c38500bb20f2de13737"); // cut VCF
|
||||
|
||||
VRTest lowPass = new VRTest("lowpass.N3.chr1.raw.vcf",
|
||||
"bda8f17cfc19d23e7e51f99e547f4b3d", // cluster file
|
||||
"66edae83c50f4e8601fef7fafba774af", // tranches
|
||||
"0123537e373657386068a534c0f5c91b", // recalVCF
|
||||
"2172368e8585841e5ad96c95d0827c4b"); // cut VCF
|
||||
"f4f2057a8c010206f0f56deff0602452", // tranches
|
||||
"dd36d252a6dc6e3207addddae731dd88", // recal file
|
||||
"f1ffd3bb1da3863ccb298a3373d6590a"); // cut VCF
|
||||
|
||||
@DataProvider(name = "VRTest")
|
||||
public Object[][] createData1() {
|
||||
return new Object[][]{ {yriTrio}, {lowPass} };
|
||||
}
|
||||
|
||||
@Test(dataProvider = "VRTest", enabled = true)
|
||||
public void testGenerateVariantClusters(VRTest params) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||
" -weightDBSNP 1.0 -weightHapMap 1.0" +
|
||||
" -T GenerateVariantClustersV2" +
|
||||
" -B:input,VCF " + params.inVCF +
|
||||
" -L 1:50,000,000-200,000,000" +
|
||||
" -qual 50.0" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" -an QD -an MQ -an SB" +
|
||||
" -clusterFile %s",
|
||||
Arrays.asList(params.clusterMD5));
|
||||
executeTest("testGenerateVariantClusters-"+params.inVCF, spec).getFirst();
|
||||
}
|
||||
|
||||
@Test(dataProvider = "VRTest",dependsOnMethods="testGenerateVariantClusters")
|
||||
@Test(dataProvider = "VRTest")
|
||||
public void testVariantRecalibrator(VRTest params) {
|
||||
//System.out.printf("PARAMS FOR %s is %s%n", vcf, clusterFile);
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||
" -B:truthHapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||
" -T VariantRecalibratorV2" +
|
||||
" -B:omni,VCF " + comparisonDataLocation + "Validated/Omni2.5_chip/1212samples.b36.vcf" +
|
||||
" -T ContrastiveRecalibrator" +
|
||||
" -B:input,VCF " + params.inVCF +
|
||||
" -L 1:20,000,000-100,000,000" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" --ignore_filter HARD_TO_VALIDATE" +
|
||||
" -clusterFile " + getFileForMD5(params.clusterMD5) +
|
||||
" -sm TRUTH_SENSITIVITY" +
|
||||
" -o %s" +
|
||||
" -L 1:50,000,000-120,000,000" +
|
||||
" -an QD -an MQ -an SB -an AF" +
|
||||
" -recalFile %s" +
|
||||
" -tranchesFile %s",
|
||||
Arrays.asList(params.recalVCFMD5, params.tranchesMD5));
|
||||
Arrays.asList(params.recalMD5, params.tranchesMD5));
|
||||
executeTest("testVariantRecalibrator-"+params.inVCF, spec).getFirst();
|
||||
}
|
||||
|
||||
@Test(dataProvider = "VRTest",dependsOnMethods="testVariantRecalibrator")
|
||||
public void testApplyVariantCuts(VRTest params) {
|
||||
public void testApplyRecalibration(VRTest params) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -T ApplyVariantCutsV2" +
|
||||
" -L 1:20,000,000-100,000,000" +
|
||||
" -B:input,VCF " + getFileForMD5(params.recalVCFMD5) +
|
||||
" -T ApplyRecalibration" +
|
||||
" -L 1:60,000,000-115,000,000" +
|
||||
" -B:input,VCF " + params.inVCF +
|
||||
" -o %s" +
|
||||
" -tranchesFile " + getFileForMD5(params.tranchesMD5),
|
||||
" -tranchesFile " + getFileForMD5(params.tranchesMD5) +
|
||||
" -recalFile " + getFileForMD5(params.recalMD5),
|
||||
Arrays.asList(params.cutVCFMD5));
|
||||
executeTest("testApplyVariantCuts-"+params.inVCF, spec);
|
||||
}
|
||||
|
||||
@Test()
|
||||
public void testFailWithBadAnnotation() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-R " + b36KGReference +
|
||||
" -NO_HEADER" +
|
||||
" --DBSNP " + GATKDataLocation + "dbsnp_129_b36.rod" +
|
||||
" -B:hapmap,VCF " + comparisonDataLocation + "Validated/HapMap/3.2/sites_r27_nr.b36_fwd.vcf" +
|
||||
" -weightDBSNP 0.2 -weightHapMap 1.0" +
|
||||
" -T GenerateVariantClustersV2" +
|
||||
" -B:input,VCF " + lowPass.inVCF +
|
||||
" -L 1:50,000,000-200,000,000" +
|
||||
" -qual 50.0" +
|
||||
" --ignore_filter GATK_STANDARD" +
|
||||
" -an QD -an HRun -an ThisAnnotationIsBAD" + // There is a bad annotation here
|
||||
" -clusterFile %s",
|
||||
1, // just one output file
|
||||
UserException.MalformedFile.class);
|
||||
executeTest("testFailWithBadAnnotation", spec);
|
||||
executeTest("testApplyRecalibration-"+params.inVCF, spec);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue