Good bye VQSR v1. This commit will break the build.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5739 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2dacf1b2b2
commit
44a717f63a
|
|
@ -1,202 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Jun 2, 2010
|
||||
*
|
||||
* @help.summary Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
|
||||
*/
|
||||
|
||||
public class ApplyVariantCuts extends RodWalker<Integer, Integer> {
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Inputs
|
||||
/////////////////////////////
|
||||
@Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
|
||||
private File TRANCHES_FILE;
|
||||
|
||||
/////////////////////////////
|
||||
// Outputs
|
||||
/////////////////////////////
|
||||
@Output( doc="The output filtered VCF file", required=true)
|
||||
private VCFWriter vcfWriter = null;
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="fdr_filter_level", shortName="fdr_filter_level", doc="The FDR level at which to start filtering.", required=false)
|
||||
private double FDR_FILTER_LEVEL = 0.0;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
final private List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
final private Set<String> inputNames = new HashSet<String>();
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
for ( Tranche t : Tranche.readTraches(TRANCHES_FILE) ) {
|
||||
if ( t.fdr >= FDR_FILTER_LEVEL) {
|
||||
tranches.add(t);
|
||||
//statusMsg = "Keeping, above FDR threshold";
|
||||
}
|
||||
logger.info(String.format("Read tranche " + t));
|
||||
}
|
||||
Collections.reverse(tranches); // this algorithm wants the tranches ordered from worst to best
|
||||
|
||||
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
||||
if( d.getName().startsWith("input") ) {
|
||||
inputNames.add(d.getName());
|
||||
logger.info("Found input variant track with name " + d.getName());
|
||||
} else {
|
||||
logger.info("Not evaluating ROD binding " + d.getName());
|
||||
}
|
||||
}
|
||||
|
||||
if( inputNames.size() == 0 ) {
|
||||
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
|
||||
}
|
||||
|
||||
// setup the header fields
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
|
||||
|
||||
if( tranches.size() >= 2 ) {
|
||||
for( int iii = 0; iii < tranches.size() - 1; iii++ ) {
|
||||
Tranche t = tranches.get(iii);
|
||||
hInfo.add(new VCFFilterHeaderLine(t.name, String.format("FDR tranche level at VSQ Lod: " + t.minVQSLod + " <= x < " + tranches.get(iii+1).minVQSLod)));
|
||||
}
|
||||
}
|
||||
if( tranches.size() >= 1 ) {
|
||||
hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("FDR tranche level at VQS Lod < " + tranches.get(0).minVQSLod)));
|
||||
} else {
|
||||
throw new UserException("No tranches were found in the file or were above the FDR Filter level " + FDR_FILTER_LEVEL);
|
||||
}
|
||||
|
||||
logger.info("Keeping all variants in tranche " + tranches.get(tranches.size()-1));
|
||||
|
||||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||
return 1;
|
||||
}
|
||||
|
||||
for( VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), true, false) ) {
|
||||
if( vc != null ) {
|
||||
String filterString = null;
|
||||
if( !vc.isFiltered() ) {
|
||||
try {
|
||||
final double lod = vc.getAttributeAsDouble(VariantRecalibrator.VQS_LOD_KEY);
|
||||
for( int i = tranches.size() - 1; i >= 0; i-- ) {
|
||||
Tranche tranche = tranches.get(i);
|
||||
if( lod >= tranche.minVQSLod ) {
|
||||
if (i == tranches.size() - 1) {
|
||||
filterString = VCFConstants.PASSES_FILTERS_v4;
|
||||
} else {
|
||||
filterString = tranche.name;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( filterString == null ) {
|
||||
filterString = tranches.get(0).name+"+";
|
||||
}
|
||||
|
||||
if ( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) {
|
||||
Set<String> filters = new HashSet<String>();
|
||||
filters.add(filterString);
|
||||
vc = VariantContext.modifyFilters(vc, filters);
|
||||
}
|
||||
} catch ( ClassCastException e ) {
|
||||
throw new UserException.MalformedFile(vc.getSource(), "Invalid value for VQS key " + VariantRecalibrator.VQS_LOD_KEY + " at variant " + vc, e);
|
||||
}
|
||||
}
|
||||
|
||||
vcfWriter.add( vc, ref.getBase() );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return 1; // This value isn't used for anything
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 1; // This value isn't used for anything
|
||||
}
|
||||
|
||||
public Integer reduce( final Integer mapValue, final Integer reduceSum ) {
|
||||
return 1; // This value isn't used for anything
|
||||
}
|
||||
|
||||
public void onTraversalDone( Integer reduceSum ) {
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,252 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Feb 11, 2010
|
||||
*
|
||||
* @help.summary Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations producing calibrated variant cluster parameters which can be applied to other datasets
|
||||
*/
|
||||
|
||||
public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||
|
||||
/////////////////////////////
|
||||
// Outputs
|
||||
/////////////////////////////
|
||||
@Output(fullName="cluster_file", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
private PrintStream CLUSTER_FILE;
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
|
||||
private String[] USE_ANNOTATIONS = null;
|
||||
@Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians to try during Bayesian clustering", required=false)
|
||||
private int MAX_GAUSSIANS = 4;
|
||||
@Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of iterations to be performed when clustering. Clustering will normally end when convergence is detected.", required=false)
|
||||
private int MAX_ITERATIONS = 200;
|
||||
@Argument(fullName="weightNovel", shortName="weightNovel", doc="The weight for novel variants during clustering", required=false)
|
||||
private double WEIGHT_NOVELS = 0.0;
|
||||
@Argument(fullName="weightDBSNP", shortName="weightDBSNP", doc="The weight for dbSNP variants during clustering", required=false)
|
||||
private double WEIGHT_DBSNP = 0.0;
|
||||
@Argument(fullName="weightHapMap", shortName="weightHapMap", doc="The weight for HapMap variants during clustering", required=false)
|
||||
private double WEIGHT_HAPMAP = 1.0;
|
||||
@Argument(fullName="weight1KG", shortName="weight1KG", doc="The weight for 1000 Genomes Project variants during clustering", required=false)
|
||||
private double WEIGHT_1KG = 1.0;
|
||||
@Argument(fullName="forceIndependent", shortName="forceIndependent", doc="Force off-diagonal entries in the covariance matrix to be zero.", required=false)
|
||||
private boolean FORCE_INDEPENDENT = false;
|
||||
@Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for clustering.", required=false)
|
||||
private double STD_THRESHOLD = 4.5;
|
||||
@Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for clustering.", required=false)
|
||||
private double QUAL_THRESHOLD = 100.0;
|
||||
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
|
||||
private double SHRINKAGE = 0.0001;
|
||||
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algorithm.", required=false)
|
||||
private double DIRICHLET_PARAMETER = 1000.0;
|
||||
|
||||
/////////////////////////////
|
||||
// Debug Arguments
|
||||
/////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName = "NO_HEADER", shortName = "NO_HEADER", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", required = false)
|
||||
protected Boolean NO_HEADER_LINE = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false)
|
||||
protected Boolean TRUST_ALL_POLYMORPHIC = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private ExpandingArrayList<String> annotationKeys;
|
||||
private Set<String> ignoreInputFilterSet = null;
|
||||
private Set<String> inputNames = new HashSet<String>();
|
||||
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
||||
private VariantGaussianMixtureModel theModel = new VariantGaussianMixtureModel();
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
//if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
|
||||
annotationKeys = new ExpandingArrayList<String>(Arrays.asList(USE_ANNOTATIONS));
|
||||
|
||||
if( IGNORE_INPUT_FILTERS != null ) {
|
||||
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
|
||||
}
|
||||
|
||||
if( !NO_HEADER_LINE ) {
|
||||
CLUSTER_FILE.print("##GenerateVariantClusters = ");
|
||||
CLUSTER_FILE.println("\"" + getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this) + "\"");
|
||||
}
|
||||
|
||||
boolean foundTruthSet = false;
|
||||
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
||||
if( d.getName().startsWith("input") ) {
|
||||
inputNames.add(d.getName());
|
||||
logger.info("Found input variant track with name " + d.getName());
|
||||
} else if ( d.getName().equals("dbsnp") ) {
|
||||
logger.info("Found dbSNP track for use in training with weight = " + WEIGHT_DBSNP);
|
||||
if( WEIGHT_DBSNP > 0.0 ) {
|
||||
foundTruthSet = true;
|
||||
}
|
||||
} else if ( d.getName().equals("hapmap") ) {
|
||||
logger.info("Found HapMap track for use in training with weight = " + WEIGHT_HAPMAP);
|
||||
if( WEIGHT_HAPMAP > 0.0 ) {
|
||||
foundTruthSet = true;
|
||||
}
|
||||
|
||||
} else if ( d.getName().equals("1kg") ) {
|
||||
logger.info("Found 1KG track for use in training with weight = " + WEIGHT_1KG);
|
||||
if( WEIGHT_1KG > 0.0 ) {
|
||||
foundTruthSet = true;
|
||||
}
|
||||
|
||||
} else {
|
||||
logger.info("Not evaluating ROD binding " + d.getName());
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundTruthSet ) {
|
||||
throw new UserException.CommandLineException("No truth set found! Please provide sets of known polymorphic loci to be used as training data using the dbsnp, hapmap, or 1kg rod bindings. Clustering weights can be specified using -weightDBSNP, -weightHapMap, and -weight1KG");
|
||||
}
|
||||
|
||||
if( inputNames.size() == 0 ) {
|
||||
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
|
||||
|
||||
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||
return mapList;
|
||||
}
|
||||
|
||||
final double annotationValues[] = new double[annotationKeys.size()];
|
||||
|
||||
for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), false, false) ) {
|
||||
if( vc != null ) {
|
||||
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
|
||||
int iii = 0;
|
||||
for( final String key : annotationKeys ) {
|
||||
annotationValues[iii++] = theModel.decodeAnnotation( getToolkit().getGenomeLocParser(), key, vc, true );
|
||||
}
|
||||
|
||||
final VariantDatum variantDatum = new VariantDatum();
|
||||
variantDatum.annotations = annotationValues;
|
||||
|
||||
final Collection<VariantContext> vcsDbsnp = tracker.getVariantContexts(ref, "dbsnp", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcsHapMap = tracker.getVariantContexts(ref, "hapmap", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcs1KG = tracker.getVariantContexts(ref, "1kg", null, context.getLocation(), false, true);
|
||||
final VariantContext vcDbsnp = ( vcsDbsnp.size() != 0 ? vcsDbsnp.iterator().next() : null );
|
||||
final VariantContext vcHapMap = ( vcsHapMap.size() != 0 ? vcsHapMap.iterator().next() : null );
|
||||
final VariantContext vc1KG = ( vcs1KG.size() != 0 ? vcs1KG.iterator().next() : null );
|
||||
|
||||
variantDatum.isKnown = ( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() );
|
||||
variantDatum.weight = WEIGHT_NOVELS;
|
||||
if( vcHapMap != null && vcHapMap.isVariant() && !vcHapMap.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vcHapMap.hasGenotypes() || vcHapMap.isPolymorphic()) ) {
|
||||
variantDatum.weight = WEIGHT_HAPMAP;
|
||||
} else if( vc1KG != null && vc1KG.isVariant() && !vc1KG.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vc1KG.hasGenotypes() || vc1KG.isPolymorphic()) ) {
|
||||
variantDatum.weight = WEIGHT_1KG;
|
||||
} else if( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() ) {
|
||||
variantDatum.weight = WEIGHT_DBSNP;
|
||||
}
|
||||
|
||||
if( variantDatum.weight > 0.0 && vc.getPhredScaledQual() > QUAL_THRESHOLD ) {
|
||||
mapList.add( variantDatum );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return mapList;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduceInit() {
|
||||
return new ExpandingArrayList<VariantDatum>();
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
reduceSum.addAll( mapValue );
|
||||
|
||||
return reduceSum;
|
||||
}
|
||||
|
||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
|
||||
logger.info( "There are " + reduceSum.size() + " variants with > 0 clustering weight and qual > threshold (--qualThreshold = " + QUAL_THRESHOLD + ")" );
|
||||
logger.info( "The annotations used for clustering are: " + annotationKeys );
|
||||
|
||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, annotationKeys );
|
||||
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
||||
|
||||
dataManager.normalizeData(); // Each data point is now [ (x - mean) / standard deviation ]
|
||||
|
||||
// Create the Gaussian Mixture Model model and run it
|
||||
theModel = new VariantGaussianMixtureModel( dataManager, MAX_GAUSSIANS, MAX_ITERATIONS, FORCE_INDEPENDENT, STD_THRESHOLD, SHRINKAGE, DIRICHLET_PARAMETER );
|
||||
|
||||
theModel.run( CLUSTER_FILE );
|
||||
}
|
||||
}
|
||||
|
|
@ -1,201 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
*/
|
||||
|
||||
public class Tranche implements Comparable<Tranche> {
|
||||
private static final int CURRENT_VERSION = 3;
|
||||
|
||||
public double fdr, minVQSLod, targetTiTv, knownTiTv, novelTiTv;
|
||||
public int numKnown,numNovel;
|
||||
public String name;
|
||||
|
||||
int accessibleTruthSites = 0;
|
||||
int callsAtTruthSites = 0;
|
||||
|
||||
public Tranche(double fdr, double targetTiTv, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites) {
|
||||
this(fdr, targetTiTv, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, "anonymous");
|
||||
}
|
||||
|
||||
public Tranche(double fdr, double targetTiTv, double minVQSLod, int numKnown, double knownTiTv, int numNovel, double novelTiTv, int accessibleTruthSites, int callsAtTruthSites, String name ) {
|
||||
this.fdr = fdr;
|
||||
this.targetTiTv = targetTiTv;
|
||||
this.minVQSLod = minVQSLod;
|
||||
this.novelTiTv = novelTiTv;
|
||||
this.numNovel = numNovel;
|
||||
this.knownTiTv = knownTiTv;
|
||||
this.numKnown = numKnown;
|
||||
this.name = name;
|
||||
|
||||
this.accessibleTruthSites = accessibleTruthSites;
|
||||
this.callsAtTruthSites = callsAtTruthSites;
|
||||
|
||||
if ( fdr < 0.0 )
|
||||
throw new UserException("Target FDR is unreasonable " + fdr);
|
||||
|
||||
if ( targetTiTv < 0.5 || targetTiTv > 10 )
|
||||
throw new UserException("Target Ti/Tv ratio is unreasonable " + targetTiTv);
|
||||
|
||||
if ( numKnown < 0 || numNovel < 0)
|
||||
throw new ReviewedStingException("Invalid tranch - no. variants is < 0 : known " + numKnown + " novel " + numNovel);
|
||||
|
||||
if ( name == null )
|
||||
throw new ReviewedStingException("BUG -- name cannot be null");
|
||||
}
|
||||
|
||||
private double getTruthSensitivity() {
|
||||
return accessibleTruthSites > 0 ? callsAtTruthSites / (1.0*accessibleTruthSites) : 0.0;
|
||||
}
|
||||
|
||||
public int compareTo(Tranche other) {
|
||||
return Double.compare(this.fdr, other.fdr);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("Tranche fdr=%.2f minVQSLod=%.4f known=(%d @ %.2f) novel=(%d @ %.2f) truthSites(%d accessible, %d called), name=%s]",
|
||||
fdr, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an appropriately formated string representing the raw tranches file on disk.
|
||||
*
|
||||
* @param rawTranches
|
||||
* @return
|
||||
*/
|
||||
public static String tranchesString(List<Tranche> rawTranches) {
|
||||
ByteArrayOutputStream bytes = new ByteArrayOutputStream();
|
||||
PrintStream stream = new PrintStream(bytes);
|
||||
|
||||
List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
tranches.addAll(rawTranches);
|
||||
Collections.sort(tranches);
|
||||
|
||||
stream.println("# Variant quality score tranches file");
|
||||
stream.println("# Version number " + CURRENT_VERSION);
|
||||
stream.println("FDRtranche,targetTiTv,numKnown,numNovel,knownTiTv,novelTiTv,minVQSLod,filterName,accessibleTruthSites,callsAtTruthSites,truthSensitivity");
|
||||
|
||||
Tranche prev = null;
|
||||
for ( Tranche t : tranches ) {
|
||||
stream.printf("%.2f,%.2f,%d,%d,%.4f,%.4f,%.4f,FDRtranche%.2fto%.2f,%d,%d,%.4f%n",
|
||||
t.fdr,t.targetTiTv,t.numKnown,t.numNovel,t.knownTiTv,t.novelTiTv, t.minVQSLod,
|
||||
(prev == null ? 0.0 : prev.fdr), t.fdr, t.accessibleTruthSites, t.callsAtTruthSites, t.getTruthSensitivity());
|
||||
prev = t;
|
||||
}
|
||||
|
||||
return bytes.toString();
|
||||
}
|
||||
|
||||
private static double getDouble(Map<String,String> bindings, String key, boolean required) {
|
||||
if ( bindings.containsKey(key) ) {
|
||||
String val = bindings.get(key);
|
||||
return Double.valueOf(val);
|
||||
}
|
||||
else if ( required ) {
|
||||
throw new UserException.MalformedFile("Malformed tranches file. Missing required key " + key);
|
||||
}
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
private static int getInteger(Map<String,String> bindings, String key, boolean required) {
|
||||
if ( bindings.containsKey(key) )
|
||||
return Integer.valueOf(bindings.get(key));
|
||||
else if ( required ) {
|
||||
throw new UserException.MalformedFile("Malformed tranches file. Missing required key " + key);
|
||||
}
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of tranches, sorted from most to least specific, read in from file f
|
||||
*
|
||||
* @param f
|
||||
* @return
|
||||
*/
|
||||
public static List<Tranche> readTraches(File f) {
|
||||
String[] header = null;
|
||||
List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
|
||||
try {
|
||||
for( final String line : new XReadLines(f) ) {
|
||||
if ( line.startsWith("#") )
|
||||
continue;
|
||||
|
||||
final String[] vals = line.split(",");
|
||||
if( header == null ) {
|
||||
header = vals;
|
||||
if ( header.length == 5 )
|
||||
// old style tranches file, throw an error
|
||||
throw new UserException.MalformedFile(f, "Unfortunately, your tranches file is from a previous version of this tool and cannot be used with the latest code. Please rerun VariantRecalibrator");
|
||||
if ( header.length != 8 && header.length != 11 )
|
||||
throw new UserException.MalformedFile(f, "Expected 8 elements in header line " + line);
|
||||
} else {
|
||||
if ( header.length != vals.length )
|
||||
throw new UserException.MalformedFile(f, "Line had too few/many fields. Header = " + header.length + " vals " + vals.length + " line " + line);
|
||||
|
||||
Map<String,String> bindings = new HashMap<String, String>();
|
||||
for ( int i = 0; i < vals.length; i++ ) bindings.put(header[i], vals[i]);
|
||||
tranches.add(new Tranche(getDouble(bindings,"FDRtranche", true),
|
||||
getDouble(bindings,"targetTiTv", false),
|
||||
getDouble(bindings,"minVQSLod", true),
|
||||
getInteger(bindings,"numKnown", false),
|
||||
getDouble(bindings,"knownTiTv", false),
|
||||
getInteger(bindings,"numNovel", true),
|
||||
getDouble(bindings,"novelTiTv", true),
|
||||
getInteger(bindings,"accessibleTruthSites", false),
|
||||
getInteger(bindings,"callsAtTruthSites", false),
|
||||
bindings.get("filterName")));
|
||||
}
|
||||
}
|
||||
|
||||
Collections.sort(tranches); // sort this in the standard order
|
||||
return tranches;
|
||||
} catch( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(f, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,37 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Mar 2, 2010
|
||||
*/
|
||||
|
||||
public interface VariantClusteringModel extends VariantOptimizationInterface {
|
||||
public void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final String clusterFilename );
|
||||
//public void applyClusters( final VariantDatum[] data, final String outputPrefix );
|
||||
}
|
||||
|
|
@ -1,169 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Feb 26, 2010
|
||||
*/
|
||||
|
||||
public class VariantDataManager {
|
||||
public enum TailType {
|
||||
TWO_TAILED,
|
||||
SMALL_IS_GOOD,
|
||||
BIG_IS_GOOD
|
||||
}
|
||||
|
||||
protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
|
||||
|
||||
public final VariantDatum[] data;
|
||||
public final int numVariants;
|
||||
public final int numAnnotations;
|
||||
public final double[] meanVector;
|
||||
public final double[] varianceVector; // This is really the standard deviation
|
||||
public final TailType[] tailTypes;
|
||||
public boolean isNormalized;
|
||||
public final ExpandingArrayList<String> annotationKeys;
|
||||
|
||||
public VariantDataManager( final ExpandingArrayList<VariantDatum> dataList, final ExpandingArrayList<String> _annotationKeys ) {
|
||||
numVariants = dataList.size();
|
||||
data = dataList.toArray( new VariantDatum[numVariants] );
|
||||
if( numVariants <= 0 ) {
|
||||
throw new UserException.BadInput("There are zero variants with > 0 clustering weight! Please provide sets of known polymorphic loci to be used as training data using the dbsnp, hapmap, or 1kg rod bindings. Clustering weights can be specified using -weightDBSNP, -weightHapMap, and -weight1KG" );
|
||||
}
|
||||
if( _annotationKeys == null ) {
|
||||
numAnnotations = 0;
|
||||
meanVector = null;
|
||||
varianceVector = null;
|
||||
tailTypes = null;
|
||||
} else {
|
||||
numAnnotations = _annotationKeys.size();
|
||||
if( numAnnotations <= 0 ) {
|
||||
throw new UserException.BadInput("There are zero annotations. At least one annotation must be provided to use this walker!" );
|
||||
}
|
||||
meanVector = new double[numAnnotations];
|
||||
varianceVector = new double[numAnnotations];
|
||||
tailTypes = new TailType[numAnnotations];
|
||||
Arrays.fill(tailTypes, TailType.TWO_TAILED);
|
||||
isNormalized = false;
|
||||
}
|
||||
annotationKeys = _annotationKeys;
|
||||
}
|
||||
|
||||
public VariantDataManager( final ExpandingArrayList<String> annotationLines ) {
|
||||
data = null;
|
||||
numVariants = 0;
|
||||
numAnnotations = annotationLines.size();
|
||||
meanVector = new double[numAnnotations];
|
||||
varianceVector = new double[numAnnotations];
|
||||
isNormalized = true;
|
||||
annotationKeys = new ExpandingArrayList<String>();
|
||||
tailTypes = new TailType[numAnnotations];
|
||||
Arrays.fill(tailTypes, TailType.TWO_TAILED);
|
||||
|
||||
int jjj = 0;
|
||||
for( final String line : annotationLines ) {
|
||||
final String[] vals = line.split(",");
|
||||
annotationKeys.add(vals[1]);
|
||||
meanVector[jjj] = Double.parseDouble(vals[2]);
|
||||
varianceVector[jjj] = Double.parseDouble(vals[3]);
|
||||
jjj++;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// code for dealing with TailTypes
|
||||
//
|
||||
|
||||
public void setAnnotationType(String annotation, TailType t ) {
|
||||
int i = annotationKeys.indexOf(annotation);
|
||||
if ( i == -1 ) throw new UserException.BadInput("Attempt to modify unknown annotation " + annotation + ": cluster file has " + Utils.join(",", annotationKeys));
|
||||
tailTypes[i] = t;
|
||||
logger.info("Updating annotation " + annotation + " to have tail type " + t);
|
||||
}
|
||||
|
||||
public TailType getAnnotationType(int i) {
|
||||
return tailTypes[i];
|
||||
}
|
||||
|
||||
|
||||
public void normalizeData() {
|
||||
boolean foundZeroVarianceAnnotation = false;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
final double theMean = mean(data, jjj);
|
||||
final double theSTD = standardDeviation(data, theMean, jjj);
|
||||
logger.info( annotationKeys.get(jjj) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) );
|
||||
if( theSTD < 1E-8 ) {
|
||||
foundZeroVarianceAnnotation = true;
|
||||
logger.warn("Zero variance is a problem: standard deviation = " + theSTD + " User must -exclude annotations with zero variance. Annotation = " + (jjj == numAnnotations-1 ? "QUAL" : annotationKeys.get(jjj)));
|
||||
} else if( theSTD < 1E-2 ) {
|
||||
logger.warn("Warning! Tiny variance. It is strongly recommended that you -exclude " + annotationKeys.get(jjj));
|
||||
}
|
||||
meanVector[jjj] = theMean;
|
||||
varianceVector[jjj] = theSTD;
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
data[iii].annotations[jjj] = ( data[iii].annotations[jjj] - theMean ) / theSTD;
|
||||
}
|
||||
}
|
||||
isNormalized = true; // Each data point is now [ (x - mean) / standard deviation ]
|
||||
if( foundZeroVarianceAnnotation ) {
|
||||
throw new UserException.BadInput("Found annotations with zero variance. They must be excluded before proceeding.");
|
||||
}
|
||||
}
|
||||
|
||||
private static double mean( final VariantDatum[] data, final int index ) {
|
||||
double sum = 0.0;
|
||||
final int numVars = data.length;
|
||||
for( int iii = 0; iii < numVars; iii++ ) {
|
||||
sum += (data[iii].annotations[index] / ((double) numVars));
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
private static double standardDeviation( final VariantDatum[] data, final double mean, final int index ) {
|
||||
double sum = 0.0;
|
||||
final int numVars = data.length;
|
||||
for( int iii = 0; iii < numVars; iii++ ) {
|
||||
sum += ( ((data[iii].annotations[index] - mean)*(data[iii].annotations[index] - mean)) / ((double) numVars));
|
||||
}
|
||||
return Math.sqrt( sum );
|
||||
}
|
||||
|
||||
public void printClusterFileHeader( PrintStream outputFile ) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
outputFile.println(String.format("@!ANNOTATION," + annotationKeys.get(jjj) + ",%.8f,%.8f", meanVector[jjj], varianceVector[jjj]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,46 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Feb 24, 2010
|
||||
*/
|
||||
|
||||
public class VariantDatum implements Comparable<VariantDatum> {
|
||||
public int pos;
|
||||
public double[] annotations;
|
||||
public boolean isTransition;
|
||||
public boolean isKnown;
|
||||
public double lod;
|
||||
public double weight;
|
||||
public boolean atTruthSite;
|
||||
|
||||
public int compareTo(VariantDatum other) {
|
||||
return Double.compare(this.lod, other.lod);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,937 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import Jama.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Feb 26, 2010
|
||||
*/
|
||||
|
||||
public final class VariantGaussianMixtureModel extends VariantOptimizationModel {
|
||||
|
||||
protected final static Logger logger = Logger.getLogger(VariantGaussianMixtureModel.class);
|
||||
|
||||
public final VariantDataManager dataManager;
|
||||
private final int maxGaussians;
|
||||
private final int maxIterations;
|
||||
private final static long RANDOM_SEED = 91801305;
|
||||
private final Random rand = new Random( RANDOM_SEED );
|
||||
private final double MIN_PROB_CONVERGENCE = 1E-5;
|
||||
|
||||
private final double SHRINKAGE;
|
||||
private final double DIRICHLET_PARAMETER;
|
||||
private final boolean FORCE_INDEPENDENT_ANNOTATIONS;
|
||||
|
||||
private final double[][] mu; // The means for each cluster
|
||||
private final Matrix[] sigma; // The covariance matrix for each cluster
|
||||
private final double[][][] sigmaInverse;
|
||||
private double[] pClusterLog10;
|
||||
private final double[] determinant;
|
||||
private final double[] sqrtDeterminantLog10;
|
||||
private final double stdThreshold;
|
||||
private double singletonFPRate = -1; // Estimated FP rate for singleton calls. Used to estimate FP rate as a function of AC
|
||||
|
||||
private double[] empiricalMu;
|
||||
private Matrix empiricalSigma;
|
||||
|
||||
private final double[] hyperParameter_a;
|
||||
private final double[] hyperParameter_b;
|
||||
private final double[] hyperParameter_lambda;
|
||||
|
||||
private final double CONSTANT_GAUSSIAN_DENOM_LOG10;
|
||||
|
||||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^##.*");
|
||||
private static final Pattern ANNOTATION_PATTERN = Pattern.compile("^@!ANNOTATION.*");
|
||||
private static final Pattern CLUSTER_PATTERN = Pattern.compile("^@!CLUSTER.*");
|
||||
|
||||
private static final double MAX_SIGMA_TO_BE_IN_GAUSSIAN = 4.0;
|
||||
|
||||
public VariantGaussianMixtureModel() {
|
||||
dataManager = null;
|
||||
maxGaussians = 0;
|
||||
maxIterations = 0;
|
||||
SHRINKAGE = 0;
|
||||
DIRICHLET_PARAMETER = 0;
|
||||
FORCE_INDEPENDENT_ANNOTATIONS = false;
|
||||
mu = null;
|
||||
sigma = null;
|
||||
sigmaInverse = null;
|
||||
determinant = null;
|
||||
sqrtDeterminantLog10 = null;
|
||||
stdThreshold = 0;
|
||||
hyperParameter_a = null;
|
||||
hyperParameter_b = null;
|
||||
hyperParameter_lambda = null;
|
||||
CONSTANT_GAUSSIAN_DENOM_LOG10 = 0.0;
|
||||
}
|
||||
|
||||
public VariantGaussianMixtureModel( final VariantDataManager _dataManager, final int _maxGaussians, final int _maxIterations,
|
||||
final boolean _forceIndependent, final double _stdThreshold, final double _shrinkage, final double _dirichlet) {
|
||||
dataManager = _dataManager;
|
||||
maxGaussians = _maxGaussians;
|
||||
maxIterations = _maxIterations;
|
||||
|
||||
mu = new double[maxGaussians][];
|
||||
sigma = new Matrix[maxGaussians];
|
||||
determinant = new double[maxGaussians];
|
||||
sqrtDeterminantLog10 = null;
|
||||
pClusterLog10 = new double[maxGaussians];
|
||||
stdThreshold = _stdThreshold;
|
||||
FORCE_INDEPENDENT_ANNOTATIONS = _forceIndependent;
|
||||
hyperParameter_a = new double[maxGaussians];
|
||||
hyperParameter_b = new double[maxGaussians];
|
||||
hyperParameter_lambda = new double[maxGaussians];
|
||||
sigmaInverse = null; // This field isn't used during GenerateVariantClusters pass
|
||||
|
||||
CONSTANT_GAUSSIAN_DENOM_LOG10 = Math.log10(Math.pow(2.0 * Math.PI, ((double)dataManager.numAnnotations) / 2.0));
|
||||
SHRINKAGE = _shrinkage;
|
||||
DIRICHLET_PARAMETER = _dirichlet;
|
||||
}
|
||||
|
||||
public VariantGaussianMixtureModel( final double _targetTITV, final File clusterFile, final double backOffGaussianFactor ) {
|
||||
super( _targetTITV );
|
||||
final ExpandingArrayList<String> annotationLines = new ExpandingArrayList<String>();
|
||||
final ExpandingArrayList<String> clusterLines = new ExpandingArrayList<String>();
|
||||
|
||||
try {
|
||||
for ( final String line : new XReadLines( clusterFile ) ) {
|
||||
if( ANNOTATION_PATTERN.matcher(line).matches() ) {
|
||||
annotationLines.add(line);
|
||||
} else if( CLUSTER_PATTERN.matcher(line).matches() ) {
|
||||
clusterLines.add(line);
|
||||
} else if( !COMMENT_PATTERN.matcher(line).matches() ) {
|
||||
throw new UserException.MalformedFile(clusterFile, "Could not parse line: " + line);
|
||||
}
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(clusterFile, e);
|
||||
}
|
||||
|
||||
dataManager = new VariantDataManager( annotationLines );
|
||||
// Several of the clustering parameters aren't used the second time around in VariantRecalibrator.java
|
||||
SHRINKAGE = 0;
|
||||
DIRICHLET_PARAMETER = 0;
|
||||
maxIterations = 0;
|
||||
stdThreshold = 0.0;
|
||||
FORCE_INDEPENDENT_ANNOTATIONS = false;
|
||||
hyperParameter_a = null;
|
||||
hyperParameter_b = null;
|
||||
hyperParameter_lambda = null;
|
||||
determinant = null;
|
||||
|
||||
// BUGBUG: move this parsing out of the constructor
|
||||
CONSTANT_GAUSSIAN_DENOM_LOG10 = Math.log10(Math.pow(2.0 * Math.PI, ((double)dataManager.numAnnotations) / 2.0));
|
||||
maxGaussians = clusterLines.size();
|
||||
mu = new double[maxGaussians][dataManager.numAnnotations];
|
||||
final double sigmaVals[][][] = new double[maxGaussians][dataManager.numAnnotations][dataManager.numAnnotations];
|
||||
sigma = new Matrix[maxGaussians];
|
||||
sigmaInverse = new double[maxGaussians][dataManager.numAnnotations][dataManager.numAnnotations];
|
||||
pClusterLog10 = new double[maxGaussians];
|
||||
sqrtDeterminantLog10 = new double[maxGaussians];
|
||||
|
||||
int kkk = 0;
|
||||
for( final String line : clusterLines ) {
|
||||
final String[] vals = line.split(",");
|
||||
pClusterLog10[kkk] = Math.log10( Double.parseDouble(vals[1]) ); // BUGBUG: #define these magic index numbers, very easy to make a mistake here
|
||||
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] = Double.parseDouble(vals[2+jjj]);
|
||||
for( int ppp = 0; ppp < dataManager.numAnnotations; ppp++ ) {
|
||||
sigmaVals[kkk][jjj][ppp] = Double.parseDouble(vals[2+dataManager.numAnnotations+(jjj*dataManager.numAnnotations)+ppp]) * backOffGaussianFactor;
|
||||
}
|
||||
}
|
||||
|
||||
sigma[kkk] = new Matrix(sigmaVals[kkk]);
|
||||
sigmaInverse[kkk] = sigma[kkk].inverse().getArray(); // Precompute all the inverses and determinants for use later
|
||||
sqrtDeterminantLog10[kkk] = Math.log10(Math.pow(sigma[kkk].det(), 0.5)); // Precompute for use later
|
||||
kkk++;
|
||||
}
|
||||
|
||||
logger.info("Found " + maxGaussians + " clusters using " + dataManager.numAnnotations + " annotations: " + dataManager.annotationKeys);
|
||||
}
|
||||
|
||||
public final void run( final PrintStream clusterFile ) {
|
||||
|
||||
// Only cluster with a good set of knowns. Filter based on being too many std's away from the mean annotation value
|
||||
// Filtering based on known status and qual threshold happens in GenerateVariantClusters
|
||||
long numAboveSTD = 0L;
|
||||
for( int iii = 0; iii < dataManager.data.length; iii++ ) {
|
||||
final VariantDatum datum = dataManager.data[iii];
|
||||
for( final double val : datum.annotations ) {
|
||||
if( Math.abs(val) > stdThreshold ) {
|
||||
datum.weight = 0.0;
|
||||
numAboveSTD++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.info( numAboveSTD + " variants were rejected from the training set for having annotation values more than X standard deviations away from the mean. (--stdThreshold = " + stdThreshold + ")" );
|
||||
|
||||
generateEmpricalStats( dataManager.data );
|
||||
|
||||
logger.info("Initializing using k-means...");
|
||||
initializeUsingKMeans( dataManager.data );
|
||||
logger.info("... done!");
|
||||
createClusters( dataManager.data, 0, maxGaussians, clusterFile );
|
||||
}
|
||||
|
||||
private void generateEmpricalStats( final VariantDatum[] data ) {
|
||||
final int numVariants = data.length;
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
|
||||
empiricalMu = new double[numAnnotations];
|
||||
final double[][] sigmaVals = new double[numAnnotations][numAnnotations];
|
||||
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
empiricalMu[jjj] = 0.0;
|
||||
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
|
||||
sigmaVals[jjj][ppp] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
double sumWeight = 0.0;
|
||||
for(int iii = 0; iii < numVariants; iii++) {
|
||||
sumWeight += data[iii].weight;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
empiricalMu[jjj] += data[iii].annotations[jjj] * data[iii].weight;
|
||||
}
|
||||
}
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
empiricalMu[jjj] /= sumWeight;
|
||||
}
|
||||
|
||||
for(int iii = 0; iii < numVariants; iii++) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
if( jjj == ppp ) {
|
||||
sigmaVals[jjj][ppp] = 1.0; //0.01 * numAnnotations;
|
||||
} else {
|
||||
sigmaVals[jjj][ppp] = 0.0;
|
||||
}
|
||||
//sigmaVals[jjj][ppp] += (data[iii].annotations[jjj]-empiricalMu[jjj]) * (data[iii].annotations[ppp]-empiricalMu[ppp]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
empiricalSigma = new Matrix(sigmaVals);
|
||||
//empiricalSigma.timesEquals(1.0 / (Math.pow(maxGaussians, 2.0 / ((double) numAnnotations))));
|
||||
}
|
||||
|
||||
public void setSingletonFPRate( final double rate ) {
|
||||
this.singletonFPRate = rate;
|
||||
}
|
||||
|
||||
public final double getAlleleCountPrior( final int alleleCount, final double maxProbability ) {
|
||||
return Math.min( maxProbability, 1.0 - Math.pow(singletonFPRate, alleleCount) );
|
||||
}
|
||||
|
||||
private void initializeUsingKMeans( final VariantDatum[] data ) {
|
||||
final int numVariants = data.length;
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
|
||||
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
|
||||
//mu[kkk] = data[rand.nextInt(numVariants)].annotations;
|
||||
mu[kkk] = new double[numAnnotations];
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] = -4.0 + 8.0 * rand.nextDouble();
|
||||
}
|
||||
}
|
||||
|
||||
for( int ttt = 0; ttt < 60; ttt++ ) {
|
||||
performKMeansIteration( data );
|
||||
}
|
||||
}
|
||||
|
||||
private void performKMeansIteration( final VariantDatum[] data ) {
|
||||
final int numVariants = data.length;
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
|
||||
final int[] assignment = new int[numVariants];
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
final VariantDatum datum = data[iii];
|
||||
if(datum.weight > 0.0) {
|
||||
double minDistance = Double.MAX_VALUE;
|
||||
int minCluster = -1;
|
||||
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
|
||||
double dist = 0.0;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
dist += (datum.annotations[jjj] - mu[kkk][jjj]) * (datum.annotations[jjj] - mu[kkk][jjj]);
|
||||
}
|
||||
if(dist < minDistance) {
|
||||
minDistance = dist;
|
||||
minCluster = kkk;
|
||||
}
|
||||
}
|
||||
assignment[iii] = minCluster;
|
||||
}
|
||||
}
|
||||
|
||||
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] = 0.0;
|
||||
}
|
||||
int numAssigned = 0;
|
||||
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
final VariantDatum datum = data[iii];
|
||||
if(datum.weight > 0.0) {
|
||||
if(assignment[iii] == kkk) {
|
||||
numAssigned++;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] += data[iii].annotations[jjj];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if(numAssigned != 0) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] /= (double) numAssigned;
|
||||
}
|
||||
} else {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] = -4.0 + 8.0 * rand.nextDouble();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public final void createClusters( final VariantDatum[] data, final int startCluster, final int stopCluster, final PrintStream clusterFile ) {
|
||||
|
||||
final int numVariants = data.length;
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
|
||||
final double[][] pVarInCluster = new double[maxGaussians][numVariants]; // Probability that the variant is in that cluster = simply evaluate the multivariate Gaussian
|
||||
|
||||
// loop control variables:
|
||||
// iii - loop over data points
|
||||
// jjj - loop over annotations (features)
|
||||
// ppp - loop over annotations again (full rank covariance matrix)
|
||||
// kkk - loop over clusters
|
||||
// ttt - loop over EM iterations
|
||||
|
||||
// Set up the initial random Gaussians
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
hyperParameter_a[kkk] = numAnnotations;
|
||||
hyperParameter_b[kkk] = SHRINKAGE;
|
||||
hyperParameter_lambda[kkk] = DIRICHLET_PARAMETER;
|
||||
pClusterLog10[kkk] = Math.log10(1.0 / ((double) (stopCluster - startCluster)));
|
||||
final double[][] randSigma = new double[numAnnotations][numAnnotations];
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for( int ppp = jjj; ppp < numAnnotations; ppp++ ) {
|
||||
randSigma[ppp][jjj] = 0.55 + 1.25 * rand.nextDouble();
|
||||
if(rand.nextBoolean()) {
|
||||
randSigma[ppp][jjj] *= -1.0;
|
||||
}
|
||||
if(jjj != ppp) { randSigma[jjj][ppp] = 0.0; } // Sigma is a symmetric, positive-definite matrix created by taking a lower diagonal matrix and multiplying it by its transpose
|
||||
}
|
||||
}
|
||||
if( FORCE_INDEPENDENT_ANNOTATIONS ) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
if(jjj!=ppp) {
|
||||
randSigma[jjj][ppp] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Matrix tmp = new Matrix(randSigma);
|
||||
tmp = tmp.times(tmp.transpose());
|
||||
sigma[kkk] = tmp;
|
||||
determinant[kkk] = sigma[kkk].det();
|
||||
}
|
||||
|
||||
// The EM loop
|
||||
double previousLikelihood = -1E20;
|
||||
double currentLikelihood;
|
||||
int ttt = 1;
|
||||
while( ttt < maxIterations ) {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Expectation Step (calculate the probability that each data point is in each cluster)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
currentLikelihood = evaluateGaussians( data, pVarInCluster, startCluster, stopCluster );
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Maximization Step (move the clusters to maximize the sum probability of each data point)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
maximizeGaussians( data, pVarInCluster, startCluster, stopCluster );
|
||||
|
||||
logger.info("Finished iteration " + ttt );
|
||||
ttt++;
|
||||
if( Math.abs(currentLikelihood - previousLikelihood) < MIN_PROB_CONVERGENCE ) {
|
||||
logger.info("Convergence!");
|
||||
break;
|
||||
}
|
||||
previousLikelihood = currentLikelihood;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Output the final cluster parameters
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
printClusterParameters( clusterFile );
|
||||
}
|
||||
|
||||
private void printClusterParameters( final PrintStream clusterFile ) {
|
||||
dataManager.printClusterFileHeader( clusterFile );
|
||||
|
||||
final int numAnnotations = mu[0].length;
|
||||
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
|
||||
if( Math.pow(10.0, pClusterLog10[kkk]) > 1E-4 ) { // BUGBUG: make this a command line argument
|
||||
final double sigmaVals[][] = sigma[kkk].getArray();
|
||||
clusterFile.print("@!CLUSTER");
|
||||
clusterFile.print(String.format(",%.8f", Math.pow(10.0, pClusterLog10[kkk])));
|
||||
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
clusterFile.print(String.format(",%.8f", mu[kkk][jjj]));
|
||||
}
|
||||
for(int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for(int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
clusterFile.print(String.format(",%.8f", (sigmaVals[jjj][ppp] / hyperParameter_a[kkk]) ));
|
||||
}
|
||||
}
|
||||
clusterFile.println();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public double decodeAnnotation(GenomeLocParser genomeLocParser, final String annotationKey, final VariantContext vc, final boolean jitter ) {
|
||||
double value;
|
||||
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // HRun values must be jittered a bit to work in this GMM
|
||||
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
|
||||
value += -0.25 + 0.5 * rand.nextDouble();
|
||||
} else if( annotationKey.equals("QUAL") ) {
|
||||
value = vc.getPhredScaledQual();
|
||||
} else {
|
||||
try {
|
||||
value = Double.parseDouble( (String)vc.getAttribute( annotationKey ) );
|
||||
} catch( Exception e ) {
|
||||
throw new UserException.MalformedFile(vc.getSource(), "No double value detected for annotation = " + annotationKey + " in variant at " + VariantContextUtils.getLocation(genomeLocParser,vc) + ", reported annotation value = " + vc.getAttribute( annotationKey ), e );
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Code to determine FDR tranches for VariantDatum[]
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
public static abstract class SelectionMetric {
|
||||
String name = null;
|
||||
|
||||
public SelectionMetric(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getName() { return name; }
|
||||
|
||||
public abstract double getThreshold(double tranche);
|
||||
public abstract double getTarget();
|
||||
public abstract void calculateRunningMetric(List<VariantDatum> data);
|
||||
public abstract double getRunningMetric(int i);
|
||||
public abstract int datumValue(VariantDatum d);
|
||||
}
|
||||
|
||||
public static class NovelTiTvMetric extends SelectionMetric {
|
||||
double[] runningTiTv;
|
||||
double targetTiTv = 0;
|
||||
|
||||
public NovelTiTvMetric(double target) {
|
||||
super("NovelTiTv");
|
||||
targetTiTv = target; // compute the desired TiTv
|
||||
}
|
||||
|
||||
public double getThreshold(double tranche) {
|
||||
return fdrToTiTv(tranche, targetTiTv);
|
||||
}
|
||||
|
||||
public double getTarget() { return targetTiTv; }
|
||||
|
||||
public void calculateRunningMetric(List<VariantDatum> data) {
|
||||
int ti = 0, tv = 0;
|
||||
runningTiTv = new double[data.size()];
|
||||
|
||||
for ( int i = data.size() - 1; i >= 0; i-- ) {
|
||||
VariantDatum datum = data.get(i);
|
||||
if ( ! datum.isKnown ) {
|
||||
if ( datum.isTransition ) { ti++; } else { tv++; }
|
||||
runningTiTv[i] = ti / Math.max(1.0 * tv, 1.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public double getRunningMetric(int i) {
|
||||
return runningTiTv[i];
|
||||
}
|
||||
|
||||
public int datumValue(VariantDatum d) {
|
||||
return d.isTransition ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
public static class TruthSensitivityMetric extends SelectionMetric {
|
||||
double[] runningSensitivity;
|
||||
double targetTiTv = 0;
|
||||
int nTrueSites = 0;
|
||||
|
||||
public TruthSensitivityMetric(int nTrueSites) {
|
||||
super("TruthSensitivity");
|
||||
this.nTrueSites = nTrueSites;
|
||||
}
|
||||
|
||||
public double getThreshold(double tranche) {
|
||||
return tranche/100; // tranche of 1 => 99% sensivity target
|
||||
}
|
||||
|
||||
public double getTarget() { return 1.0; }
|
||||
|
||||
public void calculateRunningMetric(List<VariantDatum> data) {
|
||||
int nCalledAtTruth = 0;
|
||||
runningSensitivity = new double[data.size()];
|
||||
|
||||
for ( int i = data.size() - 1; i >= 0; i-- ) {
|
||||
VariantDatum datum = data.get(i);
|
||||
nCalledAtTruth += datum.atTruthSite ? 1 : 0;
|
||||
runningSensitivity[i] = 1 - nCalledAtTruth / (1.0 * nTrueSites);
|
||||
}
|
||||
}
|
||||
|
||||
public double getRunningMetric(int i) {
|
||||
return runningSensitivity[i];
|
||||
}
|
||||
|
||||
public int datumValue(VariantDatum d) {
|
||||
return d.atTruthSite ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
public final static List<Tranche> findTranches( final VariantDatum[] data, final double[] tranches, SelectionMetric metric ) {
|
||||
return findTranches( data, tranches, metric, null );
|
||||
}
|
||||
|
||||
public final static List<Tranche> findTranches( final VariantDatum[] data, final double[] trancheThresholds, SelectionMetric metric, File debugFile ) {
|
||||
logger.info(String.format("Finding %d tranches for %d variants", trancheThresholds.length, data.length));
|
||||
|
||||
List<VariantDatum> tranchesData = sortVariantsbyLod(data);
|
||||
metric.calculateRunningMetric(tranchesData);
|
||||
|
||||
if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, tranchesData, metric); }
|
||||
|
||||
List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
for ( double trancheThreshold : trancheThresholds ) {
|
||||
Tranche t = findTranche(tranchesData, metric, trancheThreshold);
|
||||
|
||||
if ( t == null ) {
|
||||
if ( tranches.size() == 0 )
|
||||
throw new UserException(String.format("Couldn't find any tranche containing variants with a %s > %.2f", metric.getName(), metric.getThreshold(trancheThreshold)));
|
||||
break;
|
||||
}
|
||||
|
||||
tranches.add(t);
|
||||
}
|
||||
|
||||
return tranches;
|
||||
}
|
||||
|
||||
private static void writeTranchesDebuggingInfo(File f, List<VariantDatum> tranchesData, SelectionMetric metric ) {
|
||||
try {
|
||||
PrintStream out = new PrintStream(f);
|
||||
out.println("Qual metricValue runningValue");
|
||||
for ( int i = 0; i < tranchesData.size(); i++ ) {
|
||||
VariantDatum d = tranchesData.get(i);
|
||||
int score = metric.datumValue(d);
|
||||
double runningValue = metric.getRunningMetric(i);
|
||||
out.printf("%.4f %d %.4f%n", d.lod, score, runningValue);
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(f, e);
|
||||
}
|
||||
}
|
||||
|
||||
private final static List<VariantDatum> sortVariantsbyLod(final VariantDatum[] data) {
|
||||
List<VariantDatum> sorted = new ArrayList<VariantDatum>(Arrays.asList(data));
|
||||
Collections.sort(sorted);
|
||||
return sorted;
|
||||
}
|
||||
|
||||
public final static Tranche findTranche( final List<VariantDatum> data, final SelectionMetric metric, final double trancheThreshold ) {
|
||||
logger.info(String.format(" Tranche threshold %.2f => selection metric threshold %.3f", trancheThreshold, metric.getThreshold(trancheThreshold)));
|
||||
|
||||
double metricThreshold = metric.getThreshold(trancheThreshold);
|
||||
int n = data.size();
|
||||
for ( int i = 0; i < n; i++ ) {
|
||||
if ( metric.getRunningMetric(i) >= metricThreshold ) {
|
||||
// we've found the largest group of variants with Ti/Tv >= our target titv
|
||||
Tranche t = trancheOfVariants(data, i, trancheThreshold, metric.getTarget());
|
||||
logger.info(String.format(" Found tranche for %.3f: %.3f threshold starting with variant %d; running score is %.3f ",
|
||||
trancheThreshold, metricThreshold, i, metric.getRunningMetric(i)));
|
||||
logger.info(String.format(" Tranche is %s", t));
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
// we get here when there's no subset of variants with Ti/Tv >= threshold, in which case we should return null
|
||||
//logger.info(String.format(" **Couldn't find tranche for %.3f FDR = %.3f TiTv threshold; last running TiTv was %.2f ",
|
||||
// desiredFDR, titvThreshold, runningTiTv[runningTiTv.length-1]));
|
||||
|
||||
|
||||
// for ( int i = 0; i < runningTiTv.length; i++ ) {
|
||||
// if ( runningTiTv[i] >= titvThreshold ) {
|
||||
// // we've found the largest group of variants with Ti/Tv >= our target titv
|
||||
// Tranche t = trancheOfVariants(data, i, desiredFDR, targetTiTv);
|
||||
// logger.info(String.format(" Found tranche for %.3f FDR = %.3f TiTv threshold starting with variant %d; running TiTv is %.2f ",
|
||||
// desiredFDR, titvThreshold, i, metric.getRunningMetric(i)));
|
||||
// logger.info(String.format(" Tranche is %s", t));
|
||||
// return t;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // we get here when there's no subset of variants with Ti/Tv >= threshold, in which case we should return null
|
||||
// logger.info(String.format(" **Couldn't find tranche for %.3f FDR = %.3f TiTv threshold; last running TiTv was %.2f ",
|
||||
// desiredFDR, titvThreshold, runningTiTv[runningTiTv.length-1]));
|
||||
//
|
||||
return null;
|
||||
}
|
||||
|
||||
public final static Tranche trancheOfVariants( final List<VariantDatum> data, int minI, double fdr, double target ) {
|
||||
int numKnown = 0, numNovel = 0, knownTi = 0, knownTv = 0, novelTi = 0, novelTv = 0;
|
||||
|
||||
double minLod = data.get(minI).lod;
|
||||
VariantDatum last = null;
|
||||
for ( VariantDatum datum : data ) {
|
||||
if ( datum.lod >= minLod ) {
|
||||
//if( ! datum.isKnown ) System.out.println(datum.pos);
|
||||
if ( datum.isKnown ) {
|
||||
numKnown++;
|
||||
if ( datum.isTransition ) { knownTi++; } else { knownTv++; }
|
||||
} else {
|
||||
numNovel++;
|
||||
if ( datum.isTransition ) { novelTi++; } else { novelTv++; }
|
||||
|
||||
}
|
||||
}
|
||||
last = datum;
|
||||
}
|
||||
|
||||
double knownTiTv = knownTi / Math.max(1.0 * knownTv, 1.0);
|
||||
double novelTiTv = novelTi / Math.max(1.0 * novelTv, 1.0);
|
||||
|
||||
int accessibleTruthSites = countCallsAtTruth(data, Double.NEGATIVE_INFINITY);
|
||||
int nCallsAtTruth = countCallsAtTruth(data, minLod);
|
||||
|
||||
|
||||
return new Tranche(fdr, target, minLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, nCallsAtTruth);
|
||||
}
|
||||
|
||||
public final static double fdrToTiTv(double desiredFDR, double targetTiTv) {
|
||||
return (1.0 - desiredFDR / 100.0) * (targetTiTv - 0.5) + 0.5;
|
||||
}
|
||||
|
||||
public static int countCallsAtTruth(final VariantDatum[] data, double minLOD ) {
|
||||
int n = 0;
|
||||
for ( VariantDatum d : data) { n += d.atTruthSite && d.lod >= minLOD ? 1 : 0; }
|
||||
return n;
|
||||
}
|
||||
|
||||
public static int countCallsAtTruth(final List<VariantDatum> data, double minLOD ) {
|
||||
int n = 0;
|
||||
for ( VariantDatum d : data) { n += d.atTruthSite && d.lod >= minLOD ? 1 : 0; }
|
||||
return n;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Code to evaluate vectors given an already trained model
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
public final double evaluateVariant( GenomeLocParser genomeLocParser, final VariantContext vc ) {
|
||||
final double[] log10pVarInCluster = new double[maxGaussians];
|
||||
final double[] annotations = new double[dataManager.numAnnotations];
|
||||
|
||||
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
|
||||
final double value = decodeAnnotation( genomeLocParser, dataManager.annotationKeys.get(jjj), vc, false );
|
||||
|
||||
double z = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
|
||||
// switch ( dataManager.getAnnotationType(jjj) ) {
|
||||
// case TWO_TAILED: // z is already fine
|
||||
// break;
|
||||
// case SMALL_IS_GOOD:
|
||||
// if ( z < 0.0 ) {
|
||||
//// logger.info(String.format("One-tailed test correction: %s %.2f mu=%.2f sigma=%.2f z=%.2f, newZ = 0",
|
||||
//// dataManager.annotationKeys.get(jjj), value, dataManager.meanVector[jjj], dataManager.varianceVector[jjj], z));
|
||||
// z = 0.0;
|
||||
// }
|
||||
// break;
|
||||
// case BIG_IS_GOOD:
|
||||
// if ( z > 0.0 ) {
|
||||
//// logger.info(String.format("One-tailed test correction: %s %.2f mu=%.2f sigma=%.2f z=%.2f, newZ = 0",
|
||||
//// dataManager.annotationKeys.get(jjj), value, dataManager.meanVector[jjj], dataManager.varianceVector[jjj], z));
|
||||
// z = 0.0;
|
||||
// }
|
||||
// break;
|
||||
// default:
|
||||
// throw new ReviewedStingException("Unexpected annotation tail type: " + dataManager.getAnnotationType(jjj));
|
||||
// }
|
||||
|
||||
annotations[jjj] = z;
|
||||
}
|
||||
|
||||
return evaluateGaussiansForSingleVariant( annotations, log10pVarInCluster );
|
||||
}
|
||||
|
||||
private double evaluateGaussiansForSingleVariant( final double[] annotations, final double[] log10pVarInCluster ) {
|
||||
|
||||
final int numAnnotations = annotations.length;
|
||||
|
||||
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
|
||||
final double sigmaVals[][] = sigmaInverse[kkk];
|
||||
double sum = 0.0;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
double value = 0.0;
|
||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
final double myMu = mu[kkk][ppp];
|
||||
double myAnn = annotations[ppp];
|
||||
final double mySigma = sigmaVals[ppp][jjj];
|
||||
double z = (myAnn - myMu) * mySigma;
|
||||
|
||||
switch ( dataManager.getAnnotationType(jjj) ) {
|
||||
case TWO_TAILED: break;
|
||||
case SMALL_IS_GOOD: if( myAnn < myMu && Math.abs(z) < MAX_SIGMA_TO_BE_IN_GAUSSIAN ) { z = 0.0; } break;
|
||||
case BIG_IS_GOOD: if( myAnn > myMu && Math.abs(z) < MAX_SIGMA_TO_BE_IN_GAUSSIAN ) { z = 0.0; } break;
|
||||
default: throw new ReviewedStingException("Unexpected annotation tail type: " + dataManager.getAnnotationType(jjj));
|
||||
}
|
||||
|
||||
value += z;
|
||||
}
|
||||
final double jNorm = annotations[jjj] - mu[kkk][jjj];
|
||||
final double prod = value * jNorm;
|
||||
sum += prod;
|
||||
}
|
||||
|
||||
final double denomLog10 = CONSTANT_GAUSSIAN_DENOM_LOG10 + sqrtDeterminantLog10[kkk];
|
||||
final double evalGaussianPDFLog10 = (( -0.5 * sum ) / Math.log(10.0)) - denomLog10;
|
||||
//final double pVar1 = Math.pow(10.0, pClusterLog10[kkk] + evalGaussianPDFLog10);
|
||||
final double pVar1 = pClusterLog10[kkk] + evalGaussianPDFLog10;
|
||||
log10pVarInCluster[kkk] = pVar1;
|
||||
}
|
||||
|
||||
return MathUtils.log10sumLog10(log10pVarInCluster);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Code to implement the steps of the VBEM algorithm
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
private double evaluateGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {
|
||||
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
double likelihood = 0.0;
|
||||
final double sigmaVals[][][] = new double[maxGaussians][][];
|
||||
final double denomLog10[] = new double[maxGaussians];
|
||||
final double pVarInClusterLog10[] = new double[maxGaussians];
|
||||
double pVarInClusterReals[];
|
||||
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
sigmaVals[kkk] = sigma[kkk].inverse().getArray();
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
sigmaVals[kkk][jjj][ppp] *= hyperParameter_a[kkk];
|
||||
}
|
||||
}
|
||||
double sum = 0.0;
|
||||
for(int jjj = 1; jjj < numAnnotations; jjj++) {
|
||||
sum += diGamma((hyperParameter_a[kkk] + 1.0 - jjj) / 2.0);
|
||||
}
|
||||
sum -= Math.log(determinant[kkk]);
|
||||
sum += Math.log(2.0) * numAnnotations;
|
||||
final double gamma = 0.5 * sum;
|
||||
sum = 0.0;
|
||||
for(int ccc = 0; ccc < maxGaussians; ccc++) {
|
||||
sum += hyperParameter_lambda[ccc];
|
||||
}
|
||||
final double pi = diGamma(hyperParameter_lambda[kkk]) - diGamma(sum);
|
||||
final double beta = (-1.0 * numAnnotations) / (2.0 * hyperParameter_b[kkk]);
|
||||
denomLog10[kkk] = (pi / Math.log(10.0)) + (gamma / Math.log(10.0)) + (beta / Math.log(10.0));
|
||||
}
|
||||
final double mult[] = new double[numAnnotations];
|
||||
double sumWeight = 0.0;
|
||||
for( int iii = 0; iii < data.length; iii++ ) {
|
||||
sumWeight += data[iii].weight;
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
double sum = 0.0;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mult[jjj] = 0.0;
|
||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
mult[jjj] += (data[iii].annotations[ppp] - mu[kkk][ppp]) * sigmaVals[kkk][ppp][jjj];
|
||||
}
|
||||
}
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
sum += mult[jjj] * (data[iii].annotations[jjj] - mu[kkk][jjj]);
|
||||
}
|
||||
|
||||
pVarInClusterLog10[kkk] = (( -0.5 * sum )/Math.log(10.0)) + denomLog10[kkk];
|
||||
final double pVar = Math.pow(10.0, pVarInClusterLog10[kkk]);
|
||||
likelihood += pVar * data[iii].weight;
|
||||
|
||||
if( Double.isNaN(pVarInClusterLog10[kkk]) || Double.isInfinite(pVarInClusterLog10[kkk]) ) {
|
||||
logger.warn("det = " + sigma[kkk].det());
|
||||
logger.warn("denom = " + denomLog10[kkk]);
|
||||
logger.warn("sumExp = " + sum);
|
||||
logger.warn("mixtureLog10 = " + pClusterLog10[kkk]);
|
||||
logger.warn("pVar = " + pVar);
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
logger.warn(sigmaVals[kkk][ppp][jjj]);
|
||||
}
|
||||
}
|
||||
|
||||
logger.warn("About to throw exception due to numerical instability. Try running with fewer annotations and then with fewer Gaussians. " +
|
||||
"It is best to only use the annotations which appear to be Gaussianly distributed for this Gaussian mixture model.");
|
||||
throw new ReviewedStingException("Numerical Instability! Found NaN after performing log10: " + pVarInClusterLog10[kkk] + ", cluster = " + kkk + ", variant index = " + iii);
|
||||
}
|
||||
}
|
||||
|
||||
pVarInClusterReals = MathUtils.normalizeFromLog10( pVarInClusterLog10 );
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
pVarInCluster[kkk][iii] = pVarInClusterReals[kkk] * data[iii].weight;
|
||||
if( Double.isNaN(pVarInCluster[kkk][iii]) ) {
|
||||
logger.warn("About to throw exception due to numerical instability. Try running with fewer annotations and then with fewer Gaussians. " +
|
||||
"It is best to only use the annotations which appear to be Gaussianly distributed for this Gaussian mixture model.");
|
||||
throw new ReviewedStingException("Numerical Instability! Found NaN after rescaling log10 values: " + pVarInCluster[kkk][iii] + ", cluster = " + kkk + ", variant index = " + iii);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("explained likelihood = " + String.format("%.5f",likelihood / sumWeight));
|
||||
return likelihood / sumWeight;
|
||||
}
|
||||
|
||||
private void maximizeGaussians( final VariantDatum[] data, final double[][] pVarInCluster, final int startCluster, final int stopCluster ) {
|
||||
|
||||
final int numVariants = data.length;
|
||||
final int numAnnotations = data[0].annotations.length;
|
||||
final double sigmaVals[][][] = new double[maxGaussians][numAnnotations][numAnnotations];
|
||||
final double wishartVals[][] = new double[numAnnotations][numAnnotations];
|
||||
final double meanVals[][] = new double[maxGaussians][numAnnotations];
|
||||
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
meanVals[kkk][jjj] = 0.0;
|
||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
sigmaVals[kkk][jjj][ppp] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double sumPK = 0.0;
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
double sumProb = 0.0;
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
final double prob = pVarInCluster[kkk][iii];
|
||||
if( Double.isNaN(prob) ) {
|
||||
logger.warn("About to throw exception due to numerical instability. Try running with fewer annotations and then with fewer Gaussians. " +
|
||||
"It is best to only use the annotations which appear to be Gaussianly distributed for this Gaussian mixture model.");
|
||||
throw new ReviewedStingException("Numerical Instability! Found NaN in M-step: " + pVarInCluster[kkk][iii] + ", cluster = " + kkk + ", variant index = " + iii);
|
||||
}
|
||||
sumProb += prob;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
meanVals[kkk][jjj] += prob * data[iii].annotations[jjj];
|
||||
}
|
||||
}
|
||||
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
meanVals[kkk][jjj] = (meanVals[kkk][jjj] + SHRINKAGE * empiricalMu[jjj]) / (sumProb + SHRINKAGE);
|
||||
}
|
||||
|
||||
final double shrinkageFactor = (SHRINKAGE * sumProb) / (SHRINKAGE + sumProb);
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
wishartVals[jjj][ppp] = shrinkageFactor * (meanVals[kkk][jjj] - empiricalMu[jjj]) * (meanVals[kkk][ppp] - empiricalMu[ppp]);
|
||||
}
|
||||
}
|
||||
|
||||
for( int iii = 0; iii < numVariants; iii++ ) {
|
||||
final double prob = pVarInCluster[kkk][iii];
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
sigmaVals[kkk][jjj][ppp] += prob * (data[iii].annotations[jjj]-meanVals[kkk][jjj]) * (data[iii].annotations[ppp]-meanVals[kkk][ppp]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final Matrix tmpMatrix = empiricalSigma.plus(new Matrix(wishartVals).plus(new Matrix(sigmaVals[kkk])));
|
||||
|
||||
sigma[kkk] = (Matrix)tmpMatrix.clone();
|
||||
determinant[kkk] = sigma[kkk].det();
|
||||
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mu[kkk][jjj] = meanVals[kkk][jjj];
|
||||
}
|
||||
|
||||
pClusterLog10[kkk] = sumProb;
|
||||
sumPK += sumProb;
|
||||
|
||||
hyperParameter_a[kkk] = sumProb + numAnnotations;
|
||||
hyperParameter_b[kkk] = sumProb + SHRINKAGE;
|
||||
hyperParameter_lambda[kkk] = sumProb + DIRICHLET_PARAMETER;
|
||||
}
|
||||
|
||||
for( int kkk = startCluster; kkk < stopCluster; kkk++ ) {
|
||||
pClusterLog10[kkk] = Math.log10( pClusterLog10[kkk] / sumPK );
|
||||
}
|
||||
|
||||
pClusterLog10 = MathUtils.normalizeFromLog10( pClusterLog10, true );
|
||||
}
|
||||
|
||||
// from http://en.wikipedia.org/wiki/Digamma_function
|
||||
// According to J.M. Bernardo AS 103 algorithm the digamma function for x, a real number, can be approximated by:
|
||||
private static double diGamma(final double x) {
|
||||
return Math.log(x) - ( 1.0 / (2.0 * x) )
|
||||
- ( 1.0 / (12.0 * Math.pow(x, 2.0)) )
|
||||
+ ( 1.0 / (120.0 * Math.pow(x, 4.0)) )
|
||||
- ( 1.0 / (252.0 * Math.pow(x, 6.0)) );
|
||||
}
|
||||
}
|
||||
|
|
@ -1,38 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Feb 26, 2010
|
||||
*/
|
||||
|
||||
public interface VariantOptimizationInterface {
|
||||
public void run( PrintStream outputPrefix );
|
||||
}
|
||||
|
|
@ -1,67 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Feb 26, 2010
|
||||
*/
|
||||
|
||||
public abstract class VariantOptimizationModel implements VariantOptimizationInterface {
|
||||
|
||||
public enum Model {
|
||||
GAUSSIAN_MIXTURE_MODEL
|
||||
}
|
||||
|
||||
protected final double targetTITV;
|
||||
|
||||
public VariantOptimizationModel() {
|
||||
targetTITV = 0.0;
|
||||
}
|
||||
|
||||
public VariantOptimizationModel( final double _targetTITV ) {
|
||||
targetTITV = _targetTITV;
|
||||
}
|
||||
|
||||
public final double calcTruePositiveRateFromTITV( final double _titv ) {
|
||||
double titv = _titv;
|
||||
if( titv > targetTITV ) { titv -= 2.0f*(titv-targetTITV); }
|
||||
if( titv < 0.5 ) { titv = 0.5; }
|
||||
return ( (titv - 0.5) / (targetTITV - 0.5) );
|
||||
}
|
||||
|
||||
public final double calcTruePositiveRateFromKnownTITV( final double knownTITV, final double _novelTITV, final double overallTITV, final double knownAlphaFactor ) {
|
||||
|
||||
final double tprTarget = calcTruePositiveRateFromTITV( overallTITV );
|
||||
double novelTITV = _novelTITV;
|
||||
if( novelTITV > knownTITV ) { novelTITV -= 2.0f*(novelTITV-knownTITV); }
|
||||
if( novelTITV < 0.5 ) { novelTITV = 0.5; }
|
||||
final double tprKnown = ( (novelTITV - 0.5) / (knownTITV - 0.5) );
|
||||
|
||||
return ( knownAlphaFactor * tprKnown ) + ( (1.0 - knownAlphaFactor) * tprTarget );
|
||||
}
|
||||
}
|
||||
|
|
@ -1,407 +0,0 @@
|
|||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Mar 17, 2010
|
||||
*
|
||||
* @help.summary Applies calibrated variant cluster parameters to variant calls to produce an accurate and informative variant quality score
|
||||
*/
|
||||
|
||||
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> {
|
||||
public static final String VQS_LOD_KEY = "VQSLOD";
|
||||
|
||||
/////////////////////////////
|
||||
// Inputs
|
||||
/////////////////////////////
|
||||
@Input(fullName="cluster_file", shortName="clusterFile", doc="The input cluster file generated by GenerateVariantClusters", required=true)
|
||||
private File CLUSTER_FILE;
|
||||
|
||||
/////////////////////////////
|
||||
// Outputs
|
||||
/////////////////////////////
|
||||
@Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyVariantCuts", required=true)
|
||||
private File TRANCHES_FILE;
|
||||
@Output(doc="File to which recalibrated variants should be written", required=true)
|
||||
private VCFWriter vcfWriter = null;
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (~~2.07 for whole genome experiments)", required=false)
|
||||
private double TARGET_TITV = 2.07;
|
||||
@Argument(fullName="backOff", shortName="backOff", doc="The Gaussian back off factor, used to prevent overfitting by enlarging the Gaussians.", required=false)
|
||||
private double BACKOFF_FACTOR = 1.3;
|
||||
@Argument(fullName="ignore_all_input_filters", shortName="ignoreAllFilters", doc="If specified the optimizer will use variants even if the FILTER column is marked in the VCF file", required=false)
|
||||
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Argument(fullName="priorNovel", shortName="priorNovel", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
|
||||
private double PRIOR_NOVELS = 2.0;
|
||||
@Argument(fullName="priorDBSNP", shortName="priorDBSNP", doc="A prior on the quality of dbSNP variants, a phred scaled probability of being true.", required=false)
|
||||
private double PRIOR_DBSNP = 10.0;
|
||||
@Argument(fullName="priorHapMap", shortName="priorHapMap", doc="A prior on the quality of HapMap variants, a phred scaled probability of being true.", required=false)
|
||||
private double PRIOR_HAPMAP = 15.0;
|
||||
@Argument(fullName="prior1KG", shortName="prior1KG", doc="A prior on the quality of 1000 Genomes Project variants, a phred scaled probability of being true.", required=false)
|
||||
private double PRIOR_1KG = 12.0;
|
||||
@Argument(fullName="FDRtranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
|
||||
private double[] FDR_TRANCHES = new double[]{0.1, 1.0, 10.0, 100.0};
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required=false)
|
||||
private String PATH_TO_RSCRIPT = "Rscript";
|
||||
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false)
|
||||
private String PATH_TO_RESOURCES = "R/";
|
||||
@Argument(fullName="singleton_fp_rate", shortName="fp_rate", doc="Prior expectation that a singleton call would be a FP", required=false)
|
||||
private double SINGLETON_FP_RATE = 0.5;
|
||||
@Argument(fullName="max_ac_prior", shortName="maxACPrior", doc="Maximum value for the prior expectation based on allele count. Needed because (1 - 0.5^x) approaches 1.0 very quickly.", required=false)
|
||||
private double MAX_AC_PRIOR = 0.99;
|
||||
@Argument(fullName="dontTrustACField", shortName="dontTrustACField", doc="If specified the VR will not use the AC field and will instead always parse the genotypes to figure out how many variant chromosomes there are at a given site.", required=false)
|
||||
private boolean NEVER_TRUST_AC_FIELD = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Debug Arguments
|
||||
/////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName = "qual", shortName = "qual", doc = "Don't use sites with original quality scores below the qual threshold. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
||||
private double QUAL_THRESHOLD = 0.0;
|
||||
@Hidden
|
||||
@Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false)
|
||||
protected Boolean TRUST_ALL_POLYMORPHIC = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "debugFile", shortName = "debugFile", doc = "Print debugging information here", required=false)
|
||||
private File DEBUG_FILE = null;
|
||||
@Hidden
|
||||
@Argument(fullName = "selectionMetric", shortName = "sm", doc = "Selection metric to use", required=false)
|
||||
private SelectionMetricType SELECTION_METRIC_TYPE = SelectionMetricType.NOVEL_TITV;
|
||||
|
||||
// arguments for handling 1 sided annotations
|
||||
@Hidden
|
||||
@Argument(fullName="use_annotation", shortName="an", doc="Allows us to provide information about the sidedness of annotations. Can be either -AN QD [two-sided], -AN QD- [values less than mean are good] and -AN QD+ [values greater than the mean are good]", required=false)
|
||||
private List<String> USE_ANNOTATIONS = null;
|
||||
|
||||
public enum SelectionMetricType {
|
||||
NOVEL_TITV,
|
||||
TRUTH_SENSITIVITY
|
||||
}
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private VariantOptimizationModel.Model OPTIMIZATION_MODEL = VariantOptimizationModel.Model.GAUSSIAN_MIXTURE_MODEL;
|
||||
private VariantGaussianMixtureModel theModel = null;
|
||||
private Set<String> ignoreInputFilterSet = null;
|
||||
private Set<String> inputNames = new HashSet<String>();
|
||||
private Set<String> truthNames = new HashSet<String>();
|
||||
private NestedHashMap priorCache = new NestedHashMap();
|
||||
private boolean trustACField = false;
|
||||
private PrintStream tranchesStream = null;
|
||||
private int nTruthSites = 0;
|
||||
|
||||
private static double round4(double num) {
|
||||
double result = num * 10000.0;
|
||||
result = Math.round(result);
|
||||
result = result / 10000.0;
|
||||
return result;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; }
|
||||
|
||||
if( IGNORE_INPUT_FILTERS != null ) {
|
||||
ignoreInputFilterSet = new TreeSet<String>(Arrays.asList(IGNORE_INPUT_FILTERS));
|
||||
}
|
||||
|
||||
theModel = new VariantGaussianMixtureModel( TARGET_TITV, CLUSTER_FILE, BACKOFF_FACTOR );
|
||||
if ( SINGLETON_FP_RATE != -1 ) {
|
||||
theModel.setSingletonFPRate(SINGLETON_FP_RATE);
|
||||
}
|
||||
|
||||
// deal with annotations
|
||||
if ( USE_ANNOTATIONS != null ) {
|
||||
for ( String annotation : USE_ANNOTATIONS ) {
|
||||
VariantDataManager.TailType type = VariantDataManager.TailType.TWO_TAILED;
|
||||
String annotationName = annotation;
|
||||
if ( annotation.endsWith("-") ) {
|
||||
type = VariantDataManager.TailType.SMALL_IS_GOOD;
|
||||
annotationName = annotation.substring(0, annotation.length()-1);
|
||||
}
|
||||
else if ( annotation.endsWith("+") ) {
|
||||
type = VariantDataManager.TailType.BIG_IS_GOOD;
|
||||
annotationName = annotation.substring(0, annotation.length()-1);
|
||||
}
|
||||
|
||||
theModel.dataManager.setAnnotationType( annotationName, type );
|
||||
}
|
||||
}
|
||||
|
||||
boolean foundDBSNP = false;
|
||||
for( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
|
||||
if( d.getName().startsWith("input") ) {
|
||||
inputNames.add(d.getName());
|
||||
logger.info("Found input variant track with name " + d.getName());
|
||||
} else if( d.getName().startsWith("truth") ) {
|
||||
truthNames.add(d.getName());
|
||||
logger.info("Found truth variant track with name " + d.getName());
|
||||
} else if ( d.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||
logger.info("Found dbSNP track with prior probability = Q" + PRIOR_DBSNP);
|
||||
foundDBSNP = true;
|
||||
} else if ( d.getName().equals("hapmap") ) {
|
||||
logger.info("Found HapMap track with prior probability = Q" + PRIOR_HAPMAP);
|
||||
} else if ( d.getName().equals("1kg") ) {
|
||||
logger.info("Found 1KG track for with prior probability = Q" + PRIOR_1KG);
|
||||
} else {
|
||||
logger.info("Not evaluating ROD binding " + d.getName());
|
||||
}
|
||||
}
|
||||
|
||||
if(!foundDBSNP) {
|
||||
throw new UserException.CommandLineException("dbSNP track is required. This calculation is critically dependent on being able to distinguish known and novel sites.");
|
||||
}
|
||||
|
||||
if( inputNames.size() == 0 ) {
|
||||
throw new UserException.BadInput( "No input variant tracks found. Input variant binding names must begin with 'input'." );
|
||||
}
|
||||
|
||||
// setup the header fields
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
|
||||
hInfo.add(new VCFInfoHeaderLine(VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "log10-scaled probability of variant being true under the trained gaussian mixture model"));
|
||||
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
|
||||
|
||||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
|
||||
try {
|
||||
tranchesStream = new PrintStream(TRANCHES_FILE);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(TRANCHES_FILE, e);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
|
||||
|
||||
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||
return mapList;
|
||||
}
|
||||
|
||||
final Collection<VariantContext> vcsTruth = tracker.getVariantContexts(ref, truthNames, null, context.getLocation(), true, true);
|
||||
boolean isAtTruthSite = false;
|
||||
for( final VariantContext vcTruth : vcsTruth ) {
|
||||
if( vcTruth != null && vcTruth.isVariant() && !vcTruth.isFiltered() && (!vcTruth.hasGenotypes() || vcTruth.isPolymorphic()) ) {
|
||||
isAtTruthSite = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
nTruthSites += isAtTruthSite ? 1 : 0;
|
||||
|
||||
for( final VariantContext vc : tracker.getVariantContexts(ref, inputNames, null, context.getLocation(), true, false) ) {
|
||||
if( vc != null ) {
|
||||
if( !vc.isFiltered() || IGNORE_ALL_INPUT_FILTERS || (ignoreInputFilterSet != null && ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
|
||||
if( vc.getPhredScaledQual() >= QUAL_THRESHOLD ) {
|
||||
final VariantDatum variantDatum = new VariantDatum();
|
||||
|
||||
if (vc.isSNP())
|
||||
variantDatum.isTransition = VariantContextUtils.getSNPSubstitutionType(vc).compareTo(BaseUtils.BaseSubstitutionType.TRANSITION) == 0;
|
||||
|
||||
final Collection<VariantContext> vcsDbsnp = tracker.getVariantContexts(ref, "dbsnp", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcsHapMap = tracker.getVariantContexts(ref, "hapmap", null, context.getLocation(), false, true);
|
||||
final Collection<VariantContext> vcs1KG = tracker.getVariantContexts(ref, "1kg", null, context.getLocation(), false, true);
|
||||
final VariantContext vcDbsnp = ( vcsDbsnp.size() != 0 ? vcsDbsnp.iterator().next() : null );
|
||||
final VariantContext vcHapMap = ( vcsHapMap.size() != 0 ? vcsHapMap.iterator().next() : null );
|
||||
final VariantContext vc1KG = ( vcs1KG.size() != 0 ? vcs1KG.iterator().next() : null );
|
||||
|
||||
variantDatum.isKnown = ( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() );
|
||||
double knownPrior_qScore = PRIOR_NOVELS;
|
||||
if( vcHapMap != null && vcHapMap.isVariant() && !vcHapMap.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vcHapMap.hasGenotypes() || vcHapMap.isPolymorphic()) ) {
|
||||
knownPrior_qScore = PRIOR_HAPMAP;
|
||||
} else if( vc1KG != null && vc1KG.isVariant() && !vc1KG.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vc1KG.hasGenotypes() || vc1KG.isPolymorphic()) ) {
|
||||
knownPrior_qScore = PRIOR_1KG;
|
||||
} else if( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() ) {
|
||||
knownPrior_qScore = PRIOR_DBSNP;
|
||||
}
|
||||
|
||||
|
||||
// If we can trust the AC field then use it instead of parsing all the genotypes. Results in a substantial speed up.
|
||||
final int alleleCount = (trustACField ? Integer.parseInt(vc.getAttribute("AC",-1).toString()) : vc.getChromosomeCount(vc.getAlternateAllele(0)));
|
||||
if( !trustACField && !NEVER_TRUST_AC_FIELD ) {
|
||||
if( !vc.getAttributes().containsKey("AC") ) {
|
||||
NEVER_TRUST_AC_FIELD = true;
|
||||
} else {
|
||||
if( alleleCount == Integer.parseInt( vc.getAttribute("AC").toString()) ) {
|
||||
// The AC field is correct at this record so we trust it forever
|
||||
trustACField = true;
|
||||
} else { // We found a record in which the AC field wasn't correct but we are trying to trust it
|
||||
throw new UserException.BadInput("AC info field doesn't match the variant chromosome count so we can't trust it! Please run with --dontTrustACField which will force the walker to parse the genotypes for each record, drastically increasing the runtime." +
|
||||
"First observed at " + vc.getChr() + ":" + vc.getStart());
|
||||
}
|
||||
}
|
||||
}
|
||||
if( trustACField && alleleCount == -1 ) {
|
||||
throw new UserException.BadInput("AC info field doesn't exist for all records (although it does for some) so we can't trust it! Please run with --dontTrustACField which will force the walker to parse the genotypes for each record, drastically increasing the runtime." +
|
||||
"First observed at " + vc.getChr() + ":" + vc.getStart());
|
||||
}
|
||||
|
||||
final Object[] priorKey = new Object[2];
|
||||
priorKey[0] = alleleCount;
|
||||
priorKey[1] = knownPrior_qScore;
|
||||
|
||||
Double priorLodFactor = (Double)priorCache.get( priorKey );
|
||||
|
||||
// If this prior factor hasn't been calculated before, do so now
|
||||
if(priorLodFactor == null) {
|
||||
final double knownPrior = QualityUtils.qualToProb(knownPrior_qScore);
|
||||
final double acPrior = theModel.getAlleleCountPrior( alleleCount, MAX_AC_PRIOR );
|
||||
final double totalPrior = 1.0 - ((1.0 - acPrior) * (1.0 - knownPrior));
|
||||
|
||||
if( MathUtils.compareDoubles(totalPrior, 1.0, 1E-8) == 0 || MathUtils.compareDoubles(totalPrior, 0.0, 1E-8) == 0 ) {
|
||||
throw new UserException.CommandLineException("Something is wrong with the priors that were entered by the user: Prior = " + totalPrior); // TODO - fix this up later
|
||||
}
|
||||
|
||||
priorLodFactor = Math.log10(totalPrior) - Math.log10(1.0 - totalPrior) - Math.log10(1.0);
|
||||
|
||||
priorCache.put( priorLodFactor, false, priorKey );
|
||||
}
|
||||
|
||||
final double log10pVar = theModel.evaluateVariant( ref.getGenomeLocParser(),vc );
|
||||
final double lod = priorLodFactor + log10pVar;
|
||||
|
||||
if ( lod == Double.NEGATIVE_INFINITY ) {
|
||||
throw new ReviewedStingException("Negative infinity detected during summation at " + vc);
|
||||
}
|
||||
|
||||
variantDatum.pos = vc.getStart();
|
||||
variantDatum.lod = round4(lod);
|
||||
|
||||
// deal with the truth calculation
|
||||
variantDatum.atTruthSite = isAtTruthSite;
|
||||
|
||||
mapList.add( variantDatum );
|
||||
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
|
||||
attrs.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod));
|
||||
VariantContext newVC = VariantContext.modifyPErrorFiltersAndAttributes(vc, vc.getNegLog10PError(), new HashSet<String>(), attrs);
|
||||
|
||||
vcfWriter.add( newVC, ref.getBase() );
|
||||
}
|
||||
|
||||
} else { // not a SNP or is filtered so just dump it out to the VCF file
|
||||
vcfWriter.add( vc, ref.getBase() );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return mapList;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduceInit() {
|
||||
return new ExpandingArrayList<VariantDatum>();
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
reduceSum.addAll( mapValue );
|
||||
return reduceSum;
|
||||
}
|
||||
|
||||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
|
||||
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
||||
|
||||
// deal with truth information
|
||||
int nCallsAtTruth = VariantGaussianMixtureModel.countCallsAtTruth(dataManager.data, Double.NEGATIVE_INFINITY);
|
||||
logger.info(String.format("Truth set size is %d, raw calls at these sites %d, maximum sensitivity of %.2f",
|
||||
nTruthSites, nCallsAtTruth, (100.0*nCallsAtTruth / Math.max(nTruthSites, nCallsAtTruth))));
|
||||
|
||||
VariantGaussianMixtureModel.SelectionMetric metric = null;
|
||||
switch ( SELECTION_METRIC_TYPE ) {
|
||||
case NOVEL_TITV: metric = new VariantGaussianMixtureModel.NovelTiTvMetric(TARGET_TITV); break;
|
||||
case TRUTH_SENSITIVITY: metric = new VariantGaussianMixtureModel.TruthSensitivityMetric(nCallsAtTruth); break;
|
||||
default: throw new ReviewedStingException("BUG: unexpected SelectionMetricType " + SELECTION_METRIC_TYPE);
|
||||
}
|
||||
|
||||
List<Tranche> tranches = VariantGaussianMixtureModel.findTranches( dataManager.data, FDR_TRANCHES, metric, DEBUG_FILE );
|
||||
tranchesStream.print(Tranche.tranchesString(tranches));
|
||||
|
||||
// Execute Rscript command to plot the optimization curve
|
||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||
final String rScriptTranchesCommandLine = PATH_TO_RSCRIPT + " " + PATH_TO_RESOURCES + "plot_Tranches.R" + " " + TRANCHES_FILE.getAbsolutePath() + " " + TARGET_TITV;
|
||||
logger.info( rScriptTranchesCommandLine );
|
||||
|
||||
// Execute the RScript command to plot the table of truth values
|
||||
try {
|
||||
Process p;
|
||||
p = Runtime.getRuntime().exec( rScriptTranchesCommandLine );
|
||||
p.waitFor();
|
||||
} catch ( Exception e ) {
|
||||
Utils.warnUser("Unable to execute the RScript command. While not critical to the calculations themselves, the script outputs a report that is extremely useful for confirming that the recalibration proceded as expected. We highly recommend trying to rerun the script manually if possible.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue