Checking in everyone's changes to the variant recalibrator. We now calculate the variant quality score as a LOD score between the true and false hypothesis. Allele Count prior is changed to be (1 - 0.5^ac). Known prior breaks out HapMap sites
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3952 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
07addf1187
commit
a8d37da10b
|
|
@ -67,7 +67,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
private String CLUSTER_FILENAME = "optimizer.cluster";
|
||||
@Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians to try during Bayesian clustering", required=false)
|
||||
private int MAX_GAUSSIANS = 30;
|
||||
private int MAX_GAUSSIANS = 4;
|
||||
@Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of iterations to be performed when clustering. Clustering will normally end when convergence is detected.", required=false)
|
||||
private int MAX_ITERATIONS = 200;
|
||||
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
|
||||
|
|
@ -87,9 +87,9 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
@Argument(fullName="forceIndependent", shortName="forceIndependent", doc="Force off-diagonal entries in the covariance matrix to be zero.", required=false)
|
||||
private boolean FORCE_INDEPENDENT = false;
|
||||
@Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for clustering.", required=false)
|
||||
private double STD_THRESHOLD = 6.0;
|
||||
private double STD_THRESHOLD = 4.5;
|
||||
@Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for clustering.", required=false)
|
||||
private double QUAL_THRESHOLD = 300.0;
|
||||
private double QUAL_THRESHOLD = 1000.0;
|
||||
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
|
||||
private double SHRINKAGE = 0.0001;
|
||||
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algoirthm.", required=false)
|
||||
|
|
|
|||
|
|
@ -330,7 +330,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
if ( this.singletonFPRate == -1 )
|
||||
return alleleCountFactorArray[alleleCount];
|
||||
else {
|
||||
return 1 - Math.pow(singletonFPRate, alleleCount);
|
||||
return Math.min(0.95, 1.0 - Math.pow(singletonFPRate, alleleCount)); //TODO -- define the vals
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -345,7 +345,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
}
|
||||
}
|
||||
|
||||
for( int ttt = 0; ttt < 20; ttt++ ) {
|
||||
for( int ttt = 0; ttt < 60; ttt++ ) {
|
||||
performKMeansIteration( data );
|
||||
}
|
||||
}
|
||||
|
|
@ -528,12 +528,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
return value;
|
||||
}
|
||||
|
||||
public final double evaluateVariantLog10( final VariantContext vc ) {
|
||||
public final double evaluateVariant( final VariantContext vc ) {
|
||||
final double[] pVarInCluster = new double[maxGaussians];
|
||||
final double[] annotations = new double[dataManager.numAnnotations];
|
||||
|
||||
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
|
||||
final double value = decodeAnnotation( dataManager.annotationKeys.get(jjj), vc, true );
|
||||
final double value = decodeAnnotation( dataManager.annotationKeys.get(jjj), vc, false );
|
||||
annotations[jjj] = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
|
||||
}
|
||||
|
||||
|
|
@ -544,12 +544,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
sum += pVarInCluster[kkk]; // * clusterTruePositiveRate[kkk];
|
||||
}
|
||||
|
||||
double sumLog10 = Math.log10(sum);
|
||||
if( Double.isInfinite(sumLog10) || Double.isNaN(sumLog10) ) {
|
||||
//logger.warn("pTrueLog10 = -Infinity, capped at -20");
|
||||
sumLog10 = -20;
|
||||
}
|
||||
return sumLog10;
|
||||
return sum;
|
||||
}
|
||||
|
||||
public final void outputClusterReports( final String outputPrefix ) {
|
||||
|
|
@ -839,22 +834,30 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
|
|||
|
||||
final int numAnnotations = annotations.length;
|
||||
final double mult[] = new double[numAnnotations];
|
||||
|
||||
final double evalGaussianPDFLog10[] = new double[maxGaussians];
|
||||
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
|
||||
final double sigmaVals[][] = sigmaInverse[kkk].getArray();
|
||||
double sum = 0.0;
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
mult[jjj] = 0.0;
|
||||
double value = 0.0;
|
||||
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
|
||||
mult[jjj] += (annotations[ppp] - mu[kkk][ppp]) * sigmaVals[ppp][jjj];
|
||||
final double myMu = mu[kkk][ppp];
|
||||
final double myAnn = annotations[ppp];
|
||||
final double mySigma = sigmaVals[ppp][jjj];
|
||||
value += (myAnn - myMu) * mySigma;
|
||||
}
|
||||
mult[jjj] = value;
|
||||
}
|
||||
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
|
||||
sum += mult[jjj] * (annotations[jjj] - mu[kkk][jjj]);
|
||||
}
|
||||
|
||||
final double denomLog10 = Math.log10(Math.pow(2.0 * Math.PI, ((double)numAnnotations) / 2.0)) + Math.log10(Math.pow(determinant[kkk], 0.5));
|
||||
pVarInCluster[kkk] = Math.pow(10.0, pClusterLog10[kkk] + (( -0.5 * sum ) / Math.log(10.0)) - denomLog10);
|
||||
evalGaussianPDFLog10[kkk] = (( -0.5 * sum ) / Math.log(10.0)) - denomLog10;
|
||||
pVarInCluster[kkk] = Math.pow(10.0, pClusterLog10[kkk] + evalGaussianPDFLog10[kkk]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -69,12 +69,12 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
private boolean IGNORE_ALL_INPUT_FILTERS = false;
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Argument(fullName="hapmap_prior", shortName="hapmapPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true.", required=false)
|
||||
private double HAPMAP_QUAL_PRIOR = 15.0;
|
||||
@Argument(fullName="known_prior", shortName="knownPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true.", required=false)
|
||||
private int KNOWN_QUAL_PRIOR = 9;
|
||||
private double KNOWN_QUAL_PRIOR = 10.0;
|
||||
@Argument(fullName="novel_prior", shortName="novelPrior", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
|
||||
private int NOVEL_QUAL_PRIOR = 2;
|
||||
@Argument(fullName="quality_scale_factor", shortName="qScale", doc="Multiply all final quality scores by this value. Needed to normalize the quality scores.", required=false)
|
||||
private double QUALITY_SCALE_FACTOR = 1600.0;
|
||||
private double NOVEL_QUAL_PRIOR = 2.0;
|
||||
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
|
||||
private String OUTPUT_PREFIX = "optimizer";
|
||||
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
|
||||
|
|
@ -90,7 +90,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
@Argument(fullName="quality_step", shortName="qStep", doc="Resolution in QUAL units for optimization and tranche calculations", required=false)
|
||||
private double QUAL_STEP = 0.1;
|
||||
@Argument(fullName="singleton_fp_rate", shortName="fp_rate", doc="Prior expectation that a singleton call would be a FP", required=false)
|
||||
private double SINGLETON_FP_RATE = -1;
|
||||
private double SINGLETON_FP_RATE = 0.5;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
|
|
@ -98,6 +98,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
private VariantGaussianMixtureModel theModel = null;
|
||||
private VCFWriter vcfWriter;
|
||||
private Set<String> ignoreInputFilterSet = null;
|
||||
private int numUnstable = 0;
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -185,11 +186,27 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
variantDatum.isKnown = dbsnp != null;
|
||||
variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes. Also, what to do about tri-allelic sites?
|
||||
|
||||
final double acPriorLog10 = Math.log10(theModel.getAlleleCountPrior( variantDatum.alleleCount ));
|
||||
final double knownPriorLog10 = Math.log10( variantDatum.isKnown ? QualityUtils.qualToProb(KNOWN_QUAL_PRIOR) : QualityUtils.qualToProb(NOVEL_QUAL_PRIOR) );
|
||||
final double pTrue = Math.pow(10.0, theModel.evaluateVariantLog10( vc ) + acPriorLog10 + knownPriorLog10);
|
||||
final double acPrior = theModel.getAlleleCountPrior( variantDatum.alleleCount );
|
||||
double knownPrior = variantDatum.isKnown ? QualityUtils.qualToProb(KNOWN_QUAL_PRIOR) : QualityUtils.qualToProb(NOVEL_QUAL_PRIOR);
|
||||
if(variantDatum.isKnown && DbSNPHelper.isHapmap( dbsnp ) ) {
|
||||
knownPrior = QualityUtils.qualToProb(HAPMAP_QUAL_PRIOR);
|
||||
}
|
||||
final double totalPrior = 1.0 - ((1.0 - acPrior) * (1.0 - knownPrior));
|
||||
|
||||
if( MathUtils.compareDoubles(totalPrior, 1.0, 1E-8) == 0 || MathUtils.compareDoubles(totalPrior, 0.0, 1E-8) == 0 ) {
|
||||
throw new StingException("Some is wrong with the prior that was entered by the user: Prior = " + totalPrior); // TODO - fix this up later
|
||||
}
|
||||
|
||||
double pVar = theModel.evaluateVariant( vc );
|
||||
|
||||
if( pVar > 1.0 ) {
|
||||
pVar = 0.99;
|
||||
numUnstable++;
|
||||
}
|
||||
|
||||
final double lod = (Math.log10(totalPrior) + Math.log10(pVar)) - ((Math.log10(1.0 - totalPrior)) + Math.log10(1.0 - pVar));
|
||||
variantDatum.qual = 10.0 * QualityUtils.lodToPhredScaleErrorRate(lod);
|
||||
|
||||
variantDatum.qual = QUALITY_SCALE_FACTOR * QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) ); // BUGBUG: don't have a normalizing constant, so need to scale up qual scores arbitrarily
|
||||
mapList.add( variantDatum );
|
||||
|
||||
Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
|
||||
|
|
@ -228,6 +245,10 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
|
||||
vcfWriter.close();
|
||||
|
||||
if( numUnstable > 0 ) {
|
||||
logger.warn("WARNING: Found " + numUnstable + " variant(s) with pVar > 1, Most likely numerical instability during clustering !!!!!");
|
||||
}
|
||||
|
||||
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
|
||||
reduceSum.clear(); // Don't need this ever again, clean up some memory
|
||||
|
|
|
|||
|
|
@ -30,7 +30,11 @@ public class QualityUtils {
|
|||
}
|
||||
|
||||
static public double qualToProb(int qual) {
|
||||
return 1.0 - Math.pow(10.0, ((double) qual)/-10.0);
|
||||
return qualToProb( (double)qual );
|
||||
}
|
||||
|
||||
static public double qualToProb(double qual) {
|
||||
return 1.0 - Math.pow(10.0, qual/(-10.0));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -76,6 +80,10 @@ public class QualityUtils {
|
|||
static public double phredScaleErrorRate(double errorRate) {
|
||||
return -10.0*Math.log10(errorRate);
|
||||
}
|
||||
|
||||
static public double lodToPhredScaleErrorRate(double lod) {
|
||||
return phredScaleErrorRate(1.0 / (Math.pow(10.0, lod) + 1.0));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a quality score, capped at max qual.
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testGenerateVariantClusters() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "72558d2c49bb94dc59e9d4146fe0bc05" );
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "05d1692624a28cd9446feac8fd2408ab" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
|
|
@ -37,7 +37,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testVariantRecalibrator() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "0cb94385ced8a7a537d7bc79f82c01d3" );
|
||||
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "4570c93c96de61342d75c1c658e2d5c1" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String vcf = entry.getKey();
|
||||
|
|
|
|||
Loading…
Reference in New Issue