Checking in everyone's changes to the variant recalibrator. We now calculate the variant quality score as a LOD score between the true and false hypothesis. Allele Count prior is changed to be (1 - 0.5^ac). Known prior breaks out HapMap sites

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3952 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
rpoplin 2010-08-05 14:12:19 +00:00
parent 07addf1187
commit a8d37da10b
5 changed files with 60 additions and 28 deletions

View File

@ -67,7 +67,7 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
private String CLUSTER_FILENAME = "optimizer.cluster";
@Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians to try during Bayesian clustering", required=false)
private int MAX_GAUSSIANS = 30;
private int MAX_GAUSSIANS = 4;
@Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of iterations to be performed when clustering. Clustering will normally end when convergence is detected.", required=false)
private int MAX_ITERATIONS = 200;
@Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required = false)
@ -87,9 +87,9 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
@Argument(fullName="forceIndependent", shortName="forceIndependent", doc="Force off-diagonal entries in the covariance matrix to be zero.", required=false)
private boolean FORCE_INDEPENDENT = false;
@Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for clustering.", required=false)
private double STD_THRESHOLD = 6.0;
private double STD_THRESHOLD = 4.5;
@Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for clustering.", required=false)
private double QUAL_THRESHOLD = 300.0;
private double QUAL_THRESHOLD = 1000.0;
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
private double SHRINKAGE = 0.0001;
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algoirthm.", required=false)

View File

@ -330,7 +330,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
if ( this.singletonFPRate == -1 )
return alleleCountFactorArray[alleleCount];
else {
return 1 - Math.pow(singletonFPRate, alleleCount);
return Math.min(0.95, 1.0 - Math.pow(singletonFPRate, alleleCount)); //TODO -- define the vals
}
}
@ -345,7 +345,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
}
}
for( int ttt = 0; ttt < 20; ttt++ ) {
for( int ttt = 0; ttt < 60; ttt++ ) {
performKMeansIteration( data );
}
}
@ -528,12 +528,12 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
return value;
}
public final double evaluateVariantLog10( final VariantContext vc ) {
public final double evaluateVariant( final VariantContext vc ) {
final double[] pVarInCluster = new double[maxGaussians];
final double[] annotations = new double[dataManager.numAnnotations];
for( int jjj = 0; jjj < dataManager.numAnnotations; jjj++ ) {
final double value = decodeAnnotation( dataManager.annotationKeys.get(jjj), vc, true );
final double value = decodeAnnotation( dataManager.annotationKeys.get(jjj), vc, false );
annotations[jjj] = (value - dataManager.meanVector[jjj]) / dataManager.varianceVector[jjj];
}
@ -544,12 +544,7 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
sum += pVarInCluster[kkk]; // * clusterTruePositiveRate[kkk];
}
double sumLog10 = Math.log10(sum);
if( Double.isInfinite(sumLog10) || Double.isNaN(sumLog10) ) {
//logger.warn("pTrueLog10 = -Infinity, capped at -20");
sumLog10 = -20;
}
return sumLog10;
return sum;
}
public final void outputClusterReports( final String outputPrefix ) {
@ -839,22 +834,30 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
final int numAnnotations = annotations.length;
final double mult[] = new double[numAnnotations];
final double evalGaussianPDFLog10[] = new double[maxGaussians];
for( int kkk = 0; kkk < maxGaussians; kkk++ ) {
final double sigmaVals[][] = sigmaInverse[kkk].getArray();
double sum = 0.0;
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
mult[jjj] = 0.0;
double value = 0.0;
for( int ppp = 0; ppp < numAnnotations; ppp++ ) {
mult[jjj] += (annotations[ppp] - mu[kkk][ppp]) * sigmaVals[ppp][jjj];
final double myMu = mu[kkk][ppp];
final double myAnn = annotations[ppp];
final double mySigma = sigmaVals[ppp][jjj];
value += (myAnn - myMu) * mySigma;
}
mult[jjj] = value;
}
for( int jjj = 0; jjj < numAnnotations; jjj++ ) {
sum += mult[jjj] * (annotations[jjj] - mu[kkk][jjj]);
}
final double denomLog10 = Math.log10(Math.pow(2.0 * Math.PI, ((double)numAnnotations) / 2.0)) + Math.log10(Math.pow(determinant[kkk], 0.5));
pVarInCluster[kkk] = Math.pow(10.0, pClusterLog10[kkk] + (( -0.5 * sum ) / Math.log(10.0)) - denomLog10);
evalGaussianPDFLog10[kkk] = (( -0.5 * sum ) / Math.log(10.0)) - denomLog10;
pVarInCluster[kkk] = Math.pow(10.0, pClusterLog10[kkk] + evalGaussianPDFLog10[kkk]);
}
}

View File

@ -69,12 +69,12 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private boolean IGNORE_ALL_INPUT_FILTERS = false;
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null;
@Argument(fullName="hapmap_prior", shortName="hapmapPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true.", required=false)
private double HAPMAP_QUAL_PRIOR = 15.0;
@Argument(fullName="known_prior", shortName="knownPrior", doc="A prior on the quality of known variants, a phred scaled probability of being true.", required=false)
private int KNOWN_QUAL_PRIOR = 9;
private double KNOWN_QUAL_PRIOR = 10.0;
@Argument(fullName="novel_prior", shortName="novelPrior", doc="A prior on the quality of novel variants, a phred scaled probability of being true.", required=false)
private int NOVEL_QUAL_PRIOR = 2;
@Argument(fullName="quality_scale_factor", shortName="qScale", doc="Multiply all final quality scores by this value. Needed to normalize the quality scores.", required=false)
private double QUALITY_SCALE_FACTOR = 1600.0;
private double NOVEL_QUAL_PRIOR = 2.0;
@Argument(fullName="output_prefix", shortName="output", doc="The prefix added to output VCF file name and optimization curve pdf file name", required=false)
private String OUTPUT_PREFIX = "optimizer";
@Argument(fullName="clusterFile", shortName="clusterFile", doc="The output cluster file", required=true)
@ -90,7 +90,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Argument(fullName="quality_step", shortName="qStep", doc="Resolution in QUAL units for optimization and tranche calculations", required=false)
private double QUAL_STEP = 0.1;
@Argument(fullName="singleton_fp_rate", shortName="fp_rate", doc="Prior expectation that a singleton call would be a FP", required=false)
private double SINGLETON_FP_RATE = -1;
private double SINGLETON_FP_RATE = 0.5;
/////////////////////////////
// Private Member Variables
@ -98,6 +98,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private VariantGaussianMixtureModel theModel = null;
private VCFWriter vcfWriter;
private Set<String> ignoreInputFilterSet = null;
private int numUnstable = 0;
//---------------------------------------------------------------------------------------------------------------
//
@ -185,11 +186,27 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
variantDatum.isKnown = dbsnp != null;
variantDatum.alleleCount = vc.getChromosomeCount(vc.getAlternateAllele(0)); // BUGBUG: assumes file has genotypes. Also, what to do about tri-allelic sites?
final double acPriorLog10 = Math.log10(theModel.getAlleleCountPrior( variantDatum.alleleCount ));
final double knownPriorLog10 = Math.log10( variantDatum.isKnown ? QualityUtils.qualToProb(KNOWN_QUAL_PRIOR) : QualityUtils.qualToProb(NOVEL_QUAL_PRIOR) );
final double pTrue = Math.pow(10.0, theModel.evaluateVariantLog10( vc ) + acPriorLog10 + knownPriorLog10);
final double acPrior = theModel.getAlleleCountPrior( variantDatum.alleleCount );
double knownPrior = variantDatum.isKnown ? QualityUtils.qualToProb(KNOWN_QUAL_PRIOR) : QualityUtils.qualToProb(NOVEL_QUAL_PRIOR);
if(variantDatum.isKnown && DbSNPHelper.isHapmap( dbsnp ) ) {
knownPrior = QualityUtils.qualToProb(HAPMAP_QUAL_PRIOR);
}
final double totalPrior = 1.0 - ((1.0 - acPrior) * (1.0 - knownPrior));
if( MathUtils.compareDoubles(totalPrior, 1.0, 1E-8) == 0 || MathUtils.compareDoubles(totalPrior, 0.0, 1E-8) == 0 ) {
throw new StingException("Some is wrong with the prior that was entered by the user: Prior = " + totalPrior); // TODO - fix this up later
}
double pVar = theModel.evaluateVariant( vc );
if( pVar > 1.0 ) {
pVar = 0.99;
numUnstable++;
}
final double lod = (Math.log10(totalPrior) + Math.log10(pVar)) - ((Math.log10(1.0 - totalPrior)) + Math.log10(1.0 - pVar));
variantDatum.qual = 10.0 * QualityUtils.lodToPhredScaleErrorRate(lod);
variantDatum.qual = QUALITY_SCALE_FACTOR * QualityUtils.phredScaleErrorRate( Math.max(1.0 - pTrue, 0.000000001) ); // BUGBUG: don't have a normalizing constant, so need to scale up qual scores arbitrarily
mapList.add( variantDatum );
Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
@ -228,6 +245,10 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
public void onTraversalDone( ExpandingArrayList<VariantDatum> reduceSum ) {
vcfWriter.close();
if( numUnstable > 0 ) {
logger.warn("WARNING: Found " + numUnstable + " variant(s) with pVar > 1, Most likely numerical instability during clustering !!!!!");
}
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
reduceSum.clear(); // Don't need this ever again, clean up some memory

View File

@ -30,7 +30,11 @@ public class QualityUtils {
}
static public double qualToProb(int qual) {
return 1.0 - Math.pow(10.0, ((double) qual)/-10.0);
return qualToProb( (double)qual );
}
static public double qualToProb(double qual) {
return 1.0 - Math.pow(10.0, qual/(-10.0));
}
/**
@ -76,6 +80,10 @@ public class QualityUtils {
static public double phredScaleErrorRate(double errorRate) {
return -10.0*Math.log10(errorRate);
}
static public double lodToPhredScaleErrorRate(double lod) {
return phredScaleErrorRate(1.0 / (Math.pow(10.0, lod) + 1.0));
}
/**
* Return a quality score, capped at max qual.

View File

@ -12,7 +12,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testGenerateVariantClusters() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "72558d2c49bb94dc59e9d4146fe0bc05" );
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "05d1692624a28cd9446feac8fd2408ab" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();
@ -37,7 +37,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
@Test
public void testVariantRecalibrator() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "0cb94385ced8a7a537d7bc79f82c01d3" );
e.put( validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "4570c93c96de61342d75c1c658e2d5c1" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
String vcf = entry.getKey();