Variant records coming out of the VQSR are now annotated with which input annotation was most divergent from the Gaussian mixture model. This gives a general sense for why each variant was removed from the callset.

This commit is contained in:
Ryan Poplin 2011-07-02 09:55:35 -04:00
parent c65e52f88a
commit 17ff5bb094
7 changed files with 45 additions and 4 deletions

View File

@ -55,7 +55,6 @@ import java.util.*;
public class ApplyRecalibration extends RodWalker<Integer, Integer> {
/////////////////////////////
// Inputs
/////////////////////////////
@ -86,6 +85,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
final private List<Tranche> tranches = new ArrayList<Tranche>();
final private Set<String> inputNames = new HashSet<String>();
final private NestedHashMap lodMap = new NestedHashMap();
final private NestedHashMap annotationMap = new NestedHashMap();
final private Set<String> ignoreInputFilterSet = new TreeSet<String>();
//---------------------------------------------------------------------------------------------------------------
@ -124,6 +124,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out"));
final TreeSet<String> samples = new TreeSet<String>();
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
@ -149,6 +150,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
for ( final String line : new XReadLines( RECAL_FILE ) ) {
final String[] vals = line.split(",");
lodMap.put( Double.parseDouble(vals[3]), vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys
annotationMap.put( vals[4], vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(RECAL_FILE, e);
@ -174,11 +176,15 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
String filterString = null;
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
final Double lod = (Double) lodMap.get( vc.getChr(), vc.getStart(), vc.getEnd() );
final String worstAnnotation = (String) annotationMap.get( vc.getChr(), vc.getStart(), vc.getEnd() );
if( lod == null ) {
throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc );
}
// Annotate the new record with its VQSLOD and the worst performing annotation
attrs.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod));
attrs.put(VariantRecalibrator.CULPRIT_KEY, worstAnnotation);
for( int i = tranches.size() - 1; i >= 0; i-- ) {
final Tranche tranche = tranches.get(i);
if( lod >= tranche.minVQSLod ) {

View File

@ -190,6 +190,19 @@ public class GaussianMixtureModel {
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
}
public Double evaluateDatumInOneDimension( final VariantDatum datum, final int iii ) {
if(datum.isNull[iii]) { return null; }
final Normal normal = new Normal(0.0, 1.0, null);
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
int gaussianIndex = 0;
for( final MultivariateGaussian gaussian : gaussians ) {
normal.setState( gaussian.mu[iii], gaussian.sigma.get(iii, iii) );
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + Math.log10( normal.pdf( datum.annotations[iii] ) );
}
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
}
public double evaluateDatumMarginalized( final VariantDatum datum ) {
int numVals = 0;
double sumPVarInGaussian = 0.0;

View File

@ -254,7 +254,9 @@ public class VariantDataManager {
public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) {
for( final VariantDatum datum : data ) {
RECAL_FILE.println(String.format("%s,%d,%d,%.4f", datum.contig, datum.start, datum.stop, datum.lod));
RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s",
datum.contig, datum.start, datum.stop, datum.lod,
(datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")));
}
}
}

View File

@ -24,6 +24,7 @@ public class VariantDatum implements Comparable<VariantDatum> {
public String contig;
public int start;
public int stop;
public int worstAnnotation;
public MultivariateGaussian assignment; // used in K-means implementation
public int compareTo( final VariantDatum other ) {

View File

@ -60,6 +60,7 @@ import java.util.*;
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> implements TreeReducible<ExpandingArrayList<VariantDatum>> {
public static final String VQS_LOD_KEY = "VQSLOD";
public static final String CULPRIT_KEY = "culprit";
@ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection();
@ -232,6 +233,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
engine.evaluateData( dataManager.getData(), goodModel, false );
final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ) );
engine.evaluateData( dataManager.getData(), badModel, true );
engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel );
final ExpandingArrayList<VariantDatum> randomData = dataManager.getRandomDataForPlotting( 6000 );

View File

@ -57,6 +57,23 @@ public class VariantRecalibratorEngine {
}
}
public void calculateWorstPerformingAnnotation( final List<VariantDatum> data, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel ) {
for( final VariantDatum datum : data ) {
int worstAnnotation = -1;
double minProb = Double.MAX_VALUE;
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
final Double goodProbLog10 = goodModel.evaluateDatumInOneDimension(datum, iii);
final Double badProbLog10 = badModel.evaluateDatumInOneDimension(datum, iii);
if( goodProbLog10 != null && badProbLog10 != null ) {
final double prob = goodProbLog10 - badProbLog10;
if(prob < minProb) { minProb = prob; worstAnnotation = iii; }
}
}
datum.worstAnnotation = worstAnnotation;
}
}
/////////////////////////////
// Private Methods used for generating a GaussianMixtureModel
/////////////////////////////

View File

@ -26,8 +26,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf",
"d33212a84368e821cbedecd4f59756d6", // tranches
"a35cd067f378442eee8cd5edeea92be0", // recal file
"126d52843f4a57199ee97750ffc16a07"); // cut VCF
"4652dca41222bebdf9d9fda343b2a835", // recal file
"5350b1a4c1250cf3b77ca45327c04711"); // cut VCF
@DataProvider(name = "VRTest")
public Object[][] createData1() {