Variant records coming out of the VQSR are now annotated with which input annotation was most divergent from the Gaussian mixture model. This gives a general sense for why each variant was removed from the callset.
This commit is contained in:
parent
c65e52f88a
commit
17ff5bb094
|
|
@ -55,7 +55,6 @@ import java.util.*;
|
|||
|
||||
public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Inputs
|
||||
/////////////////////////////
|
||||
|
|
@ -86,6 +85,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
|||
final private List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
final private Set<String> inputNames = new HashSet<String>();
|
||||
final private NestedHashMap lodMap = new NestedHashMap();
|
||||
final private NestedHashMap annotationMap = new NestedHashMap();
|
||||
final private Set<String> ignoreInputFilterSet = new TreeSet<String>();
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -124,6 +124,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
|||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
|
||||
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));
|
||||
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out"));
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
|
||||
|
||||
|
|
@ -149,6 +150,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
|||
for ( final String line : new XReadLines( RECAL_FILE ) ) {
|
||||
final String[] vals = line.split(",");
|
||||
lodMap.put( Double.parseDouble(vals[3]), vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys
|
||||
annotationMap.put( vals[4], vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, e);
|
||||
|
|
@ -174,11 +176,15 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
|
|||
String filterString = null;
|
||||
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
|
||||
final Double lod = (Double) lodMap.get( vc.getChr(), vc.getStart(), vc.getEnd() );
|
||||
final String worstAnnotation = (String) annotationMap.get( vc.getChr(), vc.getStart(), vc.getEnd() );
|
||||
if( lod == null ) {
|
||||
throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc );
|
||||
}
|
||||
|
||||
// Annotate the new record with its VQSLOD and the worst performing annotation
|
||||
attrs.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod));
|
||||
attrs.put(VariantRecalibrator.CULPRIT_KEY, worstAnnotation);
|
||||
|
||||
for( int i = tranches.size() - 1; i >= 0; i-- ) {
|
||||
final Tranche tranche = tranches.get(i);
|
||||
if( lod >= tranche.minVQSLod ) {
|
||||
|
|
|
|||
|
|
@ -190,6 +190,19 @@ public class GaussianMixtureModel {
|
|||
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
|
||||
}
|
||||
|
||||
public Double evaluateDatumInOneDimension( final VariantDatum datum, final int iii ) {
|
||||
if(datum.isNull[iii]) { return null; }
|
||||
|
||||
final Normal normal = new Normal(0.0, 1.0, null);
|
||||
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
|
||||
int gaussianIndex = 0;
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
normal.setState( gaussian.mu[iii], gaussian.sigma.get(iii, iii) );
|
||||
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + Math.log10( normal.pdf( datum.annotations[iii] ) );
|
||||
}
|
||||
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
|
||||
}
|
||||
|
||||
public double evaluateDatumMarginalized( final VariantDatum datum ) {
|
||||
int numVals = 0;
|
||||
double sumPVarInGaussian = 0.0;
|
||||
|
|
|
|||
|
|
@ -254,7 +254,9 @@ public class VariantDataManager {
|
|||
|
||||
public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) {
|
||||
for( final VariantDatum datum : data ) {
|
||||
RECAL_FILE.println(String.format("%s,%d,%d,%.4f", datum.contig, datum.start, datum.stop, datum.lod));
|
||||
RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s",
|
||||
datum.contig, datum.start, datum.stop, datum.lod,
|
||||
(datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ public class VariantDatum implements Comparable<VariantDatum> {
|
|||
public String contig;
|
||||
public int start;
|
||||
public int stop;
|
||||
public int worstAnnotation;
|
||||
public MultivariateGaussian assignment; // used in K-means implementation
|
||||
|
||||
public int compareTo( final VariantDatum other ) {
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ import java.util.*;
|
|||
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> implements TreeReducible<ExpandingArrayList<VariantDatum>> {
|
||||
|
||||
public static final String VQS_LOD_KEY = "VQSLOD";
|
||||
public static final String CULPRIT_KEY = "culprit";
|
||||
|
||||
@ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection();
|
||||
|
||||
|
|
@ -232,6 +233,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
engine.evaluateData( dataManager.getData(), goodModel, false );
|
||||
final GaussianMixtureModel badModel = engine.generateModel( dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ) );
|
||||
engine.evaluateData( dataManager.getData(), badModel, true );
|
||||
engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel );
|
||||
|
||||
final ExpandingArrayList<VariantDatum> randomData = dataManager.getRandomDataForPlotting( 6000 );
|
||||
|
||||
|
|
|
|||
|
|
@ -57,6 +57,23 @@ public class VariantRecalibratorEngine {
|
|||
}
|
||||
}
|
||||
|
||||
public void calculateWorstPerformingAnnotation( final List<VariantDatum> data, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel ) {
|
||||
for( final VariantDatum datum : data ) {
|
||||
int worstAnnotation = -1;
|
||||
double minProb = Double.MAX_VALUE;
|
||||
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
|
||||
final Double goodProbLog10 = goodModel.evaluateDatumInOneDimension(datum, iii);
|
||||
final Double badProbLog10 = badModel.evaluateDatumInOneDimension(datum, iii);
|
||||
if( goodProbLog10 != null && badProbLog10 != null ) {
|
||||
final double prob = goodProbLog10 - badProbLog10;
|
||||
if(prob < minProb) { minProb = prob; worstAnnotation = iii; }
|
||||
}
|
||||
}
|
||||
datum.worstAnnotation = worstAnnotation;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Private Methods used for generating a GaussianMixtureModel
|
||||
/////////////////////////////
|
||||
|
|
|
|||
|
|
@ -26,8 +26,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
|
||||
VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf",
|
||||
"d33212a84368e821cbedecd4f59756d6", // tranches
|
||||
"a35cd067f378442eee8cd5edeea92be0", // recal file
|
||||
"126d52843f4a57199ee97750ffc16a07"); // cut VCF
|
||||
"4652dca41222bebdf9d9fda343b2a835", // recal file
|
||||
"5350b1a4c1250cf3b77ca45327c04711"); // cut VCF
|
||||
|
||||
@DataProvider(name = "VRTest")
|
||||
public Object[][] createData1() {
|
||||
|
|
|
|||
Loading…
Reference in New Issue