Merge pull request #165 from broadinstitute/md_vqsr_improvements

Two simple VQSR usability improvements
This commit is contained in:
Ryan Poplin 2013-04-16 06:26:38 -07:00
commit 0ee21e58c3
4 changed files with 72 additions and 44 deletions

View File

@ -200,6 +200,8 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model")); hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out")); hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out"));
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.POSITIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the positive training set of good variants"));
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.NEGATIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the negative training set of bad variants"));
} }
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
@ -243,6 +245,10 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
// Annotate the new record with its VQSLOD and the worst performing annotation // Annotate the new record with its VQSLOD and the worst performing annotation
builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod);
builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY));
if ( recalDatum.hasAttribute(VariantRecalibrator.POSITIVE_LABEL_KEY))
builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true);
if ( recalDatum.hasAttribute(VariantRecalibrator.NEGATIVE_LABEL_KEY))
builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true);
for( int i = tranches.size() - 1; i >= 0; i-- ) { for( int i = tranches.size() - 1; i >= 0; i-- ) {
final Tranche tranche = tranches.get(i); final Tranche tranche = tranches.get(i);

View File

@ -127,6 +127,22 @@ public class VariantDataManager {
} }
} }
/**
* Convert a normalized point to it's original annotation value
*
* norm = (orig - mu) / sigma
* orig = norm * sigma + mu
*
* @param normalizedValue the normalized value of the ith annotation
* @param annI the index of the annotation value
* @return the denormalized value for the annotation
*/
public double denormalizeDatum(final double normalizedValue, final int annI) {
final double mu = meanVector[annI];
final double sigma = varianceVector[annI];
return normalizedValue * sigma + mu;
}
public void addTrainingSet( final TrainingSet trainingSet ) { public void addTrainingSet( final TrainingSet trainingSet ) {
trainingSets.add( trainingSet ); trainingSets.add( trainingSet );
} }
@ -335,19 +351,17 @@ public class VariantDataManager {
}} ); }} );
// create dummy alleles to be used // create dummy alleles to be used
final List<Allele> alleles = new ArrayList<Allele>(2); final List<Allele> alleles = Arrays.asList(Allele.create("N", true), Allele.create("<VQSR>", false));
alleles.add(Allele.create("N", true));
alleles.add(Allele.create("<VQSR>", false));
// to be used for the important INFO tags
final HashMap<String, Object> attributes = new HashMap<String, Object>(3);
for( final VariantDatum datum : data ) { for( final VariantDatum datum : data ) {
attributes.put(VCFConstants.END_KEY, datum.loc.getStop()); VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles);
attributes.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); builder.attribute(VCFConstants.END_KEY, datum.loc.getStop());
attributes.put(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod));
builder.attribute(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"));
if ( datum.atTrainingSite ) builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true);
if ( datum.atAntiTrainingSite ) builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true);
VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles).attributes(attributes);
recalWriter.add(builder.make()); recalWriter.add(builder.make());
} }
} }

View File

@ -135,6 +135,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
public static final String VQS_LOD_KEY = "VQSLOD"; // Log odds ratio of being a true variant versus being false under the trained gaussian mixture model public static final String VQS_LOD_KEY = "VQSLOD"; // Log odds ratio of being a true variant versus being false under the trained gaussian mixture model
public static final String CULPRIT_KEY = "culprit"; // The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out public static final String CULPRIT_KEY = "culprit"; // The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out
public static final String NEGATIVE_LABEL_KEY = "NEGATIVE_TRAIN_SITE"; // this variant was used in the negative training set
public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; // this variant was used in the positive traning set
private static final String PLOT_TRANCHES_RSCRIPT = "plot_Tranches.R"; private static final String PLOT_TRANCHES_RSCRIPT = "plot_Tranches.R";
@ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); @ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection();
@ -433,14 +435,20 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
stream.print("surface <- c("); stream.print("surface <- c(");
for( final VariantDatum datum : fakeData ) { for( final VariantDatum datum : fakeData ) {
stream.print(String.format("%.3f, %.3f, %.3f, ", datum.annotations[iii], datum.annotations[jjj], Math.min(4.0, Math.max(-4.0, datum.lod)))); stream.print(String.format("%.3f, %.3f, %.3f, ",
dataManager.denormalizeDatum(datum.annotations[iii], iii),
dataManager.denormalizeDatum(datum.annotations[jjj], jjj),
Math.min(4.0, Math.max(-4.0, datum.lod))));
} }
stream.println("NA,NA,NA)"); stream.println("NA,NA,NA)");
stream.println("s <- matrix(surface,ncol=3,byrow=T)"); stream.println("s <- matrix(surface,ncol=3,byrow=T)");
stream.print("data <- c("); stream.print("data <- c(");
for( final VariantDatum datum : randomData ) { for( final VariantDatum datum : randomData ) {
stream.print(String.format("%.3f, %.3f, %.3f, %d, %d,", datum.annotations[iii], datum.annotations[jjj], (datum.lod < lodCutoff ? -1.0 : 1.0), stream.print(String.format("%.3f, %.3f, %.3f, %d, %d,",
dataManager.denormalizeDatum(datum.annotations[iii], iii),
dataManager.denormalizeDatum(datum.annotations[jjj], jjj),
(datum.lod < lodCutoff ? -1.0 : 1.0),
(datum.atAntiTrainingSite ? -1 : (datum.atTrainingSite ? 1 : 0)), (datum.isKnown ? 1 : -1))); (datum.atAntiTrainingSite ? -1 : (datum.atTrainingSite ? 1 : 0)), (datum.isKnown ? 1 : -1)));
} }
stream.println("NA,NA,NA,NA,1)"); stream.println("NA,NA,NA,NA,1)");

View File

@ -73,8 +73,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf",
"4d08c8eee61dd1bdea8c5765f34e41f0", // tranches "4d08c8eee61dd1bdea8c5765f34e41f0", // tranches
"ce396fe4045e020b61471f6737dff36e", // recal file "83756d1058ee3c816edf643148ae20df", // recal file
"4f59bd61be900b25c6ecedaa68b9c8de"); // cut VCF "06353a59fa4857135b5a63ea0791b035"); // cut VCF
@DataProvider(name = "VRTest") @DataProvider(name = "VRTest")
public Object[][] createData1() { public Object[][] createData1() {
@ -122,8 +122,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf", VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf",
"6a1eef4d02857dbb117a15420b5c0ce9", // tranches "6a1eef4d02857dbb117a15420b5c0ce9", // tranches
"238366af66b05b6d21749e799c25353d", // recal file "ea85f0293e9c016bd1bbe3c2977905d8", // recal file
"3928d6bc5007becf52312ade70f14c42"); // cut VCF "4cab4a11130e2f84bd5fe4f9981811bd"); // cut VCF
@DataProvider(name = "VRBCFTest") @DataProvider(name = "VRBCFTest")
public Object[][] createVRBCFTest() { public Object[][] createVRBCFTest() {
@ -174,14 +174,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest indelUnfiltered = new VRTest( VRTest indelUnfiltered = new VRTest(
validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as . validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as .
"b7589cd098dc153ec64c02dcff2838e4", // tranches "b7589cd098dc153ec64c02dcff2838e4", // tranches
"a04a9001f62eff43d363f4d63769f3ee", // recal file "6091d44e5c750620c6d5493864eeb160", // recal file
"b2c6827be592c24a4692b1753edc7d23"); // cut VCF "ef4c7931f134c1c860864772d69dd89c"); // cut VCF
VRTest indelFiltered = new VRTest( VRTest indelFiltered = new VRTest(
validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS
"b7589cd098dc153ec64c02dcff2838e4", // tranches "b7589cd098dc153ec64c02dcff2838e4", // tranches
"a04a9001f62eff43d363f4d63769f3ee", // recal file "6091d44e5c750620c6d5493864eeb160", // recal file
"5d483fe1ba2ef36ee9e6c14cbd654706"); // cut VCF "f8decee61f409b6041856c5a20e3865d"); // cut VCF
@DataProvider(name = "VRIndelTest") @DataProvider(name = "VRIndelTest")
public Object[][] createTestVariantRecalibratorIndel() { public Object[][] createTestVariantRecalibratorIndel() {
@ -239,7 +239,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" -o %s" + " -o %s" +
" -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" +
" -recalFile " + privateTestDir + "VQSR.mixedTest.recal", " -recalFile " + privateTestDir + "VQSR.mixedTest.recal",
Arrays.asList("018b3a5cc7cf0cb5468c6a0c80ccaa8b")); Arrays.asList("8d2e886523c050e0ea2952cbbde4cc26"));
executeTest("testApplyRecalibrationSnpAndIndelTogether", spec); executeTest("testApplyRecalibrationSnpAndIndelTogether", spec);
} }
} }