VariantRecalibrator's VQSR.vcf now contains NEG/POS labels

-- It's useful to know which sites have been used in the training of the model.  The recal_file emitted by VR now contains VCF info field annotations labeling each site that was used in the positive or negative training models with POSITIVE_TRAINING_SITE and/or NEGATIVE_TRAINING_SITE
-- Update MD5s, which all changed now that the recal file and the resulting applied vcfs all have these pos / neg labels
This commit is contained in:
Mark DePristo 2013-04-05 09:28:46 -04:00
parent ee51195bf5
commit 564fe36d22
4 changed files with 25 additions and 19 deletions

View File

@ -200,6 +200,8 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out"));
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.POSITIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the positive training set of good variants"));
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.NEGATIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the negative training set of bad variants"));
}
//---------------------------------------------------------------------------------------------------------------
@ -243,6 +245,10 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
// Annotate the new record with its VQSLOD and the worst performing annotation
builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod);
builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY));
if ( recalDatum.hasAttribute(VariantRecalibrator.POSITIVE_LABEL_KEY))
builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true);
if ( recalDatum.hasAttribute(VariantRecalibrator.NEGATIVE_LABEL_KEY))
builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true);
for( int i = tranches.size() - 1; i >= 0; i-- ) {
final Tranche tranche = tranches.get(i);

View File

@ -335,19 +335,17 @@ public class VariantDataManager {
}} );
// create dummy alleles to be used
final List<Allele> alleles = new ArrayList<Allele>(2);
alleles.add(Allele.create("N", true));
alleles.add(Allele.create("<VQSR>", false));
// to be used for the important INFO tags
final HashMap<String, Object> attributes = new HashMap<String, Object>(3);
final List<Allele> alleles = Arrays.asList(Allele.create("N", true), Allele.create("<VQSR>", false));
for( final VariantDatum datum : data ) {
attributes.put(VCFConstants.END_KEY, datum.loc.getStop());
attributes.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod));
attributes.put(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"));
VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles);
builder.attribute(VCFConstants.END_KEY, datum.loc.getStop());
builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod));
builder.attribute(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"));
if ( datum.atTrainingSite ) builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true);
if ( datum.atAntiTrainingSite ) builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true);
VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles).attributes(attributes);
recalWriter.add(builder.make());
}
}

View File

@ -135,6 +135,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
public static final String VQS_LOD_KEY = "VQSLOD"; // Log odds ratio of being a true variant versus being false under the trained gaussian mixture model
public static final String CULPRIT_KEY = "culprit"; // The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out
public static final String NEGATIVE_LABEL_KEY = "NEGATIVE_TRAIN_SITE"; // this variant was used in the negative training set
public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; // this variant was used in the positive traning set
private static final String PLOT_TRANCHES_RSCRIPT = "plot_Tranches.R";
@ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection();

View File

@ -73,8 +73,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf",
"4d08c8eee61dd1bdea8c5765f34e41f0", // tranches
"ce396fe4045e020b61471f6737dff36e", // recal file
"4f59bd61be900b25c6ecedaa68b9c8de"); // cut VCF
"83756d1058ee3c816edf643148ae20df", // recal file
"06353a59fa4857135b5a63ea0791b035"); // cut VCF
@DataProvider(name = "VRTest")
public Object[][] createData1() {
@ -122,8 +122,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf",
"6a1eef4d02857dbb117a15420b5c0ce9", // tranches
"238366af66b05b6d21749e799c25353d", // recal file
"3928d6bc5007becf52312ade70f14c42"); // cut VCF
"ea85f0293e9c016bd1bbe3c2977905d8", // recal file
"4cab4a11130e2f84bd5fe4f9981811bd"); // cut VCF
@DataProvider(name = "VRBCFTest")
public Object[][] createVRBCFTest() {
@ -174,14 +174,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest indelUnfiltered = new VRTest(
validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as .
"b7589cd098dc153ec64c02dcff2838e4", // tranches
"a04a9001f62eff43d363f4d63769f3ee", // recal file
"b2c6827be592c24a4692b1753edc7d23"); // cut VCF
"6091d44e5c750620c6d5493864eeb160", // recal file
"ef4c7931f134c1c860864772d69dd89c"); // cut VCF
VRTest indelFiltered = new VRTest(
validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS
"b7589cd098dc153ec64c02dcff2838e4", // tranches
"a04a9001f62eff43d363f4d63769f3ee", // recal file
"5d483fe1ba2ef36ee9e6c14cbd654706"); // cut VCF
"6091d44e5c750620c6d5493864eeb160", // recal file
"f8decee61f409b6041856c5a20e3865d"); // cut VCF
@DataProvider(name = "VRIndelTest")
public Object[][] createTestVariantRecalibratorIndel() {
@ -239,7 +239,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" -o %s" +
" -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" +
" -recalFile " + privateTestDir + "VQSR.mixedTest.recal",
Arrays.asList("018b3a5cc7cf0cb5468c6a0c80ccaa8b"));
Arrays.asList("8d2e886523c050e0ea2952cbbde4cc26"));
executeTest("testApplyRecalibrationSnpAndIndelTogether", spec);
}
}