VariantRecalibrator's VQSR.vcf now contains NEG/POS labels
-- It's useful to know which sites have been used in the training of the model. The recal_file emitted by VR now contains VCF info field annotations labeling each site that was used in the positive or negative training models with POSITIVE_TRAINING_SITE and/or NEGATIVE_TRAINING_SITE -- Update MD5s, which all changed now that the recal file and the resulting applied vcfs all have these pos / neg labels
This commit is contained in:
parent
ee51195bf5
commit
564fe36d22
|
|
@ -200,6 +200,8 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
|
|||
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
|
||||
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds ratio of being a true variant versus being false under the trained gaussian mixture model"));
|
||||
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out"));
|
||||
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.POSITIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the positive training set of good variants"));
|
||||
hInfo.add(new VCFInfoHeaderLine(VariantRecalibrator.NEGATIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the negative training set of bad variants"));
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -243,6 +245,10 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
|
|||
// Annotate the new record with its VQSLOD and the worst performing annotation
|
||||
builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod);
|
||||
builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY));
|
||||
if ( recalDatum.hasAttribute(VariantRecalibrator.POSITIVE_LABEL_KEY))
|
||||
builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true);
|
||||
if ( recalDatum.hasAttribute(VariantRecalibrator.NEGATIVE_LABEL_KEY))
|
||||
builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true);
|
||||
|
||||
for( int i = tranches.size() - 1; i >= 0; i-- ) {
|
||||
final Tranche tranche = tranches.get(i);
|
||||
|
|
|
|||
|
|
@ -335,19 +335,17 @@ public class VariantDataManager {
|
|||
}} );
|
||||
|
||||
// create dummy alleles to be used
|
||||
final List<Allele> alleles = new ArrayList<Allele>(2);
|
||||
alleles.add(Allele.create("N", true));
|
||||
alleles.add(Allele.create("<VQSR>", false));
|
||||
|
||||
// to be used for the important INFO tags
|
||||
final HashMap<String, Object> attributes = new HashMap<String, Object>(3);
|
||||
final List<Allele> alleles = Arrays.asList(Allele.create("N", true), Allele.create("<VQSR>", false));
|
||||
|
||||
for( final VariantDatum datum : data ) {
|
||||
attributes.put(VCFConstants.END_KEY, datum.loc.getStop());
|
||||
attributes.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod));
|
||||
attributes.put(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"));
|
||||
VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles);
|
||||
builder.attribute(VCFConstants.END_KEY, datum.loc.getStop());
|
||||
builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod));
|
||||
builder.attribute(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"));
|
||||
|
||||
if ( datum.atTrainingSite ) builder.attribute(VariantRecalibrator.POSITIVE_LABEL_KEY, true);
|
||||
if ( datum.atAntiTrainingSite ) builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true);
|
||||
|
||||
VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles).attributes(attributes);
|
||||
recalWriter.add(builder.make());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -135,6 +135,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
|
||||
public static final String VQS_LOD_KEY = "VQSLOD"; // Log odds ratio of being a true variant versus being false under the trained gaussian mixture model
|
||||
public static final String CULPRIT_KEY = "culprit"; // The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out
|
||||
public static final String NEGATIVE_LABEL_KEY = "NEGATIVE_TRAIN_SITE"; // this variant was used in the negative training set
|
||||
public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; // this variant was used in the positive traning set
|
||||
private static final String PLOT_TRANCHES_RSCRIPT = "plot_Tranches.R";
|
||||
|
||||
@ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection();
|
||||
|
|
|
|||
|
|
@ -73,8 +73,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
|
||||
VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf",
|
||||
"4d08c8eee61dd1bdea8c5765f34e41f0", // tranches
|
||||
"ce396fe4045e020b61471f6737dff36e", // recal file
|
||||
"4f59bd61be900b25c6ecedaa68b9c8de"); // cut VCF
|
||||
"83756d1058ee3c816edf643148ae20df", // recal file
|
||||
"06353a59fa4857135b5a63ea0791b035"); // cut VCF
|
||||
|
||||
@DataProvider(name = "VRTest")
|
||||
public Object[][] createData1() {
|
||||
|
|
@ -122,8 +122,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
|
||||
VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf",
|
||||
"6a1eef4d02857dbb117a15420b5c0ce9", // tranches
|
||||
"238366af66b05b6d21749e799c25353d", // recal file
|
||||
"3928d6bc5007becf52312ade70f14c42"); // cut VCF
|
||||
"ea85f0293e9c016bd1bbe3c2977905d8", // recal file
|
||||
"4cab4a11130e2f84bd5fe4f9981811bd"); // cut VCF
|
||||
|
||||
@DataProvider(name = "VRBCFTest")
|
||||
public Object[][] createVRBCFTest() {
|
||||
|
|
@ -174,14 +174,14 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
VRTest indelUnfiltered = new VRTest(
|
||||
validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as .
|
||||
"b7589cd098dc153ec64c02dcff2838e4", // tranches
|
||||
"a04a9001f62eff43d363f4d63769f3ee", // recal file
|
||||
"b2c6827be592c24a4692b1753edc7d23"); // cut VCF
|
||||
"6091d44e5c750620c6d5493864eeb160", // recal file
|
||||
"ef4c7931f134c1c860864772d69dd89c"); // cut VCF
|
||||
|
||||
VRTest indelFiltered = new VRTest(
|
||||
validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS
|
||||
"b7589cd098dc153ec64c02dcff2838e4", // tranches
|
||||
"a04a9001f62eff43d363f4d63769f3ee", // recal file
|
||||
"5d483fe1ba2ef36ee9e6c14cbd654706"); // cut VCF
|
||||
"6091d44e5c750620c6d5493864eeb160", // recal file
|
||||
"f8decee61f409b6041856c5a20e3865d"); // cut VCF
|
||||
|
||||
@DataProvider(name = "VRIndelTest")
|
||||
public Object[][] createTestVariantRecalibratorIndel() {
|
||||
|
|
@ -239,7 +239,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" +
|
||||
" -recalFile " + privateTestDir + "VQSR.mixedTest.recal",
|
||||
Arrays.asList("018b3a5cc7cf0cb5468c6a0c80ccaa8b"));
|
||||
Arrays.asList("8d2e886523c050e0ea2952cbbde4cc26"));
|
||||
executeTest("testApplyRecalibrationSnpAndIndelTogether", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue