From 5592e4ead03278622cec68ed483a5f14c0249c90 Mon Sep 17 00:00:00 2001 From: Laura Gauthier Date: Mon, 7 Dec 2015 08:38:20 -0500 Subject: [PATCH] Add new -AS mode to run VQSR (both VariantRecalibrator and ApplyRecalibration) in an allele-specific manner --- .../walkers/annotator/AnnotationUtils.java | 17 +- .../haplotypecaller/HaplotypeCaller.java | 13 + .../ApplyRecalibration.java | 363 ++++++++++++++++-- .../VariantDataManager.java | 58 ++- .../variantrecalibration/VariantDatum.java | 4 + .../VariantRecalibrator.java | 94 +++-- ...VariantRecalibratorArgumentCollection.java | 7 + .../VariantRecalibratorEngine.java | 4 +- ...ntRecalibrationWalkersIntegrationTest.java | 36 ++ .../gatk/utils/variant/GATKVCFConstants.java | 4 + .../utils/variant/GATKVCFHeaderLines.java | 3 + 11 files changed, 530 insertions(+), 73 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java index 81e645a72..ebf479369 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java @@ -79,11 +79,11 @@ public class AnnotationUtils { public static final int WARNINGS_LOGGED_SIZE = 3; /** - * Helper function to parse the list into the annotation string - * @param valueList the ArrayList returned from StrandBiasBySample.annotate() - * @return the array used by the per-sample Strand Bias annotation + * Helper function to convert a List of Doubles to a comma-separated String + * @param valueList the ArrayList with Double data + * @return a comma-separated String */ - protected static String encodeValueList( final List valueList, final String precisionFormat ) { + public static String encodeValueList( final List valueList, final String precisionFormat ) { List outputList = new ArrayList<>(); for (Double d : valueList) { outputList.add(String.format(precisionFormat, d)); @@ -91,6 +91,15 @@ public class AnnotationUtils { return StringUtils.join(outputList, ","); } + /** + * Helper function to convert a List of Strings to a comma-separated String + * @param stringList the ArrayList with String data + * @return a comma-separated String + */ + public static String encodeStringList( final List stringList) { + return StringUtils.join(stringList, ","); + } + /** * Checks if the walker is compatible with allele-specific annotations */ diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index 7d47f27a7..99deff690 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -180,6 +180,19 @@ import java.util.*; * -o output.raw.snps.indels.g.vcf * * + *

Single-sample GVCF calling on DNAseq with allele-specific annotations (for allele-specific cohort analysis workflow)

+ *
+ *   java -jar GenomeAnalysisTK.jar \
+ *     -R reference.fasta \
+ *     -T HaplotypeCaller \
+ *     -I sample1.bam \
+ *     --emitRefConfidence GVCF \
+ *     [--dbsnp dbSNP.vcf] \
+ *     [-L targets.interval_list] \
+ *     -G Standard -G AS_Standard \
+ *     -o output.raw.snps.indels.AS.g.vcf
+ * 
+ * *

Variant-only calling on DNAseq

*
  *   java -jar GenomeAnalysisTK.jar \
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java
index b6ef38ade..7474ecb1f 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java
@@ -51,6 +51,8 @@
 
 package org.broadinstitute.gatk.tools.walkers.variantrecalibration;
 
+import htsjdk.variant.variantcontext.Allele;
+import org.broadinstitute.gatk.tools.walkers.annotator.AnnotationUtils;
 import org.broadinstitute.gatk.utils.commandline.*;
 import org.broadinstitute.gatk.engine.CommandLineGATK;
 import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
@@ -74,6 +76,8 @@ import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
 
 import java.io.File;
 import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 /**
  * Apply a score cutoff to filter variants based on a recalibration table
@@ -126,6 +130,26 @@ import java.util.*;
  *   -o path/to/output.recalibrated.filtered.vcf
  * 
* + *

Allele-specific usage

+ *
+ * java -Xmx3g -jar GenomeAnalysisTK.jar \
+ *   -T ApplyRecalibration \
+ *   -R reference.fasta \
+ *   -input rawVariants.withASannotations.vcf \
+ *   -AS \
+ *   --ts_filter_level 99.0 \
+ *   -tranchesFile path/to/output.AS.tranches \
+ *   -recalFile path/to/output.AS.recal \
+ *   -mode SNP \
+ *   -o path/to/output.recalibrated.ASfiltered.vcf
+ * 
+ * Each allele will be annotated by its corresponding entry in the AS_FilterStatus INFO field annotation. Allele-specific VQSLOD and culprit are also carried through from VariantRecalibrator and stored in the AS_VQSLOD and AS_culprit INFO fields, respectively. + * The site-level filter is set to the most lenient of any of the allele filters. That is, if one allele passes, the whole site will be PASS. If no alleles pass, the site-level filter will be set to the lowest sensitivity tranche among all the alleles. + * + * Note that the .tranches and .recal files should be derived from an allele-specific run of VariantRecalibrator + * Also note that the AS_culprit, AS_FilterStatus, and AS_VQSLOD fields will have placeholder values (NA or NaN) for alleles of a type that have not yet been processed by ApplyRecalibration + * The spanning deletion allele (*) will not be recalibrated because it represents missing data. Its VQSLOD will remain NaN and it's culprit and FilterStatus will be NA. + * *

Caveats

* *
    @@ -145,6 +169,9 @@ public class ApplyRecalibration extends RodWalker implements T public static final String LOW_VQSLOD_FILTER_NAME = "LOW_VQSLOD"; private final double DEFAULT_VQSLOD_CUTOFF = 0.0; + boolean foundSNPTranches = false; + boolean foundINDELTranches = false; + ///////////////////////////// // Inputs ///////////////////////////// @@ -169,6 +196,14 @@ public class ApplyRecalibration extends RodWalker implements T ///////////////////////////// @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false) protected Double TS_FILTER_LEVEL = null; + + /** + * Filter the input file based on allele-specific recalibration data. See tool docs for site-level and allele-level filtering details. + * Requires a .recal file produced using an allele-specific run of VariantRecalibrator + */ + @Argument(fullName="useAlleleSpecificAnnotations", shortName="AS", doc="If specified, the tool will attempt to apply a filter to each allele based on the input tranches and allele-specific .recal file.", required=false) + public boolean useASannotations = false; + @Advanced @Argument(fullName="lodCutoff", shortName="lodCutoff", doc="The VQSLOD score below which to start filtering", required=false) protected Double VQSLOD_CUTOFF = null; @@ -191,6 +226,12 @@ public class ApplyRecalibration extends RodWalker implements T final private List tranches = new ArrayList<>(); final private Set inputNames = new HashSet<>(); final private Set ignoreInputFilterSet = new TreeSet<>(); + final static private String listPrintSeparator = ","; + final static private String trancheFilterString = "VQSRTranche"; + final static private String arrayParseRegex = "[\\[\\]\\s]"; + final static private String emptyStringValue = "NA"; + final static private String emptyFloatValue = "NaN"; + //--------------------------------------------------------------------------------------------------------------- // @@ -219,11 +260,19 @@ public class ApplyRecalibration extends RodWalker implements T // setup the header fields final Set hInfo = new HashSet<>(); - hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), inputNames)); + final Set inputHeaders = GATKVCFUtils.getHeaderFields(getToolkit(), inputNames); + hInfo.addAll(inputHeaders); addVQSRStandardHeaderLines(hInfo); + if (useASannotations) + addAlleleSpecificVQSRHeaderLines(hInfo); + + checkForPreviousApplyRecalRun(Collections.unmodifiableSet(inputHeaders)); + final TreeSet samples = new TreeSet<>(); samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)); + //generate headers from tranches file + //TODO: throw away old tranche headers if we're ignoring filters if( TS_FILTER_LEVEL != null ) { // if the user specifies both ts_filter_level and lodCutoff then throw a user error if( VQSLOD_CUTOFF != null ) { @@ -255,6 +304,20 @@ public class ApplyRecalibration extends RodWalker implements T vcfWriter.writeHeader(vcfHeader); } + private boolean trancheIntervalIsValid(final String sensitivityLimits) { + final String[] vals = sensitivityLimits.split("to"); + if(vals.length != 2) + return false; + try { + double lowerLimit = Double.parseDouble(vals[0]); + double upperLimit = Double.parseDouble(vals[1].replace("+","")); //why does our last tranche end with 100+? Is there anything greater than 100 percent? Really??? + } + catch(NumberFormatException e) { + throw new UserException("Poorly formatted tranche filter name does not contain two sensitivity interval end points."); + } + return true; + } + public static void addVQSRStandardHeaderLines(final Set hInfo) { hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); hInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.VQS_LOD_KEY)); @@ -263,6 +326,38 @@ public class ApplyRecalibration extends RodWalker implements T hInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.NEGATIVE_LABEL_KEY)); } + public static void addAlleleSpecificVQSRHeaderLines(final Set hInfo) { + hInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AS_FILTER_STATUS_KEY)); + } + + /** + * Check the filter declarations in the input VCF header to see if any ApplyRecalibration mode has been run + * Here we assume that the tranches are named with a specific format: VQSRTranche[SNP|INDEL][lowerLimit]to[upperLimit] + * @param inputHeaders + */ + private void checkForPreviousApplyRecalRun(final Set inputHeaders) { + for(final VCFHeaderLine header : inputHeaders) { + if(header instanceof VCFFilterHeaderLine) { + final String filterName = ((VCFFilterHeaderLine)header).getID(); + if(filterName.length() < 12 || !filterName.substring(0, 11).equalsIgnoreCase(trancheFilterString)) { + continue; + } + if(filterName.charAt(11) == 'S') { + //for SNP tranches, get sensitivity limit + final String sensitivityLimits = filterName.substring(14); + if(trancheIntervalIsValid(sensitivityLimits)) + foundSNPTranches = true; + } + else if(filterName.charAt(11) == 'I') { + //for INDEL tranches, get sensitivity limit + final String sensitivityLimits = filterName.substring(16); + if(trancheIntervalIsValid(sensitivityLimits)) + foundINDELTranches = true; + } + } + } + } + //--------------------------------------------------------------------------------------------------------------- // // map @@ -280,38 +375,26 @@ public class ApplyRecalibration extends RodWalker implements T for( final VariantContext vc : VCs ) { - if( VariantDataManager.checkVariationClass( vc, MODE ) && (IGNORE_ALL_FILTERS || vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) { + final boolean evaluateThisVariant = useASannotations || VariantDataManager.checkVariationClass( vc, MODE ); + final boolean variantIsNotFiltered = IGNORE_ALL_FILTERS || vc.isNotFiltered() || (!ignoreInputFilterSet.isEmpty() && ignoreInputFilterSet.containsAll(vc.getFilters())); //vc.isNotFiltered is true for PASS; vc.filtersHaveBeenApplied covers PASS and filters + if( evaluateThisVariant && variantIsNotFiltered) { - final VariantContext recalDatum = getMatchingRecalVC(vc, recals); - if( recalDatum == null ) { - throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc ); + + String filterString; + final VariantContextBuilder builder = new VariantContextBuilder(vc); + if (!useASannotations) { + filterString = doSiteSpecificFiltering(vc, recals, builder); + } + else { //allele-specific mode + filterString = doAlleleSpecificFiltering(vc, recals, builder); } - final String lodString = recalDatum.getAttributeAsString(GATKVCFConstants.VQS_LOD_KEY, null); - if( lodString == null ) { - throw new UserException("Encountered a malformed record in the input recal file. There is no lod for the record at: " + vc ); - } - final double lod; - try { - lod = Double.valueOf(lodString); - } catch (NumberFormatException e) { - throw new UserException("Encountered a malformed record in the input recal file. The lod is unreadable for the record at: " + vc ); - } - - VariantContextBuilder builder = new VariantContextBuilder(vc); - - // Annotate the new record with its VQSLOD and the worst performing annotation - builder.attribute(GATKVCFConstants.VQS_LOD_KEY, lod); - builder.attribute(GATKVCFConstants.CULPRIT_KEY, recalDatum.getAttribute(GATKVCFConstants.CULPRIT_KEY)); - if ( recalDatum.hasAttribute(GATKVCFConstants.POSITIVE_LABEL_KEY)) - builder.attribute(GATKVCFConstants.POSITIVE_LABEL_KEY, true); - if ( recalDatum.hasAttribute(GATKVCFConstants.NEGATIVE_LABEL_KEY)) - builder.attribute(GATKVCFConstants.NEGATIVE_LABEL_KEY, true); - - final String filterString = generateFilterString(lod); + //for both non-AS and AS modes: if( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) { builder.passFilters(); + } else if(filterString.equals(VCFConstants.UNFILTERED)) { + builder.unfiltered(); } else { builder.filters(filterString); } @@ -328,6 +411,78 @@ public class ApplyRecalibration extends RodWalker implements T return 1; // This value isn't used for anything } + public double parseFilterLowerLimit(final String trancheFilter) { + final Pattern pattern = Pattern.compile("VQSRTranche\\S+(\\d+\\.\\d+)to(\\d+\\.\\d+)"); + final Matcher m = pattern.matcher(trancheFilter); + return m.find() ? Double.parseDouble(m.group(1)) : -1; + } + + /** + * Generate the VCF filter string for this record based on the ApplyRecalibration modes run so far + * @param vc the input VariantContext (with at least one ApplyRecalibration mode already run) + * @param bestLod best LOD from the alleles we've seen in this recalibration mode + * @return the String to use as the VCF filter field + */ + protected String generateFilterStringFromAlleles(final VariantContext vc, final double bestLod) { + String filterString = "."; + + final boolean bothModesWereRun = (MODE == VariantRecalibratorArgumentCollection.Mode.SNP && foundINDELTranches) || (MODE == VariantRecalibratorArgumentCollection.Mode.INDEL && foundSNPTranches); + final boolean onlyOneModeNeeded = !vc.isMixed() && VariantDataManager.checkVariationClass( vc, MODE ); + + //if both SNP and INDEL modes have not yet been run (and need to be), leave this variant as unfiltered and add the filters for the alleles in this mode to the INFO field + if (!bothModesWereRun && !onlyOneModeNeeded) { + return VCFConstants.UNFILTERED; + } + + //if both SNP and INDEL modes have been run or the site is not mixed, generate a filter string for this site based on both models + + //pull out the allele filter status from the info field (there may be more than one entry in the list if there were multiple snp/indel alleles assessed in the other mode) + final String prevFilterStatus = vc.getAttributeAsString(GATKVCFConstants.AS_FILTER_STATUS_KEY, null); + + //if this site hasn't had a filter applied yet + if (prevFilterStatus != null && !prevFilterStatus.equals(VCFConstants.UNFILTERED)) { + final String prevAllelesFilterStatusString = vc.getAttributeAsString(GATKVCFConstants.AS_FILTER_STATUS_KEY, null); + final String[] prevAllelesFilterStatusList = prevAllelesFilterStatusString.split(listPrintSeparator); + //start with the current best allele filter as the most lenient filter across all modes and all alleles + String mostLenientFilterName = generateFilterString(bestLod); + //if the current mode's best allele passes the tranche filter, then let the whole site pass + if (mostLenientFilterName.equals(VCFConstants.PASSES_FILTERS_v4)) { + filterString = mostLenientFilterName; + } + //if the current mode's best allele does not pass the tranche filter, compare the most lenient filter of this mode with those from the previous mode + else { + double mostLenientSensitivityLowerLimit = parseFilterLowerLimit(mostLenientFilterName); + for (int i = 0; i < prevAllelesFilterStatusList.length; i++) { + final String alleleFilterString = prevAllelesFilterStatusList[i].replaceAll(arrayParseRegex, "").trim(); + //if any allele from the previous mode passed the tranche filter, then let the whole site pass + if (alleleFilterString.equals(VCFConstants.PASSES_FILTERS_v4)) { //this allele is PASS + mostLenientFilterName = alleleFilterString; + break; + } + //if there's no PASS, then we need to parse the filters to find out how lenient they are + else { + final double alleleLowerLimit = parseFilterLowerLimit(alleleFilterString); + if (alleleLowerLimit == -1) + continue; + if (alleleLowerLimit < mostLenientSensitivityLowerLimit) { + mostLenientSensitivityLowerLimit = alleleLowerLimit; + mostLenientFilterName = alleleFilterString; + } + } + + } + filterString = mostLenientFilterName; + } + } + + //if both modes have been run, but the previous mode didn't apply a filter, use the current mode's best allele VQSLOD filter (shouldn't get run, but just in case) + else { + filterString = generateFilterString(bestLod); + } + + return filterString; + } + /** * Generate the VCF filter string for this record based on the provided lod score * @param lod non-null double @@ -358,16 +513,164 @@ public class ApplyRecalibration extends RodWalker implements T return filterString; } - private static VariantContext getMatchingRecalVC(final VariantContext target, final List recalVCs) { + private VariantContext getMatchingRecalVC(final VariantContext target, final List recalVCs, final Allele allele) { for( final VariantContext recalVC : recalVCs ) { if ( target.getEnd() == recalVC.getEnd() ) { - return recalVC; + if (!useASannotations) + return recalVC; + else if (allele.equals(recalVC.getAlternateAllele(0))) + return recalVC; } } - return null; } + /** + * + * @param altIndex current alt allele + * @param prevCulpritList culprits from previous ApplyRecalibration run + * @param prevLodList lods from previous ApplyRecalibration run + * @param prevASfiltersList AS_filters from previous ApplyRecalibration run + * @param culpritString + * @param lodString + * @param AS_filterString + */ + private void updateAnnotationsWithoutRecalibrating(final int altIndex, final String[] prevCulpritList, final String[] prevLodList, final String[] prevASfiltersList, + final List culpritString, final List lodString, final List AS_filterString) { + if (foundINDELTranches || foundSNPTranches) { + if (altIndex < prevCulpritList.length) { + culpritString.add(prevCulpritList[altIndex].replaceAll(arrayParseRegex, "").trim()); + lodString.add(prevLodList[altIndex].replaceAll(arrayParseRegex, "").trim()); + AS_filterString.add(prevASfiltersList[altIndex].replaceAll(arrayParseRegex, "").trim()); + } + } else { //if the other allele type hasn't been processed yet, make sure there are enough entries + culpritString.add(emptyStringValue); + lodString.add(emptyFloatValue); + AS_filterString.add(emptyStringValue); + } + } + + /** + * Calculate the allele-specific filter status of vc + * @param vc + * @param recals + * @param builder is modified by adding attributes + * @return a String with the filter status for this site + */ + private String doAlleleSpecificFiltering(final VariantContext vc, final List recals, final VariantContextBuilder builder) { + double bestLod = VariantRecalibratorEngine.MIN_ACCEPTABLE_LOD_SCORE; + final List culpritStrings = new ArrayList<>(); + final List lodStrings = new ArrayList<>(); + final List AS_filterStrings = new ArrayList<>(); + + String[] prevCulpritList = null; + String[] prevLodList = null; + String[] prevASfiltersList = null; + + //get VQSR annotations from previous run of ApplyRecalibration, if applicable + if(foundINDELTranches || foundSNPTranches) { + final String prevCulprits = vc.getAttributeAsString(GATKVCFConstants.AS_CULPRIT_KEY,""); + prevCulpritList = prevCulprits.isEmpty()? new String[0] : prevCulprits.split(listPrintSeparator); + final String prevLodString = vc.getAttributeAsString(GATKVCFConstants.AS_VQS_LOD_KEY,""); + prevLodList = prevLodString.isEmpty()? new String[0] : prevLodString.split(listPrintSeparator); + final String prevASfilters = vc.getAttributeAsString(GATKVCFConstants.AS_FILTER_STATUS_KEY,""); + prevASfiltersList = prevASfilters.isEmpty()? new String[0] : prevASfilters.split(listPrintSeparator); + } + + //for each allele in the current VariantContext + for (int altIndex = 0; altIndex < vc.getNAlleles()-1; altIndex++) { + final Allele allele = vc.getAlternateAllele(altIndex); + + //if the current allele is not part of this recalibration mode, add its annotations to the list and go to the next allele + if (!VariantDataManager.checkVariationClass(vc, allele, MODE)) { + updateAnnotationsWithoutRecalibrating(altIndex, prevCulpritList, prevLodList, prevASfiltersList, culpritStrings, lodStrings, AS_filterStrings); + continue; + } + + //if the current allele does need to have recalibration applied... + + //initialize allele-specific VQSR annotation data with values for spanning deletion + String alleleLodString = emptyFloatValue; + String alleleFilterString = emptyStringValue; + String alleleCulpritString = emptyStringValue; + + //if it's not a spanning deletion, replace those allele strings with the real values + if (!allele.equals(Allele.SPAN_DEL)) { + VariantContext recalDatum = getMatchingRecalVC(vc, recals, allele); + if (recalDatum == null) { + throw new UserException("Encountered input allele which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants with flag -AS. First seen at: " + vc); + } + + //compare VQSLODs for all alleles in the current mode for filtering later + final double lod = recalDatum.getAttributeAsDouble(GATKVCFConstants.VQS_LOD_KEY, VariantRecalibratorEngine.MIN_ACCEPTABLE_LOD_SCORE); + if (lod > bestLod) + bestLod = lod; + + alleleLodString = String.format("%.4f", lod); + alleleFilterString = generateFilterString(lod); + alleleCulpritString = recalDatum.getAttributeAsString(GATKVCFConstants.CULPRIT_KEY, "."); + + if(recalDatum != null) { + if (recalDatum.hasAttribute(GATKVCFConstants.POSITIVE_LABEL_KEY)) + builder.attribute(GATKVCFConstants.POSITIVE_LABEL_KEY, true); + if (recalDatum.hasAttribute(GATKVCFConstants.NEGATIVE_LABEL_KEY)) + builder.attribute(GATKVCFConstants.NEGATIVE_LABEL_KEY, true); + } + } + + //append per-allele VQSR annotations + lodStrings.add(alleleLodString); + AS_filterStrings.add(alleleFilterString); + culpritStrings.add(alleleCulpritString); + } + + // Annotate the new record with its VQSLOD, AS_FilterStatus, and the worst performing annotation + if(!AS_filterStrings.isEmpty() ) + builder.attribute(GATKVCFConstants.AS_FILTER_STATUS_KEY, AnnotationUtils.encodeStringList(AS_filterStrings)); + if(!lodStrings.isEmpty()) + builder.attribute(GATKVCFConstants.AS_VQS_LOD_KEY, AnnotationUtils.encodeStringList(lodStrings)); + if(!culpritStrings.isEmpty()) + builder.attribute(GATKVCFConstants.AS_CULPRIT_KEY, AnnotationUtils.encodeStringList(culpritStrings)); + + return generateFilterStringFromAlleles(vc, bestLod); + } + + /** + * Calculate the filter status for a given VariantContext using the combined data from all alleles at a site + * @param vc + * @param recals + * @param builder is modified by adding attributes + * @return a String with the filter status for this site + */ + private String doSiteSpecificFiltering(final VariantContext vc, final List recals, final VariantContextBuilder builder) { + VariantContext recalDatum = getMatchingRecalVC(vc, recals, null); + if( recalDatum == null ) { + throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc ); + } + + final String lodString = recalDatum.getAttributeAsString(GATKVCFConstants.VQS_LOD_KEY, null); + if( lodString == null ) { + throw new UserException("Encountered a malformed record in the input recal file. There is no lod for the record at: " + vc ); + } + final double lod; + try { + lod = Double.valueOf(lodString); + } catch (NumberFormatException e) { + throw new UserException("Encountered a malformed record in the input recal file. The lod is unreadable for the record at: " + vc ); + } + + builder.attribute(GATKVCFConstants.VQS_LOD_KEY, lod); + builder.attribute(GATKVCFConstants.CULPRIT_KEY, recalDatum.getAttribute(GATKVCFConstants.CULPRIT_KEY)); + if(recalDatum != null) { + if (recalDatum.hasAttribute(GATKVCFConstants.POSITIVE_LABEL_KEY)) + builder.attribute(GATKVCFConstants.POSITIVE_LABEL_KEY, true); + if (recalDatum.hasAttribute(GATKVCFConstants.NEGATIVE_LABEL_KEY)) + builder.attribute(GATKVCFConstants.NEGATIVE_LABEL_KEY, true); + } + + return generateFilterString(lod); + } + //--------------------------------------------------------------------------------------------------------------- // // reduce diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java index b315ebd7c..32c7da051 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java @@ -58,7 +58,6 @@ import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.MathUtils; import htsjdk.variant.vcf.VCFConstants; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.collections.ExpandingArrayList; @@ -85,7 +84,7 @@ public class VariantDataManager { protected final static Logger logger = Logger.getLogger(VariantDataManager.class); protected final List trainingSets; private static final double SAFETY_OFFSET = 0.01; //To use for example as 1/(X + SAFETY_OFFSET) to protect against dividing or taking log of X=0. - private static final double PRECISION = 0.01; //To use mainly with MathUrils.compareDoubles(a,b,PRECISON) + private static final double PRECISION = 0.01; //To use mainly with MathUtils.compareDoubles(a,b,PRECISION) public VariantDataManager( final List annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) { this.data = Collections.emptyList(); @@ -344,7 +343,7 @@ public class VariantDataManager { int iii = 0; for( final String key : annotationKeys ) { isNull[iii] = false; - annotations[iii] = decodeAnnotation( key, vc, jitter, VRAC ); + annotations[iii] = decodeAnnotation( key, vc, jitter, VRAC, datum ); if( Double.isNaN(annotations[iii]) ) { isNull[iii] = true; } iii++; } @@ -356,19 +355,31 @@ public class VariantDataManager { return Math.log((x - xmin)/(xmax - x)); } - private static double decodeAnnotation( final String annotationKey, final VariantContext vc, final boolean jitter, final VariantRecalibratorArgumentCollection vrac ) { + private static double decodeAnnotation( final String annotationKey, final VariantContext vc, final boolean jitter, final VariantRecalibratorArgumentCollection vrac, final VariantDatum datum ) { double value; final double LOG_OF_TWO = 0.6931472; try { - value = vc.getAttributeAsDouble( annotationKey, Double.NaN ); + //if we're in allele-specific mode and an allele-specific annotation has been requested, parse the appropriate value from the list + if(vrac.useASannotations && annotationKey.startsWith(GATKVCFConstants.ALLELE_SPECIFIC_PREFIX)) { + final List valueList = vc.getAttributeAsList(annotationKey); + if (vc.hasAllele(datum.alternateAllele)) { + final int altIndex = vc.getAlleleIndex(datum.alternateAllele)-1; //-1 is to convert the index from all alleles (including reference) to just alternate alleles + value = Double.parseDouble((String)valueList.get(altIndex)); + } + //if somehow our alleles got mixed up + else + throw new IllegalStateException("VariantDatum allele " + datum.alternateAllele + " is not contained in the input VariantContext."); + } + else + value = vc.getAttributeAsDouble( annotationKey, Double.NaN ); if( Double.isInfinite(value) ) { value = Double.NaN; } - if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } - if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } - if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } - if( jitter && annotationKey.equalsIgnoreCase("SOR") && MathUtils.compareDoubles(value, LOG_OF_TWO, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } //min SOR is 2.0, then we take ln - if( jitter && annotationKey.equalsIgnoreCase("MQ")) { + if( jitter && annotationKey.equalsIgnoreCase(GATKVCFConstants.HAPLOTYPE_SCORE_KEY) && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } + if( jitter && (annotationKey.equalsIgnoreCase(GATKVCFConstants.FISHER_STRAND_KEY) || annotationKey.equalsIgnoreCase(GATKVCFConstants.AS_FILTER_STATUS_KEY)) && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY) && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } + if( jitter && (annotationKey.equalsIgnoreCase(GATKVCFConstants.STRAND_ODDS_RATIO_KEY) || annotationKey.equalsIgnoreCase(GATKVCFConstants.AS_STRAND_ODDS_RATIO_KEY)) && MathUtils.compareDoubles(value, LOG_OF_TWO, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } //min SOR is 2.0, then we take ln + if( jitter && (annotationKey.equalsIgnoreCase(VCFConstants.RMS_MAPPING_QUALITY_KEY) || annotationKey.equalsIgnoreCase(GATKVCFConstants.AS_RMS_MAPPING_QUALITY_KEY))) { if( vrac.MQ_CAP > 0) { value = logitTransform(value, -SAFETY_OFFSET, vrac.MQ_CAP + SAFETY_OFFSET); if (MathUtils.compareDoubles(value, logitTransform(vrac.MQ_CAP, -SAFETY_OFFSET, vrac.MQ_CAP + SAFETY_OFFSET), PRECISION) == 0 ) { @@ -394,6 +405,8 @@ public class VariantDataManager { for( final TrainingSet trainingSet : trainingSets ) { for( final VariantContext trainVC : tracker.getValues(trainingSet.rodBinding, genomeLoc) ) { + if (VRAC.useASannotations && !doAllelesMatch(trainVC, datum)) + continue; if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) { datum.isKnown = datum.isKnown || trainingSet.isKnown; datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth; @@ -413,6 +426,11 @@ public class VariantDataManager { (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples()); } + private boolean doAllelesMatch(final VariantContext trainVC, final VariantDatum datum) { + //only do this check in the allele-specific case, where each datum represents one allele + return datum.alternateAllele == null || trainVC.getAlternateAlleles().contains(datum.alternateAllele); + } + protected static boolean checkVariationClass( final VariantContext evalVC, final VariantContext trainVC ) { switch( trainVC.getType() ) { case SNP: @@ -436,7 +454,21 @@ public class VariantDataManager { case BOTH: return true; default: - throw new ReviewedGATKException( "Encountered unknown recal mode: " + mode ); + throw new IllegalStateException( "Encountered unknown recal mode: " + mode ); + } + } + + protected static boolean checkVariationClass( final VariantContext evalVC, final Allele allele, final VariantRecalibratorArgumentCollection.Mode mode ) { + switch( mode ) { + case SNP: + //note that spanning deletions are considered SNPs by this logic + return evalVC.getReference().length() == allele.length(); + case INDEL: + return (evalVC.getReference().length() != allele.length()) || allele.isSymbolic(); + case BOTH: + return true; + default: + throw new IllegalStateException( "Encountered unknown recal mode: " + mode ); } } @@ -448,9 +480,11 @@ public class VariantDataManager { }} ); // create dummy alleles to be used - final List alleles = Arrays.asList(Allele.create("N", true), Allele.create("", false)); + List alleles = Arrays.asList(Allele.create("N", true), Allele.create("", false)); for( final VariantDatum datum : data ) { + if (VRAC.useASannotations) + alleles = Arrays.asList(datum.referenceAllele, datum.alternateAllele); //use the alleles to distinguish between multiallelics in AS mode VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles); builder.attribute(VCFConstants.END_KEY, datum.loc.getStop()); builder.attribute(GATKVCFConstants.VQS_LOD_KEY, String.format("%.4f", datum.lod)); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDatum.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDatum.java index 8fe785769..d452ddc9f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDatum.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDatum.java @@ -51,6 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.variantrecalibration; +import htsjdk.variant.variantcontext.Allele; import org.broadinstitute.gatk.utils.GenomeLoc; import java.io.Serializable; @@ -79,8 +80,11 @@ public class VariantDatum { public int consensusCount; public GenomeLoc loc; public int worstAnnotation; + public double worstValue; public MultivariateGaussian assignment; // used in K-means implementation public boolean isAggregate; // this datum was provided to aid in modeling but isn't part of the input callset + public Allele referenceAllele; + public Allele alternateAllele; public static class VariantDatumLODComparator implements Comparator, Serializable { @Override diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java index fda39f3d4..c8b2ae3d6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java @@ -51,6 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.variantrecalibration; +import htsjdk.variant.variantcontext.Allele; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; @@ -142,6 +143,26 @@ import Jama.Matrix; * -rscriptFile output.plots.R * * + *

    Allele-specfic usage

    + *
    + * java -Xmx4g -jar GenomeAnalysisTK.jar \
    + *   -T VariantRecalibrator \
    + *   -R reference.fasta \
    + *   -input raw_variants.withASannotations.vcf \
    + *   -AS \
    + *   -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
    + *   -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
    + *   -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.high_confidence.vcf
    + *   -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \
    + *   -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an InbreedingCoeff \
    + *   -mode SNP \
    + *   -recalFile output.AS.recal \
    + *   -tranchesFile output.AS.tranches \
    + *   -rscriptFile output.plots.AS.R
    + * 
    + * The input VCF must have been produced using allele-specific annotations in HaplotypeCaller. + * Note that each allele will have a separate line in the output .recal file with its own VQSLOD and culprit that will be transferred to the final VCF in ApplyRecalibration. + * *

    Caveats

    * *
      @@ -201,14 +222,6 @@ public class VariantRecalibrator extends RodWalker variants, final boolean isInput, final RefMetaDataTracker tracker, final AlignmentContext context, final VariantContext vc, final Allele refAllele, final Allele altAllele) { + final VariantDatum datum = new VariantDatum(); + + // Populate the datum with lots of fields from the VariantContext, unfortunately the VC is too big so we just pull in only the things we absolutely need. + datum.referenceAllele = refAllele; + datum.alternateAllele = altAllele; + dataManager.decodeAnnotations(datum, vc, true); //BUGBUG: when run with HierarchicalMicroScheduler this is non-deterministic because order of calls depends on load of machine + datum.loc = (isInput ? getToolkit().getGenomeLocParser().createGenomeLoc(vc) : null); + datum.originalQual = vc.getPhredScaledQual(); + datum.isSNP = vc.isSNP() && vc.isBiallelic(); + datum.isTransition = datum.isSNP && GATKVariantContextUtils.isTransition(vc); + datum.isAggregate = !isInput; + + // Loop through the training data sets and if they overlap this locus (and allele, if applicable) then update the prior and training status appropriately + dataManager.parseTrainingSets(tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC); + final double priorFactor = QualityUtils.qualToProb(datum.prior); + datum.prior = Math.log10(priorFactor) - Math.log10(1.0 - priorFactor); + + variants.add(datum); + } + + //--------------------------------------------------------------------------------------------------------------- // // reduce @@ -426,7 +468,7 @@ public class VariantRecalibrator extends RodWalker outputFiles = executeTest("testApplyRecalibrationAlleleSpecificSNPmode", spec).getFirst(); + setPDFsForDeletion(outputFiles); + } + + @Test(enabled = true) + public void testApplyRecalibrationAlleleSpecificINDELmode() { + final String base = "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 3:113005755-195507036" + + " -mode INDEL -AS" + + " -ts_filter_level 99.3" + + " --no_cmdline_in_header" + + " -input " + privateTestDir + "VQSR.AStest.postSNPinput.vcf" + + " -o %s" + + " -tranchesFile " + privateTestDir + "VQSR.AStest.indels.tranches" + + " -recalFile " + privateTestDir + "VQSR.AStest.indels.recal"; + + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("7c1df91b9827a5906d30db52e96922e1")); + final List outputFiles = executeTest("testApplyRecalibrationAlleleSpecificINDELmode", spec).getFirst(); + setPDFsForDeletion(outputFiles); + } + private void setPDFsForDeletion( final List walkerOutputFiles ) { for ( final File outputFile : walkerOutputFiles ) { new File(outputFile.getAbsolutePath() + ".pdf").deleteOnExit(); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java index cd664faef..04059fa5e 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java @@ -35,6 +35,7 @@ import htsjdk.variant.variantcontext.Allele; public final class GATKVCFConstants { //INFO keys + public static final String ALLELE_SPECIFIC_PREFIX = "AS_"; public static final String RAW_RMS_MAPPING_QUALITY_KEY = "RAW_MQ"; public static final String AS_RMS_MAPPING_QUALITY_KEY = "AS_MQ"; public static final String AS_RAW_RMS_MAPPING_QUALITY_KEY = "AS_RAW_MQ"; @@ -55,6 +56,7 @@ public final class GATKVCFConstants { public static final String GENOTYPE_AND_VALIDATE_STATUS_KEY = "callStatus"; public static final String CLIPPING_RANK_SUM_KEY = "ClippingRankSum"; public static final String CULPRIT_KEY = "culprit"; + public static final String AS_CULPRIT_KEY = "AS_culprit"; public static final String SPANNING_DELETIONS_KEY = "Dels"; public static final String ORIGINAL_DP_KEY = "DP_Orig"; //SelectVariants public static final String DOWNSAMPLED_KEY = "DS"; @@ -64,6 +66,7 @@ public final class GATKVCFConstants { public static final String FISHER_STRAND_KEY = "FS"; public static final String AS_FISHER_STRAND_KEY = "AS_FS"; public static final String FRACTION_INFORMATIVE_READS_KEY = "FractionInformativeReads"; + public static final String AS_FILTER_STATUS_KEY = "AS_FilterStatus"; public static final String AS_SB_TABLE_KEY = "AS_SB_TABLE"; public static final String GC_CONTENT_KEY = "GC"; public static final String GQ_MEAN_KEY = "GQ_MEAN"; @@ -123,6 +126,7 @@ public final class GATKVCFConstants { public static final String TUMOR_LOD_KEY = "TLOD"; //M2 public static final String VARIANT_TYPE_KEY = "VariantType"; public static final String VQS_LOD_KEY = "VQSLOD"; + public static final String AS_VQS_LOD_KEY = "AS_VQSLOD"; public static final String OXOG_ALT_F1R2_KEY = "ALT_F1R2"; public static final String OXOG_ALT_F2R1_KEY = "ALT_F2R1"; public static final String OXOG_REF_F1R2_KEY = "REF_F1R2"; diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java index e7da2143c..8cd4efa54 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java @@ -122,6 +122,7 @@ public class GATKVCFHeaderLines { addInfoLine(new VCFInfoHeaderLine(AS_MATE_MAP_QUAL_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "allele specific Z-score from Wilcoxon rank sum test of mate mapping qualities")); addInfoLine(new VCFInfoHeaderLine(AS_RAW_MATE_MAP_QUAL_RANK_SUM_KEY, 1, VCFHeaderLineType.String, "raw data for allele specific rank sum tes of mate mapping qualities")); addInfoLine(new VCFInfoHeaderLine(CLIPPING_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases")); + addInfoLine(new VCFInfoHeaderLine(AS_FILTER_STATUS_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "Filter status for each allele, as assessed by ApplyRecalibration. Note that the VCF filter field will reflect the most lenient/sensitive status across all alleles.")); addInfoLine(new VCFInfoHeaderLine(FISHER_STRAND_KEY, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias")); addInfoLine(new VCFInfoHeaderLine(AS_FISHER_STRAND_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "allele specific phred-scaled p-value using Fisher's exact test to detect strand bias of each alt allele")); addInfoLine(new VCFInfoHeaderLine(AS_SB_TABLE_KEY, 1, VCFHeaderLineType.String, "Allele-specific forward/reverse read counts for strand bias tests")); @@ -174,7 +175,9 @@ public class GATKVCFHeaderLines { addInfoLine(new VCFInfoHeaderLine(ORIGINAL_CONTIG_KEY, 1, VCFHeaderLineType.String, "Original contig name for the record")); addInfoLine(new VCFInfoHeaderLine(ORIGINAL_START_KEY, 1, VCFHeaderLineType.Integer, "Original start position for the record")); addInfoLine(new VCFInfoHeaderLine(VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds of being a true variant versus being false under the trained gaussian mixture model")); + addInfoLine(new VCFInfoHeaderLine(AS_VQS_LOD_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "For each alt allele, the log odds of being a true variant versus being false under the trained gaussian mixture model")); addInfoLine(new VCFInfoHeaderLine(CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out")); + addInfoLine(new VCFInfoHeaderLine(AS_CULPRIT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "For each alt allele, the annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out")); addInfoLine(new VCFInfoHeaderLine(POSITIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the positive training set of good variants")); addInfoLine(new VCFInfoHeaderLine(NEGATIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the negative training set of bad variants")); addInfoLine(new VCFInfoHeaderLine(RBP_INCONSISTENT_KEY, 0, VCFHeaderLineType.Flag, "Are the reads significantly haplotype-inconsistent?"));