Merge pull request #1266 from broadinstitute/ldg_VQSRforAlleles

Add new -AS mode to run VQSR (both VariantRecalibrator and ApplyRecal…
This commit is contained in:
ldgauthier 2016-01-22 13:19:11 -05:00
commit ab5a8f0bb0
11 changed files with 530 additions and 73 deletions

View File

@ -79,11 +79,11 @@ public class AnnotationUtils {
public static final int WARNINGS_LOGGED_SIZE = 3;
/**
* Helper function to parse the list into the annotation string
* @param valueList the ArrayList returned from StrandBiasBySample.annotate()
* @return the array used by the per-sample Strand Bias annotation
* Helper function to convert a List of Doubles to a comma-separated String
* @param valueList the ArrayList with Double data
* @return a comma-separated String
*/
protected static String encodeValueList( final List<Double> valueList, final String precisionFormat ) {
public static String encodeValueList( final List<Double> valueList, final String precisionFormat ) {
List<String> outputList = new ArrayList<>();
for (Double d : valueList) {
outputList.add(String.format(precisionFormat, d));
@ -91,6 +91,15 @@ public class AnnotationUtils {
return StringUtils.join(outputList, ",");
}
/**
* Helper function to convert a List of Strings to a comma-separated String
* @param stringList the ArrayList with String data
* @return a comma-separated String
*/
public static String encodeStringList( final List<String> stringList) {
return StringUtils.join(stringList, ",");
}
/**
* Checks if the walker is compatible with allele-specific annotations
*/

View File

@ -180,6 +180,19 @@ import java.util.*;
* -o output.raw.snps.indels.g.vcf
* </pre>
*
* <h4>Single-sample GVCF calling on DNAseq with allele-specific annotations (for allele-specific cohort analysis workflow)</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -R reference.fasta \
* -T HaplotypeCaller \
* -I sample1.bam \
* --emitRefConfidence GVCF \
* [--dbsnp dbSNP.vcf] \
* [-L targets.interval_list] \
* -G Standard -G AS_Standard \
* -o output.raw.snps.indels.AS.g.vcf
* </pre>
*
* <h4>Variant-only calling on DNAseq</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \

View File

@ -51,6 +51,8 @@
package org.broadinstitute.gatk.tools.walkers.variantrecalibration;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.tools.walkers.annotator.AnnotationUtils;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
@ -74,6 +76,8 @@ import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.io.File;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Apply a score cutoff to filter variants based on a recalibration table
@ -126,6 +130,26 @@ import java.util.*;
* -o path/to/output.recalibrated.filtered.vcf
* </pre>
*
* <h3>Allele-specific usage</h3>
* <pre>
* java -Xmx3g -jar GenomeAnalysisTK.jar \
* -T ApplyRecalibration \
* -R reference.fasta \
* -input rawVariants.withASannotations.vcf \
* -AS \
* --ts_filter_level 99.0 \
* -tranchesFile path/to/output.AS.tranches \
* -recalFile path/to/output.AS.recal \
* -mode SNP \
* -o path/to/output.recalibrated.ASfiltered.vcf
* </pre>
* Each allele will be annotated by its corresponding entry in the AS_FilterStatus INFO field annotation. Allele-specific VQSLOD and culprit are also carried through from VariantRecalibrator and stored in the AS_VQSLOD and AS_culprit INFO fields, respectively.
* The site-level filter is set to the most lenient of any of the allele filters. That is, if one allele passes, the whole site will be PASS. If no alleles pass, the site-level filter will be set to the lowest sensitivity tranche among all the alleles.
*
* Note that the .tranches and .recal files should be derived from an allele-specific run of VariantRecalibrator
* Also note that the AS_culprit, AS_FilterStatus, and AS_VQSLOD fields will have placeholder values (NA or NaN) for alleles of a type that have not yet been processed by ApplyRecalibration
* The spanning deletion allele (*) will not be recalibrated because it represents missing data. Its VQSLOD will remain NaN and it's culprit and FilterStatus will be NA.
*
* <h3>Caveats</h3>
*
* <ul>
@ -145,6 +169,9 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
public static final String LOW_VQSLOD_FILTER_NAME = "LOW_VQSLOD";
private final double DEFAULT_VQSLOD_CUTOFF = 0.0;
boolean foundSNPTranches = false;
boolean foundINDELTranches = false;
/////////////////////////////
// Inputs
/////////////////////////////
@ -169,6 +196,14 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
/////////////////////////////
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false)
protected Double TS_FILTER_LEVEL = null;
/**
* Filter the input file based on allele-specific recalibration data. See tool docs for site-level and allele-level filtering details.
* Requires a .recal file produced using an allele-specific run of VariantRecalibrator
*/
@Argument(fullName="useAlleleSpecificAnnotations", shortName="AS", doc="If specified, the tool will attempt to apply a filter to each allele based on the input tranches and allele-specific .recal file.", required=false)
public boolean useASannotations = false;
@Advanced
@Argument(fullName="lodCutoff", shortName="lodCutoff", doc="The VQSLOD score below which to start filtering", required=false)
protected Double VQSLOD_CUTOFF = null;
@ -191,6 +226,12 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
final private List<Tranche> tranches = new ArrayList<>();
final private Set<String> inputNames = new HashSet<>();
final private Set<String> ignoreInputFilterSet = new TreeSet<>();
final static private String listPrintSeparator = ",";
final static private String trancheFilterString = "VQSRTranche";
final static private String arrayParseRegex = "[\\[\\]\\s]";
final static private String emptyStringValue = "NA";
final static private String emptyFloatValue = "NaN";
//---------------------------------------------------------------------------------------------------------------
//
@ -219,11 +260,19 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
// setup the header fields
final Set<VCFHeaderLine> hInfo = new HashSet<>();
hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), inputNames));
final Set<VCFHeaderLine> inputHeaders = GATKVCFUtils.getHeaderFields(getToolkit(), inputNames);
hInfo.addAll(inputHeaders);
addVQSRStandardHeaderLines(hInfo);
if (useASannotations)
addAlleleSpecificVQSRHeaderLines(hInfo);
checkForPreviousApplyRecalRun(Collections.unmodifiableSet(inputHeaders));
final TreeSet<String> samples = new TreeSet<>();
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
//generate headers from tranches file
//TODO: throw away old tranche headers if we're ignoring filters
if( TS_FILTER_LEVEL != null ) {
// if the user specifies both ts_filter_level and lodCutoff then throw a user error
if( VQSLOD_CUTOFF != null ) {
@ -255,6 +304,20 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
vcfWriter.writeHeader(vcfHeader);
}
private boolean trancheIntervalIsValid(final String sensitivityLimits) {
final String[] vals = sensitivityLimits.split("to");
if(vals.length != 2)
return false;
try {
double lowerLimit = Double.parseDouble(vals[0]);
double upperLimit = Double.parseDouble(vals[1].replace("+","")); //why does our last tranche end with 100+? Is there anything greater than 100 percent? Really???
}
catch(NumberFormatException e) {
throw new UserException("Poorly formatted tranche filter name does not contain two sensitivity interval end points.");
}
return true;
}
public static void addVQSRStandardHeaderLines(final Set<VCFHeaderLine> hInfo) {
hInfo.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
hInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.VQS_LOD_KEY));
@ -263,6 +326,38 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
hInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.NEGATIVE_LABEL_KEY));
}
public static void addAlleleSpecificVQSRHeaderLines(final Set<VCFHeaderLine> hInfo) {
hInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AS_FILTER_STATUS_KEY));
}
/**
* Check the filter declarations in the input VCF header to see if any ApplyRecalibration mode has been run
* Here we assume that the tranches are named with a specific format: VQSRTranche[SNP|INDEL][lowerLimit]to[upperLimit]
* @param inputHeaders
*/
private void checkForPreviousApplyRecalRun(final Set<VCFHeaderLine> inputHeaders) {
for(final VCFHeaderLine header : inputHeaders) {
if(header instanceof VCFFilterHeaderLine) {
final String filterName = ((VCFFilterHeaderLine)header).getID();
if(filterName.length() < 12 || !filterName.substring(0, 11).equalsIgnoreCase(trancheFilterString)) {
continue;
}
if(filterName.charAt(11) == 'S') {
//for SNP tranches, get sensitivity limit
final String sensitivityLimits = filterName.substring(14);
if(trancheIntervalIsValid(sensitivityLimits))
foundSNPTranches = true;
}
else if(filterName.charAt(11) == 'I') {
//for INDEL tranches, get sensitivity limit
final String sensitivityLimits = filterName.substring(16);
if(trancheIntervalIsValid(sensitivityLimits))
foundINDELTranches = true;
}
}
}
}
//---------------------------------------------------------------------------------------------------------------
//
// map
@ -280,38 +375,26 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
for( final VariantContext vc : VCs ) {
if( VariantDataManager.checkVariationClass( vc, MODE ) && (IGNORE_ALL_FILTERS || vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
final boolean evaluateThisVariant = useASannotations || VariantDataManager.checkVariationClass( vc, MODE );
final boolean variantIsNotFiltered = IGNORE_ALL_FILTERS || vc.isNotFiltered() || (!ignoreInputFilterSet.isEmpty() && ignoreInputFilterSet.containsAll(vc.getFilters())); //vc.isNotFiltered is true for PASS; vc.filtersHaveBeenApplied covers PASS and filters
if( evaluateThisVariant && variantIsNotFiltered) {
final VariantContext recalDatum = getMatchingRecalVC(vc, recals);
if( recalDatum == null ) {
throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc );
String filterString;
final VariantContextBuilder builder = new VariantContextBuilder(vc);
if (!useASannotations) {
filterString = doSiteSpecificFiltering(vc, recals, builder);
}
else { //allele-specific mode
filterString = doAlleleSpecificFiltering(vc, recals, builder);
}
final String lodString = recalDatum.getAttributeAsString(GATKVCFConstants.VQS_LOD_KEY, null);
if( lodString == null ) {
throw new UserException("Encountered a malformed record in the input recal file. There is no lod for the record at: " + vc );
}
final double lod;
try {
lod = Double.valueOf(lodString);
} catch (NumberFormatException e) {
throw new UserException("Encountered a malformed record in the input recal file. The lod is unreadable for the record at: " + vc );
}
VariantContextBuilder builder = new VariantContextBuilder(vc);
// Annotate the new record with its VQSLOD and the worst performing annotation
builder.attribute(GATKVCFConstants.VQS_LOD_KEY, lod);
builder.attribute(GATKVCFConstants.CULPRIT_KEY, recalDatum.getAttribute(GATKVCFConstants.CULPRIT_KEY));
if ( recalDatum.hasAttribute(GATKVCFConstants.POSITIVE_LABEL_KEY))
builder.attribute(GATKVCFConstants.POSITIVE_LABEL_KEY, true);
if ( recalDatum.hasAttribute(GATKVCFConstants.NEGATIVE_LABEL_KEY))
builder.attribute(GATKVCFConstants.NEGATIVE_LABEL_KEY, true);
final String filterString = generateFilterString(lod);
//for both non-AS and AS modes:
if( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) {
builder.passFilters();
} else if(filterString.equals(VCFConstants.UNFILTERED)) {
builder.unfiltered();
} else {
builder.filters(filterString);
}
@ -328,6 +411,78 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
return 1; // This value isn't used for anything
}
public double parseFilterLowerLimit(final String trancheFilter) {
final Pattern pattern = Pattern.compile("VQSRTranche\\S+(\\d+\\.\\d+)to(\\d+\\.\\d+)");
final Matcher m = pattern.matcher(trancheFilter);
return m.find() ? Double.parseDouble(m.group(1)) : -1;
}
/**
* Generate the VCF filter string for this record based on the ApplyRecalibration modes run so far
* @param vc the input VariantContext (with at least one ApplyRecalibration mode already run)
* @param bestLod best LOD from the alleles we've seen in this recalibration mode
* @return the String to use as the VCF filter field
*/
protected String generateFilterStringFromAlleles(final VariantContext vc, final double bestLod) {
String filterString = ".";
final boolean bothModesWereRun = (MODE == VariantRecalibratorArgumentCollection.Mode.SNP && foundINDELTranches) || (MODE == VariantRecalibratorArgumentCollection.Mode.INDEL && foundSNPTranches);
final boolean onlyOneModeNeeded = !vc.isMixed() && VariantDataManager.checkVariationClass( vc, MODE );
//if both SNP and INDEL modes have not yet been run (and need to be), leave this variant as unfiltered and add the filters for the alleles in this mode to the INFO field
if (!bothModesWereRun && !onlyOneModeNeeded) {
return VCFConstants.UNFILTERED;
}
//if both SNP and INDEL modes have been run or the site is not mixed, generate a filter string for this site based on both models
//pull out the allele filter status from the info field (there may be more than one entry in the list if there were multiple snp/indel alleles assessed in the other mode)
final String prevFilterStatus = vc.getAttributeAsString(GATKVCFConstants.AS_FILTER_STATUS_KEY, null);
//if this site hasn't had a filter applied yet
if (prevFilterStatus != null && !prevFilterStatus.equals(VCFConstants.UNFILTERED)) {
final String prevAllelesFilterStatusString = vc.getAttributeAsString(GATKVCFConstants.AS_FILTER_STATUS_KEY, null);
final String[] prevAllelesFilterStatusList = prevAllelesFilterStatusString.split(listPrintSeparator);
//start with the current best allele filter as the most lenient filter across all modes and all alleles
String mostLenientFilterName = generateFilterString(bestLod);
//if the current mode's best allele passes the tranche filter, then let the whole site pass
if (mostLenientFilterName.equals(VCFConstants.PASSES_FILTERS_v4)) {
filterString = mostLenientFilterName;
}
//if the current mode's best allele does not pass the tranche filter, compare the most lenient filter of this mode with those from the previous mode
else {
double mostLenientSensitivityLowerLimit = parseFilterLowerLimit(mostLenientFilterName);
for (int i = 0; i < prevAllelesFilterStatusList.length; i++) {
final String alleleFilterString = prevAllelesFilterStatusList[i].replaceAll(arrayParseRegex, "").trim();
//if any allele from the previous mode passed the tranche filter, then let the whole site pass
if (alleleFilterString.equals(VCFConstants.PASSES_FILTERS_v4)) { //this allele is PASS
mostLenientFilterName = alleleFilterString;
break;
}
//if there's no PASS, then we need to parse the filters to find out how lenient they are
else {
final double alleleLowerLimit = parseFilterLowerLimit(alleleFilterString);
if (alleleLowerLimit == -1)
continue;
if (alleleLowerLimit < mostLenientSensitivityLowerLimit) {
mostLenientSensitivityLowerLimit = alleleLowerLimit;
mostLenientFilterName = alleleFilterString;
}
}
}
filterString = mostLenientFilterName;
}
}
//if both modes have been run, but the previous mode didn't apply a filter, use the current mode's best allele VQSLOD filter (shouldn't get run, but just in case)
else {
filterString = generateFilterString(bestLod);
}
return filterString;
}
/**
* Generate the VCF filter string for this record based on the provided lod score
* @param lod non-null double
@ -358,16 +513,164 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
return filterString;
}
private static VariantContext getMatchingRecalVC(final VariantContext target, final List<VariantContext> recalVCs) {
private VariantContext getMatchingRecalVC(final VariantContext target, final List<VariantContext> recalVCs, final Allele allele) {
for( final VariantContext recalVC : recalVCs ) {
if ( target.getEnd() == recalVC.getEnd() ) {
return recalVC;
if (!useASannotations)
return recalVC;
else if (allele.equals(recalVC.getAlternateAllele(0)))
return recalVC;
}
}
return null;
}
/**
*
* @param altIndex current alt allele
* @param prevCulpritList culprits from previous ApplyRecalibration run
* @param prevLodList lods from previous ApplyRecalibration run
* @param prevASfiltersList AS_filters from previous ApplyRecalibration run
* @param culpritString
* @param lodString
* @param AS_filterString
*/
private void updateAnnotationsWithoutRecalibrating(final int altIndex, final String[] prevCulpritList, final String[] prevLodList, final String[] prevASfiltersList,
final List<String> culpritString, final List<String> lodString, final List<String> AS_filterString) {
if (foundINDELTranches || foundSNPTranches) {
if (altIndex < prevCulpritList.length) {
culpritString.add(prevCulpritList[altIndex].replaceAll(arrayParseRegex, "").trim());
lodString.add(prevLodList[altIndex].replaceAll(arrayParseRegex, "").trim());
AS_filterString.add(prevASfiltersList[altIndex].replaceAll(arrayParseRegex, "").trim());
}
} else { //if the other allele type hasn't been processed yet, make sure there are enough entries
culpritString.add(emptyStringValue);
lodString.add(emptyFloatValue);
AS_filterString.add(emptyStringValue);
}
}
/**
* Calculate the allele-specific filter status of vc
* @param vc
* @param recals
* @param builder is modified by adding attributes
* @return a String with the filter status for this site
*/
private String doAlleleSpecificFiltering(final VariantContext vc, final List<VariantContext> recals, final VariantContextBuilder builder) {
double bestLod = VariantRecalibratorEngine.MIN_ACCEPTABLE_LOD_SCORE;
final List<String> culpritStrings = new ArrayList<>();
final List<String> lodStrings = new ArrayList<>();
final List<String> AS_filterStrings = new ArrayList<>();
String[] prevCulpritList = null;
String[] prevLodList = null;
String[] prevASfiltersList = null;
//get VQSR annotations from previous run of ApplyRecalibration, if applicable
if(foundINDELTranches || foundSNPTranches) {
final String prevCulprits = vc.getAttributeAsString(GATKVCFConstants.AS_CULPRIT_KEY,"");
prevCulpritList = prevCulprits.isEmpty()? new String[0] : prevCulprits.split(listPrintSeparator);
final String prevLodString = vc.getAttributeAsString(GATKVCFConstants.AS_VQS_LOD_KEY,"");
prevLodList = prevLodString.isEmpty()? new String[0] : prevLodString.split(listPrintSeparator);
final String prevASfilters = vc.getAttributeAsString(GATKVCFConstants.AS_FILTER_STATUS_KEY,"");
prevASfiltersList = prevASfilters.isEmpty()? new String[0] : prevASfilters.split(listPrintSeparator);
}
//for each allele in the current VariantContext
for (int altIndex = 0; altIndex < vc.getNAlleles()-1; altIndex++) {
final Allele allele = vc.getAlternateAllele(altIndex);
//if the current allele is not part of this recalibration mode, add its annotations to the list and go to the next allele
if (!VariantDataManager.checkVariationClass(vc, allele, MODE)) {
updateAnnotationsWithoutRecalibrating(altIndex, prevCulpritList, prevLodList, prevASfiltersList, culpritStrings, lodStrings, AS_filterStrings);
continue;
}
//if the current allele does need to have recalibration applied...
//initialize allele-specific VQSR annotation data with values for spanning deletion
String alleleLodString = emptyFloatValue;
String alleleFilterString = emptyStringValue;
String alleleCulpritString = emptyStringValue;
//if it's not a spanning deletion, replace those allele strings with the real values
if (!allele.equals(Allele.SPAN_DEL)) {
VariantContext recalDatum = getMatchingRecalVC(vc, recals, allele);
if (recalDatum == null) {
throw new UserException("Encountered input allele which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants with flag -AS. First seen at: " + vc);
}
//compare VQSLODs for all alleles in the current mode for filtering later
final double lod = recalDatum.getAttributeAsDouble(GATKVCFConstants.VQS_LOD_KEY, VariantRecalibratorEngine.MIN_ACCEPTABLE_LOD_SCORE);
if (lod > bestLod)
bestLod = lod;
alleleLodString = String.format("%.4f", lod);
alleleFilterString = generateFilterString(lod);
alleleCulpritString = recalDatum.getAttributeAsString(GATKVCFConstants.CULPRIT_KEY, ".");
if(recalDatum != null) {
if (recalDatum.hasAttribute(GATKVCFConstants.POSITIVE_LABEL_KEY))
builder.attribute(GATKVCFConstants.POSITIVE_LABEL_KEY, true);
if (recalDatum.hasAttribute(GATKVCFConstants.NEGATIVE_LABEL_KEY))
builder.attribute(GATKVCFConstants.NEGATIVE_LABEL_KEY, true);
}
}
//append per-allele VQSR annotations
lodStrings.add(alleleLodString);
AS_filterStrings.add(alleleFilterString);
culpritStrings.add(alleleCulpritString);
}
// Annotate the new record with its VQSLOD, AS_FilterStatus, and the worst performing annotation
if(!AS_filterStrings.isEmpty() )
builder.attribute(GATKVCFConstants.AS_FILTER_STATUS_KEY, AnnotationUtils.encodeStringList(AS_filterStrings));
if(!lodStrings.isEmpty())
builder.attribute(GATKVCFConstants.AS_VQS_LOD_KEY, AnnotationUtils.encodeStringList(lodStrings));
if(!culpritStrings.isEmpty())
builder.attribute(GATKVCFConstants.AS_CULPRIT_KEY, AnnotationUtils.encodeStringList(culpritStrings));
return generateFilterStringFromAlleles(vc, bestLod);
}
/**
* Calculate the filter status for a given VariantContext using the combined data from all alleles at a site
* @param vc
* @param recals
* @param builder is modified by adding attributes
* @return a String with the filter status for this site
*/
private String doSiteSpecificFiltering(final VariantContext vc, final List<VariantContext> recals, final VariantContextBuilder builder) {
VariantContext recalDatum = getMatchingRecalVC(vc, recals, null);
if( recalDatum == null ) {
throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc );
}
final String lodString = recalDatum.getAttributeAsString(GATKVCFConstants.VQS_LOD_KEY, null);
if( lodString == null ) {
throw new UserException("Encountered a malformed record in the input recal file. There is no lod for the record at: " + vc );
}
final double lod;
try {
lod = Double.valueOf(lodString);
} catch (NumberFormatException e) {
throw new UserException("Encountered a malformed record in the input recal file. The lod is unreadable for the record at: " + vc );
}
builder.attribute(GATKVCFConstants.VQS_LOD_KEY, lod);
builder.attribute(GATKVCFConstants.CULPRIT_KEY, recalDatum.getAttribute(GATKVCFConstants.CULPRIT_KEY));
if(recalDatum != null) {
if (recalDatum.hasAttribute(GATKVCFConstants.POSITIVE_LABEL_KEY))
builder.attribute(GATKVCFConstants.POSITIVE_LABEL_KEY, true);
if (recalDatum.hasAttribute(GATKVCFConstants.NEGATIVE_LABEL_KEY))
builder.attribute(GATKVCFConstants.NEGATIVE_LABEL_KEY, true);
}
return generateFilterString(lod);
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce

View File

@ -58,7 +58,6 @@ import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.MathUtils;
import htsjdk.variant.vcf.VCFConstants;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.collections.ExpandingArrayList;
@ -85,7 +84,7 @@ public class VariantDataManager {
protected final static Logger logger = Logger.getLogger(VariantDataManager.class);
protected final List<TrainingSet> trainingSets;
private static final double SAFETY_OFFSET = 0.01; //To use for example as 1/(X + SAFETY_OFFSET) to protect against dividing or taking log of X=0.
private static final double PRECISION = 0.01; //To use mainly with MathUrils.compareDoubles(a,b,PRECISON)
private static final double PRECISION = 0.01; //To use mainly with MathUtils.compareDoubles(a,b,PRECISION)
public VariantDataManager( final List<String> annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) {
this.data = Collections.emptyList();
@ -344,7 +343,7 @@ public class VariantDataManager {
int iii = 0;
for( final String key : annotationKeys ) {
isNull[iii] = false;
annotations[iii] = decodeAnnotation( key, vc, jitter, VRAC );
annotations[iii] = decodeAnnotation( key, vc, jitter, VRAC, datum );
if( Double.isNaN(annotations[iii]) ) { isNull[iii] = true; }
iii++;
}
@ -356,19 +355,31 @@ public class VariantDataManager {
return Math.log((x - xmin)/(xmax - x));
}
private static double decodeAnnotation( final String annotationKey, final VariantContext vc, final boolean jitter, final VariantRecalibratorArgumentCollection vrac ) {
private static double decodeAnnotation( final String annotationKey, final VariantContext vc, final boolean jitter, final VariantRecalibratorArgumentCollection vrac, final VariantDatum datum ) {
double value;
final double LOG_OF_TWO = 0.6931472;
try {
value = vc.getAttributeAsDouble( annotationKey, Double.NaN );
//if we're in allele-specific mode and an allele-specific annotation has been requested, parse the appropriate value from the list
if(vrac.useASannotations && annotationKey.startsWith(GATKVCFConstants.ALLELE_SPECIFIC_PREFIX)) {
final List<Object> valueList = vc.getAttributeAsList(annotationKey);
if (vc.hasAllele(datum.alternateAllele)) {
final int altIndex = vc.getAlleleIndex(datum.alternateAllele)-1; //-1 is to convert the index from all alleles (including reference) to just alternate alleles
value = Double.parseDouble((String)valueList.get(altIndex));
}
//if somehow our alleles got mixed up
else
throw new IllegalStateException("VariantDatum allele " + datum.alternateAllele + " is not contained in the input VariantContext.");
}
else
value = vc.getAttributeAsDouble( annotationKey, Double.NaN );
if( Double.isInfinite(value) ) { value = Double.NaN; }
if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
if( jitter && annotationKey.equalsIgnoreCase("SOR") && MathUtils.compareDoubles(value, LOG_OF_TWO, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } //min SOR is 2.0, then we take ln
if( jitter && annotationKey.equalsIgnoreCase("MQ")) {
if( jitter && annotationKey.equalsIgnoreCase(GATKVCFConstants.HAPLOTYPE_SCORE_KEY) && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
if( jitter && (annotationKey.equalsIgnoreCase(GATKVCFConstants.FISHER_STRAND_KEY) || annotationKey.equalsIgnoreCase(GATKVCFConstants.AS_FILTER_STATUS_KEY)) && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
if( jitter && annotationKey.equalsIgnoreCase(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY) && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
if( jitter && (annotationKey.equalsIgnoreCase(GATKVCFConstants.STRAND_ODDS_RATIO_KEY) || annotationKey.equalsIgnoreCase(GATKVCFConstants.AS_STRAND_ODDS_RATIO_KEY)) && MathUtils.compareDoubles(value, LOG_OF_TWO, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } //min SOR is 2.0, then we take ln
if( jitter && (annotationKey.equalsIgnoreCase(VCFConstants.RMS_MAPPING_QUALITY_KEY) || annotationKey.equalsIgnoreCase(GATKVCFConstants.AS_RMS_MAPPING_QUALITY_KEY))) {
if( vrac.MQ_CAP > 0) {
value = logitTransform(value, -SAFETY_OFFSET, vrac.MQ_CAP + SAFETY_OFFSET);
if (MathUtils.compareDoubles(value, logitTransform(vrac.MQ_CAP, -SAFETY_OFFSET, vrac.MQ_CAP + SAFETY_OFFSET), PRECISION) == 0 ) {
@ -394,6 +405,8 @@ public class VariantDataManager {
for( final TrainingSet trainingSet : trainingSets ) {
for( final VariantContext trainVC : tracker.getValues(trainingSet.rodBinding, genomeLoc) ) {
if (VRAC.useASannotations && !doAllelesMatch(trainVC, datum))
continue;
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
datum.isKnown = datum.isKnown || trainingSet.isKnown;
datum.atTruthSite = datum.atTruthSite || trainingSet.isTruth;
@ -413,6 +426,11 @@ public class VariantDataManager {
(TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples());
}
private boolean doAllelesMatch(final VariantContext trainVC, final VariantDatum datum) {
//only do this check in the allele-specific case, where each datum represents one allele
return datum.alternateAllele == null || trainVC.getAlternateAlleles().contains(datum.alternateAllele);
}
protected static boolean checkVariationClass( final VariantContext evalVC, final VariantContext trainVC ) {
switch( trainVC.getType() ) {
case SNP:
@ -436,7 +454,21 @@ public class VariantDataManager {
case BOTH:
return true;
default:
throw new ReviewedGATKException( "Encountered unknown recal mode: " + mode );
throw new IllegalStateException( "Encountered unknown recal mode: " + mode );
}
}
protected static boolean checkVariationClass( final VariantContext evalVC, final Allele allele, final VariantRecalibratorArgumentCollection.Mode mode ) {
switch( mode ) {
case SNP:
//note that spanning deletions are considered SNPs by this logic
return evalVC.getReference().length() == allele.length();
case INDEL:
return (evalVC.getReference().length() != allele.length()) || allele.isSymbolic();
case BOTH:
return true;
default:
throw new IllegalStateException( "Encountered unknown recal mode: " + mode );
}
}
@ -448,9 +480,11 @@ public class VariantDataManager {
}} );
// create dummy alleles to be used
final List<Allele> alleles = Arrays.asList(Allele.create("N", true), Allele.create("<VQSR>", false));
List<Allele> alleles = Arrays.asList(Allele.create("N", true), Allele.create("<VQSR>", false));
for( final VariantDatum datum : data ) {
if (VRAC.useASannotations)
alleles = Arrays.asList(datum.referenceAllele, datum.alternateAllele); //use the alleles to distinguish between multiallelics in AS mode
VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles);
builder.attribute(VCFConstants.END_KEY, datum.loc.getStop());
builder.attribute(GATKVCFConstants.VQS_LOD_KEY, String.format("%.4f", datum.lod));

View File

@ -51,6 +51,7 @@
package org.broadinstitute.gatk.tools.walkers.variantrecalibration;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.GenomeLoc;
import java.io.Serializable;
@ -79,8 +80,11 @@ public class VariantDatum {
public int consensusCount;
public GenomeLoc loc;
public int worstAnnotation;
public double worstValue;
public MultivariateGaussian assignment; // used in K-means implementation
public boolean isAggregate; // this datum was provided to aid in modeling but isn't part of the input callset
public Allele referenceAllele;
public Allele alternateAllele;
public static class VariantDatumLODComparator implements Comparator<VariantDatum>, Serializable {
@Override

View File

@ -51,6 +51,7 @@
package org.broadinstitute.gatk.tools.walkers.variantrecalibration;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
@ -142,6 +143,26 @@ import Jama.Matrix;
* -rscriptFile output.plots.R
* </pre>
*
* <h3>Allele-specfic usage</h3>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -T VariantRecalibrator \
* -R reference.fasta \
* -input raw_variants.withASannotations.vcf \
* -AS \
* -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \
* -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \
* -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.high_confidence.vcf
* -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \
* -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an InbreedingCoeff \
* -mode SNP \
* -recalFile output.AS.recal \
* -tranchesFile output.AS.tranches \
* -rscriptFile output.plots.AS.R
* </pre>
* The input VCF must have been produced using allele-specific annotations in HaplotypeCaller.
* Note that each allele will have a separate line in the output .recal file with its own VQSLOD and culprit that will be transferred to the final VCF in ApplyRecalibration.
*
* <h3>Caveats</h3>
*
* <ul>
@ -201,14 +222,6 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyRecalibration", required=true)
protected File TRANCHES_FILE;
/**
* This GATKReport gives information to describe the VQSR model fit. Normalized means for the positive model are concatenated as one table and negative model normalized means as another table.
* Covariances are also concatenated for postive and negative models, respectively. Tables of annotation means and standard deviations are provided to help describe the normalization.
* The model fit report can be read in with our R gsalib package. Individual model Gaussians can be subset by the value in the "Gaussian" column if desired.
*/
@Output(fullName="model_file", shortName = "modelFile", doc="A GATKReport containing the positive and negative model fits", required=false)
protected PrintStream modelReport = null;
/////////////////////////////
// Additional Command Line Arguments
/////////////////////////////
@ -245,6 +258,16 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Output(fullName="rscript_file", shortName="rscriptFile", doc="The output rscript file generated by the VQSR to aid in visualization of the input data and learned model", required=false, defaultToStdout=false)
private File RSCRIPT_FILE = null;
/**
* This GATKReport gives information to describe the VQSR model fit. Normalized means for the positive model are concatenated as one table and negative model normalized means as another table.
* Covariances are also concatenated for postive and negative models, respectively. Tables of annotation means and standard deviations are provided to help describe the normalization.
* The model fit report can be read in with our R gsalib package. Individual model Gaussians can be subset by the value in the "Gaussian" column if desired.
*/
@Argument(fullName="output_model", shortName = "outputModel", doc="If specified, the variant recalibrator will output the VQSR model fit to the file specified by -modelFile or to stdout", required=false)
private boolean outputModel = false;
@Output(fullName="model_file", shortName = "modelFile", doc="A GATKReport containing the positive and negative model fits", required=false)
protected PrintStream modelReport = null;
@Hidden
@Argument(fullName="replicate", shortName="replicate", doc="Used to debug the random number generation inside the VQSR. Do not use.", required=false)
protected int REPLICATE = 200;
@ -353,23 +376,16 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
for( final VariantContext vc : tracker.getValues(rods, context.getLocation()) ) {
if( vc != null && ( IGNORE_ALL_FILTERS || vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()) ) ) {
if( VariantDataManager.checkVariationClass( vc, VRAC.MODE ) ) {
final VariantDatum datum = new VariantDatum();
// Populate the datum with lots of fields from the VariantContext, unfortunately the VC is too big so we just pull in only the things we absolutely need.
dataManager.decodeAnnotations( datum, vc, true ); //BUGBUG: when run with HierarchicalMicroScheduler this is non-deterministic because order of calls depends on load of machine
datum.loc = ( isInput ? getToolkit().getGenomeLocParser().createGenomeLoc(vc) : null );
datum.originalQual = vc.getPhredScaledQual();
datum.isSNP = vc.isSNP() && vc.isBiallelic();
datum.isTransition = datum.isSNP && GATKVariantContextUtils.isTransition(vc);
datum.isAggregate = !isInput;
// Loop through the training data sets and if they overlap this locus (and allele, if applicable) then update the prior and training status appropriately
dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC );
final double priorFactor = QualityUtils.qualToProb( datum.prior );
datum.prior = Math.log10( priorFactor ) - Math.log10( 1.0 - priorFactor );
variants.add( datum );
if( VariantDataManager.checkVariationClass( vc, VRAC.MODE ) && !VRAC.useASannotations) {
addDatum(variants, isInput, tracker, context, vc, null, null);
}
else if(VRAC.useASannotations) {
for(final Allele allele : vc.getAlternateAlleles()) {
if(allele == Allele.SPAN_DEL)
continue;
if (VariantDataManager.checkVariationClass(vc, allele, VRAC.MODE))
addDatum(variants, isInput, tracker, context, vc, vc.getReference(), allele);
}
}
}
}
@ -377,6 +393,32 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
return variants;
}
/**
* add a datum representing a variant site (or allele) to the data in {@code variants}, which represents the callset to be recalibrated
* @param variants is modified by having a new VariantDatum added to it
*/
private void addDatum(final ExpandingArrayList<VariantDatum> variants, final boolean isInput, final RefMetaDataTracker tracker, final AlignmentContext context, final VariantContext vc, final Allele refAllele, final Allele altAllele) {
final VariantDatum datum = new VariantDatum();
// Populate the datum with lots of fields from the VariantContext, unfortunately the VC is too big so we just pull in only the things we absolutely need.
datum.referenceAllele = refAllele;
datum.alternateAllele = altAllele;
dataManager.decodeAnnotations(datum, vc, true); //BUGBUG: when run with HierarchicalMicroScheduler this is non-deterministic because order of calls depends on load of machine
datum.loc = (isInput ? getToolkit().getGenomeLocParser().createGenomeLoc(vc) : null);
datum.originalQual = vc.getPhredScaledQual();
datum.isSNP = vc.isSNP() && vc.isBiallelic();
datum.isTransition = datum.isSNP && GATKVariantContextUtils.isTransition(vc);
datum.isAggregate = !isInput;
// Loop through the training data sets and if they overlap this locus (and allele, if applicable) then update the prior and training status appropriately
dataManager.parseTrainingSets(tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC);
final double priorFactor = QualityUtils.qualToProb(datum.prior);
datum.prior = Math.log10(priorFactor) - Math.log10(1.0 - priorFactor);
variants.add(datum);
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
@ -426,7 +468,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider " + (badModel.failedToConverge ? "raising the number of variants used to train the negative model (via --minNumBadVariants 5000, for example)." : "lowering the maximum number of Gaussians allowed for use in the model (via --maxGaussians 4, for example).") );
}
if (modelReport != null) {
if (outputModel) {
GATKReport report = writeModelReport(goodModel, badModel, USE_ANNOTATIONS);
report.print(modelReport);
}

View File

@ -83,6 +83,13 @@ public class VariantRecalibratorArgumentCollection {
@Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ", required = true)
public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP;
/**
* Generate a VQSR model using per-allele data instead of the default per-site data, assuming that the input VCF contains allele-specific annotations.
* Annotations should be specified using their full names with AS_ prefix. Non-allele-specific (scalar) annotations will be applied to all alleles.
*/
@Argument(fullName="useAlleleSpecificAnnotations", shortName="AS", doc="If specified, the variant recalibrator will attempt to use the allele-specific versions of the specified annotations.", required=false)
public boolean useASannotations = false;
/**
* This parameter determines the maximum number of Gaussians that should be used when building a positive model
* using the variational Bayes algorithm.

View File

@ -123,15 +123,17 @@ public class VariantRecalibratorEngine {
for( final VariantDatum datum : data ) {
int worstAnnotation = -1;
double minProb = Double.MAX_VALUE;
double worstValue = -1;
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
final Double goodProbLog10 = goodModel.evaluateDatumInOneDimension(datum, iii);
final Double badProbLog10 = badModel.evaluateDatumInOneDimension(datum, iii);
if( goodProbLog10 != null && badProbLog10 != null ) {
final double prob = goodProbLog10 - badProbLog10;
if(prob < minProb) { minProb = prob; worstAnnotation = iii; }
if(prob < minProb) { minProb = prob; worstAnnotation = iii; worstValue = datum.annotations[iii];}
}
}
datum.worstAnnotation = worstAnnotation;
datum.worstValue = worstValue;
}
}

View File

@ -346,6 +346,42 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
}
}
@Test(enabled = true)
public void testApplyRecalibrationAlleleSpecificSNPmode() {
final String base = "-R " + b37KGReference +
" -T ApplyRecalibration" +
" -L 3:113005755-195507036" +
" -mode SNP -AS" +
" -ts_filter_level 99.7" +
" --no_cmdline_in_header" +
" -input " + privateTestDir + "VQSR.AStest.input.vcf" +
" -o %s" +
" -tranchesFile " + privateTestDir + "VQSR.AStest.snps.tranches" +
" -recalFile " + privateTestDir + "VQSR.AStest.snps.recal";
final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("db465aaf18b9de8b3191de63fc9f0e6e"));
final List<File> outputFiles = executeTest("testApplyRecalibrationAlleleSpecificSNPmode", spec).getFirst();
setPDFsForDeletion(outputFiles);
}
@Test(enabled = true)
public void testApplyRecalibrationAlleleSpecificINDELmode() {
final String base = "-R " + b37KGReference +
" -T ApplyRecalibration" +
" -L 3:113005755-195507036" +
" -mode INDEL -AS" +
" -ts_filter_level 99.3" +
" --no_cmdline_in_header" +
" -input " + privateTestDir + "VQSR.AStest.postSNPinput.vcf" +
" -o %s" +
" -tranchesFile " + privateTestDir + "VQSR.AStest.indels.tranches" +
" -recalFile " + privateTestDir + "VQSR.AStest.indels.recal";
final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("7c1df91b9827a5906d30db52e96922e1"));
final List<File> outputFiles = executeTest("testApplyRecalibrationAlleleSpecificINDELmode", spec).getFirst();
setPDFsForDeletion(outputFiles);
}
private void setPDFsForDeletion( final List<File> walkerOutputFiles ) {
for ( final File outputFile : walkerOutputFiles ) {
new File(outputFile.getAbsolutePath() + ".pdf").deleteOnExit();

View File

@ -35,6 +35,7 @@ import htsjdk.variant.variantcontext.Allele;
public final class GATKVCFConstants {
//INFO keys
public static final String ALLELE_SPECIFIC_PREFIX = "AS_";
public static final String RAW_RMS_MAPPING_QUALITY_KEY = "RAW_MQ";
public static final String AS_RMS_MAPPING_QUALITY_KEY = "AS_MQ";
public static final String AS_RAW_RMS_MAPPING_QUALITY_KEY = "AS_RAW_MQ";
@ -55,6 +56,7 @@ public final class GATKVCFConstants {
public static final String GENOTYPE_AND_VALIDATE_STATUS_KEY = "callStatus";
public static final String CLIPPING_RANK_SUM_KEY = "ClippingRankSum";
public static final String CULPRIT_KEY = "culprit";
public static final String AS_CULPRIT_KEY = "AS_culprit";
public static final String SPANNING_DELETIONS_KEY = "Dels";
public static final String ORIGINAL_DP_KEY = "DP_Orig"; //SelectVariants
public static final String DOWNSAMPLED_KEY = "DS";
@ -64,6 +66,7 @@ public final class GATKVCFConstants {
public static final String FISHER_STRAND_KEY = "FS";
public static final String AS_FISHER_STRAND_KEY = "AS_FS";
public static final String FRACTION_INFORMATIVE_READS_KEY = "FractionInformativeReads";
public static final String AS_FILTER_STATUS_KEY = "AS_FilterStatus";
public static final String AS_SB_TABLE_KEY = "AS_SB_TABLE";
public static final String GC_CONTENT_KEY = "GC";
public static final String GQ_MEAN_KEY = "GQ_MEAN";
@ -123,6 +126,7 @@ public final class GATKVCFConstants {
public static final String TUMOR_LOD_KEY = "TLOD"; //M2
public static final String VARIANT_TYPE_KEY = "VariantType";
public static final String VQS_LOD_KEY = "VQSLOD";
public static final String AS_VQS_LOD_KEY = "AS_VQSLOD";
public static final String OXOG_ALT_F1R2_KEY = "ALT_F1R2";
public static final String OXOG_ALT_F2R1_KEY = "ALT_F2R1";
public static final String OXOG_REF_F1R2_KEY = "REF_F1R2";

View File

@ -122,6 +122,7 @@ public class GATKVCFHeaderLines {
addInfoLine(new VCFInfoHeaderLine(AS_MATE_MAP_QUAL_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "allele specific Z-score from Wilcoxon rank sum test of mate mapping qualities"));
addInfoLine(new VCFInfoHeaderLine(AS_RAW_MATE_MAP_QUAL_RANK_SUM_KEY, 1, VCFHeaderLineType.String, "raw data for allele specific rank sum tes of mate mapping qualities"));
addInfoLine(new VCFInfoHeaderLine(CLIPPING_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases"));
addInfoLine(new VCFInfoHeaderLine(AS_FILTER_STATUS_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "Filter status for each allele, as assessed by ApplyRecalibration. Note that the VCF filter field will reflect the most lenient/sensitive status across all alleles."));
addInfoLine(new VCFInfoHeaderLine(FISHER_STRAND_KEY, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias"));
addInfoLine(new VCFInfoHeaderLine(AS_FISHER_STRAND_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "allele specific phred-scaled p-value using Fisher's exact test to detect strand bias of each alt allele"));
addInfoLine(new VCFInfoHeaderLine(AS_SB_TABLE_KEY, 1, VCFHeaderLineType.String, "Allele-specific forward/reverse read counts for strand bias tests"));
@ -174,7 +175,9 @@ public class GATKVCFHeaderLines {
addInfoLine(new VCFInfoHeaderLine(ORIGINAL_CONTIG_KEY, 1, VCFHeaderLineType.String, "Original contig name for the record"));
addInfoLine(new VCFInfoHeaderLine(ORIGINAL_START_KEY, 1, VCFHeaderLineType.Integer, "Original start position for the record"));
addInfoLine(new VCFInfoHeaderLine(VQS_LOD_KEY, 1, VCFHeaderLineType.Float, "Log odds of being a true variant versus being false under the trained gaussian mixture model"));
addInfoLine(new VCFInfoHeaderLine(AS_VQS_LOD_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "For each alt allele, the log odds of being a true variant versus being false under the trained gaussian mixture model"));
addInfoLine(new VCFInfoHeaderLine(CULPRIT_KEY, 1, VCFHeaderLineType.String, "The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out"));
addInfoLine(new VCFInfoHeaderLine(AS_CULPRIT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "For each alt allele, the annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out"));
addInfoLine(new VCFInfoHeaderLine(POSITIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the positive training set of good variants"));
addInfoLine(new VCFInfoHeaderLine(NEGATIVE_LABEL_KEY, 1, VCFHeaderLineType.Flag, "This variant was used to build the negative training set of bad variants"));
addInfoLine(new VCFInfoHeaderLine(RBP_INCONSISTENT_KEY, 0, VCFHeaderLineType.Flag, "Are the reads significantly haplotype-inconsistent?"));