Allow GenotypeGVCFs to emit ref sites.

This commit is contained in:
David Benjamin 2015-11-09 12:48:40 -05:00
parent 2904572fc5
commit aecaa6d38e
8 changed files with 87 additions and 66 deletions

View File

@ -244,8 +244,11 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
// Add 0.0 removes -0.0 occurrences.
final double phredScaledConfidence = (-10.0 * log10Confidence) + 0.0;
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
if ( !passesEmitThreshold(phredScaledConfidence, outputAlternativeAlleles.siteIsMonomorphic) && !forceSiteEmission()) {
//skip this if we are already looking at a vc with a NON_REF allele i.e. if we are in GenotypeGVCFs
if ( !passesEmitThreshold(phredScaledConfidence, outputAlternativeAlleles.siteIsMonomorphic) && !forceSiteEmission()
&& outputAlternativeAlleles.alleles[0] != GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
final double[] AFpriors = getAlleleFrequencyPriors(vc, defaultPloidy, model);
@ -336,7 +339,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
/**
* Provided the exact mode computations it returns the appropiate subset of alleles that progress to genotyping.
* Provided the exact mode computations it returns the appropriate subset of alleles that progress to genotyping.
* @param afcr the exact model calcualtion result.
* @return never {@code null}.
*/
@ -350,8 +353,11 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
boolean siteIsMonomorphic = true;
for (final Allele alternativeAllele : alleles) {
if (alternativeAllele.isReference()) continue;
// we want to keep the NON_REF symbolic allele but only in the absence of a non-symbolic allele, e.g.
// if we combined a ref / NON_REF gVCF with a ref / alt gVCF
final boolean isNonRefWhichIsLoneAltAllele = alternativeAlleleCount == 1 && alternativeAllele == GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE;
final boolean isPlausible = afcr.isPolymorphicPhredScaledQual(alternativeAllele, configuration.genotypeArgs.STANDARD_CONFIDENCE_FOR_EMITTING);
final boolean toOutput = isPlausible || forceKeepAllele(alternativeAllele);
final boolean toOutput = isPlausible || forceKeepAllele(alternativeAllele) || isNonRefWhichIsLoneAltAllele;
siteIsMonomorphic &= ! isPlausible;
if (!toOutput) continue;

View File

@ -377,7 +377,8 @@ public class UnifiedGenotypingEngine extends GenotypingEngine<UnifiedArgumentCol
@Override
protected boolean forceKeepAllele(final Allele allele) {
return configuration.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES || configuration.annotateAllSitesWithPLs;
return configuration.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES
|| configuration.annotateAllSitesWithPLs;
}
@Override

View File

@ -52,6 +52,7 @@
package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
@ -72,7 +73,7 @@ public abstract class DiploidExactAFCalculator extends ExactAFCalculator {
final double[] log10AlleleFrequencyPriors, final StateTracker stateTracker) {
final int numAlternateAlleles = vc.getNAlleles() - 1;
final ArrayList<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), true);
final ArrayList<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), true, vc.hasAllele(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE));
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;

View File

@ -125,11 +125,22 @@ abstract class ExactAFCalculator extends AFCalculator {
}
/**
* Unpack GenotypesContext into arraylist of doubel values
* Unpack GenotypesContext into arraylist of double values
* @param GLs Input genotype context
* @return ArrayList of doubles corresponding to GL vectors
*/
protected static ArrayList<double[]> getGLs(final GenotypesContext GLs, final boolean includeDummy) {
return getGLs(GLs, includeDummy, false);
}
/**
* Unpack GenotypesContext into arraylist of double values
* @param GLs Input genotype context
* @param keepUninformative Don't filter out uninformative genotype likelihoods (i.e. all log likelihoods near 0)
* This is useful for VariantContexts with a NON_REF allele
* @return ArrayList of doubles corresponding to GL vectors
*/
protected static ArrayList<double[]> getGLs(final GenotypesContext GLs, final boolean includeDummy, final boolean keepUninformative) {
final ArrayList<double[]> genotypeLikelihoods = new ArrayList<>(GLs.size() + 1);
if ( includeDummy ) genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
@ -137,7 +148,7 @@ abstract class ExactAFCalculator extends AFCalculator {
if ( sample.hasLikelihoods() ) {
final double[] gls = sample.getLikelihoods().getAsVector();
if ( MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL )
if ( MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL || keepUninformative )
genotypeLikelihoods.add(gls);
}
}

View File

@ -68,6 +68,7 @@ import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AS_StandardAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode;
import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.gatk.tools.walkers.genotyper.UnifiedGenotypingEngine;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.GeneralPloidyFailOverAFCalculatorProvider;
@ -132,7 +133,6 @@ import java.util.*;
@Reference(window=@Window(start=-10,stop=10))
@SuppressWarnings("unused")
public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWriter> implements AnnotatorCompatible, TreeReducible<VariantContextWriter> {
/**
* The gVCF files to merge together
*/
@ -212,15 +212,10 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
final GenomeAnalysisEngine toolkit = getToolkit();
final Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(toolkit, variants);
final GATKVariantContextUtils.GenotypeMergeType mergeType;
if(uniquifySamples) {
mergeType = GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY;
}
else
mergeType = GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE;
final GATKVariantContextUtils.GenotypeMergeType mergeType = uniquifySamples ?
GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY : GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE;
final SampleList samples = new IndexedSampleList(SampleUtils.getSampleList(vcfRods, mergeType));
// create the annotation engine
annotationEngine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, Collections.<String>emptyList(), this, toolkit);
// create the genotyping engine
@ -255,15 +250,18 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
logger.info("Notice that the -ploidy parameter is ignored in " + getClass().getSimpleName() + " tool as this is automatically determined by the input variant files");
}
// get VariantContexts from input gVCFs, merge, and regenotype
public VariantContext map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
if ( tracker == null ) // RodWalkers can make funky map calls
return null;
final GenomeLoc loc = ref.getLocus();
final VariantContext combinedVC = ReferenceConfidenceVariantContextMerger.merge(tracker.getPrioritizedValue(variants, loc), loc, INCLUDE_NON_VARIANTS ? ref.getBase() : null, true, uniquifySamples, annotationEngine);
if ( combinedVC == null )
return null;
return regenotypeVC(tracker, ref, combinedVC);
final List<VariantContext> vcsAtThisLocus = tracker.getPrioritizedValue(variants, loc);
final Byte refBase = INCLUDE_NON_VARIANTS ? ref.getBase() : null;
final boolean removeNonRefSymbolicAllele = !INCLUDE_NON_VARIANTS;
final VariantContext combinedVC = ReferenceConfidenceVariantContextMerger.merge(vcsAtThisLocus, loc,
refBase, removeNonRefSymbolicAllele, uniquifySamples, annotationEngine);
return combinedVC == null ? null : regenotypeVC(tracker, ref, combinedVC);
}
/**
@ -275,65 +273,67 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
* @return a new VariantContext or null if the site turned monomorphic and we don't want such sites
*/
protected VariantContext regenotypeVC(final RefMetaDataTracker tracker, final ReferenceContext ref, final VariantContext originalVC) {
if ( originalVC == null ) throw new IllegalArgumentException("originalVC cannot be null");
VariantContext rawResult = originalVC;
// only re-genotype polymorphic sites
if ( rawResult.isVariant() ) {
VariantContext regenotypedVC = genotypingEngine.calculateGenotypes(rawResult);
if ( ! isProperlyPolymorphic(regenotypedVC) ) {
if (!INCLUDE_NON_VARIANTS)
return null;
}
else {
rawResult = addGenotypingAnnotations(rawResult.getAttributes(), regenotypedVC);
}
if ( originalVC == null ) {
throw new IllegalArgumentException("originalVC cannot be null");
} else if (!isProperlyPolymorphic(originalVC) && !INCLUDE_NON_VARIANTS) {
return null;
}
VariantContext result = originalVC;
//don't need to calculate quals for sites with no data whatsoever
if (result.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) > 0 ) {
result = genotypingEngine.calculateGenotypes(originalVC);
}
if (result == null || (!isProperlyPolymorphic(result) && !INCLUDE_NON_VARIANTS)) {
return null;
}
result = addGenotypingAnnotations(originalVC.getAttributes(), result);
//At this point we should already have DP and AD annotated
VariantContext result = annotationEngine.finalizeAnnotations(rawResult, originalVC);
result = annotationEngine.finalizeAnnotations(result, originalVC);
//do trimming after allele-specific annotation reduction or the mapping is difficult
result = GATKVariantContextUtils.reverseTrimAlleles(result);
// if it turned monomorphic then we either need to ignore or fix such sites
boolean createRefGTs = false;
if ( result.isMonomorphicInSamples() ) {
if ( !INCLUDE_NON_VARIANTS )
return null;
createRefGTs = true;
}
// Re-annotate and fix/remove some of the original annotations.
// Note that the order of these actions matters and is different for polymorphic and monomorphic sites.
// For polymorphic sites we need to make sure e.g. the SB tag is sent to the annotation engine and then removed later.
// For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine.
// We could theoretically make 2 passes to re-create the genotypes, but that gets extremely expensive with large sample sizes.
if ( createRefGTs ) {
if (result.isPolymorphicInSamples()) {
result = annotationEngine.annotateContext(tracker, ref, null, result);
result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, false)).make();
} else if (INCLUDE_NON_VARIANTS) {
result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true)).make();
result = annotationEngine.annotateContext(tracker, ref, null, result);
} else {
result = annotationEngine.annotateContext(tracker, ref, null, result);
result = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, false)).make();
return null;
}
return result;
}
/**
* Determines whether the provided VariantContext has real alternate alleles
* Determines whether the provided VariantContext has real alternate alleles.
*
* There is a bit of a hack to handle the <NON-REF> case because it is not defined in htsjdk.Allele
* We check for this as a biallelic symbolic allele.
*
* @param vc the VariantContext to evaluate
* @return true if it has proper alternate alleles, false otherwise
*/
private boolean isProperlyPolymorphic(final VariantContext vc) {
return ( vc != null &&
!vc.getAlternateAlleles().isEmpty() &&
(!vc.isBiallelic() ||
(!vc.getAlternateAllele(0).equals(Allele.SPAN_DEL) &&
!vc.getAlternateAllele(0).equals(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED))
)
);
//obvious cases
if (vc == null || vc.getAlternateAlleles().isEmpty()) {
return false;
} else if (vc.isBiallelic()) {
return !(vc.getAlternateAllele(0).equals(Allele.SPAN_DEL) ||
vc.getAlternateAllele(0).equals(GATKVCFConstants.SPANNING_DELETION_SYMBOLIC_ALLELE_DEPRECATED) ||
vc.isSymbolic());
} else {
return true;
}
}
/**
@ -435,6 +435,9 @@ public class GenotypeGVCFs extends RodWalker<VariantContext, VariantContextWrite
private UnifiedArgumentCollection createUAC() {
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
uac.genotypeArgs = genotypeArgs.clone();
//whether to emit non-variant sites is not contained in genotypeArgs and must be passed to uac separately
uac.outputMode = INCLUDE_NON_VARIANTS ? OutputMode.EMIT_ALL_CONFIDENT_SITES : OutputMode.EMIT_VARIANTS_ONLY;
return uac;
}

View File

@ -71,7 +71,6 @@ import java.util.Collections;
import java.util.List;
public class GenotypeGVCFsIntegrationTest extends WalkerTest {
private static String baseTestString(String args, String ref) {
return "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + ref + args;
}
@ -108,9 +107,9 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" +
" -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" +
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
" -L 20:10,000,000-11,000,000", b37KGReference),
1,
Arrays.asList("63bdb33fe44b6589adc5c36b0ea740b2"));
Arrays.asList("24ea3dd1f13b6636cf51aea7d5a4ce06"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
@ -146,7 +145,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference),
1,
Arrays.asList("8b338d065806f7c7eea67f56a1f6009e"));
Arrays.asList("65497a0711a33d131a165c9cfc8bc3cf"));
executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec);
}
@ -159,7 +158,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-20,000,000", b37KGReference),
1,
Arrays.asList("2d4e6a3193c493514576a758e891b951"));
Arrays.asList("70dcdc1a111ebd048d32e7e61a9b7052"));
executeTest("combineSingleSamplePipelineGVCFHierarchical", spec);
}
@ -171,7 +170,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference),
1,
Arrays.asList("7693207e925359df331e64664c5b8763"));
Arrays.asList("f911428f0f3bfba9b1d96a6b5ace3dee"));
executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec);
}
@ -277,7 +276,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V:combined2 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" +
" --uniquifySamples", b37KGReference),
1,
Arrays.asList("2492d55f0e688d01ef2d67f675c0e2d9"));
Arrays.asList("1c552a9d76a1bbba4b92a94532f54a1a"));
executeTest("testUniquifiedSamples", spec);
}
@ -558,7 +557,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
" -V " + privateTestDir + "set.zero.RGQs.no.call.sample2.g.vcf" +
" -L chr16:1279274-1279874 -allSites", hg19ReferenceWithChrPrefixInChromosomeNames),
1,
Arrays.asList("75f6402da0f6b8b4e69c847fe8b5179a"));
Arrays.asList("6505d305441b4e6ff975a40ef5d352b5"));
executeTest("testSetZeroRGQsToNoCall", spec);
}
@ -616,7 +615,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest {
public void testMonomorphicVCwithAlt() {
final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -G AS_Standard -o %s --no_cmdline_in_header --disableDithering -V "
+ privateTestDir + "monomorphicGVCwithAlt.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("080951cdb5d4903dd58b1e753b9378d5"));
final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Collections.singletonList("ddf0a386c007b797fce3eb4ddc204216"));
spec.disableShadowBCF();
executeTest("testAlleleSpecificAnnotations", spec);
}

View File

@ -482,7 +482,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
{ privateTestDir+"forHardLeftAlignVariantsTest.vcf", "", "5e81af1825aa207b0a352f5eeb5db700"},
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT", "339cca608ff18a355abc629bca448043"},
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -env", "3e8e2ebbc576ceee717a7ce80e23dd35"},
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -trimAlternates", "8650d66b2199a4f8ce0acc660b2091cd"},
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -trimAlternates", "2cbf4c8c991777254145aacf19cba508"},
{ privateTestDir+"multi-allelic-ordering.vcf", "-sn SAMPLE-CC -sn SAMPLE-CT -env -trimAlternates", "14538e17d5aca22c655c42e130f8cebc"}
};
}
@ -770,7 +770,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " --variant " + testfile + " -o %s --no_cmdline_in_header -sn NA12891 -trimAlternates",
1,
Arrays.asList("7aecb079b16448f0377b6b03069b2994"));
Arrays.asList("a8c23f4d6f93806a34d432dd2c7a0449"));
spec.disableShadowBCF();
executeTest("testSACDiploid", spec);
}

View File

@ -866,7 +866,7 @@ public class GATKVariantContextUtils {
newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
}
if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) )
if ( newLikelihoods == null || (originalVC.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0) == 0 && likelihoodsAreUninformative(newLikelihoods) ))
gb.noPL();
else
gb.PL(newLikelihoods);