diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java index 238ad814d..3c9da90d2 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/GenotypeCalculationArgumentCollection.java @@ -51,6 +51,7 @@ package org.broadinstitute.gatk.engine.arguments; +import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator; import org.broadinstitute.gatk.utils.commandline.Advanced; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants; @@ -113,7 +114,7 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; /** - * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), + * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN_ALLELES), * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend * that you not play around with this parameter. @@ -124,6 +125,15 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{ @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) public int MAX_ALTERNATE_ALLELES = 6; + /** + * Determines the maximum number of PL values that will be logged in the output. If the number of genotypes + * (which is determined by the ploidy and the number of alleles) exceeds the value provided by this argument, + * then output of all of the PL values will be suppressed. + */ + @Advanced + @Argument(fullName = "max_num_PL_values", shortName = "maxNumPLValues", doc = "Maximum number of PL values to output", required = false) + public int MAX_NUM_PL_VALUES = AFCalculator.MAX_NUM_PL_VALUES_DEFAULT; + /** * By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model, * see e.g. Waterson (1975) or Tajima (1996). diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java index 451b49ab5..a5e5d8a87 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java @@ -56,22 +56,22 @@ import com.google.java.contract.Requires; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.contexts.AlignmentContext; -import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.utils.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.genotyper.SampleList; -import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculationResult; +import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.genotyper.SampleList; import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; @@ -85,7 +85,7 @@ import java.util.*; */ public abstract class GenotypingEngine { - protected final AFCalculatorProvider afCalculatorProvider ; + protected final AFCalculatorProvider afCalculatorProvider; protected Logger logger; @@ -104,6 +104,9 @@ public abstract class GenotypingEngine getFilteredAndStratifiedContexts(final ReferenceContext refContext, diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculator.java index e82c4666e..fa91a2f2b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculator.java @@ -53,11 +53,12 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.SimpleTimer; import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.SimpleTimer; import java.io.File; import java.util.List; @@ -67,10 +68,13 @@ import java.util.List; * Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods */ public abstract class AFCalculator implements Cloneable { - private final static Logger defaultLogger = Logger.getLogger(AFCalculator.class); - + private static final Logger defaultLogger = Logger.getLogger(AFCalculator.class); + public static final int MAX_NUM_PL_VALUES_DEFAULT = 100; protected Logger logger = defaultLogger; + protected int maxNumPLValues = MAX_NUM_PL_VALUES_DEFAULT; // if PL vectors longer than this # of elements, don't log them + protected static int maxNumPLValuesObserved = 0; + protected static long numTimesMaxNumPLValuesExceeded = 0; private SimpleTimer callTimer = new SimpleTimer(); private StateTracker stateTracker; @@ -102,10 +106,19 @@ public abstract class AFCalculator implements Cloneable { * * @param logger */ - public void setLogger(Logger logger) { + public void setLogger(final Logger logger) { this.logger = logger; } + /** + * Set the maximum number of PL values to log. If the number of PL values exceeds this, no PL values will be logged. + * @param maxNumPLValues maximum number of PL values to log + */ + public AFCalculator setMaxNumPLValues(final int maxNumPLValues) { + this.maxNumPLValues = maxNumPLValues; + return this; + } + /** * Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc * @@ -242,4 +255,50 @@ public abstract class AFCalculator implements Cloneable { return getStateTracker(false,allele + 1).getAlleleCountsOfMAP()[allele]; } + /** + * Strips PLs from the specified GenotypeBuilder if their number exceeds the maximum allowed. Corresponding counters are updated. + * @param gb the GenotypeBuilder to modify + * @param vc the VariantContext + * @param sampleName the sample name + * @param newLikelihoods the PL array + */ + protected void removePLsIfMaxNumPLValuesExceeded(final GenotypeBuilder gb, final VariantContext vc, final String sampleName, final double[] newLikelihoods) { + final int numPLValuesFound = newLikelihoods.length; + if (numPLValuesFound > maxNumPLValues) { + logMaxNumPLValuesWarning(vc, sampleName, numPLValuesFound); + numTimesMaxNumPLValuesExceeded++; + gb.noPL(); + if (numPLValuesFound > maxNumPLValuesObserved) { + maxNumPLValuesObserved = numPLValuesFound; + } + } + } + + private void logMaxNumPLValuesWarning(final VariantContext vc, final String sampleName, final int numPLValuesFound) { + final String message = String.format("Maximum allowed number of PLs (%d) exceeded for sample %s at %s:%d-%d with %d possible genotypes. " + + "No PLs will be output for these genotypes (which may cause incorrect results in subsequent analyses) " + + "unless the --max_num_PL_values argument is increased accordingly", + maxNumPLValues, sampleName, vc.getContig(), vc.getStart(), vc.getEnd(), numPLValuesFound); + + if ( numTimesMaxNumPLValuesExceeded == 0 ) { + logger.warn(message + ". Unless the DEBUG logging level is used, this warning message is output just once per run and further warnings are suppressed."); + } else { + logger.debug(message); + } + } + + /** + * Logs the number of times the maximum allowed number of PLs was exceeded and the largest number of PLs observed. The corresponding counters are reset. + */ + public void printFinalMaxNumPLValuesWarning() { + if ( numTimesMaxNumPLValuesExceeded > 0 ) { + final String message = String.format("Maximum allowed number of PLs (%d) was exceeded %d time(s); the largest number of PLs found was %d. " + + "No PLs will be output for these genotypes (which may cause incorrect results in subsequent analyses) " + + "unless the --max_num_PL_values argument is increased accordingly", + maxNumPLValues, numTimesMaxNumPLValuesExceeded, maxNumPLValuesObserved); + logger.warn(message); + } + maxNumPLValuesObserved = 0; + numTimesMaxNumPLValuesExceeded = 0; + } } \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/ExactAFCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/ExactAFCalculator.java index 9a888f934..4791cb6c3 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/ExactAFCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/ExactAFCalculator.java @@ -51,10 +51,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc; +import htsjdk.variant.variantcontext.*; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import htsjdk.variant.variantcontext.*; import java.util.*; @@ -65,8 +65,8 @@ abstract class ExactAFCalculator extends AFCalculator { protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first - // useful so that we don't keep printing out the same warning message - protected static boolean printedWarning = false; + // useful so that we don't keep printing out the same warning messages + protected static boolean printedMaxAltAllelesWarning = false; /** * Sorts {@link ExactAFCalculator.LikelihoodSum} instances where those with higher likelihood are first. @@ -168,15 +168,14 @@ abstract class ExactAFCalculator extends AFCalculator { if (altAlleleReduction == 0) return vc; - String message = "this tool is currently set to genotype at most " + maximumAlternativeAlleles - + " alternate alleles in a given context, but the context at " + vc.getContig() + ":" + vc.getStart() - + " has " + (vc.getAlternateAlleles().size()) - + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"; + final String message = String.format("This tool is currently set to genotype at most %d " + + "alternate alleles in a given context, but the context at %s: %d has %d " + + "alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument", + maximumAlternativeAlleles, vc.getContig(), vc.getStart(), vc.getAlternateAlleles().size()); - if ( !printedWarning ) { - printedWarning = true; - message += ". This warning message is output just once per run and further warnings will be suppressed unless the DEBUG logging level is used."; - logger.warn(message); + if ( !printedMaxAltAllelesWarning ) { + printedMaxAltAllelesWarning = true; + logger.warn(message + ". Unless the DEBUG logging level is used, this warning message is output just once per run and further warnings are suppressed."); } else { logger.debug(message); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProvider.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProvider.java index 08f378f48..1c58191df 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProvider.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProvider.java @@ -71,6 +71,8 @@ public class FixedAFCalculatorProvider extends AFCalculatorProvider { private final int maximumAltAlleleCount; + private final int maximumNumPLValues; + private final int ploidy; /** @@ -125,17 +127,18 @@ public class FixedAFCalculatorProvider extends AFCalculatorProvider { final Logger logger, final boolean verifyRequests) { if (configuration == null) - throw new IllegalArgumentException("null configuration"); - if (configuration == null) - throw new IllegalArgumentException("null configuration genotype arguments"); + throw new IllegalArgumentException("null genotype-arguments configuration"); if (configuration.samplePloidy < 1) throw new IllegalArgumentException("invalid sample ploidy " + configuration.samplePloidy); if (configuration.MAX_ALTERNATE_ALLELES < 0) throw new IllegalArgumentException("invalid maximum number of alleles " + (configuration.MAX_ALTERNATE_ALLELES + 1)); + if (configuration.MAX_NUM_PL_VALUES < 0) + throw new IllegalArgumentException("invalid maximum number of PL values " + configuration.MAX_NUM_PL_VALUES); ploidy = configuration.samplePloidy; maximumAltAlleleCount = configuration.MAX_ALTERNATE_ALLELES; - singleton = AFCalculatorImplementation.bestValue(ploidy,maximumAltAlleleCount,preferred).newInstance(); + maximumNumPLValues = configuration.MAX_NUM_PL_VALUES; + singleton = AFCalculatorImplementation.bestValue(ploidy,maximumAltAlleleCount,preferred).newInstance().setMaxNumPLValues(maximumNumPLValues); singleton.setLogger(logger); this.verifyRequests = verifyRequests; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalculator.java index 886e56958..88537bc44 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyExactAFCalculator.java @@ -58,15 +58,16 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalcula import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import java.util.*; +import java.util.Arrays; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; public class GeneralPloidyExactAFCalculator extends ExactAFCalculator { - static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 100; // if PL vectors longer than this # of elements, don't log them - protected GeneralPloidyExactAFCalculator() { } @@ -491,6 +492,7 @@ public class GeneralPloidyExactAFCalculator extends ExactAFCalculator { final boolean assignGenotypes, final double[] newLikelihoods) { final GenotypeBuilder gb = new GenotypeBuilder(g); + final String sampleName = g.getSampleName(); // add likelihoods gb.PL(newLikelihoods); @@ -500,7 +502,7 @@ public class GeneralPloidyExactAFCalculator extends ExactAFCalculator { if (newSACs != null) gb.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, newSACs); if (assignGenotypes) - assignGenotype(gb, newLikelihoods, allelesToUse, ploidy); + assignGenotype(gb, vc, sampleName, newLikelihoods, allelesToUse, ploidy); else gb.alleles(GATKVariantContextUtils.noCallAlleles(ploidy)); @@ -528,13 +530,18 @@ public class GeneralPloidyExactAFCalculator extends ExactAFCalculator { } /** - * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs + * Assign genotypes (GTs) to the samples in the VariantContext greedily based on the PLs * + * @param gb the GenotypeBuilder to modify + * @param vc the VariantContext + * @param sampleName the sample name * @param newLikelihoods the PL array * @param allelesToUse the list of alleles to choose from (corresponding to the PLs) * @param numChromosomes Number of chromosomes per pool */ private void assignGenotype(final GenotypeBuilder gb, + final VariantContext vc, + final String sampleName, final double[] newLikelihoods, final List allelesToUse, final int numChromosomes) { @@ -547,13 +554,10 @@ public class GeneralPloidyExactAFCalculator extends ExactAFCalculator { gb.alleles(alleleCounts.asAlleleList(allelesToUse)); - // remove PLs if necessary - if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING) - gb.noPL(); + removePLsIfMaxNumPLValuesExceeded(gb, vc, sampleName, newLikelihoods); // TODO - deprecated so what is the appropriate method to call? if ( numNewAltAlleles > 0 ) gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); } - } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesExactAFCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesExactAFCalculator.java index 46118d108..5eae11025 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesExactAFCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesExactAFCalculator.java @@ -75,8 +75,6 @@ import java.util.*; */ public class IndependentAllelesExactAFCalculator extends ExactAFCalculator { - private static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 100; // if PL vectors longer than this # of elements, don't log them - /** * Array that caches the allele list that corresponds to the ith ploidy. * @@ -460,78 +458,83 @@ public class IndependentAllelesExactAFCalculator extends ExactAFCalculator { @Override @Requires("vc != null && allelesToUse != null") public GenotypesContext subsetAlleles(VariantContext vc, int defaultPloidy, List allelesToUse, boolean assignGenotypes) { - // the genotypes with PLs - final GenotypesContext oldGTs = vc.getGenotypes(); + // the genotypes with PLs + final GenotypesContext oldGTs = vc.getGenotypes(); - // samples - final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); + // samples + final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(); + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(); - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final int numNewAltAlleles = allelesToUse.size() - 1; + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final int numNewAltAlleles = allelesToUse.size() - 1; - // create the new genotypes - for ( int k = 0; k < oldGTs.size(); k++ ) { - final Genotype g = oldGTs.get(sampleIndices.get(k)); - final int declaredPloidy = g.getPloidy(); - final int ploidy = declaredPloidy <= 0 ? defaultPloidy : declaredPloidy; - if ( !g.hasLikelihoods() ) { - newGTs.add(GenotypeBuilder.create(g.getSampleName(),GATKVariantContextUtils.noCallAlleles(ploidy))); - continue; - } - - // create the new likelihoods array from the alleles we are allowed to use - final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); - double[] newLikelihoods; - - // Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization - // and subsetting - if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) { - newLikelihoods = originalLikelihoods; - } else { - newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse); - - // might need to re-normalize - newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); - } - - // if there is no mass on the (new) likelihoods, then just no-call the sample - if ( MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) { - newGTs.add(GenotypeBuilder.create(g.getSampleName(), GATKVariantContextUtils.noCallAlleles(ploidy))); - } - else { - final GenotypeBuilder gb = new GenotypeBuilder(g); - - if ( numNewAltAlleles == 0 ) - gb.noPL(); - else - gb.PL(newLikelihoods); - - // if we weren't asked to assign a genotype, then just no-call the sample - if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) - gb.alleles(GATKVariantContextUtils.noCallAlleles(ploidy)); - else - assignGenotype(gb, newLikelihoods, allelesToUse, ploidy); - newGTs.add(gb.make()); - } + // create the new genotypes + for ( int k = 0; k < oldGTs.size(); k++ ) { + final Genotype g = oldGTs.get(sampleIndices.get(k)); + final int declaredPloidy = g.getPloidy(); + final int ploidy = declaredPloidy <= 0 ? defaultPloidy : declaredPloidy; + if ( !g.hasLikelihoods() ) { + newGTs.add(GenotypeBuilder.create(g.getSampleName(),GATKVariantContextUtils.noCallAlleles(ploidy))); + continue; } - return GATKVariantContextUtils.fixADFromSubsettedAlleles(newGTs, vc, allelesToUse); + // create the new likelihoods array from the alleles we are allowed to use + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + double[] newLikelihoods; + + // Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization + // and subsetting + if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) { + newLikelihoods = originalLikelihoods; + } else { + newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse); + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); + } + + // if there is no mass on the (new) likelihoods, then just no-call the sample + if ( MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) { + newGTs.add(GenotypeBuilder.create(g.getSampleName(), GATKVariantContextUtils.noCallAlleles(ploidy))); + } else { + final GenotypeBuilder gb = new GenotypeBuilder(g); + final String sampleName = g.getSampleName(); + + if ( numNewAltAlleles == 0 ) + gb.noPL(); + else + gb.PL(newLikelihoods); + + // if we weren't asked to assign a genotype, then just no-call the sample + if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) + gb.alleles(GATKVariantContextUtils.noCallAlleles(ploidy)); + else + assignGenotype(gb, vc, sampleName, newLikelihoods, allelesToUse, ploidy); + newGTs.add(gb.make()); + } + } + + return GATKVariantContextUtils.fixADFromSubsettedAlleles(newGTs, vc, allelesToUse); } /** - * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs + * Assign genotypes (GTs) to the samples in the VariantContext greedily based on the PLs * + * @param gb the GenotypeBuilder to modify + * @param vc the VariantContext + * @param sampleName the sample name * @param newLikelihoods the PL array * @param allelesToUse the list of alleles to choose from (corresponding to the PLs) * @param numChromosomes Number of chromosomes per pool */ private void assignGenotype(final GenotypeBuilder gb, + final VariantContext vc, + final String sampleName, final double[] newLikelihoods, final List allelesToUse, final int numChromosomes) { @@ -544,9 +547,7 @@ public class IndependentAllelesExactAFCalculator extends ExactAFCalculator { gb.alleles(alleleCounts.asAlleleList(allelesToUse)); - // remove PLs if necessary - if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING) - gb.noPL(); + removePLsIfMaxNumPLValuesExceeded(gb, vc, sampleName, newLikelihoods); if ( numNewAltAlleles > 0 ) gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index f85a254a8..c7cc4cecc 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -55,30 +55,24 @@ import com.google.java.contract.Ensures; import htsjdk.samtools.SAMFileWriter; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.vcf.*; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFStandardHeaderLines; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; +import org.broadinstitute.gatk.engine.filters.BadMateFilter; import org.broadinstitute.gatk.engine.io.DirectOutputTracker; import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterStub; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardHCAnnotation; -import org.broadinstitute.gatk.utils.contexts.AlignmentContext; -import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.utils.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.downsampling.AlleleBiasedDownsamplingUtils; -import org.broadinstitute.gatk.utils.downsampling.DownsampleType; -import org.broadinstitute.gatk.utils.downsampling.DownsamplingUtils; -import org.broadinstitute.gatk.engine.filters.BadMateFilter; -import org.broadinstitute.gatk.utils.genotyper.*; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; import org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub; -import org.broadinstitute.gatk.utils.haplotypeBAMWriter.DroppedReadsTracker; -import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardHCAnnotation; import org.broadinstitute.gatk.tools.walkers.genotyper.*; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.FixedAFCalculatorProvider; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; @@ -91,21 +85,33 @@ import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; import org.broadinstitute.gatk.utils.clipping.ReadClipper; import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingUtils; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.fragments.FragmentCollection; import org.broadinstitute.gatk.utils.fragments.FragmentUtils; +import org.broadinstitute.gatk.utils.genotyper.*; import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils; import org.broadinstitute.gatk.utils.gvcf.GVCFWriter; import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.haplotypeBAMWriter.DroppedReadsTracker; import org.broadinstitute.gatk.utils.haplotypeBAMWriter.HaplotypeBAMWriter; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.pairhmm.PairHMM; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.ReadUtils; -import org.broadinstitute.gatk.utils.variant.*; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants; import java.io.File; import java.io.FileNotFoundException; @@ -1154,6 +1160,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Override public void onTraversalDone(Integer result) { + genotypingEngine.printFinalMaxNumPLValuesWarning(); if ( HCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it referenceConfidenceModel.close(); //TODO remove the need to call close here for debugging, the likelihood output stream should be managed diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index d828a1c20..02699eb6a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -51,13 +51,18 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Level; import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.engine.walkers.WalkerTest; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; +import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -352,4 +357,60 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { executeTest(" testHaplotypeCallerMultiAllelicNonRef", spec); } + @Test + public void testHaplotypeCallerMaxNumPLValues() { + final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 70", + b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("7ea93210277fa4b590790a81c4b3994b")); + spec.disableShadowBCF(); + executeTest("testHaplotypeCallerMaxNumPLValues", spec); + } + + @Test + public void testHaplotypeCallerMaxNumPLValuesExceededWithWarnLogLevel() throws IOException { + // Need to see log WARN messages + final Level level = logger.getLevel(); + logger.setLevel(Level.WARN); + + final File logFile = createTempFile("testMaxNumPLValuesExceededWithWarnLogLevel.log", ".tmp"); + final String logFileName = logFile.getAbsolutePath(); + + final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s", + b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", + GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9d69cb9dc67e0d0ee9863767428e6841")); + spec.disableShadowBCF(); + executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithWarnLogLevel", spec); + // Make sure the "Maximum allowed number of PLs exceeded" messages are in the log + Assert.assertTrue(FileUtils.readFileToString(logFile).contains("Maximum allowed number of PLs (30) exceeded for sample NA12878 at 20:10097101-10097101 with 35 possible genotypes. " + + "No PLs will be output for these genotypes (which may cause incorrect results in subsequent analyses) unless the --max_num_PL_values argument is increased accordingly. " + + "Unless the DEBUG logging level is used, this warning message is output just once per run and further warnings are suppressed.")); + Assert.assertFalse(FileUtils.readFileToString(logFile).contains("Maximum allowed number of PLs (30) exceeded for sample NA12878 at 20:10316239-10316241 with 70 possible genotypes.")); + Assert.assertTrue(FileUtils.readFileToString(logFile).contains("Maximum allowed number of PLs (30) was exceeded 2 time(s); the largest number of PLs found was 70.")); + // Set the log level back + logger.setLevel(level); + } + + @Test + public void testHaplotypeCallerMaxNumPLValuesExceededWithDebugLogLevel() throws IOException { + // Need to see log DEBUG messages + final Level level = logger.getLevel(); + logger.setLevel(Level.DEBUG); + + final File logFile = createTempFile("testMaxNumPLValuesExceededWithDebugLogLevel.log", ".tmp"); + final String logFileName = logFile.getAbsolutePath(); + + final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s", + b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", + GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9d69cb9dc67e0d0ee9863767428e6841")); + spec.disableShadowBCF(); + executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithDebugLogLevel", spec); + // Make sure the "Maximum allowed number of PLs exceeded" messages are in the log + Assert.assertTrue(FileUtils.readFileToString(logFile).contains("Maximum allowed number of PLs (30) exceeded for sample NA12878 at 20:10097101-10097101 with 35 possible genotypes.")); + Assert.assertTrue(FileUtils.readFileToString(logFile).contains("Maximum allowed number of PLs (30) exceeded for sample NA12878 at 20:10316239-10316241 with 70 possible genotypes.")); + Assert.assertTrue(FileUtils.readFileToString(logFile).contains("Maximum allowed number of PLs (30) was exceeded 2 time(s); the largest number of PLs found was 70.")); + // Set the log level back + logger.setLevel(level); + } }