Merge pull request #1326 from broadinstitute/sl_issue_1293

Added maxNumPLValues argument to allow users to set maximum number of PL values in output.
This commit is contained in:
samuelklee 2016-04-28 10:52:27 -04:00
commit aa0c76a166
10 changed files with 281 additions and 120 deletions

View File

@ -51,6 +51,7 @@
package org.broadinstitute.gatk.engine.arguments;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator;
import org.broadinstitute.gatk.utils.commandline.Advanced;
import org.broadinstitute.gatk.utils.commandline.Argument;
import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants;
@ -113,7 +114,7 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
/**
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN_ALLELES),
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
* that you not play around with this parameter.
@ -124,6 +125,15 @@ public class GenotypeCalculationArgumentCollection implements Cloneable{
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
public int MAX_ALTERNATE_ALLELES = 6;
/**
* Determines the maximum number of PL values that will be logged in the output. If the number of genotypes
* (which is determined by the ploidy and the number of alleles) exceeds the value provided by this argument,
* then output of all of the PL values will be suppressed.
*/
@Advanced
@Argument(fullName = "max_num_PL_values", shortName = "maxNumPLValues", doc = "Maximum number of PL values to output", required = false)
public int MAX_NUM_PL_VALUES = AFCalculator.MAX_NUM_PL_VALUES_DEFAULT;
/**
* By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model,
* see e.g. Waterson (1975) or Tajima (1996).

View File

@ -56,22 +56,22 @@ import com.google.java.contract.Requires;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.SampleList;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculationResult;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.genotyper.SampleList;
import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
@ -85,7 +85,7 @@ import java.util.*;
*/
public abstract class GenotypingEngine<Config extends StandardCallerArgumentCollection> {
protected final AFCalculatorProvider afCalculatorProvider ;
protected final AFCalculatorProvider afCalculatorProvider;
protected Logger logger;
@ -104,6 +104,9 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
protected final GenomeLocParser genomeLocParser;
protected static int maxNumPLValuesObserved = 0;
protected static int numTimesMaxNumPLValuesExceeded = 0;
/**
* Construct a new genotyper engine, on a specific subset of samples.
*
@ -226,7 +229,8 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
final int defaultPloidy = configuration.genotypeArgs.samplePloidy;
final int maxAltAlleles = configuration.genotypeArgs.MAX_ALTERNATE_ALLELES;
final AFCalculator afCalculator = afCalculatorProvider.getInstance(vc,defaultPloidy,maxAltAlleles);
final int maxNumPLValues = configuration.genotypeArgs.MAX_NUM_PL_VALUES;
final AFCalculator afCalculator = afCalculatorProvider.getInstance(vc,defaultPloidy,maxAltAlleles).setMaxNumPLValues(maxNumPLValues);
final AFCalculationResult AFresult = afCalculator.getLog10PNonRef(vc, defaultPloidy,maxAltAlleles, getAlleleFrequencyPriors(vc,defaultPloidy,model));
final OutputAlleleSubset outputAlternativeAlleles = calculateOutputAlleleSubset(AFresult);
@ -730,4 +734,15 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
return 1.0 - Math.pow(10.0, normalizedLog10ACeq0Posterior);
}
}
/**
* Logs the number of times the maximum allowed number of PLs was exceeded and the largest number of PLs observed. The corresponding counters are reset.
*/
public void printFinalMaxNumPLValuesWarning() {
final int defaultPloidy = configuration.genotypeArgs.samplePloidy;
final int maxAltAlleles = configuration.genotypeArgs.MAX_ALTERNATE_ALLELES;
final int maxNumPLValues = configuration.genotypeArgs.MAX_NUM_PL_VALUES;
final AFCalculator afCalculator = afCalculatorProvider.getInstance(defaultPloidy,maxAltAlleles).setMaxNumPLValues(maxNumPLValues);
afCalculator.printFinalMaxNumPLValuesWarning();
}
}

View File

@ -56,22 +56,22 @@ import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.SampleList;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculationResult;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.baq.BAQ;
import org.broadinstitute.gatk.utils.classloader.PluginManager;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.genotyper.SampleList;
import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import java.io.PrintStream;
@ -470,7 +470,9 @@ public class UnifiedGenotypingEngine extends GenotypingEngine<UnifiedArgumentCol
allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
final int defaultPloidy = configuration.genotypeArgs.samplePloidy;
final int maxAltAlleles = configuration.genotypeArgs.MAX_ALTERNATE_ALLELES;
return afCalculatorProvider.getInstance(vc, defaultPloidy, maxAltAlleles).getLog10PNonRef(vc, defaultPloidy, maxAltAlleles, getAlleleFrequencyPriors(vc,defaultPloidy,model));
final int maxNumPLValues = configuration.genotypeArgs.MAX_NUM_PL_VALUES;
return afCalculatorProvider.getInstance(vc, defaultPloidy, maxAltAlleles).setMaxNumPLValues(maxNumPLValues)
.getLog10PNonRef(vc, defaultPloidy, maxAltAlleles, getAlleleFrequencyPriors(vc,defaultPloidy,model));
}
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(final ReferenceContext refContext,

View File

@ -53,11 +53,12 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.SimpleTimer;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.SimpleTimer;
import java.io.File;
import java.util.List;
@ -67,10 +68,13 @@ import java.util.List;
* Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods
*/
public abstract class AFCalculator implements Cloneable {
private final static Logger defaultLogger = Logger.getLogger(AFCalculator.class);
private static final Logger defaultLogger = Logger.getLogger(AFCalculator.class);
public static final int MAX_NUM_PL_VALUES_DEFAULT = 100;
protected Logger logger = defaultLogger;
protected int maxNumPLValues = MAX_NUM_PL_VALUES_DEFAULT; // if PL vectors longer than this # of elements, don't log them
protected static int maxNumPLValuesObserved = 0;
protected static long numTimesMaxNumPLValuesExceeded = 0;
private SimpleTimer callTimer = new SimpleTimer();
private StateTracker stateTracker;
@ -102,10 +106,19 @@ public abstract class AFCalculator implements Cloneable {
*
* @param logger
*/
public void setLogger(Logger logger) {
public void setLogger(final Logger logger) {
this.logger = logger;
}
/**
* Set the maximum number of PL values to log. If the number of PL values exceeds this, no PL values will be logged.
* @param maxNumPLValues maximum number of PL values to log
*/
public AFCalculator setMaxNumPLValues(final int maxNumPLValues) {
this.maxNumPLValues = maxNumPLValues;
return this;
}
/**
* Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
*
@ -242,4 +255,50 @@ public abstract class AFCalculator implements Cloneable {
return getStateTracker(false,allele + 1).getAlleleCountsOfMAP()[allele];
}
/**
* Strips PLs from the specified GenotypeBuilder if their number exceeds the maximum allowed. Corresponding counters are updated.
* @param gb the GenotypeBuilder to modify
* @param vc the VariantContext
* @param sampleName the sample name
* @param newLikelihoods the PL array
*/
protected void removePLsIfMaxNumPLValuesExceeded(final GenotypeBuilder gb, final VariantContext vc, final String sampleName, final double[] newLikelihoods) {
final int numPLValuesFound = newLikelihoods.length;
if (numPLValuesFound > maxNumPLValues) {
logMaxNumPLValuesWarning(vc, sampleName, numPLValuesFound);
numTimesMaxNumPLValuesExceeded++;
gb.noPL();
if (numPLValuesFound > maxNumPLValuesObserved) {
maxNumPLValuesObserved = numPLValuesFound;
}
}
}
private void logMaxNumPLValuesWarning(final VariantContext vc, final String sampleName, final int numPLValuesFound) {
final String message = String.format("Maximum allowed number of PLs (%d) exceeded for sample %s at %s:%d-%d with %d possible genotypes. " +
"No PLs will be output for these genotypes (which may cause incorrect results in subsequent analyses) " +
"unless the --max_num_PL_values argument is increased accordingly",
maxNumPLValues, sampleName, vc.getContig(), vc.getStart(), vc.getEnd(), numPLValuesFound);
if ( numTimesMaxNumPLValuesExceeded == 0 ) {
logger.warn(message + ". Unless the DEBUG logging level is used, this warning message is output just once per run and further warnings are suppressed.");
} else {
logger.debug(message);
}
}
/**
* Logs the number of times the maximum allowed number of PLs was exceeded and the largest number of PLs observed. The corresponding counters are reset.
*/
public void printFinalMaxNumPLValuesWarning() {
if ( numTimesMaxNumPLValuesExceeded > 0 ) {
final String message = String.format("Maximum allowed number of PLs (%d) was exceeded %d time(s); the largest number of PLs found was %d. " +
"No PLs will be output for these genotypes (which may cause incorrect results in subsequent analyses) " +
"unless the --max_num_PL_values argument is increased accordingly",
maxNumPLValues, numTimesMaxNumPLValuesExceeded, maxNumPLValuesObserved);
logger.warn(message);
}
maxNumPLValuesObserved = 0;
numTimesMaxNumPLValuesExceeded = 0;
}
}

View File

@ -51,10 +51,10 @@
package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc;
import htsjdk.variant.variantcontext.*;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import htsjdk.variant.variantcontext.*;
import java.util.*;
@ -65,8 +65,8 @@ abstract class ExactAFCalculator extends AFCalculator {
protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first
// useful so that we don't keep printing out the same warning message
protected static boolean printedWarning = false;
// useful so that we don't keep printing out the same warning messages
protected static boolean printedMaxAltAllelesWarning = false;
/**
* Sorts {@link ExactAFCalculator.LikelihoodSum} instances where those with higher likelihood are first.
@ -168,15 +168,14 @@ abstract class ExactAFCalculator extends AFCalculator {
if (altAlleleReduction == 0)
return vc;
String message = "this tool is currently set to genotype at most " + maximumAlternativeAlleles
+ " alternate alleles in a given context, but the context at " + vc.getContig() + ":" + vc.getStart()
+ " has " + (vc.getAlternateAlleles().size())
+ " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument";
final String message = String.format("This tool is currently set to genotype at most %d " +
"alternate alleles in a given context, but the context at %s: %d has %d " +
"alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument",
maximumAlternativeAlleles, vc.getContig(), vc.getStart(), vc.getAlternateAlleles().size());
if ( !printedWarning ) {
printedWarning = true;
message += ". This warning message is output just once per run and further warnings will be suppressed unless the DEBUG logging level is used.";
logger.warn(message);
if ( !printedMaxAltAllelesWarning ) {
printedMaxAltAllelesWarning = true;
logger.warn(message + ". Unless the DEBUG logging level is used, this warning message is output just once per run and further warnings are suppressed.");
} else {
logger.debug(message);
}

View File

@ -71,6 +71,8 @@ public class FixedAFCalculatorProvider extends AFCalculatorProvider {
private final int maximumAltAlleleCount;
private final int maximumNumPLValues;
private final int ploidy;
/**
@ -125,17 +127,18 @@ public class FixedAFCalculatorProvider extends AFCalculatorProvider {
final Logger logger, final boolean verifyRequests) {
if (configuration == null)
throw new IllegalArgumentException("null configuration");
if (configuration == null)
throw new IllegalArgumentException("null configuration genotype arguments");
throw new IllegalArgumentException("null genotype-arguments configuration");
if (configuration.samplePloidy < 1)
throw new IllegalArgumentException("invalid sample ploidy " + configuration.samplePloidy);
if (configuration.MAX_ALTERNATE_ALLELES < 0)
throw new IllegalArgumentException("invalid maximum number of alleles " + (configuration.MAX_ALTERNATE_ALLELES + 1));
if (configuration.MAX_NUM_PL_VALUES < 0)
throw new IllegalArgumentException("invalid maximum number of PL values " + configuration.MAX_NUM_PL_VALUES);
ploidy = configuration.samplePloidy;
maximumAltAlleleCount = configuration.MAX_ALTERNATE_ALLELES;
singleton = AFCalculatorImplementation.bestValue(ploidy,maximumAltAlleleCount,preferred).newInstance();
maximumNumPLValues = configuration.MAX_NUM_PL_VALUES;
singleton = AFCalculatorImplementation.bestValue(ploidy,maximumAltAlleleCount,preferred).newInstance().setMaxNumPLValues(maximumNumPLValues);
singleton.setLogger(logger);
this.verifyRequests = verifyRequests;
}

View File

@ -58,15 +58,16 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalcula
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import java.util.*;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
public class GeneralPloidyExactAFCalculator extends ExactAFCalculator {
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 100; // if PL vectors longer than this # of elements, don't log them
protected GeneralPloidyExactAFCalculator() {
}
@ -491,6 +492,7 @@ public class GeneralPloidyExactAFCalculator extends ExactAFCalculator {
final boolean assignGenotypes, final double[] newLikelihoods) {
final GenotypeBuilder gb = new GenotypeBuilder(g);
final String sampleName = g.getSampleName();
// add likelihoods
gb.PL(newLikelihoods);
@ -500,7 +502,7 @@ public class GeneralPloidyExactAFCalculator extends ExactAFCalculator {
if (newSACs != null)
gb.attribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY, newSACs);
if (assignGenotypes)
assignGenotype(gb, newLikelihoods, allelesToUse, ploidy);
assignGenotype(gb, vc, sampleName, newLikelihoods, allelesToUse, ploidy);
else
gb.alleles(GATKVariantContextUtils.noCallAlleles(ploidy));
@ -528,13 +530,18 @@ public class GeneralPloidyExactAFCalculator extends ExactAFCalculator {
}
/**
* Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs
* Assign genotypes (GTs) to the samples in the VariantContext greedily based on the PLs
*
* @param gb the GenotypeBuilder to modify
* @param vc the VariantContext
* @param sampleName the sample name
* @param newLikelihoods the PL array
* @param allelesToUse the list of alleles to choose from (corresponding to the PLs)
* @param numChromosomes Number of chromosomes per pool
*/
private void assignGenotype(final GenotypeBuilder gb,
final VariantContext vc,
final String sampleName,
final double[] newLikelihoods,
final List<Allele> allelesToUse,
final int numChromosomes) {
@ -547,13 +554,10 @@ public class GeneralPloidyExactAFCalculator extends ExactAFCalculator {
gb.alleles(alleleCounts.asAlleleList(allelesToUse));
// remove PLs if necessary
if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING)
gb.noPL();
removePLsIfMaxNumPLValuesExceeded(gb, vc, sampleName, newLikelihoods);
// TODO - deprecated so what is the appropriate method to call?
if ( numNewAltAlleles > 0 )
gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods));
}
}

View File

@ -75,8 +75,6 @@ import java.util.*;
*/
public class IndependentAllelesExactAFCalculator extends ExactAFCalculator {
private static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 100; // if PL vectors longer than this # of elements, don't log them
/**
* Array that caches the allele list that corresponds to the ith ploidy.
*
@ -460,78 +458,83 @@ public class IndependentAllelesExactAFCalculator extends ExactAFCalculator {
@Override
@Requires("vc != null && allelesToUse != null")
public GenotypesContext subsetAlleles(VariantContext vc, int defaultPloidy, List<Allele> allelesToUse, boolean assignGenotypes) {
// the genotypes with PLs
final GenotypesContext oldGTs = vc.getGenotypes();
// the genotypes with PLs
final GenotypesContext oldGTs = vc.getGenotypes();
// samples
final List<String> sampleIndices = oldGTs.getSampleNamesOrderedByName();
// samples
final List<String> sampleIndices = oldGTs.getSampleNamesOrderedByName();
// the new genotypes to create
final GenotypesContext newGTs = GenotypesContext.create();
// the new genotypes to create
final GenotypesContext newGTs = GenotypesContext.create();
// we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
final int numNewAltAlleles = allelesToUse.size() - 1;
// we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
final int numNewAltAlleles = allelesToUse.size() - 1;
// create the new genotypes
for ( int k = 0; k < oldGTs.size(); k++ ) {
final Genotype g = oldGTs.get(sampleIndices.get(k));
final int declaredPloidy = g.getPloidy();
final int ploidy = declaredPloidy <= 0 ? defaultPloidy : declaredPloidy;
if ( !g.hasLikelihoods() ) {
newGTs.add(GenotypeBuilder.create(g.getSampleName(),GATKVariantContextUtils.noCallAlleles(ploidy)));
continue;
}
// create the new likelihoods array from the alleles we are allowed to use
final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
double[] newLikelihoods;
// Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization
// and subsetting
if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) {
newLikelihoods = originalLikelihoods;
} else {
newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse);
// might need to re-normalize
newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
}
// if there is no mass on the (new) likelihoods, then just no-call the sample
if ( MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) {
newGTs.add(GenotypeBuilder.create(g.getSampleName(), GATKVariantContextUtils.noCallAlleles(ploidy)));
}
else {
final GenotypeBuilder gb = new GenotypeBuilder(g);
if ( numNewAltAlleles == 0 )
gb.noPL();
else
gb.PL(newLikelihoods);
// if we weren't asked to assign a genotype, then just no-call the sample
if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL )
gb.alleles(GATKVariantContextUtils.noCallAlleles(ploidy));
else
assignGenotype(gb, newLikelihoods, allelesToUse, ploidy);
newGTs.add(gb.make());
}
// create the new genotypes
for ( int k = 0; k < oldGTs.size(); k++ ) {
final Genotype g = oldGTs.get(sampleIndices.get(k));
final int declaredPloidy = g.getPloidy();
final int ploidy = declaredPloidy <= 0 ? defaultPloidy : declaredPloidy;
if ( !g.hasLikelihoods() ) {
newGTs.add(GenotypeBuilder.create(g.getSampleName(),GATKVariantContextUtils.noCallAlleles(ploidy)));
continue;
}
return GATKVariantContextUtils.fixADFromSubsettedAlleles(newGTs, vc, allelesToUse);
// create the new likelihoods array from the alleles we are allowed to use
final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
double[] newLikelihoods;
// Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization
// and subsetting
if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) {
newLikelihoods = originalLikelihoods;
} else {
newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse);
// might need to re-normalize
newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
}
// if there is no mass on the (new) likelihoods, then just no-call the sample
if ( MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) {
newGTs.add(GenotypeBuilder.create(g.getSampleName(), GATKVariantContextUtils.noCallAlleles(ploidy)));
} else {
final GenotypeBuilder gb = new GenotypeBuilder(g);
final String sampleName = g.getSampleName();
if ( numNewAltAlleles == 0 )
gb.noPL();
else
gb.PL(newLikelihoods);
// if we weren't asked to assign a genotype, then just no-call the sample
if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL )
gb.alleles(GATKVariantContextUtils.noCallAlleles(ploidy));
else
assignGenotype(gb, vc, sampleName, newLikelihoods, allelesToUse, ploidy);
newGTs.add(gb.make());
}
}
return GATKVariantContextUtils.fixADFromSubsettedAlleles(newGTs, vc, allelesToUse);
}
/**
* Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs
* Assign genotypes (GTs) to the samples in the VariantContext greedily based on the PLs
*
* @param gb the GenotypeBuilder to modify
* @param vc the VariantContext
* @param sampleName the sample name
* @param newLikelihoods the PL array
* @param allelesToUse the list of alleles to choose from (corresponding to the PLs)
* @param numChromosomes Number of chromosomes per pool
*/
private void assignGenotype(final GenotypeBuilder gb,
final VariantContext vc,
final String sampleName,
final double[] newLikelihoods,
final List<Allele> allelesToUse,
final int numChromosomes) {
@ -544,9 +547,7 @@ public class IndependentAllelesExactAFCalculator extends ExactAFCalculator {
gb.alleles(alleleCounts.asAlleleList(allelesToUse));
// remove PLs if necessary
if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING)
gb.noPL();
removePLsIfMaxNumPLValuesExceeded(gb, vc, sampleName, newLikelihoods);
if ( numNewAltAlleles > 0 )
gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods));

View File

@ -55,30 +55,24 @@ import com.google.java.contract.Ensures;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFStandardHeaderLines;
import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.GATKVCFUtils;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
import org.broadinstitute.gatk.engine.filters.BadMateFilter;
import org.broadinstitute.gatk.engine.io.DirectOutputTracker;
import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterStub;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardHCAnnotation;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.downsampling.AlleleBiasedDownsamplingUtils;
import org.broadinstitute.gatk.utils.downsampling.DownsampleType;
import org.broadinstitute.gatk.utils.downsampling.DownsamplingUtils;
import org.broadinstitute.gatk.engine.filters.BadMateFilter;
import org.broadinstitute.gatk.utils.genotyper.*;
import org.broadinstitute.gatk.engine.iterators.ReadTransformer;
import org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub;
import org.broadinstitute.gatk.utils.haplotypeBAMWriter.DroppedReadsTracker;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.engine.iterators.ReadTransformer;
import org.broadinstitute.gatk.engine.walkers.*;
import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardHCAnnotation;
import org.broadinstitute.gatk.tools.walkers.genotyper.*;
import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.FixedAFCalculatorProvider;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
@ -91,21 +85,33 @@ import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState;
import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState;
import org.broadinstitute.gatk.utils.clipping.ReadClipper;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.downsampling.AlleleBiasedDownsamplingUtils;
import org.broadinstitute.gatk.utils.downsampling.DownsampleType;
import org.broadinstitute.gatk.utils.downsampling.DownsamplingUtils;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.fragments.FragmentCollection;
import org.broadinstitute.gatk.utils.fragments.FragmentUtils;
import org.broadinstitute.gatk.utils.genotyper.*;
import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils;
import org.broadinstitute.gatk.utils.gvcf.GVCFWriter;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.haplotypeBAMWriter.DroppedReadsTracker;
import org.broadinstitute.gatk.utils.haplotypeBAMWriter.HaplotypeBAMWriter;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.pairhmm.PairHMM;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.sam.ReadUtils;
import org.broadinstitute.gatk.utils.variant.*;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants;
import java.io.File;
import java.io.FileNotFoundException;
@ -1154,6 +1160,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@Override
public void onTraversalDone(Integer result) {
genotypingEngine.printFinalMaxNumPLValuesWarning();
if ( HCAC.emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it
referenceConfidenceModel.close();
//TODO remove the need to call close here for debugging, the likelihood output stream should be managed

View File

@ -51,13 +51,18 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Level;
import org.broadinstitute.gatk.engine.GATKVCFUtils;
import org.broadinstitute.gatk.engine.walkers.WalkerTest;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -352,4 +357,60 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
executeTest(" testHaplotypeCallerMultiAllelicNonRef", spec);
}
@Test
public void testHaplotypeCallerMaxNumPLValues() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 70",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("7ea93210277fa4b590790a81c4b3994b"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerMaxNumPLValues", spec);
}
@Test
public void testHaplotypeCallerMaxNumPLValuesExceededWithWarnLogLevel() throws IOException {
// Need to see log WARN messages
final Level level = logger.getLevel();
logger.setLevel(Level.WARN);
final File logFile = createTempFile("testMaxNumPLValuesExceededWithWarnLogLevel.log", ".tmp");
final String logFileName = logFile.getAbsolutePath();
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals",
GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9d69cb9dc67e0d0ee9863767428e6841"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithWarnLogLevel", spec);
// Make sure the "Maximum allowed number of PLs exceeded" messages are in the log
Assert.assertTrue(FileUtils.readFileToString(logFile).contains("Maximum allowed number of PLs (30) exceeded for sample NA12878 at 20:10097101-10097101 with 35 possible genotypes. " +
"No PLs will be output for these genotypes (which may cause incorrect results in subsequent analyses) unless the --max_num_PL_values argument is increased accordingly. " +
"Unless the DEBUG logging level is used, this warning message is output just once per run and further warnings are suppressed."));
Assert.assertFalse(FileUtils.readFileToString(logFile).contains("Maximum allowed number of PLs (30) exceeded for sample NA12878 at 20:10316239-10316241 with 70 possible genotypes."));
Assert.assertTrue(FileUtils.readFileToString(logFile).contains("Maximum allowed number of PLs (30) was exceeded 2 time(s); the largest number of PLs found was 70."));
// Set the log level back
logger.setLevel(level);
}
@Test
public void testHaplotypeCallerMaxNumPLValuesExceededWithDebugLogLevel() throws IOException {
// Need to see log DEBUG messages
final Level level = logger.getLevel();
logger.setLevel(Level.DEBUG);
final File logFile = createTempFile("testMaxNumPLValuesExceededWithDebugLogLevel.log", ".tmp");
final String logFileName = logFile.getAbsolutePath();
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 30 -log %s",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals",
GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER, logFileName);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9d69cb9dc67e0d0ee9863767428e6841"));
spec.disableShadowBCF();
executeTest("testHaplotypeCallerMaxNumPLValuesExceededWithDebugLogLevel", spec);
// Make sure the "Maximum allowed number of PLs exceeded" messages are in the log
Assert.assertTrue(FileUtils.readFileToString(logFile).contains("Maximum allowed number of PLs (30) exceeded for sample NA12878 at 20:10097101-10097101 with 35 possible genotypes."));
Assert.assertTrue(FileUtils.readFileToString(logFile).contains("Maximum allowed number of PLs (30) exceeded for sample NA12878 at 20:10316239-10316241 with 70 possible genotypes."));
Assert.assertTrue(FileUtils.readFileToString(logFile).contains("Maximum allowed number of PLs (30) was exceeded 2 time(s); the largest number of PLs found was 70."));
// Set the log level back
logger.setLevel(level);
}
}