Clean up AlleleFrequencyCalculation classes

-- Added a true base class that only does truly common tasks (like manage call logging)
   -- This base class provides the only public method (getLog10PNonRef) and calls into a protected compute function that's abstract
   -- Split ExactAF into superclass ExactAF with common data structures and two subclasses: DiploidExact and GeneralPloidyExact
   -- Added an abstract reduceScope function that manages the simplification of the input VariantContext in the case where there are too many alleles or other constraints require us to only attempt a smaller computation
   -- All unit tests pass
This commit is contained in:
Mark DePristo 2012-09-30 17:56:54 -04:00
parent 1c52db4cdd
commit 3e01a76590
14 changed files with 348 additions and 181 deletions

View File

@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public class GeneralPloidyExactAFCalculation extends ExactAFCalculation {
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
final protected UnifiedArgumentCollection UAC;
@ -42,35 +42,38 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private final static boolean VERBOSE = false;
protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
protected GeneralPloidyExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
ploidy = UAC.samplePloidy;
this.UAC = UAC;
}
public List<Allele> getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
GenotypesContext GLs = vc.getGenotypes();
List<Allele> alleles = vc.getAlleles();
@Override
protected VariantContext reduceScope(VariantContext vc) {
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
final List<Allele> alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy));
VariantContextBuilder builder = new VariantContextBuilder(vc);
builder.alleles(alleles);
builder.genotypes(subsetAlleles(vc, alleles, false, ploidy));
return builder.make();
GLs = subsetAlleles(vc, alleles, false, ploidy);
} else {
return vc;
}
}
combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
return alleles;
@Override
public void computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, result);
}

View File

@ -491,15 +491,15 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
// If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
// and we repeat until queue is empty
// queue of AC conformations to process
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue = new LinkedList<AlleleFrequencyCalculationModel.ExactACset>();
final LinkedList<ExactAFCalculation.ExactACset> ACqueue = new LinkedList<ExactAFCalculation.ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset = new HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset>(likelihoodDim);
final HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset> indexesToACset = new HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset>(likelihoodDim);
// add AC=0 to the queue
final int[] zeroCounts = new int[nAlleles];
zeroCounts[0] = numChromosomes;
AlleleFrequencyCalculationModel.ExactACset zeroSet =
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts));
ExactAFCalculation.ExactACset zeroSet =
new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(zeroCounts));
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet);
@ -508,7 +508,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
double maxLog10L = Double.NEGATIVE_INFINITY;
while ( !ACqueue.isEmpty() ) {
// compute log10Likelihoods
final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove();
final ExactAFCalculation.ExactACset ACset = ACqueue.remove();
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
// adjust max likelihood seen if needed
@ -525,8 +525,8 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
int plIdx = 0;
SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
while (iterator.hasNext()) {
AlleleFrequencyCalculationModel.ExactACset ACset =
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector()));
ExactAFCalculation.ExactACset ACset =
new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(iterator.getCurrentVector()));
// for observed base X, add Q(jX,k) to likelihood vector for all k in error model
//likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
@ -540,14 +540,14 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
}
private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set,
private double calculateACConformationAndUpdateQueue(final DiploidExactAFCalculation.ExactACset set,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
final double maxLog10L,
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts,
AlleleFrequencyCalculationModel.ExactACset> indexesToACset,
final LinkedList<ExactAFCalculation.ExactACset> ACqueue,
final HashMap<ExactAFCalculation.ExactACcounts,
ExactAFCalculation.ExactACset> indexesToACset,
final ReadBackedPileup pileup) {
// compute likelihood of set
getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
@ -597,7 +597,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
* @param numObservations Number of observations for each allele
* @param pileup Read backed pileup in case it's necessary
*/
public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
public abstract void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
@ -608,12 +608,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
// Static methods
public static void updateACset(final int[] newSetCounts,
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset) {
final LinkedList<ExactAFCalculation.ExactACset> ACqueue,
final HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset> indexesToACset) {
final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts);
final ExactAFCalculation.ExactACcounts index = new ExactAFCalculation.ExactACcounts(newSetCounts);
if ( !indexesToACset.containsKey(index) ) {
AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index);
ExactAFCalculation.ExactACset newSet = new ExactAFCalculation.ExactACset(1, index);
indexesToACset.put(index, newSet);
ACqueue.add(newSet);
if (VERBOSE)

View File

@ -188,7 +188,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
* @param alleleList List of alleles
* @param numObservations Number of observations for each allele in alleleList
*/
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,

View File

@ -12,7 +12,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
@ -218,7 +221,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
* @param alleleList List of alleles
* @param numObservations Number of observations for each allele in alleleList
*/
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,

View File

@ -141,7 +141,7 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest {
final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size());
double[] priors = new double[len]; // flat priors
GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
GeneralPloidyExactAFCalculation.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));

View File

@ -0,0 +1,230 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.List;
/**
* Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods
*/
public abstract class AlleleFrequencyCalculation implements Cloneable {
private final static Logger defaultLogger = Logger.getLogger(AlleleFrequencyCalculation.class);
public enum Model {
/** The default model with the best performance in all cases */
EXACT("ExactAFCalculation");
final String implementationName;
private Model(String implementationName) {
this.implementationName = implementationName;
}
}
protected int nSamples;
protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
protected Logger logger;
protected PrintStream verboseWriter;
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
private SimpleTimer callTimer = new SimpleTimer();
private PrintStream callReport = null;
protected AlleleFrequencyCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) {
this(nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.exactCallsLog, logger, verboseWriter);
}
protected AlleleFrequencyCalculation(final int nSamples,
final int maxAltAlleles,
final boolean capMaxAltsForIndels,
final File exactCallsLog,
final Logger logger,
final PrintStream verboseWriter) {
if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples);
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles);
this.nSamples = nSamples;
this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = maxAltAlleles;
this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = capMaxAltsForIndels;
this.logger = logger == null ? defaultLogger : logger;
this.verboseWriter = verboseWriter;
if ( exactCallsLog != null )
initializeOutputFile(exactCallsLog);
}
/**
* @see #getLog10PNonRef(org.broadinstitute.sting.utils.variantcontext.VariantContext, double[], AlleleFrequencyCalculationResult)
*
* Allocates a new results object. Useful for testing but slow in practice.
*/
public AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
return getLog10PNonRef(vc, log10AlleleFrequencyPriors, new AlleleFrequencyCalculationResult(MAX_ALTERNATE_ALLELES_TO_GENOTYPE));
}
/**
* Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
*
* @param vc the VariantContext holding the alleles and sample information
* @param log10AlleleFrequencyPriors a prior vector nSamples x 2 in length indicating the Pr(AF = i)
* @param result a pre-allocated (for efficiency) object to hold the result of the calculation
* @return result (for programming convenience)
*/
public AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null");
if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null");
if ( result == null ) throw new IllegalArgumentException("Results object cannot be null");
final VariantContext vcWorking = reduceScope(vc);
result.setAllelesUsedInGenotyping(vcWorking.getAlleles());
callTimer.start();
computeLog10PNonRef(vcWorking, log10AlleleFrequencyPriors, result);
final long nanoTime = callTimer.getElapsedTimeNano();
if ( callReport != null )
printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, result.getLog10PosteriorOfAFzero());
return result;
}
// ---------------------------------------------------------------------------
//
// Abstract methods that should be implemented by concrete implementations
// to actually calculate the AF
//
// ---------------------------------------------------------------------------
/**
* Look at VC and perhaps return a new one of reduced complexity, if that's necessary
*
* Used before the call to computeLog10PNonRef to simply the calculation job at hand,
* if vc exceeds bounds. For example, if VC has 100 alt alleles this function
* may decide to only genotype the best 2 of them.
*
* @param vc the initial VC provided by the caller to this AFcalculation
* @return a potentially simpler VC that's more tractable to genotype
*/
@Requires("vc != null")
@Ensures("result != null")
protected abstract VariantContext reduceScope(final VariantContext vc);
/**
* Actually carry out the log10PNonRef calculation on vc, storing results in results
*
* @param vc variant context with alleles and genotype likelihoods
* @param log10AlleleFrequencyPriors priors
* @param result (pre-allocated) object to store results
*/
// TODO -- add consistent requires among args
protected abstract void computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result);
/**
* Must be overridden by concrete subclasses
*
* @param vc variant context with alleles and genotype likelihoods
* @param allelesToUse alleles to subset
* @param assignGenotypes
* @param ploidy
* @return GenotypesContext object
*/
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy);
// ---------------------------------------------------------------------------
//
// Print information about the call to the calls log
//
// ---------------------------------------------------------------------------
private void initializeOutputFile(final File outputFile) {
try {
if (outputFile != null) {
callReport = new PrintStream( new FileOutputStream(outputFile) );
callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value")));
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(outputFile, e);
}
}
private void printCallInfo(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final long runtimeNano,
final double log10PosteriorOfAFzero) {
printCallElement(vc, "type", "ignore", vc.getType());
int allelei = 0;
for ( final Allele a : vc.getAlleles() )
printCallElement(vc, "allele", allelei++, a.getDisplayString());
for ( final Genotype g : vc.getGenotypes() )
printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString());
for ( int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++ )
printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]);
printCallElement(vc, "runtime.nano", "ignore", runtimeNano);
printCallElement(vc, "log10PosteriorOfAFzero", "ignore", log10PosteriorOfAFzero);
callReport.flush();
}
private void printCallElement(final VariantContext vc,
final Object variable,
final Object key,
final Object value) {
final String loc = String.format("%s:%d", vc.getChr(), vc.getStart());
callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value)));
}
}

View File

@ -26,8 +26,10 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.List;
/**
* Created by IntelliJ IDEA.
@ -54,6 +56,7 @@ public class AlleleFrequencyCalculationResult {
private double log10LikelihoodOfAFzero;
private double log10PosteriorOfAFzero;
private List<Allele> allelesUsedInGenotyping;
public AlleleFrequencyCalculationResult(final int maxAltAlleles) {
alleleCountsOfMLE = new int[maxAltAlleles];
@ -93,13 +96,14 @@ public class AlleleFrequencyCalculationResult {
}
public void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculation.VALUE_NOT_CALCULATED;
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
alleleCountsOfMLE[i] = 0;
alleleCountsOfMAP[i] = 0;
}
currentPosteriorsCacheIndex = 0;
log10PosteriorMatrixSum = null;
allelesUsedInGenotyping = null;
}
public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
@ -147,4 +151,12 @@ public class AlleleFrequencyCalculationResult {
Arrays.fill(alleleCountsOfMAP, 0);
}
}
public List<Allele> getAllelesUsedInGenotyping() {
return allelesUsedInGenotyping;
}
public void setAllelesUsedInGenotyping(List<Allele> allelesUsedInGenotyping) {
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
}
}

View File

@ -27,98 +27,49 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.*;
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private SimpleTimer callTimer = new SimpleTimer();
private PrintStream callReport = null;
public class DiploidExactAFCalculation extends ExactAFCalculation {
// private final static boolean DEBUG = false;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
public DiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) {
super(nSamples, maxAltAlleles, false, null, null, null);
}
public DiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
if ( UAC.exactCallsLog != null )
initializeOutputFile(UAC.exactCallsLog);
}
public void initializeOutputFile(final File outputFile) {
try {
if (outputFile != null) {
callReport = new PrintStream( new FileOutputStream(outputFile) );
callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value")));
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(outputFile, e);
}
@Override
public void computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
linearExactMultiAllelic(vc.getGenotypes(), vc.getNAlleles() - 1, log10AlleleFrequencyPriors, result);
}
public List<Allele> getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
GenotypesContext GLs = vc.getGenotypes();
List<Allele> alleles = vc.getAlleles();
@Override
protected VariantContext reduceScope(final VariantContext vc) {
final int myMaxAltAllelesToGenotype = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS && vc.getType().equals(VariantContext.Type.INDEL) ? 2 : MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
VariantContextBuilder builder = new VariantContextBuilder(vc);
List<Allele> alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
GLs = VariantContextUtils.subsetDiploidAlleles(vc, alleles, false);
builder.alleles(alleles);
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
return builder.make();
} else {
return vc;
}
callTimer.start();
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result);
final long nanoTime = callTimer.getElapsedTimeNano();
if ( callReport != null )
printCallInfo(vc, alleles, GLs, log10AlleleFrequencyPriors, nanoTime, result.getLog10PosteriorOfAFzero());
return alleles;
}
private void printCallInfo(final VariantContext vc,
final List<Allele> alleles,
final GenotypesContext GLs,
final double[] log10AlleleFrequencyPriors,
final long runtimeNano,
final double log10PosteriorOfAFzero) {
printCallElement(vc, "type", "ignore", vc.getType());
int allelei = 0;
for ( final Allele a : alleles )
printCallElement(vc, "allele", allelei++, a.getDisplayString());
for ( final Genotype g : GLs )
printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString());
for ( int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++ )
printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]);
printCallElement(vc, "runtime.nano", "ignore", runtimeNano);
printCallElement(vc, "log10PosteriorOfAFzero", "ignore", log10PosteriorOfAFzero);
callReport.flush();
}
private void printCallElement(final VariantContext vc, final Object variable, final Object key, final Object value) {
final String loc = String.format("%s:%d", vc.getChr(), vc.getStart());
callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value)));
}
private static final int PL_INDEX_OF_HOM_REF = 0;

View File

@ -30,40 +30,23 @@ import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* The model representing how we calculate a genotype given the priors and a pile
* of bases and quality scores
* Uses the Exact calculation of Heng Li
*/
public abstract class AlleleFrequencyCalculationModel implements Cloneable {
public enum Model {
/** The default model with the best performance in all cases */
EXACT
abstract class ExactAFCalculation extends AlleleFrequencyCalculation {
protected ExactAFCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) {
super(UAC, nSamples, logger, verboseWriter);
}
protected int N;
protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
protected Logger logger;
protected PrintStream verboseWriter;
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
protected AlleleFrequencyCalculationModel(final UnifiedArgumentCollection UAC, final int N, final Logger logger, final PrintStream verboseWriter) {
this.N = N;
this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = UAC.MAX_ALTERNATE_ALLELES;
this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
this.logger = logger;
this.verboseWriter = verboseWriter;
protected ExactAFCalculation(final int nSamples, int maxAltAlleles, boolean capMaxAltsForIndels, File exactCallsLog, Logger logger, PrintStream verboseWriter) {
super(nSamples, maxAltAlleles, capMaxAltsForIndels, exactCallsLog, logger, verboseWriter);
}
/**
@ -102,31 +85,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
return genotypeLikelihoods;
}
/**
* Must be overridden by concrete subclasses
* @param vc variant context with alleles and genotype likelihoods
* @param log10AlleleFrequencyPriors priors
* @param result (pre-allocated) object to store likelihoods results
* @return the alleles used for genotyping
*/
protected abstract List<Allele> getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result);
/**
* Must be overridden by concrete subclasses
* @param vc variant context with alleles and genotype likelihoods
* @param allelesToUse alleles to subset
* @param assignGenotypes
* @param ploidy
* @return GenotypesContext object
*/
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy);
// -------------------------------------------------------------------------------------
//
// protected classes used to store exact model matrix columns

View File

@ -41,7 +41,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
*/
@Advanced
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
protected AlleleFrequencyCalculation.Model AFmodel = AlleleFrequencyCalculation.Model.EXACT;
/**
* The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily

View File

@ -27,10 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
@ -249,7 +249,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2");
}
if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL)
if (UAC.AFmodel != AlleleFrequencyCalculation.Model.POOL)
throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2");
}

View File

@ -78,7 +78,7 @@ public class UnifiedGenotyperEngine {
private ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>> glcm = new ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>>();
// the model used for calculating p(non-ref)
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
private ThreadLocal<AlleleFrequencyCalculation> afcm = new ThreadLocal<AlleleFrequencyCalculation>();
// the allele frequency likelihoods and posteriors (allocated once as an optimization)
private ThreadLocal<AlleleFrequencyCalculationResult> alleleFrequencyCalculationResult = new ThreadLocal<AlleleFrequencyCalculationResult>();
@ -371,7 +371,7 @@ public class UnifiedGenotyperEngine {
}
AFresult.reset();
List<Allele> allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
// is the most likely frequency conformation AC=0 for all alternate alleles?
boolean bestGuessIsRef = true;
@ -382,7 +382,7 @@ public class UnifiedGenotyperEngine {
myAlleles.add(vc.getReference());
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
final Allele alternateAllele = vc.getAlternateAllele(i);
final int indexOfAllele = allelesUsedInGenotyping.indexOf(alternateAllele);
final int indexOfAllele = AFresult.getAllelesUsedInGenotyping().indexOf(alternateAllele);
// the genotyping model may have stripped it out
if ( indexOfAllele == -1 )
continue;
@ -754,32 +754,34 @@ public class UnifiedGenotyperEngine {
return glcm;
}
private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
private static AlleleFrequencyCalculation getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
List<Class<? extends AlleleFrequencyCalculationModel>> afClasses = new PluginManager<AlleleFrequencyCalculationModel>(AlleleFrequencyCalculationModel.class).getPlugins();
List<Class<? extends AlleleFrequencyCalculation>> afClasses = new PluginManager<AlleleFrequencyCalculation>(AlleleFrequencyCalculation.class).getPlugins();
// user-specified name
String afModelName = UAC.AFmodel.name();
String afModelName = UAC.AFmodel.implementationName;
if (!afModelName.contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY)
afModelName = GPSTRING + afModelName;
else
afModelName = "Diploid" + afModelName;
for (int i = 0; i < afClasses.size(); i++) {
Class<? extends AlleleFrequencyCalculationModel> afClass = afClasses.get(i);
Class<? extends AlleleFrequencyCalculation> afClass = afClasses.get(i);
String key = afClass.getSimpleName().replace("AFCalculationModel","").toUpperCase();
if (afModelName.equalsIgnoreCase(key)) {
try {
Object args[] = new Object[]{UAC,N,logger,verboseWriter};
Constructor c = afClass.getDeclaredConstructor(UnifiedArgumentCollection.class, int.class, Logger.class, PrintStream.class);
return (AlleleFrequencyCalculationModel)c.newInstance(args);
return (AlleleFrequencyCalculation)c.newInstance(args);
}
catch (Exception e) {
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculation " + UAC.AFmodel);
}
}
}
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculation " + UAC.AFmodel);
}
public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {

View File

@ -24,7 +24,7 @@
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult;
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidExactAFCalculation;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.TreeSet;
@ -51,7 +51,7 @@ public class GLBasedSampleSelector extends SampleSelector {
flatPriors = new double[1+2*samples.size()];
}
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size());
ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result);
DiploidExactAFCalculation.linearExactMultiAllelic(subContext.getGenotypes(), vc.getAlternateAlleles().size(), flatPriors, result);
// do we want to let this qual go up or down?
if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) {
return true;

View File

@ -1,16 +1,14 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.*;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.Arrays;
import java.util.List;
public class ExactAFCalculationModelUnitTest extends BaseTest {
@ -45,6 +43,19 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
this.numAltAlleles = numAltAlleles;
}
public VariantContext getVC() {
VariantContextBuilder builder = new VariantContextBuilder("test", "1", 1, 1, getAlleles());
builder.genotypes(GLs);
return builder.make();
}
public List<Allele> getAlleles() {
return Arrays.asList(Allele.create("A", true),
Allele.create("C"),
Allele.create("G"),
Allele.create("T")).subList(0, numAltAlleles+1);
}
public String toString() {
return String.format("%s input=%s", super.toString(), GLs);
}
@ -83,9 +94,8 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
@Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) {
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
final DiploidExactAFCalculation afCalculation = new DiploidExactAFCalculation(cfg.getVC().getNSamples(), cfg.numAltAlleles);
final AlleleFrequencyCalculationResult result = afCalculation.getLog10PNonRef(cfg.getVC(), priors);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
@ -102,9 +112,8 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
final double[] BB = new double[]{-20000000.0, -20000000.0, 0.0};
GetGLsTest cfg = new GetGLsTest("B6", 1, createGenotype("1", BB), createGenotype("2", BB), createGenotype("3", BB));
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
final DiploidExactAFCalculation afCalculation = new DiploidExactAFCalculation(1, 1);
final AlleleFrequencyCalculationResult result = afCalculation.getLog10PNonRef(cfg.getVC(), priors);
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0];
Assert.assertEquals(calculatedAlleleCount, 6);
@ -117,9 +126,8 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
final double[] AC = new double[]{-100.0, -100.0, -100.0, 0.0, -100.0, -100.0};
GetGLsTest cfg = new GetGLsTest("B1C1", 2, createGenotype("1", AC), createGenotype("2", AB));
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
final DiploidExactAFCalculation afCalculation = new DiploidExactAFCalculation(2, 2);
final AlleleFrequencyCalculationResult result = afCalculation.getLog10PNonRef(cfg.getVC(), priors);
Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1);
Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1);