Merge branch 'master' of ssh://gsa2/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
e13e61673b
|
|
@ -0,0 +1,228 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.ConsoleAppender;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.log4j.SimpleLayout;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 10/2/12
|
||||
* Time: 10:25 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ExactAFCalculationPerformanceTest {
|
||||
final static Logger logger = Logger.getLogger(ExactAFCalculationPerformanceTest.class);
|
||||
|
||||
private static abstract class Analysis {
|
||||
final GATKReport report;
|
||||
|
||||
public Analysis(final String name, final List<String> columns) {
|
||||
report = GATKReport.newSimpleReport(name, columns);
|
||||
}
|
||||
|
||||
public abstract void run(final ExactAFCalculationTestBuilder testBuilder,
|
||||
final List<Object> coreColumns);
|
||||
|
||||
public String getName() {
|
||||
return getTable().getTableName();
|
||||
}
|
||||
|
||||
public GATKReportTable getTable() {
|
||||
return report.getTables().iterator().next();
|
||||
}
|
||||
}
|
||||
|
||||
private static class AnalyzeByACAndPL extends Analysis {
|
||||
public AnalyzeByACAndPL(final List<String> columns) {
|
||||
super("AnalyzeByACAndPL", Utils.append(columns, "non.type.pls", "ac", "n.alt.seg", "other.ac"));
|
||||
}
|
||||
|
||||
public void run(final ExactAFCalculationTestBuilder testBuilder, final List<Object> coreValues) {
|
||||
final SimpleTimer timer = new SimpleTimer();
|
||||
|
||||
for ( final int nonTypePL : Arrays.asList(10, 100, 1000) ) {
|
||||
final ExactAFCalculation calc = testBuilder.makeModel();
|
||||
final double[] priors = testBuilder.makePriors();
|
||||
|
||||
for ( int[] ACs : makeACs(testBuilder.numAltAlleles, testBuilder.nSamples*2) ) {
|
||||
final VariantContext vc = testBuilder.makeACTest(ACs, nonTypePL);
|
||||
|
||||
timer.start();
|
||||
final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vc, priors);
|
||||
final long runtime = timer.getElapsedTimeNano();
|
||||
|
||||
int otherAC = 0;
|
||||
int nAltSeg = 0;
|
||||
for ( int i = 0; i < ACs.length; i++ ) {
|
||||
nAltSeg += ACs[i] > 0 ? 1 : 0;
|
||||
if ( i > 0 ) otherAC += ACs[i];
|
||||
}
|
||||
|
||||
final List<Object> columns = new LinkedList<Object>(coreValues);
|
||||
columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, ACs[0], nAltSeg, otherAC));
|
||||
report.addRowList(columns);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<int[]> makeACs(final int nAltAlleles, final int nChrom) {
|
||||
if ( nAltAlleles > 2 ) throw new IllegalArgumentException("nAltAlleles must be < 3");
|
||||
|
||||
final List<int[]> ACs = new LinkedList<int[]>();
|
||||
|
||||
if ( nAltAlleles == 1 )
|
||||
for ( int i = 0; i < nChrom; i++ ) {
|
||||
ACs.add(new int[]{i});
|
||||
} else if ( nAltAlleles == 2 ) {
|
||||
for ( int i = 0; i < nChrom; i++ ) {
|
||||
for ( int j : Arrays.asList(0, 1, 5, 10, 50, 100, 1000, 10000, 100000) ) {
|
||||
if ( j < nChrom - i )
|
||||
ACs.add(new int[]{i, j});
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw new IllegalStateException("cannot get here");
|
||||
}
|
||||
|
||||
return ACs;
|
||||
}
|
||||
}
|
||||
|
||||
private static class AnalyzeBySingletonPosition extends Analysis {
|
||||
public AnalyzeBySingletonPosition(final List<String> columns) {
|
||||
super("AnalyzeBySingletonPosition", Utils.append(columns, "non.type.pls", "position.of.singleton"));
|
||||
}
|
||||
|
||||
public void run(final ExactAFCalculationTestBuilder testBuilder, final List<Object> coreValues) {
|
||||
final SimpleTimer timer = new SimpleTimer();
|
||||
|
||||
for ( final int nonTypePL : Arrays.asList(100) ) {
|
||||
final ExactAFCalculation calc = testBuilder.makeModel();
|
||||
final double[] priors = testBuilder.makePriors();
|
||||
|
||||
final int[] ac = new int[testBuilder.numAltAlleles];
|
||||
ac[0] = 1;
|
||||
final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL);
|
||||
|
||||
for ( int position = 0; position < vc.getNSamples(); position++ ) {
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder(vc);
|
||||
final List<Genotype> genotypes = new ArrayList<Genotype>(vc.getGenotypes());
|
||||
Collections.rotate(genotypes, position);
|
||||
vcb.genotypes(genotypes);
|
||||
|
||||
timer.start();
|
||||
final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vcb.make(), priors);
|
||||
final long runtime = timer.getElapsedTimeNano();
|
||||
|
||||
final List<Object> columns = new LinkedList<Object>(coreValues);
|
||||
columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, position));
|
||||
report.addRowList(columns);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class AnalyzeByNonInformative extends Analysis {
|
||||
public AnalyzeByNonInformative(final List<String> columns) {
|
||||
super("AnalyzeByNonInformative", Utils.append(columns, "non.type.pls", "n.non.informative"));
|
||||
}
|
||||
|
||||
public void run(final ExactAFCalculationTestBuilder testBuilder, final List<Object> coreValues) {
|
||||
final SimpleTimer timer = new SimpleTimer();
|
||||
|
||||
for ( final int nonTypePL : Arrays.asList(100) ) {
|
||||
final ExactAFCalculation calc = testBuilder.makeModel();
|
||||
final double[] priors = testBuilder.makePriors();
|
||||
|
||||
final int[] ac = new int[testBuilder.numAltAlleles];
|
||||
ac[0] = 1;
|
||||
final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL);
|
||||
final Genotype nonInformative = testBuilder.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0);
|
||||
|
||||
for ( int nNonInformative = 0; nNonInformative < vc.getNSamples(); nNonInformative++ ) {
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder(vc);
|
||||
|
||||
final List<Genotype> genotypes = new ArrayList<Genotype>();
|
||||
genotypes.addAll(vc.getGenotypes().subList(0, nNonInformative + 1));
|
||||
genotypes.addAll(Collections.nCopies(vc.getNSamples() - nNonInformative, nonInformative));
|
||||
vcb.genotypes(genotypes);
|
||||
|
||||
timer.start();
|
||||
final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vcb.make(), priors);
|
||||
final long runtime = timer.getElapsedTimeNano();
|
||||
|
||||
final List<Object> columns = new LinkedList<Object>(coreValues);
|
||||
columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, nNonInformative));
|
||||
report.addRowList(columns);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
logger.addAppender(new ConsoleAppender(new SimpleLayout()));
|
||||
|
||||
final List<String> coreColumns = Arrays.asList("iteration", "n.alt.alleles", "n.samples",
|
||||
"exact.model", "prior.type", "runtime", "n.evaluations");
|
||||
|
||||
final PrintStream out = new PrintStream(new FileOutputStream(args[0]));
|
||||
|
||||
final boolean USE_GENERAL = false;
|
||||
final List<ExactAFCalculationTestBuilder.ModelType> modelTypes = USE_GENERAL
|
||||
? Arrays.asList(ExactAFCalculationTestBuilder.ModelType.values())
|
||||
: Arrays.asList(ExactAFCalculationTestBuilder.ModelType.DiploidExact);
|
||||
|
||||
final boolean ONLY_HUMAN_PRIORS = false;
|
||||
final List<ExactAFCalculationTestBuilder.PriorType> priorTypes = ONLY_HUMAN_PRIORS
|
||||
? Arrays.asList(ExactAFCalculationTestBuilder.PriorType.values())
|
||||
: Arrays.asList(ExactAFCalculationTestBuilder.PriorType.human);
|
||||
|
||||
final int MAX_N_SAMPLES_FOR_MULTI_ALLELIC = 100;
|
||||
|
||||
final List<Analysis> analyzes = new ArrayList<Analysis>();
|
||||
analyzes.add(new AnalyzeByACAndPL(coreColumns));
|
||||
analyzes.add(new AnalyzeBySingletonPosition(coreColumns));
|
||||
analyzes.add(new AnalyzeByNonInformative(coreColumns));
|
||||
|
||||
for ( int iteration = 0; iteration < 1; iteration++ ) {
|
||||
for ( final int nAltAlleles : Arrays.asList(1, 2) ) {
|
||||
for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) {
|
||||
if ( nSamples > MAX_N_SAMPLES_FOR_MULTI_ALLELIC && nAltAlleles > 1 )
|
||||
continue; // skip things that will take forever!
|
||||
|
||||
for ( final ExactAFCalculationTestBuilder.ModelType modelType : modelTypes ) {
|
||||
for ( final ExactAFCalculationTestBuilder.PriorType priorType : priorTypes ) {
|
||||
final ExactAFCalculationTestBuilder testBuilder
|
||||
= new ExactAFCalculationTestBuilder(nSamples, nAltAlleles, modelType, priorType);
|
||||
|
||||
for ( final Analysis analysis : analyzes ) {
|
||||
logger.info(Utils.join("\t", Arrays.asList(iteration, nAltAlleles, nSamples, modelType, priorType, analysis.getName())));
|
||||
final List<?> values = Arrays.asList(iteration, nAltAlleles, nSamples, modelType, priorType);
|
||||
analysis.run(testBuilder, (List<Object>)values);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final GATKReport report = new GATKReport();
|
||||
for ( final Analysis analysis : analyzes )
|
||||
report.addTable(analysis.getTable());
|
||||
report.print(out);
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,158 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class ExactAFCalculationTestBuilder {
|
||||
final static Allele A = Allele.create("A", true);
|
||||
final static Allele C = Allele.create("C");
|
||||
final static Allele G = Allele.create("G");
|
||||
final static Allele T = Allele.create("T");
|
||||
|
||||
static int sampleNameCounter = 0;
|
||||
|
||||
final int nSamples;
|
||||
final int numAltAlleles;
|
||||
final ModelType modelType;
|
||||
final PriorType priorType;
|
||||
|
||||
public ExactAFCalculationTestBuilder(final int nSamples, final int numAltAlleles,
|
||||
final ModelType modelType, final PriorType priorType) {
|
||||
this.nSamples = nSamples;
|
||||
this.numAltAlleles = numAltAlleles;
|
||||
this.modelType = modelType;
|
||||
this.priorType = priorType;
|
||||
}
|
||||
|
||||
public enum ModelType {
|
||||
DiploidExact,
|
||||
OptimizedDiploidExact,
|
||||
GeneralExact
|
||||
}
|
||||
|
||||
public enum PriorType {
|
||||
flat,
|
||||
human
|
||||
}
|
||||
|
||||
public int getnSamples() {
|
||||
return nSamples;
|
||||
}
|
||||
|
||||
public ExactAFCalculation makeModel() {
|
||||
switch (modelType) {
|
||||
case DiploidExact: return new DiploidExactAFCalculation(nSamples, 4);
|
||||
case OptimizedDiploidExact: return new OptimizedDiploidExactAFCalculation(nSamples, 4);
|
||||
case GeneralExact: return new GeneralPloidyExactAFCalculation(nSamples, 4, 2);
|
||||
default: throw new RuntimeException("Unexpected type " + modelType);
|
||||
}
|
||||
}
|
||||
|
||||
public double[] makePriors() {
|
||||
final int nPriorValues = 2*nSamples+1;
|
||||
|
||||
switch ( priorType ) {
|
||||
case flat:
|
||||
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
case human:
|
||||
final double[] humanPriors = new double[nPriorValues];
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues-1, humanPriors, 0.001);
|
||||
return humanPriors;
|
||||
default:
|
||||
throw new RuntimeException("Unexpected type " + priorType);
|
||||
}
|
||||
}
|
||||
|
||||
public VariantContext makeACTest(final int[] ACs, final int nonTypePL) {
|
||||
final int nChrom = nSamples * 2;
|
||||
|
||||
final int[] nhet = new int[numAltAlleles];
|
||||
final int[] nhomvar = new int[numAltAlleles];
|
||||
|
||||
for ( int i = 0; i < ACs.length; i++ ) {
|
||||
final double p = ACs[i] / (1.0 * nChrom);
|
||||
nhomvar[i] = (int)Math.floor(nSamples * p * p);
|
||||
nhet[i] = ACs[i] - 2 * nhomvar[i];
|
||||
|
||||
if ( nhet[i] < 0 )
|
||||
throw new IllegalStateException("Bug!");
|
||||
}
|
||||
|
||||
final long calcAC = MathUtils.sum(nhet) + 2 * MathUtils.sum(nhomvar);
|
||||
if ( calcAC != MathUtils.sum(ACs) )
|
||||
throw new IllegalStateException("calculated AC " + calcAC + " not equal to desired AC " + Utils.join(",", ACs));
|
||||
|
||||
return makeACTest(nhet, nhomvar, nonTypePL);
|
||||
}
|
||||
|
||||
public VariantContext makeACTest(final int[] nhet, final int[] nhomvar, final int nonTypePL) {
|
||||
List<Genotype> samples = new ArrayList<Genotype>(nSamples);
|
||||
|
||||
for ( int altI = 0; altI < nhet.length; altI++ ) {
|
||||
for ( int i = 0; i < nhet[altI]; i++ )
|
||||
samples.add(makePL(GenotypeType.HET, nonTypePL, altI+1));
|
||||
for ( int i = 0; i < nhomvar[altI]; i++ )
|
||||
samples.add(makePL(GenotypeType.HOM_VAR, nonTypePL, altI+1));
|
||||
}
|
||||
|
||||
final int nRef = (int)(nSamples - MathUtils.sum(nhet) - MathUtils.sum(nhomvar));
|
||||
for ( int i = 0; i < nRef; i++ ) samples.add(makePL(GenotypeType.HOM_REF, nonTypePL, 0));
|
||||
|
||||
samples = samples.subList(0, nSamples);
|
||||
|
||||
if ( samples.size() > nSamples )
|
||||
throw new IllegalStateException("too many samples");
|
||||
|
||||
VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, getAlleles());
|
||||
vcb.genotypes(samples);
|
||||
return vcb.make();
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles() {
|
||||
return Arrays.asList(A, C, G, T).subList(0, numAltAlleles+1);
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles(final GenotypeType type, final int altI) {
|
||||
switch (type) {
|
||||
case HOM_REF: return Arrays.asList(getAlleles().get(0), getAlleles().get(0));
|
||||
case HET: return Arrays.asList(getAlleles().get(0), getAlleles().get(altI));
|
||||
case HOM_VAR: return Arrays.asList(getAlleles().get(altI), getAlleles().get(altI));
|
||||
default: throw new IllegalArgumentException("Unexpected type " + type);
|
||||
}
|
||||
}
|
||||
|
||||
public Genotype makePL(final List<Allele> expectedGT, int ... pls) {
|
||||
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
|
||||
gb.alleles(expectedGT);
|
||||
gb.PL(pls);
|
||||
return gb.make();
|
||||
}
|
||||
|
||||
private int numPLs() {
|
||||
return GenotypeLikelihoods.numLikelihoods(numAltAlleles+1, 2);
|
||||
}
|
||||
|
||||
public Genotype makePL(final GenotypeType type, final int nonTypePL, final int altI) {
|
||||
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
|
||||
gb.alleles(getAlleles(type, altI));
|
||||
|
||||
final int[] pls = new int[numPLs()];
|
||||
Arrays.fill(pls, nonTypePL);
|
||||
|
||||
int index = 0;
|
||||
switch ( type ) {
|
||||
case HOM_REF: index = GenotypeLikelihoods.calculatePLindex(0, 0); break;
|
||||
case HET: index = GenotypeLikelihoods.calculatePLindex(0, altI); break;
|
||||
case HOM_VAR: index = GenotypeLikelihoods.calculatePLindex(altI, altI); break;
|
||||
}
|
||||
pls[index] = 0;
|
||||
gb.PL(pls);
|
||||
|
||||
return gb.make();
|
||||
}
|
||||
}
|
||||
|
|
@ -34,43 +34,47 @@ import org.broadinstitute.sting.utils.variantcontext.*;
|
|||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||
public class GeneralPloidyExactAFCalculation extends ExactAFCalculation {
|
||||
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
|
||||
final protected UnifiedArgumentCollection UAC;
|
||||
|
||||
private final int ploidy;
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
private final static boolean VERBOSE = false;
|
||||
|
||||
protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
protected GeneralPloidyExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
ploidy = UAC.samplePloidy;
|
||||
this.UAC = UAC;
|
||||
|
||||
}
|
||||
|
||||
public List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
GenotypesContext GLs = vc.getGenotypes();
|
||||
List<Allele> alleles = vc.getAlleles();
|
||||
public GeneralPloidyExactAFCalculation(final int nSamples, final int maxAltAlleles, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, false, null, null, null);
|
||||
this.ploidy = ploidy;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected VariantContext reduceScope(VariantContext vc) {
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
|
||||
final List<Allele> alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy));
|
||||
|
||||
|
||||
GLs = subsetAlleles(vc, alleles, false, ploidy);
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
builder.alleles(alleles);
|
||||
builder.genotypes(subsetAlleles(vc, alleles, false, ploidy));
|
||||
return builder.make();
|
||||
} else {
|
||||
return vc;
|
||||
}
|
||||
}
|
||||
|
||||
combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
|
||||
|
||||
return alleles;
|
||||
@Override
|
||||
public void computeLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, result);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -194,7 +198,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
|
||||
combinedPoolLikelihoods.add(set);
|
||||
for (int p=1; p<genotypeLikelihoods.size(); p++) {
|
||||
result.reset();
|
||||
result.reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
|
||||
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
|
||||
numAlleles, log10AlleleFrequencyPriors, result);
|
||||
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
|
||||
|
|
@ -226,6 +230,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
// keep processing while we have AC conformations that need to be calculated
|
||||
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
result.incNEvaluations();
|
||||
// compute log10Likelihoods
|
||||
final ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLikelihoodSeen, ACqueue, indexesToACset);
|
||||
|
|
@ -491,15 +491,15 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
// If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
|
||||
// and we repeat until queue is empty
|
||||
// queue of AC conformations to process
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue = new LinkedList<AlleleFrequencyCalculationModel.ExactACset>();
|
||||
final LinkedList<ExactAFCalculation.ExactACset> ACqueue = new LinkedList<ExactAFCalculation.ExactACset>();
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset = new HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset>(likelihoodDim);
|
||||
final HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset> indexesToACset = new HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset>(likelihoodDim);
|
||||
// add AC=0 to the queue
|
||||
final int[] zeroCounts = new int[nAlleles];
|
||||
zeroCounts[0] = numChromosomes;
|
||||
|
||||
AlleleFrequencyCalculationModel.ExactACset zeroSet =
|
||||
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts));
|
||||
ExactAFCalculation.ExactACset zeroSet =
|
||||
new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(zeroCounts));
|
||||
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
|
@ -508,7 +508,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove();
|
||||
final ExactAFCalculation.ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
|
|
@ -525,8 +525,8 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
int plIdx = 0;
|
||||
SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
|
||||
while (iterator.hasNext()) {
|
||||
AlleleFrequencyCalculationModel.ExactACset ACset =
|
||||
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector()));
|
||||
ExactAFCalculation.ExactACset ACset =
|
||||
new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(iterator.getCurrentVector()));
|
||||
// for observed base X, add Q(jX,k) to likelihood vector for all k in error model
|
||||
//likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
|
||||
|
|
@ -540,14 +540,14 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
|
||||
}
|
||||
|
||||
private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set,
|
||||
private double calculateACConformationAndUpdateQueue(final DiploidExactAFCalculation.ExactACset set,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final double maxLog10L,
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts,
|
||||
AlleleFrequencyCalculationModel.ExactACset> indexesToACset,
|
||||
final LinkedList<ExactAFCalculation.ExactACset> ACqueue,
|
||||
final HashMap<ExactAFCalculation.ExactACcounts,
|
||||
ExactAFCalculation.ExactACset> indexesToACset,
|
||||
final ReadBackedPileup pileup) {
|
||||
// compute likelihood of set
|
||||
getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
|
||||
|
|
@ -597,7 +597,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
* @param numObservations Number of observations for each allele
|
||||
* @param pileup Read backed pileup in case it's necessary
|
||||
*/
|
||||
public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
public abstract void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
|
|
@ -608,12 +608,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
|
||||
// Static methods
|
||||
public static void updateACset(final int[] newSetCounts,
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset) {
|
||||
final LinkedList<ExactAFCalculation.ExactACset> ACqueue,
|
||||
final HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset> indexesToACset) {
|
||||
|
||||
final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts);
|
||||
final ExactAFCalculation.ExactACcounts index = new ExactAFCalculation.ExactACcounts(newSetCounts);
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index);
|
||||
ExactAFCalculation.ExactACset newSet = new ExactAFCalculation.ExactACset(1, index);
|
||||
indexesToACset.put(index, newSet);
|
||||
ACqueue.add(newSet);
|
||||
if (VERBOSE)
|
||||
|
|
|
|||
|
|
@ -188,7 +188,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
|
|||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele in alleleList
|
||||
*/
|
||||
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
|
|
|
|||
|
|
@ -12,7 +12,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
|||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
|
@ -218,7 +221,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
|
|||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele in alleleList
|
||||
*/
|
||||
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
|
|
|
|||
|
|
@ -237,9 +237,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING);
|
||||
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
|
||||
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
|
||||
|
||||
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
|
||||
UnifiedArgumentCollection simpleUAC = UAC.clone();
|
||||
simpleUAC.exactCallsLog = null;
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
// initialize the output VCF header
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
|
|
|
|||
|
|
@ -0,0 +1,348 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class ExactAFCalculationModelUnitTest extends BaseTest {
|
||||
static Allele A = Allele.create("A", true);
|
||||
static Allele C = Allele.create("C");
|
||||
static Allele G = Allele.create("G");
|
||||
|
||||
static int sampleNameCounter = 0;
|
||||
static Genotype AA1, AB1, BB1, NON_INFORMATIVE1;
|
||||
static Genotype AA2, AB2, AC2, BB2, BC2, CC2, NON_INFORMATIVE2;
|
||||
final double[] FLAT_3SAMPLE_PRIORS = new double[2*3+1]; // flat priors
|
||||
final private static boolean INCLUDE_BIALLELIC = true;
|
||||
final private static boolean INCLUDE_TRIALLELIC = true;
|
||||
final private static boolean Guillermo_FIXME = false; // TODO -- can only be enabled when GdA fixes bug
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
AA1 = makePL(Arrays.asList(A, A), 0, 20, 20);
|
||||
AB1 = makePL(Arrays.asList(A, C), 20, 0, 20);
|
||||
BB1 = makePL(Arrays.asList(C, C), 20, 20, 0);
|
||||
NON_INFORMATIVE1 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0);
|
||||
|
||||
AA2 = makePL(Arrays.asList(A, A), 0, 20, 20, 20, 20, 20);
|
||||
AB2 = makePL(Arrays.asList(A, C), 20, 0, 20, 20, 20, 20);
|
||||
BB2 = makePL(Arrays.asList(C, C), 20, 20, 0, 20, 20, 20);
|
||||
AC2 = makePL(Arrays.asList(A, G), 20, 20, 20, 0, 20, 20);
|
||||
BC2 = makePL(Arrays.asList(C, G), 20, 20, 20, 20, 0, 20);
|
||||
CC2 = makePL(Arrays.asList(G, G), 20, 20, 20, 20, 20, 0);
|
||||
NON_INFORMATIVE2 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0, 0, 0, 0);
|
||||
}
|
||||
|
||||
private Genotype makePL(final List<Allele> expectedGT, int ... pls) {
|
||||
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
|
||||
gb.alleles(expectedGT);
|
||||
gb.PL(pls);
|
||||
return gb.make();
|
||||
}
|
||||
|
||||
private class GetGLsTest extends TestDataProvider {
|
||||
GenotypesContext GLs;
|
||||
int numAltAlleles;
|
||||
final ExactAFCalculation calc;
|
||||
final int[] expectedACs;
|
||||
final double[] priors;
|
||||
final String priorName;
|
||||
|
||||
private GetGLsTest(final ExactAFCalculation calculation, int numAltAlleles, List<Genotype> arg, final double[] priors, final String priorName) {
|
||||
super(GetGLsTest.class);
|
||||
GLs = GenotypesContext.create(new ArrayList<Genotype>(arg));
|
||||
this.numAltAlleles = numAltAlleles;
|
||||
this.calc = calculation;
|
||||
this.priors = priors;
|
||||
this.priorName = priorName;
|
||||
|
||||
expectedACs = new int[numAltAlleles+1];
|
||||
for ( int alleleI = 0; alleleI < expectedACs.length; alleleI++ ) {
|
||||
expectedACs[alleleI] = 0;
|
||||
final Allele allele = getAlleles().get(alleleI);
|
||||
for ( Genotype g : arg ) {
|
||||
expectedACs[alleleI] += Collections.frequency(g.getAlleles(), allele);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public AlleleFrequencyCalculationResult execute() {
|
||||
return getCalc().getLog10PNonRef(getVC(), getPriors());
|
||||
}
|
||||
|
||||
public double[] getPriors() {
|
||||
return priors;
|
||||
}
|
||||
|
||||
public ExactAFCalculation getCalc() {
|
||||
return calc;
|
||||
}
|
||||
|
||||
public VariantContext getVC() {
|
||||
VariantContextBuilder builder = new VariantContextBuilder("test", "1", 1, 1, getAlleles());
|
||||
builder.genotypes(GLs);
|
||||
return builder.make();
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles() {
|
||||
return Arrays.asList(Allele.create("A", true),
|
||||
Allele.create("C"),
|
||||
Allele.create("G"),
|
||||
Allele.create("T")).subList(0, numAltAlleles+1);
|
||||
}
|
||||
|
||||
public int getExpectedAltAC(final int alleleI) {
|
||||
return expectedACs[alleleI+1];
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s model=%s prior=%s input=%s", super.toString(), calc.getClass().getSimpleName(),
|
||||
priorName, GLs.size() > 5 ? String.format("%d samples", GLs.size()) : GLs);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "wellFormedGLs")
|
||||
public Object[][] createSimpleGLsData() {
|
||||
final List<Genotype> biAllelicSamples = Arrays.asList(AA1, AB1, BB1);
|
||||
final List<Genotype> triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2);
|
||||
|
||||
for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
|
||||
final ExactAFCalculation diploidCalc = new DiploidExactAFCalculation(nSamples, 4);
|
||||
final ExactAFCalculation optDiploidCalc = new OptimizedDiploidExactAFCalculation(nSamples, 4);
|
||||
final ExactAFCalculation generalCalc = new GeneralPloidyExactAFCalculation(nSamples, 4, 2);
|
||||
|
||||
final int nPriorValues = 2*nSamples+1;
|
||||
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
final double[] humanPriors = new double[nPriorValues];
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues-1, humanPriors, 0.001);
|
||||
|
||||
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
|
||||
for ( ExactAFCalculation model : Arrays.asList(diploidCalc, optDiploidCalc, generalCalc) ) {
|
||||
final String priorName = priors == humanPriors ? "human" : "flat";
|
||||
|
||||
// bi-allelic
|
||||
if ( INCLUDE_BIALLELIC && nSamples <= biAllelicSamples.size() )
|
||||
for ( List<Genotype> genotypes : Utils.makePermutations(biAllelicSamples, nSamples, true) )
|
||||
new GetGLsTest(model, 1, genotypes, priors, priorName);
|
||||
|
||||
// tri-allelic
|
||||
if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || model != generalCalc || Guillermo_FIXME ) )
|
||||
for ( List<Genotype> genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) )
|
||||
new GetGLsTest(model, 2, genotypes, priors, priorName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return GetGLsTest.getTests(GetGLsTest.class);
|
||||
}
|
||||
|
||||
private static class NonInformativeData {
|
||||
final Genotype nonInformative;
|
||||
final List<Genotype> called;
|
||||
final int nAltAlleles;
|
||||
|
||||
private NonInformativeData(List<Genotype> called, Genotype nonInformative, int nAltAlleles) {
|
||||
this.called = called;
|
||||
this.nonInformative = nonInformative;
|
||||
this.nAltAlleles = nAltAlleles;
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "GLsWithNonInformative")
|
||||
public Object[][] makeGLsWithNonInformative() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final List<NonInformativeData> nonInformativeTests = new LinkedList<NonInformativeData>();
|
||||
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB1), NON_INFORMATIVE1, 1));
|
||||
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2), NON_INFORMATIVE2, 2));
|
||||
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2, BC2), NON_INFORMATIVE2, 2));
|
||||
|
||||
for ( final int nNonInformative : Arrays.asList(1, 10, 100) ) {
|
||||
for ( final NonInformativeData testData : nonInformativeTests ) {
|
||||
final List<Genotype> samples = new ArrayList<Genotype>();
|
||||
samples.addAll(testData.called);
|
||||
samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative));
|
||||
|
||||
final int nSamples = samples.size();
|
||||
final ExactAFCalculation diploidCalc = new DiploidExactAFCalculation(nSamples, 4);
|
||||
final ExactAFCalculation optDiploidCalc = new OptimizedDiploidExactAFCalculation(nSamples, 4);
|
||||
final ExactAFCalculation generalCalc = new GeneralPloidyExactAFCalculation(nSamples, 4, 2);
|
||||
final double[] priors = new double[2*nSamples+1]; // flat priors
|
||||
|
||||
for ( ExactAFCalculation model : Arrays.asList(diploidCalc, optDiploidCalc, generalCalc) ) {
|
||||
final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat");
|
||||
|
||||
for ( int rotation = 0; rotation < nSamples; rotation++ ) {
|
||||
Collections.rotate(samples, 1);
|
||||
final GetGLsTest withNonInformative = new GetGLsTest(model, testData.nAltAlleles, samples, priors, "flat");
|
||||
tests.add(new Object[]{onlyInformative, withNonInformative});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "wellFormedGLs")
|
||||
public void testGLs(GetGLsTest cfg) {
|
||||
testResultSimple(cfg);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "GLsWithNonInformative", dependsOnMethods = "testGLs")
|
||||
public void testGLsWithNonInformative(GetGLsTest onlyInformative, GetGLsTest withNonInformative) {
|
||||
final AlleleFrequencyCalculationResult expected = onlyInformative.execute();
|
||||
final AlleleFrequencyCalculationResult actual = withNonInformative.execute();
|
||||
|
||||
testResultSimple(withNonInformative);
|
||||
|
||||
Assert.assertEquals(actual.getLog10PosteriorOfAFzero(), expected.getLog10LikelihoodOfAFzero());
|
||||
Assert.assertEquals(actual.getLog10LikelihoodOfAFzero(), expected.getLog10LikelihoodOfAFzero());
|
||||
Assert.assertEquals(actual.getLog10PosteriorsMatrixSumWithoutAFzero(), expected.getLog10PosteriorsMatrixSumWithoutAFzero());
|
||||
Assert.assertEquals(actual.getAlleleCountsOfMAP(), expected.getAlleleCountsOfMAP());
|
||||
Assert.assertEquals(actual.getAlleleCountsOfMLE(), expected.getAlleleCountsOfMLE());
|
||||
Assert.assertEquals(actual.getLog10MAP(), expected.getLog10MAP());
|
||||
Assert.assertEquals(actual.getLog10MLE(), expected.getLog10MLE());
|
||||
Assert.assertEquals(actual.getAllelesUsedInGenotyping(), expected.getAllelesUsedInGenotyping());
|
||||
}
|
||||
|
||||
private void testResultSimple(final GetGLsTest cfg) {
|
||||
final AlleleFrequencyCalculationResult result = cfg.execute();
|
||||
|
||||
Assert.assertEquals(result.getNormalizedPosteriorOfAFzero() + result.getNormalizedPosteriorOfAFGTZero(), 1.0, 1e-4);
|
||||
|
||||
final int minNumberOfEvaluations = cfg.getVC().getCalledChrCount();
|
||||
Assert.assertTrue(result.getnEvaluations() >= minNumberOfEvaluations,
|
||||
"Number of evaluations " + result.getnEvaluations() + " must be at least " + minNumberOfEvaluations);
|
||||
Assert.assertNotNull(result.getAllelesUsedInGenotyping());
|
||||
Assert.assertTrue(cfg.getAlleles().containsAll(result.getAllelesUsedInGenotyping()), "Result object has alleles not in our initial allele list");
|
||||
|
||||
for ( int altAlleleI = 0; altAlleleI < cfg.numAltAlleles; altAlleleI++ ) {
|
||||
int expectedAlleleCount = cfg.getExpectedAltAC(altAlleleI);
|
||||
int calcAC_MLE = result.getAlleleCountsOfMLE()[altAlleleI];
|
||||
|
||||
final Allele allele = cfg.getAlleles().get(altAlleleI+1);
|
||||
Assert.assertEquals(calcAC_MLE, expectedAlleleCount, "MLE AC not equal to expected AC for allele " + allele);
|
||||
}
|
||||
|
||||
// TODO
|
||||
// TODO -- enable when we understand the contract between AC_MAP and pNonRef
|
||||
// TODO
|
||||
// final int AC_MAP = (int)MathUtils.sum(result.getAlleleCountsOfMAP());
|
||||
// if ( AC_MAP > 0 ) {
|
||||
// Assert.assertTrue(result.getNormalizedPosteriorOfAFzero() < 0.50, "MAP AC " + AC_MAP + " > 0 but we had posterior AF = 0 > 0.5 of " + result.getNormalizedPosteriorOfAFzero());
|
||||
// } else {
|
||||
// Assert.assertTrue(result.getNormalizedPosteriorOfAFzero() > 0.50, "MAP AC " + AC_MAP + " == 0 but we had posterior AF = 0 < 0.5 of " + result.getNormalizedPosteriorOfAFzero());
|
||||
// }
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "Models")
|
||||
public void testLargeGLs(final ExactAFCalculation calc) {
|
||||
final Genotype BB = makePL(Arrays.asList(C, C), 20000000, 20000000, 0);
|
||||
GetGLsTest cfg = new GetGLsTest(calc, 1, Arrays.asList(BB, BB, BB), FLAT_3SAMPLE_PRIORS, "flat");
|
||||
|
||||
final AlleleFrequencyCalculationResult result = cfg.execute();
|
||||
|
||||
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0];
|
||||
Assert.assertEquals(calculatedAlleleCount, 6);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "Models")
|
||||
public void testMismatchedGLs(final ExactAFCalculation calc) {
|
||||
final Genotype AB = makePL(Arrays.asList(A,C), 2000, 0, 2000, 2000, 2000, 2000);
|
||||
final Genotype AC = makePL(Arrays.asList(A,G), 100, 100, 100, 0, 100, 100);
|
||||
GetGLsTest cfg = new GetGLsTest(calc, 2, Arrays.asList(AB, AC), FLAT_3SAMPLE_PRIORS, "flat");
|
||||
|
||||
final AlleleFrequencyCalculationResult result = cfg.execute();
|
||||
|
||||
Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1);
|
||||
Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1);
|
||||
}
|
||||
|
||||
@DataProvider(name = "Models")
|
||||
public Object[][] makeModels() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
tests.add(new Object[]{new DiploidExactAFCalculation(2, 4)});
|
||||
tests.add(new Object[]{new OptimizedDiploidExactAFCalculation(2, 4)});
|
||||
tests.add(new Object[]{new GeneralPloidyExactAFCalculation(2, 4, 2)});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "Models")
|
||||
public void testBiallelicPriors(final ExactAFCalculation model) {
|
||||
final int REF_PL = 10;
|
||||
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
|
||||
|
||||
for ( int log10NonRefPrior = 1; log10NonRefPrior < 100*REF_PL; log10NonRefPrior += 1 ) {
|
||||
final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior);
|
||||
final double[] priors = MathUtils.toLog10(new double[]{refPrior, (1-refPrior) / 2, (1-refPrior) / 2});
|
||||
GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior);
|
||||
final AlleleFrequencyCalculationResult result = cfg.execute();
|
||||
final int actualAC = result.getAlleleCountsOfMAP()[0];
|
||||
|
||||
final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
|
||||
final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1];
|
||||
final boolean expectNonRef = pRefWithPrior <= pHetWithPrior;
|
||||
|
||||
if ( expectNonRef )
|
||||
Assert.assertTrue(result.getNormalizedPosteriorOfAFGTZero() > 0.5);
|
||||
else
|
||||
Assert.assertTrue(result.getNormalizedPosteriorOfAFGTZero() < 0.5);
|
||||
|
||||
final int expectedAC = expectNonRef ? 1 : 0;
|
||||
Assert.assertEquals(actualAC, expectedAC,
|
||||
"actual AC with priors " + log10NonRefPrior + " not expected "
|
||||
+ expectedAC + " priors " + Utils.join(",", priors));
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = false, dataProvider = "Models")
|
||||
public void testTriallelicPriors(final ExactAFCalculation model) {
|
||||
// TODO
|
||||
// TODO
|
||||
// TODO THIS SEEMS TO ID A BUG IN THE EXACT MODEL FOR MULTI-ALLELICS, AS THE
|
||||
// TODO SECOND ALLELE ISN'T HAVING A SQUARED PRIOR. TALK TO ERIC AND CONFIRM
|
||||
// TODO
|
||||
// TODO
|
||||
final int REF_PL_AB = 10, REF_PL_AC = 20; // first AC goes, then AB
|
||||
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL_AB, 0, 10000, 10000, 10000);
|
||||
final Genotype AC = makePL(Arrays.asList(A, G), REF_PL_AC, 10000, 10000, 0, 10000, 10000);
|
||||
|
||||
for ( int log10NonRefPrior = 1; log10NonRefPrior < 100*REF_PL_AC; log10NonRefPrior += 1 ) {
|
||||
final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior);
|
||||
final double nonRefPrior = (1-refPrior) / 2;
|
||||
final double[] priors = MathUtils.toLog10(new double[]{refPrior, nonRefPrior, nonRefPrior, nonRefPrior, nonRefPrior, nonRefPrior});
|
||||
GetGLsTest cfg = new GetGLsTest(model, 2, Arrays.asList(AB, AC), priors, "pNonRef" + log10NonRefPrior);
|
||||
final AlleleFrequencyCalculationResult result = cfg.execute();
|
||||
final int actualAC_AB = result.getAlleleCountsOfMAP()[0];
|
||||
|
||||
final double pRefABWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
|
||||
final double pHetABWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1];
|
||||
final int expectedAC_AB = pRefABWithPrior <= pHetABWithPrior ? 1 : 0;
|
||||
Assert.assertEquals(actualAC_AB, expectedAC_AB,
|
||||
"actual AC with priors " + log10NonRefPrior + " not expected "
|
||||
+ expectedAC_AB + " priors " + Utils.join(",", priors));
|
||||
|
||||
final double nonRefPriorSecondAllele = Math.pow(nonRefPrior, 2);
|
||||
final double refPriorSecondAllele = 1 - nonRefPriorSecondAllele;
|
||||
final int actualAC_AC = result.getAlleleCountsOfMAP()[1];
|
||||
final double pRefACWithPrior = AB.getLikelihoods().getAsVector()[0] + Math.log10(refPriorSecondAllele);
|
||||
final double pHetACWithPrior = AC.getLikelihoods().getAsVector()[3] + Math.log10(nonRefPriorSecondAllele);
|
||||
final int expectedAC_AC = pRefACWithPrior <= pHetACWithPrior ? 1 : 0;
|
||||
Assert.assertEquals(actualAC_AC, expectedAC_AC,
|
||||
"actual AC with priors " + log10NonRefPrior + " not expected "
|
||||
+ expectedAC_AC + " priors " + Utils.join(",", priors));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -141,7 +141,7 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest {
|
|||
final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size());
|
||||
double[] priors = new double[len]; // flat priors
|
||||
|
||||
GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
|
||||
GeneralPloidyExactAFCalculation.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
|
||||
int nameIndex = 1;
|
||||
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
|
||||
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
|
||||
|
|
|
|||
|
|
@ -1,13 +1,12 @@
|
|||
package org.broadinstitute.sting.gatk.arguments;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Advanced;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
|
|
@ -59,4 +58,8 @@ public class StandardCallerArgumentCollection {
|
|||
@Advanced
|
||||
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES = 3;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "logExactCalls", doc="x", required=false)
|
||||
public File exactCallsLog = null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -124,6 +124,12 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
*/
|
||||
private FilePointer generatePointerOverEntireFileset() {
|
||||
FilePointer filePointer = new FilePointer();
|
||||
|
||||
// This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is
|
||||
// the only FilePointer we will create. This allows us to have this FilePointer represent regions from
|
||||
// multiple contigs
|
||||
filePointer.setIsMonolithic(true);
|
||||
|
||||
Map<SAMReaderID,GATKBAMFileSpan> currentPosition;
|
||||
|
||||
// Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
|||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -88,6 +89,17 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
|
|||
*/
|
||||
private PeekableIterator<SAMRecord> currentContigReadsIterator = null;
|
||||
|
||||
/**
|
||||
* How many FilePointers have we pulled from the filePointers iterator?
|
||||
*/
|
||||
private int totalFilePointersConsumed = 0;
|
||||
|
||||
/**
|
||||
* Have we encountered a monolithic FilePointer?
|
||||
*/
|
||||
private boolean encounteredMonolithicFilePointer = false;
|
||||
|
||||
|
||||
{
|
||||
createNextContigFilePointer();
|
||||
advance();
|
||||
|
|
@ -167,6 +179,20 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
|
|||
logger.info("Loading BAM index data for next contig");
|
||||
|
||||
while ( filePointers.hasNext() ) {
|
||||
|
||||
// Make sure that if we see a monolithic FilePointer (representing all regions in all files) that
|
||||
// it is the ONLY FilePointer we ever encounter
|
||||
if ( encounteredMonolithicFilePointer ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer");
|
||||
}
|
||||
if ( filePointers.peek().isMonolithic() ) {
|
||||
if ( totalFilePointersConsumed > 0 ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer");
|
||||
}
|
||||
encounteredMonolithicFilePointer = true;
|
||||
logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek()));
|
||||
}
|
||||
|
||||
// If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the
|
||||
// same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge
|
||||
if ( nextContigFilePointers.isEmpty() ||
|
||||
|
|
@ -175,6 +201,7 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
|
|||
(nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) {
|
||||
|
||||
nextContigFilePointers.add(filePointers.next());
|
||||
totalFilePointersConsumed++;
|
||||
}
|
||||
else {
|
||||
break; // next FilePointer is on a different contig or has different mapped/unmapped status,
|
||||
|
|
|
|||
|
|
@ -50,6 +50,14 @@ public class FilePointer {
|
|||
*/
|
||||
protected final boolean isRegionUnmapped;
|
||||
|
||||
/**
|
||||
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
|
||||
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
|
||||
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
|
||||
* from more than one contig.
|
||||
*/
|
||||
private boolean isMonolithic = false;
|
||||
|
||||
public FilePointer( List<GenomeLoc> locations ) {
|
||||
this.locations.addAll(locations);
|
||||
this.isRegionUnmapped = checkUnmappedStatus();
|
||||
|
|
@ -81,7 +89,8 @@ public class FilePointer {
|
|||
}
|
||||
|
||||
private void validateLocations() {
|
||||
if ( isRegionUnmapped ) {
|
||||
// Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction
|
||||
if ( isRegionUnmapped || isMonolithic ) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -123,6 +132,29 @@ public class FilePointer {
|
|||
return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
|
||||
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
|
||||
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
|
||||
* from more than one contig.
|
||||
*
|
||||
* @return true if this FP is a monolithic FP representing all regions in all files, otherwise false
|
||||
*/
|
||||
public boolean isMonolithic() {
|
||||
return isMonolithic;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all
|
||||
* regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic
|
||||
* FP may contain intervals from more than one contig.
|
||||
*
|
||||
* @param isMonolithic set this FP's monolithic status to this value
|
||||
*/
|
||||
public void setIsMonolithic( boolean isMonolithic ) {
|
||||
this.isMonolithic = isMonolithic;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if(!(other instanceof FilePointer))
|
||||
|
|
|
|||
|
|
@ -215,19 +215,29 @@ public class ReadShard extends Shard {
|
|||
int start = Integer.MAX_VALUE;
|
||||
int stop = Integer.MIN_VALUE;
|
||||
String contig = null;
|
||||
boolean foundMapped = false;
|
||||
|
||||
for ( final SAMRecord read : reads ) {
|
||||
if ( contig != null && ! read.getReferenceName().equals(contig) )
|
||||
throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. "
|
||||
+ "First contig is " + contig + " next read was " + read.getReferenceName() );
|
||||
contig = read.getReferenceName();
|
||||
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
|
||||
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
|
||||
|
||||
// Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates
|
||||
// of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries,
|
||||
// this shard might consist *only* of unmapped mates! We need to refrain from using the alignment
|
||||
// starts/stops of these unmapped mates, and detect the case where the shard has been filled *only*
|
||||
// with unmapped mates.
|
||||
if ( ! read.getReadUnmappedFlag() ) {
|
||||
foundMapped = true;
|
||||
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
|
||||
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
|
||||
}
|
||||
}
|
||||
|
||||
assert contig != null;
|
||||
|
||||
if ( contig.equals("*") ) // all reads are unmapped
|
||||
if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped
|
||||
return GenomeLoc.UNMAPPED;
|
||||
else
|
||||
return parser.createGenomeLoc(contig, start, stop);
|
||||
|
|
|
|||
|
|
@ -179,7 +179,7 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
|
|||
int numReadGroups = 0;
|
||||
for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() )
|
||||
numReadGroups += header.getReadGroups().size();
|
||||
recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups);
|
||||
recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG);
|
||||
|
||||
recalibrationEngine = initializeRecalibrationEngine();
|
||||
recalibrationEngine.initialize(requestedCovariates, recalibrationTables);
|
||||
|
|
|
|||
|
|
@ -182,6 +182,10 @@ public class RecalibrationArgumentCollection {
|
|||
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String FORCE_PLATFORM = null;
|
||||
|
||||
@Hidden
|
||||
@Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only")
|
||||
public PrintStream RECAL_TABLE_UPDATE_LOG = null;
|
||||
|
||||
public File existingRecalibrationReport = null;
|
||||
|
||||
public GATKReportTable generateReportTable(final String covariateNames) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,233 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods
|
||||
*/
|
||||
public abstract class AlleleFrequencyCalculation implements Cloneable {
|
||||
private final static Logger defaultLogger = Logger.getLogger(AlleleFrequencyCalculation.class);
|
||||
|
||||
public enum Model {
|
||||
/** The default model with the best performance in all cases */
|
||||
EXACT("ExactAFCalculation");
|
||||
|
||||
final String implementationName;
|
||||
|
||||
private Model(String implementationName) {
|
||||
this.implementationName = implementationName;
|
||||
}
|
||||
}
|
||||
|
||||
protected int nSamples;
|
||||
protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
|
||||
protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
|
||||
protected Logger logger;
|
||||
protected PrintStream verboseWriter;
|
||||
|
||||
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
|
||||
|
||||
private SimpleTimer callTimer = new SimpleTimer();
|
||||
private PrintStream callReport = null;
|
||||
|
||||
protected AlleleFrequencyCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) {
|
||||
this(nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.exactCallsLog, logger, verboseWriter);
|
||||
}
|
||||
|
||||
protected AlleleFrequencyCalculation(final int nSamples,
|
||||
final int maxAltAlleles,
|
||||
final boolean capMaxAltsForIndels,
|
||||
final File exactCallsLog,
|
||||
final Logger logger,
|
||||
final PrintStream verboseWriter) {
|
||||
if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples);
|
||||
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles);
|
||||
|
||||
this.nSamples = nSamples;
|
||||
this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = maxAltAlleles;
|
||||
this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = capMaxAltsForIndels;
|
||||
this.logger = logger == null ? defaultLogger : logger;
|
||||
this.verboseWriter = verboseWriter;
|
||||
if ( exactCallsLog != null )
|
||||
initializeOutputFile(exactCallsLog);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #getLog10PNonRef(org.broadinstitute.sting.utils.variantcontext.VariantContext, double[], AlleleFrequencyCalculationResult)
|
||||
*
|
||||
* Allocates a new results object. Useful for testing but slow in practice.
|
||||
*/
|
||||
public final AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors) {
|
||||
return getLog10PNonRef(vc, log10AlleleFrequencyPriors, new AlleleFrequencyCalculationResult(MAX_ALTERNATE_ALLELES_TO_GENOTYPE));
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
|
||||
*
|
||||
* @param vc the VariantContext holding the alleles and sample information
|
||||
* @param log10AlleleFrequencyPriors a prior vector nSamples x 2 in length indicating the Pr(AF = i)
|
||||
* @param result a pre-allocated (for efficiency) object to hold the result of the calculation
|
||||
* @return result (for programming convenience)
|
||||
*/
|
||||
public final AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null");
|
||||
if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null");
|
||||
if ( result == null ) throw new IllegalArgumentException("Results object cannot be null");
|
||||
|
||||
// reset the result, so we can store our new result there
|
||||
result.reset();
|
||||
|
||||
final VariantContext vcWorking = reduceScope(vc);
|
||||
|
||||
callTimer.start();
|
||||
computeLog10PNonRef(vcWorking, log10AlleleFrequencyPriors, result);
|
||||
final long nanoTime = callTimer.getElapsedTimeNano();
|
||||
|
||||
if ( callReport != null )
|
||||
printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, result.getLog10PosteriorOfAFzero());
|
||||
|
||||
result.setAllelesUsedInGenotyping(vcWorking.getAlleles());
|
||||
return result;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// Abstract methods that should be implemented by concrete implementations
|
||||
// to actually calculate the AF
|
||||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Look at VC and perhaps return a new one of reduced complexity, if that's necessary
|
||||
*
|
||||
* Used before the call to computeLog10PNonRef to simply the calculation job at hand,
|
||||
* if vc exceeds bounds. For example, if VC has 100 alt alleles this function
|
||||
* may decide to only genotype the best 2 of them.
|
||||
*
|
||||
* @param vc the initial VC provided by the caller to this AFcalculation
|
||||
* @return a potentially simpler VC that's more tractable to genotype
|
||||
*/
|
||||
@Requires("vc != null")
|
||||
@Ensures("result != null")
|
||||
protected abstract VariantContext reduceScope(final VariantContext vc);
|
||||
|
||||
/**
|
||||
* Actually carry out the log10PNonRef calculation on vc, storing results in results
|
||||
*
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param log10AlleleFrequencyPriors priors
|
||||
* @param result (pre-allocated) object to store results
|
||||
*/
|
||||
// TODO -- add consistent requires among args
|
||||
protected abstract void computeLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result);
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
*
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param allelesToUse alleles to subset
|
||||
* @param assignGenotypes
|
||||
* @param ploidy
|
||||
* @return GenotypesContext object
|
||||
*/
|
||||
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
//
|
||||
// Print information about the call to the calls log
|
||||
//
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
private void initializeOutputFile(final File outputFile) {
|
||||
try {
|
||||
if (outputFile != null) {
|
||||
callReport = new PrintStream( new FileOutputStream(outputFile) );
|
||||
callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value")));
|
||||
}
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotCreateOutputFile(outputFile, e);
|
||||
}
|
||||
}
|
||||
|
||||
private void printCallInfo(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final long runtimeNano,
|
||||
final double log10PosteriorOfAFzero) {
|
||||
printCallElement(vc, "type", "ignore", vc.getType());
|
||||
|
||||
int allelei = 0;
|
||||
for ( final Allele a : vc.getAlleles() )
|
||||
printCallElement(vc, "allele", allelei++, a.getDisplayString());
|
||||
|
||||
for ( final Genotype g : vc.getGenotypes() )
|
||||
printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString());
|
||||
|
||||
for ( int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++ )
|
||||
printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]);
|
||||
|
||||
printCallElement(vc, "runtime.nano", "ignore", runtimeNano);
|
||||
printCallElement(vc, "log10PosteriorOfAFzero", "ignore", log10PosteriorOfAFzero);
|
||||
|
||||
callReport.flush();
|
||||
}
|
||||
|
||||
private void printCallElement(final VariantContext vc,
|
||||
final Object variable,
|
||||
final Object key,
|
||||
final Object value) {
|
||||
final String loc = String.format("%s:%d", vc.getChr(), vc.getStart());
|
||||
callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value)));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -25,9 +25,12 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -35,9 +38,10 @@ import java.util.Arrays;
|
|||
* Date: Dec 14, 2011
|
||||
*
|
||||
* Useful helper class to communicate the results of the allele frequency calculation
|
||||
*
|
||||
* TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF?
|
||||
*/
|
||||
public class AlleleFrequencyCalculationResult {
|
||||
|
||||
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
|
||||
private double log10MLE;
|
||||
private double log10MAP;
|
||||
|
|
@ -54,21 +58,88 @@ public class AlleleFrequencyCalculationResult {
|
|||
private double log10LikelihoodOfAFzero;
|
||||
private double log10PosteriorOfAFzero;
|
||||
|
||||
int nEvaluations = 0;
|
||||
|
||||
/**
|
||||
* The list of alleles actually used in computing the AF
|
||||
*/
|
||||
private List<Allele> allelesUsedInGenotyping = null;
|
||||
|
||||
/**
|
||||
* Create a results object capability of storing results for calls with up to maxAltAlleles
|
||||
*
|
||||
* @param maxAltAlleles an integer >= 1
|
||||
*/
|
||||
public AlleleFrequencyCalculationResult(final int maxAltAlleles) {
|
||||
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles);
|
||||
|
||||
alleleCountsOfMLE = new int[maxAltAlleles];
|
||||
alleleCountsOfMAP = new int[maxAltAlleles];
|
||||
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the log10 value of the probability mass at the MLE
|
||||
*
|
||||
* @return a log10 prob
|
||||
*/
|
||||
@Ensures("goodLog10Value(result)")
|
||||
public double getLog10MLE() {
|
||||
return log10MLE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the log10 value of the probability mass at the max. a posterior (MAP)
|
||||
*
|
||||
* @return a log10 prob
|
||||
*/
|
||||
@Ensures("goodLog10Value(result)")
|
||||
public double getLog10MAP() {
|
||||
return log10MAP;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a vector with maxAltAlleles values containing AC values at the MLE
|
||||
*
|
||||
* The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order,
|
||||
* starting from index 0 (i.e., the first alt allele is at 0). The vector is always
|
||||
* maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values
|
||||
* are meaningful.
|
||||
*
|
||||
* @return a vector with allele counts, not all of which may be meaningful
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public int[] getAlleleCountsOfMLE() {
|
||||
return alleleCountsOfMLE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a vector with maxAltAlleles values containing AC values at the MAP
|
||||
*
|
||||
* @see #getAlleleCountsOfMLE() for the encoding of results in this vector
|
||||
*
|
||||
* @return a non-null vector of ints
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public int[] getAlleleCountsOfMAP() {
|
||||
return alleleCountsOfMAP;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of cycles used to evaluate the pNonRef for this AF calculation
|
||||
*
|
||||
* @return the number of evaluations required to produce the answer for this AF calculation
|
||||
*/
|
||||
public int getnEvaluations() {
|
||||
return nEvaluations;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double getLog10PosteriorsMatrixSumWithoutAFzero() {
|
||||
if ( log10PosteriorMatrixSum == null ) {
|
||||
log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
|
||||
|
|
@ -76,33 +147,99 @@ public class AlleleFrequencyCalculationResult {
|
|||
return log10PosteriorMatrixSum;
|
||||
}
|
||||
|
||||
public int[] getAlleleCountsOfMLE() {
|
||||
return alleleCountsOfMLE;
|
||||
}
|
||||
|
||||
public int[] getAlleleCountsOfMAP() {
|
||||
return alleleCountsOfMAP;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double getLog10LikelihoodOfAFzero() {
|
||||
return log10LikelihoodOfAFzero;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double getLog10PosteriorOfAFzero() {
|
||||
return log10PosteriorOfAFzero;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
|
||||
/**
|
||||
* Get the list of alleles actually used in genotyping.
|
||||
*
|
||||
* Due to computational / implementation constraints this may be smaller than
|
||||
* the actual list of alleles requested
|
||||
*
|
||||
* @return a non-empty list of alleles used during genotyping
|
||||
*/
|
||||
@Ensures({"result != null", "! result.isEmpty()"})
|
||||
public List<Allele> getAllelesUsedInGenotyping() {
|
||||
if ( allelesUsedInGenotyping == null )
|
||||
throw new IllegalStateException("allelesUsedInGenotyping requested but not yet set");
|
||||
|
||||
return allelesUsedInGenotyping;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the normalized -- across all AFs -- of AC == 0, NOT LOG10
|
||||
* @return
|
||||
*/
|
||||
// TODO -- this ensure cannot be enabled right now because the log10 inputs can be infinity, etc.
|
||||
// TODO -- we should own these values in a more meaningful way and return good values in the case
|
||||
// TODO -- where this happens, or instead thrown an error and have a function to say "was this calculation successful
|
||||
// @Ensures({"result >= 0.0", "result <= 1.0"})
|
||||
public double getNormalizedPosteriorOfAFzero() {
|
||||
return getNormalizedPosteriors()[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the normalized -- across all AFs -- of AC > 0, NOT LOG10
|
||||
* @return
|
||||
*/
|
||||
// TODO -- this ensure cannot be enabled right now because the log10 inputs can be infinity, etc.
|
||||
// TODO -- we should own these values in a more meaningful way and return good values in the case
|
||||
// TODO -- where this happens, or instead thrown an error and have a function to say "was this calculation successful
|
||||
//@Ensures({"result >= 0.0", "result <= 1.0"})
|
||||
public double getNormalizedPosteriorOfAFGTZero() {
|
||||
return getNormalizedPosteriors()[1];
|
||||
}
|
||||
|
||||
private double[] getNormalizedPosteriors() {
|
||||
final double[] posteriors = new double[]{ getLog10PosteriorOfAFzero(), getLog10PosteriorsMatrixSumWithoutAFzero() };
|
||||
return MathUtils.normalizeFromLog10(posteriors);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Protected mutational methods only for use within the calculation models themselves
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Reset the data in this results object, so that it can be used in a subsequent AF calculation
|
||||
*
|
||||
* Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer
|
||||
*/
|
||||
protected void reset() {
|
||||
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculation.VALUE_NOT_CALCULATED;
|
||||
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
|
||||
alleleCountsOfMLE[i] = 0;
|
||||
alleleCountsOfMAP[i] = 0;
|
||||
}
|
||||
currentPosteriorsCacheIndex = 0;
|
||||
log10PosteriorMatrixSum = null;
|
||||
allelesUsedInGenotyping = null;
|
||||
}
|
||||
|
||||
public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
/**
|
||||
* Tell this result we used one more evaluation cycle
|
||||
*/
|
||||
protected void incNEvaluations() {
|
||||
nEvaluations++;
|
||||
}
|
||||
|
||||
protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
if ( log10LofK > log10MLE ) {
|
||||
log10MLE = log10LofK;
|
||||
for ( int i = 0; i < alleleCountsForK.length; i++ )
|
||||
|
|
@ -110,7 +247,7 @@ public class AlleleFrequencyCalculationResult {
|
|||
}
|
||||
}
|
||||
|
||||
public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
addToPosteriorsCache(log10LofK);
|
||||
|
||||
if ( log10LofK > log10MAP ) {
|
||||
|
|
@ -132,7 +269,7 @@ public class AlleleFrequencyCalculationResult {
|
|||
}
|
||||
}
|
||||
|
||||
public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
|
||||
protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
|
||||
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
|
||||
if ( log10LikelihoodOfAFzero > log10MLE ) {
|
||||
log10MLE = log10LikelihoodOfAFzero;
|
||||
|
|
@ -140,11 +277,22 @@ public class AlleleFrequencyCalculationResult {
|
|||
}
|
||||
}
|
||||
|
||||
public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
|
||||
protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
|
||||
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
|
||||
if ( log10PosteriorOfAFzero > log10MAP ) {
|
||||
log10MAP = log10PosteriorOfAFzero;
|
||||
Arrays.fill(alleleCountsOfMAP, 0);
|
||||
}
|
||||
}
|
||||
|
||||
protected void setAllelesUsedInGenotyping(List<Allele> allelesUsedInGenotyping) {
|
||||
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() )
|
||||
throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty");
|
||||
|
||||
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
|
||||
}
|
||||
|
||||
private static boolean goodLog10Value(final double result) {
|
||||
return result <= 0.0 || Double.isInfinite(result) || Double.isNaN(result);
|
||||
}
|
||||
}
|
||||
|
|
@ -32,41 +32,54 @@ import org.broadinstitute.sting.utils.variantcontext.*;
|
|||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||
|
||||
public class DiploidExactAFCalculation extends ExactAFCalculation {
|
||||
// private final static boolean DEBUG = false;
|
||||
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
public DiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) {
|
||||
super(nSamples, maxAltAlleles, false, null, null, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Dynamically found in UnifiedGenotyperEngine
|
||||
*
|
||||
* @param UAC
|
||||
* @param N
|
||||
* @param logger
|
||||
* @param verboseWriter
|
||||
*/
|
||||
public DiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
}
|
||||
|
||||
public List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
GenotypesContext GLs = vc.getGenotypes();
|
||||
List<Allele> alleles = vc.getAlleles();
|
||||
@Override
|
||||
public void computeLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
linearExactMultiAllelic(vc.getGenotypes(), vc.getNAlleles() - 1, log10AlleleFrequencyPriors, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected VariantContext reduceScope(final VariantContext vc) {
|
||||
final int myMaxAltAllelesToGenotype = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS && vc.getType().equals(VariantContext.Type.INDEL) ? 2 : MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
List<Allele> alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
|
||||
GLs = VariantContextUtils.subsetDiploidAlleles(vc, alleles, false);
|
||||
builder.alleles(alleles);
|
||||
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
|
||||
return builder.make();
|
||||
} else {
|
||||
return vc;
|
||||
}
|
||||
|
||||
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result);
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
|
||||
private static final int PL_INDEX_OF_HOM_REF = 0;
|
||||
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
|
|
@ -134,6 +147,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// keep processing while we have AC conformations that need to be calculated
|
||||
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
result.incNEvaluations(); // keep track of the number of evaluations
|
||||
|
||||
// compute log10Likelihoods
|
||||
final ExactACset set = ACqueue.remove();
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||
|
|
@ -30,40 +30,23 @@ import org.broadinstitute.sting.utils.MathUtils;
|
|||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* The model representing how we calculate a genotype given the priors and a pile
|
||||
* of bases and quality scores
|
||||
* Uses the Exact calculation of Heng Li
|
||||
*/
|
||||
public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
||||
|
||||
public enum Model {
|
||||
/** The default model with the best performance in all cases */
|
||||
EXACT
|
||||
abstract class ExactAFCalculation extends AlleleFrequencyCalculation {
|
||||
protected ExactAFCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) {
|
||||
super(UAC, nSamples, logger, verboseWriter);
|
||||
}
|
||||
|
||||
protected int N;
|
||||
protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
|
||||
protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
|
||||
protected Logger logger;
|
||||
protected PrintStream verboseWriter;
|
||||
|
||||
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
|
||||
|
||||
protected AlleleFrequencyCalculationModel(final UnifiedArgumentCollection UAC, final int N, final Logger logger, final PrintStream verboseWriter) {
|
||||
this.N = N;
|
||||
this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = UAC.MAX_ALTERNATE_ALLELES;
|
||||
this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
this.logger = logger;
|
||||
this.verboseWriter = verboseWriter;
|
||||
protected ExactAFCalculation(final int nSamples, int maxAltAlleles, boolean capMaxAltsForIndels, File exactCallsLog, Logger logger, PrintStream verboseWriter) {
|
||||
super(nSamples, maxAltAlleles, capMaxAltsForIndels, exactCallsLog, logger, verboseWriter);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -102,31 +85,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
return genotypeLikelihoods;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param log10AlleleFrequencyPriors priors
|
||||
* @param result (pre-allocated) object to store likelihoods results
|
||||
* @return the alleles used for genotyping
|
||||
*/
|
||||
protected abstract List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result);
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param allelesToUse alleles to subset
|
||||
* @param assignGenotypes
|
||||
* @param ploidy
|
||||
* @return GenotypesContext object
|
||||
*/
|
||||
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy);
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// protected classes used to store exact model matrix columns
|
||||
|
|
@ -0,0 +1,496 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class OptimizedDiploidExactAFCalculation extends ExactAFCalculation {
|
||||
// private final static boolean DEBUG = false;
|
||||
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
public OptimizedDiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) {
|
||||
super(nSamples, maxAltAlleles, false, null, null, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Dynamically found in UnifiedGenotyperEngine
|
||||
*
|
||||
* @param UAC
|
||||
* @param N
|
||||
* @param logger
|
||||
* @param verboseWriter
|
||||
*/
|
||||
public OptimizedDiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void computeLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
linearExactMultiAllelic(vc.getGenotypes(), vc.getNAlleles() - 1, log10AlleleFrequencyPriors, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected VariantContext reduceScope(final VariantContext vc) {
|
||||
final int myMaxAltAllelesToGenotype = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS && vc.getType().equals(VariantContext.Type.INDEL) ? 2 : MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
List<Allele> alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
|
||||
builder.alleles(alleles);
|
||||
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
|
||||
return builder.make();
|
||||
} else {
|
||||
return vc;
|
||||
}
|
||||
}
|
||||
|
||||
private static final int PL_INDEX_OF_HOM_REF = 0;
|
||||
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
|
||||
for ( int i = 0; i < numOriginalAltAlleles; i++ )
|
||||
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
||||
|
||||
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
|
||||
for ( final double[] likelihoods : GLs ) {
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
|
||||
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL);
|
||||
if ( alleles.alleleIndex1 != 0 )
|
||||
likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
// don't double-count it
|
||||
if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 )
|
||||
likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
|
||||
}
|
||||
}
|
||||
|
||||
// sort them by probability mass and choose the best ones
|
||||
Collections.sort(Arrays.asList(likelihoodSums));
|
||||
final ArrayList<Allele> bestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( int i = 0; i < numAllelesToChoose; i++ )
|
||||
bestAlleles.add(likelihoodSums[i].allele);
|
||||
|
||||
final ArrayList<Allele> orderedBestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( Allele allele : vc.getAlternateAlleles() ) {
|
||||
if ( bestAlleles.contains(allele) )
|
||||
orderedBestAlleles.add(allele);
|
||||
}
|
||||
|
||||
return orderedBestAlleles;
|
||||
}
|
||||
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Multi-allelic implementation.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
public static void linearExactMultiAllelic(final GenotypesContext GLs,
|
||||
final int numAlternateAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
final int numChr = 2*numSamples;
|
||||
|
||||
// queue of AC conformations to process
|
||||
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1);
|
||||
|
||||
// add AC=0 to the queue
|
||||
int[] zeroCounts = new int[numAlternateAlleles];
|
||||
ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts));
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
result.incNEvaluations(); // keep track of the number of evaluations
|
||||
|
||||
// compute log10Likelihoods
|
||||
final ExactACset set = ACqueue.remove();
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
if ( log10LofKs > maxLikelihoodSeen.maxLog10L )
|
||||
maxLikelihoodSeen.update(log10LofKs, set.ACcounts);
|
||||
|
||||
// clean up memory
|
||||
indexesToACset.remove(set.ACcounts);
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
|
||||
}
|
||||
}
|
||||
|
||||
private static final class DependentSet {
|
||||
public final int[] ACcounts;
|
||||
public final int PLindex;
|
||||
|
||||
public DependentSet(final int[] ACcounts, final int PLindex) {
|
||||
this.ACcounts = ACcounts;
|
||||
this.PLindex = PLindex;
|
||||
}
|
||||
}
|
||||
|
||||
private static double calculateAlleleCountConformation(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final MaxLikelihoodSeen maxLikelihoodSeen,
|
||||
final int numChr,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
||||
|
||||
// compute the log10Likelihoods
|
||||
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result);
|
||||
|
||||
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
||||
|
||||
// can we abort early because the log10Likelihoods are so small?
|
||||
if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) {
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
final int ACwiggle = numChr - set.getACsum();
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
final int numAltAlleles = set.ACcounts.getCounts().length;
|
||||
|
||||
// add conformations for the k+1 case
|
||||
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
// to get to this conformation, a sample would need to be AB (remember that ref=0)
|
||||
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
|
||||
updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
|
||||
if ( ACwiggle > 1 ) {
|
||||
final ArrayList<DependentSet> differentAlleles = new ArrayList<DependentSet>(numAltAlleles * numAltAlleles);
|
||||
final ArrayList<DependentSet> sameAlleles = new ArrayList<DependentSet>(numAltAlleles);
|
||||
|
||||
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
|
||||
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele_i]++;
|
||||
ACcountsClone[allele_j]++;
|
||||
|
||||
// to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index)
|
||||
final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1);
|
||||
if ( allele_i == allele_j )
|
||||
sameAlleles.add(new DependentSet(ACcountsClone, PLindex));
|
||||
else
|
||||
differentAlleles.add(new DependentSet(ACcountsClone, PLindex));
|
||||
}
|
||||
}
|
||||
|
||||
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
|
||||
for ( DependentSet dependent : differentAlleles )
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
for ( DependentSet dependent : sameAlleles )
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
|
||||
// also pushes its value to the given callingSetIndex.
|
||||
private static void updateACset(final int[] newSetCounts,
|
||||
final int numChr,
|
||||
final ExactACset dependentSet,
|
||||
final int PLsetIndex,
|
||||
final Queue<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final ArrayList<double[]> genotypeLikelihoods) {
|
||||
final ExactACcounts index = new ExactACcounts(newSetCounts);
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
ExactACset set = new ExactACset(numChr/2 +1, index);
|
||||
indexesToACset.put(index, set);
|
||||
ACqueue.add(set);
|
||||
}
|
||||
|
||||
// push data from the dependency to the new set
|
||||
//if ( DEBUG )
|
||||
// System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts);
|
||||
pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
private static void computeLofK(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
set.log10Likelihoods[0] = 0.0; // the zero case
|
||||
final int totalK = set.getACsum();
|
||||
|
||||
// special case for k = 0 over all k
|
||||
if ( totalK == 0 ) {
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ )
|
||||
set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
|
||||
|
||||
final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
||||
result.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
// if we got here, then k > 0 for at least one k.
|
||||
// the non-AA possible conformations were already dealt with by pushes from dependent sets;
|
||||
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
|
||||
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
|
||||
|
||||
if ( totalK < 2*j-1 ) {
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
|
||||
set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue);
|
||||
}
|
||||
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator;
|
||||
}
|
||||
|
||||
double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
|
||||
|
||||
// update the MLE if necessary
|
||||
result.updateMLEifNeeded(log10LofK, set.ACcounts.counts);
|
||||
|
||||
// apply the priors over each alternate allele
|
||||
for ( final int ACcount : set.ACcounts.getCounts() ) {
|
||||
if ( ACcount > 0 )
|
||||
log10LofK += log10AlleleFrequencyPriors[ACcount];
|
||||
}
|
||||
result.updateMAPifNeeded(log10LofK, set.ACcounts.counts);
|
||||
}
|
||||
|
||||
private static void pushData(final ExactACset targetSet,
|
||||
final ExactACset dependentSet,
|
||||
final int PLsetIndex,
|
||||
final ArrayList<double[]> genotypeLikelihoods) {
|
||||
final int totalK = targetSet.getACsum();
|
||||
|
||||
for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) {
|
||||
|
||||
if ( totalK <= 2*j ) { // skip impossible conformations
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double conformationValue =
|
||||
determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex];
|
||||
targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
|
||||
|
||||
// the closed form representation generalized for multiple alleles is as follows:
|
||||
// AA: (2j - totalK) * (2j - totalK - 1)
|
||||
// AB: 2k_b * (2j - totalK)
|
||||
// AC: 2k_c * (2j - totalK)
|
||||
// BB: k_b * (k_b - 1)
|
||||
// BC: 2 * k_b * k_c
|
||||
// CC: k_c * (k_c - 1)
|
||||
|
||||
// find the 2 alleles that are represented by this PL index
|
||||
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
|
||||
|
||||
// *** note that throughout this method we subtract one from the alleleIndex because ACcounts ***
|
||||
// *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. ***
|
||||
|
||||
// the AX het case
|
||||
if ( alleles.alleleIndex1 == 0 )
|
||||
return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK];
|
||||
|
||||
final int k_i = ACcounts[alleles.alleleIndex1-1];
|
||||
|
||||
// the hom var case (e.g. BB, CC, DD)
|
||||
final double coeff;
|
||||
if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) {
|
||||
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
|
||||
}
|
||||
// the het non-ref case (e.g. BC, BD, CD)
|
||||
else {
|
||||
final int k_j = ACcounts[alleles.alleleIndex2-1];
|
||||
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
|
||||
}
|
||||
|
||||
return coeff;
|
||||
}
|
||||
|
||||
public GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy) {
|
||||
return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// Deprecated bi-allelic ~O(N) implementation. Kept here for posterity.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* A simple data structure that holds the current, prev, and prev->prev likelihoods vectors
|
||||
* for the exact model calculation
|
||||
*/
|
||||
/*
|
||||
private final static class ExactACCache {
|
||||
double[] kMinus2, kMinus1, kMinus0;
|
||||
|
||||
private final static double[] create(int n) {
|
||||
return new double[n];
|
||||
}
|
||||
|
||||
public ExactACCache(int n) {
|
||||
kMinus2 = create(n);
|
||||
kMinus1 = create(n);
|
||||
kMinus0 = create(n);
|
||||
}
|
||||
|
||||
final public void rotate() {
|
||||
double[] tmp = kMinus2;
|
||||
kMinus2 = kMinus1;
|
||||
kMinus1 = kMinus0;
|
||||
kMinus0 = tmp;
|
||||
}
|
||||
|
||||
final public double[] getkMinus2() {
|
||||
return kMinus2;
|
||||
}
|
||||
|
||||
final public double[] getkMinus1() {
|
||||
return kMinus1;
|
||||
}
|
||||
|
||||
final public double[] getkMinus0() {
|
||||
return kMinus0;
|
||||
}
|
||||
}
|
||||
|
||||
public int linearExact(GenotypesContext GLs,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[][] log10AlleleFrequencyLikelihoods,
|
||||
double[][] log10AlleleFrequencyPosteriors) {
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
final int numChr = 2*numSamples;
|
||||
|
||||
final ExactACCache logY = new ExactACCache(numSamples+1);
|
||||
logY.getkMinus0()[0] = 0.0; // the zero case
|
||||
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
boolean done = false;
|
||||
int lastK = -1;
|
||||
|
||||
for (int k=0; k <= numChr && ! done; k++ ) {
|
||||
final double[] kMinus0 = logY.getkMinus0();
|
||||
|
||||
if ( k == 0 ) { // special case for k = 0
|
||||
for ( int j=1; j <= numSamples; j++ ) {
|
||||
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0];
|
||||
}
|
||||
} else { // k > 0
|
||||
final double[] kMinus1 = logY.getkMinus1();
|
||||
final double[] kMinus2 = logY.getkMinus2();
|
||||
|
||||
for ( int j=1; j <= numSamples; j++ ) {
|
||||
final double[] gl = genotypeLikelihoods.get(j);
|
||||
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
|
||||
|
||||
double aa = Double.NEGATIVE_INFINITY;
|
||||
double ab = Double.NEGATIVE_INFINITY;
|
||||
if (k < 2*j-1)
|
||||
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0];
|
||||
|
||||
if (k < 2*j)
|
||||
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1];
|
||||
|
||||
double log10Max;
|
||||
if (k > 1) {
|
||||
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2];
|
||||
log10Max = approximateLog10SumLog10(aa, ab, bb);
|
||||
} else {
|
||||
// we know we aren't considering the BB case, so we can use an optimized log10 function
|
||||
log10Max = approximateLog10SumLog10(aa, ab);
|
||||
}
|
||||
|
||||
// finally, update the L(j,k) value
|
||||
kMinus0[j] = log10Max - logDenominator;
|
||||
}
|
||||
}
|
||||
|
||||
// update the posteriors vector
|
||||
final double log10LofK = kMinus0[numSamples];
|
||||
log10AlleleFrequencyLikelihoods[0][k] = log10LofK;
|
||||
log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k];
|
||||
|
||||
// can we abort early?
|
||||
lastK = k;
|
||||
maxLog10L = Math.max(maxLog10L, log10LofK);
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
//if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
|
||||
done = true;
|
||||
}
|
||||
|
||||
logY.rotate();
|
||||
}
|
||||
|
||||
return lastK;
|
||||
}
|
||||
|
||||
final static double approximateLog10SumLog10(double a, double b, double c) {
|
||||
return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c);
|
||||
}
|
||||
*/
|
||||
|
||||
}
|
||||
|
|
@ -41,7 +41,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
|
||||
protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
|
||||
protected AlleleFrequencyCalculation.Model AFmodel = AlleleFrequencyCalculation.Model.EXACT;
|
||||
|
||||
/**
|
||||
* The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily
|
||||
|
|
@ -186,7 +186,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
|
||||
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
|
||||
|
||||
|
||||
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
|
||||
public UnifiedArgumentCollection clone() {
|
||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||
|
|
@ -224,6 +223,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
uac.minReferenceDepth = minReferenceDepth;
|
||||
uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES;
|
||||
uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO;
|
||||
uac.exactCallsLog = exactCallsLog;
|
||||
|
||||
// todo- arguments to remove
|
||||
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
|
||||
|
|
@ -242,5 +242,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
this.OutputMode = SCAC.OutputMode;
|
||||
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
this.exactCallsLog = SCAC.exactCallsLog;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,10 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
|
|
@ -249,7 +249,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2");
|
||||
}
|
||||
|
||||
if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL)
|
||||
if (UAC.AFmodel != AlleleFrequencyCalculation.Model.POOL)
|
||||
throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2");
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -78,11 +78,10 @@ public class UnifiedGenotyperEngine {
|
|||
private ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>> glcm = new ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>>();
|
||||
|
||||
// the model used for calculating p(non-ref)
|
||||
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
|
||||
private ThreadLocal<AlleleFrequencyCalculation> afcm = new ThreadLocal<AlleleFrequencyCalculation>();
|
||||
|
||||
// the allele frequency likelihoods and posteriors (allocated once as an optimization)
|
||||
private ThreadLocal<AlleleFrequencyCalculationResult> alleleFrequencyCalculationResult = new ThreadLocal<AlleleFrequencyCalculationResult>();
|
||||
private ThreadLocal<double[]> posteriorsArray = new ThreadLocal<double[]>();
|
||||
|
||||
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
|
||||
private final double[] log10AlleleFrequencyPriorsSNPs;
|
||||
|
|
@ -357,7 +356,6 @@ public class UnifiedGenotyperEngine {
|
|||
if ( afcm.get() == null ) {
|
||||
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
|
||||
alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES));
|
||||
posteriorsArray.set(new double[2]);
|
||||
}
|
||||
AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get();
|
||||
|
||||
|
|
@ -370,8 +368,7 @@ public class UnifiedGenotyperEngine {
|
|||
generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
|
||||
}
|
||||
|
||||
AFresult.reset();
|
||||
List<Allele> allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
|
||||
afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
|
||||
|
||||
// is the most likely frequency conformation AC=0 for all alternate alleles?
|
||||
boolean bestGuessIsRef = true;
|
||||
|
|
@ -382,7 +379,7 @@ public class UnifiedGenotyperEngine {
|
|||
myAlleles.add(vc.getReference());
|
||||
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
|
||||
final Allele alternateAllele = vc.getAlternateAllele(i);
|
||||
final int indexOfAllele = allelesUsedInGenotyping.indexOf(alternateAllele);
|
||||
final int indexOfAllele = AFresult.getAllelesUsedInGenotyping().indexOf(alternateAllele);
|
||||
// the genotyping model may have stripped it out
|
||||
if ( indexOfAllele == -1 )
|
||||
continue;
|
||||
|
|
@ -403,16 +400,16 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
// calculate p(f>0):
|
||||
final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get());
|
||||
final double PofF = 1.0 - normalizedPosteriors[0];
|
||||
final double PoFEq0 = AFresult.getNormalizedPosteriorOfAFzero();
|
||||
final double PoFGT0 = AFresult.getNormalizedPosteriorOfAFGTZero();
|
||||
|
||||
double phredScaledConfidence;
|
||||
if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PoFEq0);
|
||||
if ( Double.isInfinite(phredScaledConfidence) )
|
||||
phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero();
|
||||
} else {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PoFGT0);
|
||||
if ( Double.isInfinite(phredScaledConfidence) ) {
|
||||
final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
|
||||
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
|
||||
|
|
@ -423,7 +420,7 @@ public class UnifiedGenotyperEngine {
|
|||
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) {
|
||||
// technically, at this point our confidence in a reference call isn't accurately estimated
|
||||
// because it didn't take into account samples with no data, so let's get a better estimate
|
||||
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, 1.0 - PofF);
|
||||
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0);
|
||||
}
|
||||
|
||||
// start constructing the resulting VC
|
||||
|
|
@ -439,7 +436,7 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// print out stats if we have a writer
|
||||
if ( verboseWriter != null && !limitedContext )
|
||||
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model);
|
||||
printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model);
|
||||
|
||||
// *** note that calculating strand bias involves overwriting data structures, so we do that last
|
||||
final HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
|
@ -477,7 +474,6 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// the forward lod
|
||||
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
|
||||
AFresult.reset();
|
||||
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
|
||||
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
|
||||
double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
|
||||
|
|
@ -486,7 +482,6 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// the reverse lod
|
||||
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
|
||||
AFresult.reset();
|
||||
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
|
||||
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
|
||||
double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
|
||||
|
|
@ -524,13 +519,7 @@ public class UnifiedGenotyperEngine {
|
|||
vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap);
|
||||
}
|
||||
|
||||
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
|
||||
}
|
||||
|
||||
public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) {
|
||||
normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero();
|
||||
normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
|
||||
return MathUtils.normalizeFromLog10(normalizedPosteriors);
|
||||
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
|
||||
}
|
||||
|
||||
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) {
|
||||
|
|
@ -754,32 +743,34 @@ public class UnifiedGenotyperEngine {
|
|||
return glcm;
|
||||
}
|
||||
|
||||
private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
|
||||
private static AlleleFrequencyCalculation getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
|
||||
|
||||
List<Class<? extends AlleleFrequencyCalculationModel>> afClasses = new PluginManager<AlleleFrequencyCalculationModel>(AlleleFrequencyCalculationModel.class).getPlugins();
|
||||
List<Class<? extends AlleleFrequencyCalculation>> afClasses = new PluginManager<AlleleFrequencyCalculation>(AlleleFrequencyCalculation.class).getPlugins();
|
||||
|
||||
// user-specified name
|
||||
String afModelName = UAC.AFmodel.name();
|
||||
String afModelName = UAC.AFmodel.implementationName;
|
||||
|
||||
if (!afModelName.contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY)
|
||||
afModelName = GPSTRING + afModelName;
|
||||
else
|
||||
afModelName = "Diploid" + afModelName;
|
||||
|
||||
for (int i = 0; i < afClasses.size(); i++) {
|
||||
Class<? extends AlleleFrequencyCalculationModel> afClass = afClasses.get(i);
|
||||
Class<? extends AlleleFrequencyCalculation> afClass = afClasses.get(i);
|
||||
String key = afClass.getSimpleName().replace("AFCalculationModel","").toUpperCase();
|
||||
if (afModelName.equalsIgnoreCase(key)) {
|
||||
try {
|
||||
Object args[] = new Object[]{UAC,N,logger,verboseWriter};
|
||||
Constructor c = afClass.getDeclaredConstructor(UnifiedArgumentCollection.class, int.class, Logger.class, PrintStream.class);
|
||||
|
||||
return (AlleleFrequencyCalculationModel)c.newInstance(args);
|
||||
return (AlleleFrequencyCalculation)c.newInstance(args);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
|
||||
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculation " + UAC.AFmodel);
|
||||
}
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
|
||||
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculation " + UAC.AFmodel);
|
||||
}
|
||||
|
||||
public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidExactAFCalculation;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.TreeSet;
|
||||
|
|
@ -51,7 +51,7 @@ public class GLBasedSampleSelector extends SampleSelector {
|
|||
flatPriors = new double[1+2*samples.size()];
|
||||
}
|
||||
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size());
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result);
|
||||
DiploidExactAFCalculation.linearExactMultiAllelic(subContext.getGenotypes(), vc.getAlternateAlleles().size(), flatPriors, result);
|
||||
// do we want to let this qual go up or down?
|
||||
if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) {
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.sting.alignment.bwa.java.AlignmentMatchSequence;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
|
|
@ -7,19 +9,19 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
|
@ -30,6 +32,7 @@ import java.util.*;
|
|||
* produces a binary ped file in individual major mode.
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@Reference(window=@Window(start=0,stop=100))
|
||||
public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
|
@ -78,8 +81,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
@Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele")
|
||||
boolean majorAlleleFirst = false;
|
||||
|
||||
private ValidateVariants vv = new ValidateVariants();
|
||||
|
||||
private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
|
||||
|
||||
private static final byte HOM_REF = 0x0;
|
||||
|
|
@ -89,6 +90,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
|
||||
private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples
|
||||
|
||||
private static final String PLINK_DELETION_MARKER = "-";
|
||||
|
||||
// note that HET and NO_CALL are flipped from the documentation: that's because
|
||||
// plink actually reads these in backwards; and we want to use a shift operator
|
||||
// to put these in the appropriate location
|
||||
|
|
@ -101,7 +104,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
private List<String> famOrder = new ArrayList<String>();
|
||||
|
||||
public void initialize() {
|
||||
initializeValidator();
|
||||
writeBedHeader();
|
||||
Map<String,Map<String,String>> sampleMetaValues = parseMetaData();
|
||||
// create temporary output streams and buffers
|
||||
|
|
@ -150,22 +152,25 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null || ! tracker.hasValues(variantCollection.variants) ||
|
||||
tracker.getFirstValue(variantCollection.variants).isFiltered() ||
|
||||
! tracker.getFirstValue(variantCollection.variants).isSNP() ||
|
||||
! tracker.getFirstValue(variantCollection.variants).isBiallelic()) {
|
||||
if ( tracker == null ) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
VariantContext vc = tracker.getFirstValue(variantCollection.variants,context.getLocation());
|
||||
if ( vc == null || vc.isFiltered() || ! vc.isBiallelic() ) {
|
||||
return 0;
|
||||
}
|
||||
try {
|
||||
vv.map(tracker,ref,context);
|
||||
} catch (UserException e) {
|
||||
validateVariantSite(vc,ref,context);
|
||||
} catch (TribbleException e) {
|
||||
throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+
|
||||
"Please run ValidateVariants for more detailed information.");
|
||||
"Please run ValidateVariants for more detailed information. This error is: "+e.getMessage());
|
||||
}
|
||||
|
||||
VariantContext vc = tracker.getFirstValue(variantCollection.variants);
|
||||
String refOut;
|
||||
String altOut;
|
||||
String vcRef = getReferenceAllele(vc);
|
||||
String vcAlt = getAlternateAllele(vc);
|
||||
boolean altMajor;
|
||||
if ( majorAlleleFirst ) {
|
||||
// want to use the major allele as ref
|
||||
|
|
@ -174,17 +179,17 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
VariantContextUtils.calculateChromosomeCounts(vc,ats,true);
|
||||
}
|
||||
if ( getAF(ats.get("AF")) > 0.5 ) {
|
||||
refOut = vc.getAlternateAllele(0).getBaseString();
|
||||
altOut = vc.getReference().getBaseString();
|
||||
refOut = vcAlt;
|
||||
altOut = vcRef;
|
||||
altMajor = true;
|
||||
} else {
|
||||
refOut = vc.getReference().getBaseString();
|
||||
altOut = vc.getAlternateAllele(0).getBaseString();
|
||||
refOut = vcRef;
|
||||
altOut = vcAlt;
|
||||
altMajor = false;
|
||||
}
|
||||
} else {
|
||||
refOut = vc.getReference().getBaseString();
|
||||
altOut = vc.getAlternateAllele(0).getBaseString();
|
||||
refOut = vcRef;
|
||||
altOut = vcAlt;
|
||||
altMajor = false;
|
||||
}
|
||||
// write an entry into the map file
|
||||
|
|
@ -286,8 +291,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
|
||||
private byte getStandardEncoding(Genotype g, int offset) {
|
||||
byte b;
|
||||
if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) {
|
||||
b = NO_CALL;
|
||||
if ( ! checkGQIsGood(g) ) {
|
||||
b = NO_CALL;
|
||||
} else if ( g.isHomRef() ) {
|
||||
b = HOM_REF;
|
||||
} else if ( g.isHomVar() ) {
|
||||
|
|
@ -322,7 +327,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
if ( genotype.hasGQ() ) {
|
||||
return genotype.getGQ() >= minGenotypeQuality;
|
||||
} else if ( genotype.hasLikelihoods() ) {
|
||||
return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality;
|
||||
double log10gq = GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector());
|
||||
return MathUtils.log10ProbabilityToPhredScale(log10gq) >= minGenotypeQuality;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
|
@ -346,13 +352,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
private void initializeValidator() {
|
||||
vv.variantCollection = variantCollection;
|
||||
vv.dbsnp = dbsnp;
|
||||
vv.DO_NOT_VALIDATE_FILTERED = true;
|
||||
vv.type = ValidateVariants.ValidationType.REF;
|
||||
}
|
||||
|
||||
private void writeBedHeader() {
|
||||
// write magic bits into the ped file
|
||||
try {
|
||||
|
|
@ -410,4 +409,53 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
|
||||
return metaValues;
|
||||
}
|
||||
|
||||
private void validateVariantSite(VariantContext vc, ReferenceContext ref, AlignmentContext context) {
|
||||
final Allele reportedRefAllele = vc.getReference();
|
||||
final int refLength = reportedRefAllele.length();
|
||||
if ( refLength > 100 ) {
|
||||
logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", refLength, vc.getChr(), vc.getStart()));
|
||||
return;
|
||||
}
|
||||
|
||||
final byte[] observedRefBases = new byte[refLength];
|
||||
System.arraycopy(ref.getBases(), 0, observedRefBases, 0, refLength);
|
||||
final Allele observedRefAllele = Allele.create(observedRefBases);
|
||||
vc.validateReferenceBases(reportedRefAllele, observedRefAllele);
|
||||
vc.validateAlternateAlleles();
|
||||
}
|
||||
|
||||
private String getReferenceAllele(VariantContext vc) {
|
||||
if ( vc.isSimpleInsertion() ) {
|
||||
// bi-allelic, so we just have "-" for ped output
|
||||
return PLINK_DELETION_MARKER;
|
||||
}
|
||||
if ( vc.isSymbolic() ) {
|
||||
// either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Reference will just be 1.
|
||||
return "1";
|
||||
}
|
||||
if ( vc.isSimpleDeletion() ) {
|
||||
// bi-allelic. Want to take the standard representation and strip off the leading base.
|
||||
return vc.getReference().getBaseString().substring(1);
|
||||
}
|
||||
// snp or mnp
|
||||
return vc.getReference().getBaseString();
|
||||
}
|
||||
|
||||
private String getAlternateAllele(VariantContext vc ) {
|
||||
if ( vc.isSimpleInsertion() ) {
|
||||
// bi-allelic. Want to take the standard representation and strip off the leading base.
|
||||
return vc.getAlternateAllele(0).getBaseString().substring(1);
|
||||
}
|
||||
if ( vc.isSymbolic() ) {
|
||||
// either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Alt will just be 2.
|
||||
return "2";
|
||||
}
|
||||
if ( vc.isSimpleDeletion() ) {
|
||||
// bi-allelic, so we just have "-" for ped output
|
||||
return PLINK_DELETION_MARKER;
|
||||
}
|
||||
// snp or mnp
|
||||
return vc.getAlternateAllele(0).getBaseString();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -236,6 +236,33 @@ public class Utils {
|
|||
}
|
||||
}
|
||||
|
||||
public static <T> List<T> append(final List<T> left, T ... elts) {
|
||||
final List<T> l = new LinkedList<T>(left);
|
||||
l.addAll(Arrays.asList(elts));
|
||||
return l;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string of the values in joined by separator, such as A,B,C
|
||||
*
|
||||
* @param separator
|
||||
* @param doubles
|
||||
* @return
|
||||
*/
|
||||
public static String join(String separator, double[] doubles) {
|
||||
if ( doubles == null || doubles.length == 0)
|
||||
return "";
|
||||
else {
|
||||
StringBuilder ret = new StringBuilder();
|
||||
ret.append(doubles[0]);
|
||||
for (int i = 1; i < doubles.length; ++i) {
|
||||
ret.append(separator);
|
||||
ret.append(doubles[i]);
|
||||
}
|
||||
return ret.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of
|
||||
* elti objects (note there's no actual space between sep and the elti elements). Returns
|
||||
|
|
|
|||
|
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.collections;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Wrapper around the basic NestedIntegerArray class that logs all updates (ie., all calls to put())
|
||||
* to the provided output stream. For testing/debugging purposes.
|
||||
*
|
||||
* Log entries are of the following form (fields are tab-separated):
|
||||
* LABEL VALUE KEY1 KEY2 ... KEY_N
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class LoggingNestedIntegerArray<T> extends NestedIntegerArray<T> {
|
||||
|
||||
private PrintStream log;
|
||||
private String logEntryLabel;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param log output stream to which to log update operations
|
||||
* @param logEntryLabel String that should be prefixed to each log entry
|
||||
* @param dimensions
|
||||
*/
|
||||
public LoggingNestedIntegerArray( PrintStream log, String logEntryLabel, final int... dimensions ) {
|
||||
super(dimensions);
|
||||
|
||||
if ( log == null ) {
|
||||
throw new ReviewedStingException("Log output stream must not be null");
|
||||
}
|
||||
this.log = log;
|
||||
this.logEntryLabel = logEntryLabel != null ? logEntryLabel : "";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void put( final T value, final int... keys ) {
|
||||
super.put(value, keys);
|
||||
|
||||
StringBuilder logEntry = new StringBuilder();
|
||||
|
||||
logEntry.append(logEntryLabel);
|
||||
logEntry.append("\t");
|
||||
logEntry.append(value);
|
||||
for ( int key : keys ) {
|
||||
logEntry.append("\t");
|
||||
logEntry.append(key);
|
||||
}
|
||||
|
||||
// PrintStream methods all use synchronized blocks internally, so our logging is thread-safe
|
||||
log.println(logEntry.toString());
|
||||
}
|
||||
}
|
||||
|
|
@ -25,9 +25,12 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.collections.LoggingNestedIntegerArray;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
|
||||
import org.broadinstitute.sting.utils.collections.NestedIntegerArray;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Utility class to facilitate on-the-fly base quality score recalibration.
|
||||
*
|
||||
|
|
@ -52,19 +55,31 @@ public class RecalibrationTables {
|
|||
private final NestedIntegerArray[] tables;
|
||||
|
||||
public RecalibrationTables(final Covariate[] covariates) {
|
||||
this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1);
|
||||
this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, null);
|
||||
}
|
||||
|
||||
public RecalibrationTables(final Covariate[] covariates, final PrintStream log) {
|
||||
this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, log);
|
||||
}
|
||||
|
||||
public RecalibrationTables(final Covariate[] covariates, final int numReadGroups) {
|
||||
this(covariates, numReadGroups, null);
|
||||
}
|
||||
|
||||
public RecalibrationTables(final Covariate[] covariates, final int numReadGroups, final PrintStream log) {
|
||||
tables = new NestedIntegerArray[covariates.length];
|
||||
|
||||
final int qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.index].maximumKeyValue() + 1;
|
||||
final int eventDimension = EventType.values().length;
|
||||
|
||||
tables[TableType.READ_GROUP_TABLE.index] = new NestedIntegerArray<RecalDatum>(numReadGroups, eventDimension);
|
||||
tables[TableType.QUALITY_SCORE_TABLE.index] = new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, eventDimension);
|
||||
tables[TableType.READ_GROUP_TABLE.index] = log == null ? new NestedIntegerArray<RecalDatum>(numReadGroups, eventDimension) :
|
||||
new LoggingNestedIntegerArray<RecalDatum>(log, "READ_GROUP_TABLE", numReadGroups, eventDimension);
|
||||
tables[TableType.QUALITY_SCORE_TABLE.index] = log == null ? new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, eventDimension) :
|
||||
new LoggingNestedIntegerArray<RecalDatum>(log, "QUALITY_SCORE_TABLE", numReadGroups, qualDimension, eventDimension);
|
||||
for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.index; i < covariates.length; i++)
|
||||
tables[i] = new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension);
|
||||
tables[i] = log == null ? new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) :
|
||||
new LoggingNestedIntegerArray<RecalDatum>(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.index + 1),
|
||||
numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension);
|
||||
}
|
||||
|
||||
public NestedIntegerArray<RecalDatum> getReadGroupTable() {
|
||||
|
|
|
|||
|
|
@ -1,127 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
|
||||
public class ExactAFCalculationModelUnitTest extends BaseTest {
|
||||
|
||||
static double[] AA1, AB1, BB1;
|
||||
static double[] AA2, AB2, AC2, BB2, BC2, CC2;
|
||||
static final int numSamples = 3;
|
||||
static double[] priors = new double[2*numSamples+1]; // flat priors
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
AA1 = new double[]{0.0, -20.0, -20.0};
|
||||
AB1 = new double[]{-20.0, 0.0, -20.0};
|
||||
BB1 = new double[]{-20.0, -20.0, 0.0};
|
||||
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
|
||||
AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
|
||||
BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
|
||||
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
|
||||
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
|
||||
}
|
||||
|
||||
private class GetGLsTest extends TestDataProvider {
|
||||
GenotypesContext GLs;
|
||||
int numAltAlleles;
|
||||
String name;
|
||||
|
||||
private GetGLsTest(String name, int numAltAlleles, Genotype... arg) {
|
||||
super(GetGLsTest.class, name);
|
||||
GLs = GenotypesContext.create(arg);
|
||||
this.name = name;
|
||||
this.numAltAlleles = numAltAlleles;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s input=%s", super.toString(), GLs);
|
||||
}
|
||||
}
|
||||
|
||||
private static Genotype createGenotype(String name, double[] gls) {
|
||||
return new GenotypeBuilder(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).PL(gls).make();
|
||||
}
|
||||
|
||||
@DataProvider(name = "getGLs")
|
||||
public Object[][] createGLsData() {
|
||||
|
||||
// bi-allelic case
|
||||
new GetGLsTest("B0", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AA3", AA1));
|
||||
new GetGLsTest("B1", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AB", AB1));
|
||||
new GetGLsTest("B2", 1, createGenotype("AA1", AA1), createGenotype("BB", BB1), createGenotype("AA2", AA1));
|
||||
new GetGLsTest("B3a", 1, createGenotype("AB", AB1), createGenotype("AA", AA1), createGenotype("BB", BB1));
|
||||
new GetGLsTest("B3b", 1, createGenotype("AB1", AB1), createGenotype("AB2", AB1), createGenotype("AB3", AB1));
|
||||
new GetGLsTest("B4", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("AA", AA1));
|
||||
new GetGLsTest("B5", 1, createGenotype("BB1", BB1), createGenotype("AB", AB1), createGenotype("BB2", BB1));
|
||||
new GetGLsTest("B6", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("BB3", BB1));
|
||||
|
||||
// tri-allelic case
|
||||
new GetGLsTest("B1C0", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AB", AB2));
|
||||
new GetGLsTest("B0C1", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AC", AC2));
|
||||
new GetGLsTest("B1C1a", 2, createGenotype("AA", AA2), createGenotype("AB", AB2), createGenotype("AC", AC2));
|
||||
new GetGLsTest("B1C1b", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("BC", BC2));
|
||||
new GetGLsTest("B2C1", 2, createGenotype("AB1", AB2), createGenotype("AB2", AB2), createGenotype("AC", AC2));
|
||||
new GetGLsTest("B3C2a", 2, createGenotype("AB", AB2), createGenotype("BC1", BC2), createGenotype("BC2", BC2));
|
||||
new GetGLsTest("B3C2b", 2, createGenotype("AB", AB2), createGenotype("BB", BB2), createGenotype("CC", CC2));
|
||||
|
||||
return GetGLsTest.getTests(GetGLsTest.class);
|
||||
}
|
||||
|
||||
|
||||
@Test(dataProvider = "getGLs")
|
||||
public void testGLs(GetGLsTest cfg) {
|
||||
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
|
||||
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
|
||||
|
||||
int nameIndex = 1;
|
||||
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
|
||||
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
|
||||
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
|
||||
|
||||
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLargeGLs() {
|
||||
|
||||
final double[] BB = new double[]{-20000000.0, -20000000.0, 0.0};
|
||||
GetGLsTest cfg = new GetGLsTest("B6", 1, createGenotype("1", BB), createGenotype("2", BB), createGenotype("3", BB));
|
||||
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
|
||||
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
|
||||
|
||||
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0];
|
||||
Assert.assertEquals(calculatedAlleleCount, 6);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMismatchedGLs() {
|
||||
|
||||
final double[] AB = new double[]{-2000.0, 0.0, -2000.0, -2000.0, -2000.0, -2000.0};
|
||||
final double[] AC = new double[]{-100.0, -100.0, -100.0, 0.0, -100.0, -100.0};
|
||||
GetGLsTest cfg = new GetGLsTest("B1C1", 2, createGenotype("1", AC), createGenotype("2", AB));
|
||||
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
|
||||
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
|
||||
|
||||
Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1);
|
||||
Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -182,12 +182,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testOutputParameterAllConfident() {
|
||||
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "da318257d25a02abd26a3348421c3c69");
|
||||
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "7bb6375fddc461c72d44f261f6d4b3c7");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOutputParameterAllSites() {
|
||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "13c4f01cffbbfac600318be95b3ca02f");
|
||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "2104dac76fa2a58a92c72b331c7f2095");
|
||||
}
|
||||
|
||||
private void testOutputParameters(final String args, final String md5) {
|
||||
|
|
|
|||
|
|
@ -52,6 +52,50 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
|
|||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNA12878HighGQ() {
|
||||
String testName = "testNA12878HighGQ";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",80),
|
||||
3,
|
||||
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","0822adea688e99bb336afe5172d4c959")
|
||||
);
|
||||
|
||||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVCFMismatchReference() {
|
||||
String testName = "testVCFMismatchReference";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("NA12878.badReference.vcf", "CEUTrio.NA12878.metadata.txt",80),
|
||||
3,
|
||||
UserException.class
|
||||
);
|
||||
|
||||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1000GWithIndels() {
|
||||
String testName = "test1000GWithIndels";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0),
|
||||
3,
|
||||
Arrays.asList("3c98112434d9948dc47da72ad14e8d84","3aceda4f9bb5b5457797c1fe5a85b03d","451498ceff06c1649890900fa994f1af")
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1000G_Symbolic() {
|
||||
String testName = "test1000G_Symbolic";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("1000G_selected_SVs.vcf", "1000G_selected_allVariants.md.txt",0),
|
||||
3,
|
||||
Arrays.asList("5e7ede48e7c5d5972c59dc5558a06e40","451498ceff06c1649890900fa994f1af","4b53a82a0b2d1a22a6eebca50a4f83a8")
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCEUTrio() {
|
||||
String testName = "testCEUTrio";
|
||||
|
|
@ -112,6 +156,7 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
|
|||
executeTest(testName, spec);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -189,7 +189,7 @@ class QCommandLine extends CommandLineProgram with Logging {
|
|||
private def createQueueHeader() : Seq[String] = {
|
||||
Seq(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp),
|
||||
"Copyright (c) 2012 The Broad Institute",
|
||||
"Fro support and documentation go to http://www.broadinstitute.org/gatk")
|
||||
"For support and documentation go to http://www.broadinstitute.org/gatk")
|
||||
}
|
||||
|
||||
private def getQueueVersion : String = {
|
||||
|
|
|
|||
Loading…
Reference in New Issue