Merge branch 'master' of ssh://gsa2/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Eric Banks 2012-10-04 10:54:23 -04:00
commit e13e61673b
33 changed files with 2086 additions and 329 deletions

View File

@ -0,0 +1,228 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Logger;
import org.apache.log4j.SimpleLayout;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.*;
/**
* Created with IntelliJ IDEA.
* User: depristo
* Date: 10/2/12
* Time: 10:25 AM
* To change this template use File | Settings | File Templates.
*/
public class ExactAFCalculationPerformanceTest {
final static Logger logger = Logger.getLogger(ExactAFCalculationPerformanceTest.class);
private static abstract class Analysis {
final GATKReport report;
public Analysis(final String name, final List<String> columns) {
report = GATKReport.newSimpleReport(name, columns);
}
public abstract void run(final ExactAFCalculationTestBuilder testBuilder,
final List<Object> coreColumns);
public String getName() {
return getTable().getTableName();
}
public GATKReportTable getTable() {
return report.getTables().iterator().next();
}
}
private static class AnalyzeByACAndPL extends Analysis {
public AnalyzeByACAndPL(final List<String> columns) {
super("AnalyzeByACAndPL", Utils.append(columns, "non.type.pls", "ac", "n.alt.seg", "other.ac"));
}
public void run(final ExactAFCalculationTestBuilder testBuilder, final List<Object> coreValues) {
final SimpleTimer timer = new SimpleTimer();
for ( final int nonTypePL : Arrays.asList(10, 100, 1000) ) {
final ExactAFCalculation calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
for ( int[] ACs : makeACs(testBuilder.numAltAlleles, testBuilder.nSamples*2) ) {
final VariantContext vc = testBuilder.makeACTest(ACs, nonTypePL);
timer.start();
final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vc, priors);
final long runtime = timer.getElapsedTimeNano();
int otherAC = 0;
int nAltSeg = 0;
for ( int i = 0; i < ACs.length; i++ ) {
nAltSeg += ACs[i] > 0 ? 1 : 0;
if ( i > 0 ) otherAC += ACs[i];
}
final List<Object> columns = new LinkedList<Object>(coreValues);
columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, ACs[0], nAltSeg, otherAC));
report.addRowList(columns);
}
}
}
private List<int[]> makeACs(final int nAltAlleles, final int nChrom) {
if ( nAltAlleles > 2 ) throw new IllegalArgumentException("nAltAlleles must be < 3");
final List<int[]> ACs = new LinkedList<int[]>();
if ( nAltAlleles == 1 )
for ( int i = 0; i < nChrom; i++ ) {
ACs.add(new int[]{i});
} else if ( nAltAlleles == 2 ) {
for ( int i = 0; i < nChrom; i++ ) {
for ( int j : Arrays.asList(0, 1, 5, 10, 50, 100, 1000, 10000, 100000) ) {
if ( j < nChrom - i )
ACs.add(new int[]{i, j});
}
}
} else {
throw new IllegalStateException("cannot get here");
}
return ACs;
}
}
private static class AnalyzeBySingletonPosition extends Analysis {
public AnalyzeBySingletonPosition(final List<String> columns) {
super("AnalyzeBySingletonPosition", Utils.append(columns, "non.type.pls", "position.of.singleton"));
}
public void run(final ExactAFCalculationTestBuilder testBuilder, final List<Object> coreValues) {
final SimpleTimer timer = new SimpleTimer();
for ( final int nonTypePL : Arrays.asList(100) ) {
final ExactAFCalculation calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
final int[] ac = new int[testBuilder.numAltAlleles];
ac[0] = 1;
final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL);
for ( int position = 0; position < vc.getNSamples(); position++ ) {
final VariantContextBuilder vcb = new VariantContextBuilder(vc);
final List<Genotype> genotypes = new ArrayList<Genotype>(vc.getGenotypes());
Collections.rotate(genotypes, position);
vcb.genotypes(genotypes);
timer.start();
final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vcb.make(), priors);
final long runtime = timer.getElapsedTimeNano();
final List<Object> columns = new LinkedList<Object>(coreValues);
columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, position));
report.addRowList(columns);
}
}
}
}
private static class AnalyzeByNonInformative extends Analysis {
public AnalyzeByNonInformative(final List<String> columns) {
super("AnalyzeByNonInformative", Utils.append(columns, "non.type.pls", "n.non.informative"));
}
public void run(final ExactAFCalculationTestBuilder testBuilder, final List<Object> coreValues) {
final SimpleTimer timer = new SimpleTimer();
for ( final int nonTypePL : Arrays.asList(100) ) {
final ExactAFCalculation calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
final int[] ac = new int[testBuilder.numAltAlleles];
ac[0] = 1;
final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL);
final Genotype nonInformative = testBuilder.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0);
for ( int nNonInformative = 0; nNonInformative < vc.getNSamples(); nNonInformative++ ) {
final VariantContextBuilder vcb = new VariantContextBuilder(vc);
final List<Genotype> genotypes = new ArrayList<Genotype>();
genotypes.addAll(vc.getGenotypes().subList(0, nNonInformative + 1));
genotypes.addAll(Collections.nCopies(vc.getNSamples() - nNonInformative, nonInformative));
vcb.genotypes(genotypes);
timer.start();
final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vcb.make(), priors);
final long runtime = timer.getElapsedTimeNano();
final List<Object> columns = new LinkedList<Object>(coreValues);
columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, nNonInformative));
report.addRowList(columns);
}
}
}
}
public static void main(final String[] args) throws Exception {
logger.addAppender(new ConsoleAppender(new SimpleLayout()));
final List<String> coreColumns = Arrays.asList("iteration", "n.alt.alleles", "n.samples",
"exact.model", "prior.type", "runtime", "n.evaluations");
final PrintStream out = new PrintStream(new FileOutputStream(args[0]));
final boolean USE_GENERAL = false;
final List<ExactAFCalculationTestBuilder.ModelType> modelTypes = USE_GENERAL
? Arrays.asList(ExactAFCalculationTestBuilder.ModelType.values())
: Arrays.asList(ExactAFCalculationTestBuilder.ModelType.DiploidExact);
final boolean ONLY_HUMAN_PRIORS = false;
final List<ExactAFCalculationTestBuilder.PriorType> priorTypes = ONLY_HUMAN_PRIORS
? Arrays.asList(ExactAFCalculationTestBuilder.PriorType.values())
: Arrays.asList(ExactAFCalculationTestBuilder.PriorType.human);
final int MAX_N_SAMPLES_FOR_MULTI_ALLELIC = 100;
final List<Analysis> analyzes = new ArrayList<Analysis>();
analyzes.add(new AnalyzeByACAndPL(coreColumns));
analyzes.add(new AnalyzeBySingletonPosition(coreColumns));
analyzes.add(new AnalyzeByNonInformative(coreColumns));
for ( int iteration = 0; iteration < 1; iteration++ ) {
for ( final int nAltAlleles : Arrays.asList(1, 2) ) {
for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) {
if ( nSamples > MAX_N_SAMPLES_FOR_MULTI_ALLELIC && nAltAlleles > 1 )
continue; // skip things that will take forever!
for ( final ExactAFCalculationTestBuilder.ModelType modelType : modelTypes ) {
for ( final ExactAFCalculationTestBuilder.PriorType priorType : priorTypes ) {
final ExactAFCalculationTestBuilder testBuilder
= new ExactAFCalculationTestBuilder(nSamples, nAltAlleles, modelType, priorType);
for ( final Analysis analysis : analyzes ) {
logger.info(Utils.join("\t", Arrays.asList(iteration, nAltAlleles, nSamples, modelType, priorType, analysis.getName())));
final List<?> values = Arrays.asList(iteration, nAltAlleles, nSamples, modelType, priorType);
analysis.run(testBuilder, (List<Object>)values);
}
}
}
}
}
}
final GATKReport report = new GATKReport();
for ( final Analysis analysis : analyzes )
report.addTable(analysis.getTable());
report.print(out);
out.close();
}
}

View File

@ -0,0 +1,158 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class ExactAFCalculationTestBuilder {
final static Allele A = Allele.create("A", true);
final static Allele C = Allele.create("C");
final static Allele G = Allele.create("G");
final static Allele T = Allele.create("T");
static int sampleNameCounter = 0;
final int nSamples;
final int numAltAlleles;
final ModelType modelType;
final PriorType priorType;
public ExactAFCalculationTestBuilder(final int nSamples, final int numAltAlleles,
final ModelType modelType, final PriorType priorType) {
this.nSamples = nSamples;
this.numAltAlleles = numAltAlleles;
this.modelType = modelType;
this.priorType = priorType;
}
public enum ModelType {
DiploidExact,
OptimizedDiploidExact,
GeneralExact
}
public enum PriorType {
flat,
human
}
public int getnSamples() {
return nSamples;
}
public ExactAFCalculation makeModel() {
switch (modelType) {
case DiploidExact: return new DiploidExactAFCalculation(nSamples, 4);
case OptimizedDiploidExact: return new OptimizedDiploidExactAFCalculation(nSamples, 4);
case GeneralExact: return new GeneralPloidyExactAFCalculation(nSamples, 4, 2);
default: throw new RuntimeException("Unexpected type " + modelType);
}
}
public double[] makePriors() {
final int nPriorValues = 2*nSamples+1;
switch ( priorType ) {
case flat:
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
case human:
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues-1, humanPriors, 0.001);
return humanPriors;
default:
throw new RuntimeException("Unexpected type " + priorType);
}
}
public VariantContext makeACTest(final int[] ACs, final int nonTypePL) {
final int nChrom = nSamples * 2;
final int[] nhet = new int[numAltAlleles];
final int[] nhomvar = new int[numAltAlleles];
for ( int i = 0; i < ACs.length; i++ ) {
final double p = ACs[i] / (1.0 * nChrom);
nhomvar[i] = (int)Math.floor(nSamples * p * p);
nhet[i] = ACs[i] - 2 * nhomvar[i];
if ( nhet[i] < 0 )
throw new IllegalStateException("Bug!");
}
final long calcAC = MathUtils.sum(nhet) + 2 * MathUtils.sum(nhomvar);
if ( calcAC != MathUtils.sum(ACs) )
throw new IllegalStateException("calculated AC " + calcAC + " not equal to desired AC " + Utils.join(",", ACs));
return makeACTest(nhet, nhomvar, nonTypePL);
}
public VariantContext makeACTest(final int[] nhet, final int[] nhomvar, final int nonTypePL) {
List<Genotype> samples = new ArrayList<Genotype>(nSamples);
for ( int altI = 0; altI < nhet.length; altI++ ) {
for ( int i = 0; i < nhet[altI]; i++ )
samples.add(makePL(GenotypeType.HET, nonTypePL, altI+1));
for ( int i = 0; i < nhomvar[altI]; i++ )
samples.add(makePL(GenotypeType.HOM_VAR, nonTypePL, altI+1));
}
final int nRef = (int)(nSamples - MathUtils.sum(nhet) - MathUtils.sum(nhomvar));
for ( int i = 0; i < nRef; i++ ) samples.add(makePL(GenotypeType.HOM_REF, nonTypePL, 0));
samples = samples.subList(0, nSamples);
if ( samples.size() > nSamples )
throw new IllegalStateException("too many samples");
VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, getAlleles());
vcb.genotypes(samples);
return vcb.make();
}
public List<Allele> getAlleles() {
return Arrays.asList(A, C, G, T).subList(0, numAltAlleles+1);
}
public List<Allele> getAlleles(final GenotypeType type, final int altI) {
switch (type) {
case HOM_REF: return Arrays.asList(getAlleles().get(0), getAlleles().get(0));
case HET: return Arrays.asList(getAlleles().get(0), getAlleles().get(altI));
case HOM_VAR: return Arrays.asList(getAlleles().get(altI), getAlleles().get(altI));
default: throw new IllegalArgumentException("Unexpected type " + type);
}
}
public Genotype makePL(final List<Allele> expectedGT, int ... pls) {
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
gb.alleles(expectedGT);
gb.PL(pls);
return gb.make();
}
private int numPLs() {
return GenotypeLikelihoods.numLikelihoods(numAltAlleles+1, 2);
}
public Genotype makePL(final GenotypeType type, final int nonTypePL, final int altI) {
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
gb.alleles(getAlleles(type, altI));
final int[] pls = new int[numPLs()];
Arrays.fill(pls, nonTypePL);
int index = 0;
switch ( type ) {
case HOM_REF: index = GenotypeLikelihoods.calculatePLindex(0, 0); break;
case HET: index = GenotypeLikelihoods.calculatePLindex(0, altI); break;
case HOM_VAR: index = GenotypeLikelihoods.calculatePLindex(altI, altI); break;
}
pls[index] = 0;
gb.PL(pls);
return gb.make();
}
}

View File

@ -34,43 +34,47 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public class GeneralPloidyExactAFCalculation extends ExactAFCalculation {
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
final protected UnifiedArgumentCollection UAC;
private final int ploidy;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private final static boolean VERBOSE = false;
protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
protected GeneralPloidyExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
ploidy = UAC.samplePloidy;
this.UAC = UAC;
}
public List<Allele> getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
GenotypesContext GLs = vc.getGenotypes();
List<Allele> alleles = vc.getAlleles();
public GeneralPloidyExactAFCalculation(final int nSamples, final int maxAltAlleles, final int ploidy) {
super(nSamples, maxAltAlleles, false, null, null, null);
this.ploidy = ploidy;
}
@Override
protected VariantContext reduceScope(VariantContext vc) {
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
final List<Allele> alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy));
GLs = subsetAlleles(vc, alleles, false, ploidy);
VariantContextBuilder builder = new VariantContextBuilder(vc);
builder.alleles(alleles);
builder.genotypes(subsetAlleles(vc, alleles, false, ploidy));
return builder.make();
} else {
return vc;
}
}
combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
return alleles;
@Override
public void computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, result);
}
@ -194,7 +198,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
combinedPoolLikelihoods.add(set);
for (int p=1; p<genotypeLikelihoods.size(); p++) {
result.reset();
result.reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
numAlleles, log10AlleleFrequencyPriors, result);
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
@ -226,6 +230,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
// keep processing while we have AC conformations that need to be calculated
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
while ( !ACqueue.isEmpty() ) {
result.incNEvaluations();
// compute log10Likelihoods
final ExactACset ACset = ACqueue.remove();
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLikelihoodSeen, ACqueue, indexesToACset);

View File

@ -491,15 +491,15 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
// If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
// and we repeat until queue is empty
// queue of AC conformations to process
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue = new LinkedList<AlleleFrequencyCalculationModel.ExactACset>();
final LinkedList<ExactAFCalculation.ExactACset> ACqueue = new LinkedList<ExactAFCalculation.ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset = new HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset>(likelihoodDim);
final HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset> indexesToACset = new HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset>(likelihoodDim);
// add AC=0 to the queue
final int[] zeroCounts = new int[nAlleles];
zeroCounts[0] = numChromosomes;
AlleleFrequencyCalculationModel.ExactACset zeroSet =
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts));
ExactAFCalculation.ExactACset zeroSet =
new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(zeroCounts));
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet);
@ -508,7 +508,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
double maxLog10L = Double.NEGATIVE_INFINITY;
while ( !ACqueue.isEmpty() ) {
// compute log10Likelihoods
final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove();
final ExactAFCalculation.ExactACset ACset = ACqueue.remove();
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
// adjust max likelihood seen if needed
@ -525,8 +525,8 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
int plIdx = 0;
SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
while (iterator.hasNext()) {
AlleleFrequencyCalculationModel.ExactACset ACset =
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector()));
ExactAFCalculation.ExactACset ACset =
new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(iterator.getCurrentVector()));
// for observed base X, add Q(jX,k) to likelihood vector for all k in error model
//likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
@ -540,14 +540,14 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
}
private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set,
private double calculateACConformationAndUpdateQueue(final DiploidExactAFCalculation.ExactACset set,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
final double maxLog10L,
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts,
AlleleFrequencyCalculationModel.ExactACset> indexesToACset,
final LinkedList<ExactAFCalculation.ExactACset> ACqueue,
final HashMap<ExactAFCalculation.ExactACcounts,
ExactAFCalculation.ExactACset> indexesToACset,
final ReadBackedPileup pileup) {
// compute likelihood of set
getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
@ -597,7 +597,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
* @param numObservations Number of observations for each allele
* @param pileup Read backed pileup in case it's necessary
*/
public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
public abstract void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
@ -608,12 +608,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
// Static methods
public static void updateACset(final int[] newSetCounts,
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset) {
final LinkedList<ExactAFCalculation.ExactACset> ACqueue,
final HashMap<ExactAFCalculation.ExactACcounts, ExactAFCalculation.ExactACset> indexesToACset) {
final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts);
final ExactAFCalculation.ExactACcounts index = new ExactAFCalculation.ExactACcounts(newSetCounts);
if ( !indexesToACset.containsKey(index) ) {
AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index);
ExactAFCalculation.ExactACset newSet = new ExactAFCalculation.ExactACset(1, index);
indexesToACset.put(index, newSet);
ACqueue.add(newSet);
if (VERBOSE)

View File

@ -188,7 +188,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
* @param alleleList List of alleles
* @param numObservations Number of observations for each allele in alleleList
*/
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,

View File

@ -12,7 +12,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
@ -218,7 +221,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
* @param alleleList List of alleles
* @param numObservations Number of observations for each allele in alleleList
*/
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,

View File

@ -237,9 +237,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING);
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
UnifiedArgumentCollection simpleUAC = UAC.clone();
simpleUAC.exactCallsLog = null;
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
// initialize the output VCF header
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());

View File

@ -0,0 +1,348 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.*;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
public class ExactAFCalculationModelUnitTest extends BaseTest {
static Allele A = Allele.create("A", true);
static Allele C = Allele.create("C");
static Allele G = Allele.create("G");
static int sampleNameCounter = 0;
static Genotype AA1, AB1, BB1, NON_INFORMATIVE1;
static Genotype AA2, AB2, AC2, BB2, BC2, CC2, NON_INFORMATIVE2;
final double[] FLAT_3SAMPLE_PRIORS = new double[2*3+1]; // flat priors
final private static boolean INCLUDE_BIALLELIC = true;
final private static boolean INCLUDE_TRIALLELIC = true;
final private static boolean Guillermo_FIXME = false; // TODO -- can only be enabled when GdA fixes bug
@BeforeSuite
public void before() {
AA1 = makePL(Arrays.asList(A, A), 0, 20, 20);
AB1 = makePL(Arrays.asList(A, C), 20, 0, 20);
BB1 = makePL(Arrays.asList(C, C), 20, 20, 0);
NON_INFORMATIVE1 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0);
AA2 = makePL(Arrays.asList(A, A), 0, 20, 20, 20, 20, 20);
AB2 = makePL(Arrays.asList(A, C), 20, 0, 20, 20, 20, 20);
BB2 = makePL(Arrays.asList(C, C), 20, 20, 0, 20, 20, 20);
AC2 = makePL(Arrays.asList(A, G), 20, 20, 20, 0, 20, 20);
BC2 = makePL(Arrays.asList(C, G), 20, 20, 20, 20, 0, 20);
CC2 = makePL(Arrays.asList(G, G), 20, 20, 20, 20, 20, 0);
NON_INFORMATIVE2 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0, 0, 0, 0);
}
private Genotype makePL(final List<Allele> expectedGT, int ... pls) {
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
gb.alleles(expectedGT);
gb.PL(pls);
return gb.make();
}
private class GetGLsTest extends TestDataProvider {
GenotypesContext GLs;
int numAltAlleles;
final ExactAFCalculation calc;
final int[] expectedACs;
final double[] priors;
final String priorName;
private GetGLsTest(final ExactAFCalculation calculation, int numAltAlleles, List<Genotype> arg, final double[] priors, final String priorName) {
super(GetGLsTest.class);
GLs = GenotypesContext.create(new ArrayList<Genotype>(arg));
this.numAltAlleles = numAltAlleles;
this.calc = calculation;
this.priors = priors;
this.priorName = priorName;
expectedACs = new int[numAltAlleles+1];
for ( int alleleI = 0; alleleI < expectedACs.length; alleleI++ ) {
expectedACs[alleleI] = 0;
final Allele allele = getAlleles().get(alleleI);
for ( Genotype g : arg ) {
expectedACs[alleleI] += Collections.frequency(g.getAlleles(), allele);
}
}
}
public AlleleFrequencyCalculationResult execute() {
return getCalc().getLog10PNonRef(getVC(), getPriors());
}
public double[] getPriors() {
return priors;
}
public ExactAFCalculation getCalc() {
return calc;
}
public VariantContext getVC() {
VariantContextBuilder builder = new VariantContextBuilder("test", "1", 1, 1, getAlleles());
builder.genotypes(GLs);
return builder.make();
}
public List<Allele> getAlleles() {
return Arrays.asList(Allele.create("A", true),
Allele.create("C"),
Allele.create("G"),
Allele.create("T")).subList(0, numAltAlleles+1);
}
public int getExpectedAltAC(final int alleleI) {
return expectedACs[alleleI+1];
}
public String toString() {
return String.format("%s model=%s prior=%s input=%s", super.toString(), calc.getClass().getSimpleName(),
priorName, GLs.size() > 5 ? String.format("%d samples", GLs.size()) : GLs);
}
}
@DataProvider(name = "wellFormedGLs")
public Object[][] createSimpleGLsData() {
final List<Genotype> biAllelicSamples = Arrays.asList(AA1, AB1, BB1);
final List<Genotype> triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2);
for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
final ExactAFCalculation diploidCalc = new DiploidExactAFCalculation(nSamples, 4);
final ExactAFCalculation optDiploidCalc = new OptimizedDiploidExactAFCalculation(nSamples, 4);
final ExactAFCalculation generalCalc = new GeneralPloidyExactAFCalculation(nSamples, 4, 2);
final int nPriorValues = 2*nSamples+1;
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues-1, humanPriors, 0.001);
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
for ( ExactAFCalculation model : Arrays.asList(diploidCalc, optDiploidCalc, generalCalc) ) {
final String priorName = priors == humanPriors ? "human" : "flat";
// bi-allelic
if ( INCLUDE_BIALLELIC && nSamples <= biAllelicSamples.size() )
for ( List<Genotype> genotypes : Utils.makePermutations(biAllelicSamples, nSamples, true) )
new GetGLsTest(model, 1, genotypes, priors, priorName);
// tri-allelic
if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || model != generalCalc || Guillermo_FIXME ) )
for ( List<Genotype> genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) )
new GetGLsTest(model, 2, genotypes, priors, priorName);
}
}
}
return GetGLsTest.getTests(GetGLsTest.class);
}
private static class NonInformativeData {
final Genotype nonInformative;
final List<Genotype> called;
final int nAltAlleles;
private NonInformativeData(List<Genotype> called, Genotype nonInformative, int nAltAlleles) {
this.called = called;
this.nonInformative = nonInformative;
this.nAltAlleles = nAltAlleles;
}
}
@DataProvider(name = "GLsWithNonInformative")
public Object[][] makeGLsWithNonInformative() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<NonInformativeData> nonInformativeTests = new LinkedList<NonInformativeData>();
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB1), NON_INFORMATIVE1, 1));
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2), NON_INFORMATIVE2, 2));
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2, BC2), NON_INFORMATIVE2, 2));
for ( final int nNonInformative : Arrays.asList(1, 10, 100) ) {
for ( final NonInformativeData testData : nonInformativeTests ) {
final List<Genotype> samples = new ArrayList<Genotype>();
samples.addAll(testData.called);
samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative));
final int nSamples = samples.size();
final ExactAFCalculation diploidCalc = new DiploidExactAFCalculation(nSamples, 4);
final ExactAFCalculation optDiploidCalc = new OptimizedDiploidExactAFCalculation(nSamples, 4);
final ExactAFCalculation generalCalc = new GeneralPloidyExactAFCalculation(nSamples, 4, 2);
final double[] priors = new double[2*nSamples+1]; // flat priors
for ( ExactAFCalculation model : Arrays.asList(diploidCalc, optDiploidCalc, generalCalc) ) {
final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat");
for ( int rotation = 0; rotation < nSamples; rotation++ ) {
Collections.rotate(samples, 1);
final GetGLsTest withNonInformative = new GetGLsTest(model, testData.nAltAlleles, samples, priors, "flat");
tests.add(new Object[]{onlyInformative, withNonInformative});
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "wellFormedGLs")
public void testGLs(GetGLsTest cfg) {
testResultSimple(cfg);
}
@Test(enabled = true, dataProvider = "GLsWithNonInformative", dependsOnMethods = "testGLs")
public void testGLsWithNonInformative(GetGLsTest onlyInformative, GetGLsTest withNonInformative) {
final AlleleFrequencyCalculationResult expected = onlyInformative.execute();
final AlleleFrequencyCalculationResult actual = withNonInformative.execute();
testResultSimple(withNonInformative);
Assert.assertEquals(actual.getLog10PosteriorOfAFzero(), expected.getLog10LikelihoodOfAFzero());
Assert.assertEquals(actual.getLog10LikelihoodOfAFzero(), expected.getLog10LikelihoodOfAFzero());
Assert.assertEquals(actual.getLog10PosteriorsMatrixSumWithoutAFzero(), expected.getLog10PosteriorsMatrixSumWithoutAFzero());
Assert.assertEquals(actual.getAlleleCountsOfMAP(), expected.getAlleleCountsOfMAP());
Assert.assertEquals(actual.getAlleleCountsOfMLE(), expected.getAlleleCountsOfMLE());
Assert.assertEquals(actual.getLog10MAP(), expected.getLog10MAP());
Assert.assertEquals(actual.getLog10MLE(), expected.getLog10MLE());
Assert.assertEquals(actual.getAllelesUsedInGenotyping(), expected.getAllelesUsedInGenotyping());
}
private void testResultSimple(final GetGLsTest cfg) {
final AlleleFrequencyCalculationResult result = cfg.execute();
Assert.assertEquals(result.getNormalizedPosteriorOfAFzero() + result.getNormalizedPosteriorOfAFGTZero(), 1.0, 1e-4);
final int minNumberOfEvaluations = cfg.getVC().getCalledChrCount();
Assert.assertTrue(result.getnEvaluations() >= minNumberOfEvaluations,
"Number of evaluations " + result.getnEvaluations() + " must be at least " + minNumberOfEvaluations);
Assert.assertNotNull(result.getAllelesUsedInGenotyping());
Assert.assertTrue(cfg.getAlleles().containsAll(result.getAllelesUsedInGenotyping()), "Result object has alleles not in our initial allele list");
for ( int altAlleleI = 0; altAlleleI < cfg.numAltAlleles; altAlleleI++ ) {
int expectedAlleleCount = cfg.getExpectedAltAC(altAlleleI);
int calcAC_MLE = result.getAlleleCountsOfMLE()[altAlleleI];
final Allele allele = cfg.getAlleles().get(altAlleleI+1);
Assert.assertEquals(calcAC_MLE, expectedAlleleCount, "MLE AC not equal to expected AC for allele " + allele);
}
// TODO
// TODO -- enable when we understand the contract between AC_MAP and pNonRef
// TODO
// final int AC_MAP = (int)MathUtils.sum(result.getAlleleCountsOfMAP());
// if ( AC_MAP > 0 ) {
// Assert.assertTrue(result.getNormalizedPosteriorOfAFzero() < 0.50, "MAP AC " + AC_MAP + " > 0 but we had posterior AF = 0 > 0.5 of " + result.getNormalizedPosteriorOfAFzero());
// } else {
// Assert.assertTrue(result.getNormalizedPosteriorOfAFzero() > 0.50, "MAP AC " + AC_MAP + " == 0 but we had posterior AF = 0 < 0.5 of " + result.getNormalizedPosteriorOfAFzero());
// }
}
@Test(enabled = true, dataProvider = "Models")
public void testLargeGLs(final ExactAFCalculation calc) {
final Genotype BB = makePL(Arrays.asList(C, C), 20000000, 20000000, 0);
GetGLsTest cfg = new GetGLsTest(calc, 1, Arrays.asList(BB, BB, BB), FLAT_3SAMPLE_PRIORS, "flat");
final AlleleFrequencyCalculationResult result = cfg.execute();
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0];
Assert.assertEquals(calculatedAlleleCount, 6);
}
@Test(enabled = true, dataProvider = "Models")
public void testMismatchedGLs(final ExactAFCalculation calc) {
final Genotype AB = makePL(Arrays.asList(A,C), 2000, 0, 2000, 2000, 2000, 2000);
final Genotype AC = makePL(Arrays.asList(A,G), 100, 100, 100, 0, 100, 100);
GetGLsTest cfg = new GetGLsTest(calc, 2, Arrays.asList(AB, AC), FLAT_3SAMPLE_PRIORS, "flat");
final AlleleFrequencyCalculationResult result = cfg.execute();
Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1);
Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1);
}
@DataProvider(name = "Models")
public Object[][] makeModels() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{new DiploidExactAFCalculation(2, 4)});
tests.add(new Object[]{new OptimizedDiploidExactAFCalculation(2, 4)});
tests.add(new Object[]{new GeneralPloidyExactAFCalculation(2, 4, 2)});
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "Models")
public void testBiallelicPriors(final ExactAFCalculation model) {
final int REF_PL = 10;
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
for ( int log10NonRefPrior = 1; log10NonRefPrior < 100*REF_PL; log10NonRefPrior += 1 ) {
final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior);
final double[] priors = MathUtils.toLog10(new double[]{refPrior, (1-refPrior) / 2, (1-refPrior) / 2});
GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior);
final AlleleFrequencyCalculationResult result = cfg.execute();
final int actualAC = result.getAlleleCountsOfMAP()[0];
final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1];
final boolean expectNonRef = pRefWithPrior <= pHetWithPrior;
if ( expectNonRef )
Assert.assertTrue(result.getNormalizedPosteriorOfAFGTZero() > 0.5);
else
Assert.assertTrue(result.getNormalizedPosteriorOfAFGTZero() < 0.5);
final int expectedAC = expectNonRef ? 1 : 0;
Assert.assertEquals(actualAC, expectedAC,
"actual AC with priors " + log10NonRefPrior + " not expected "
+ expectedAC + " priors " + Utils.join(",", priors));
}
}
@Test(enabled = false, dataProvider = "Models")
public void testTriallelicPriors(final ExactAFCalculation model) {
// TODO
// TODO
// TODO THIS SEEMS TO ID A BUG IN THE EXACT MODEL FOR MULTI-ALLELICS, AS THE
// TODO SECOND ALLELE ISN'T HAVING A SQUARED PRIOR. TALK TO ERIC AND CONFIRM
// TODO
// TODO
final int REF_PL_AB = 10, REF_PL_AC = 20; // first AC goes, then AB
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL_AB, 0, 10000, 10000, 10000);
final Genotype AC = makePL(Arrays.asList(A, G), REF_PL_AC, 10000, 10000, 0, 10000, 10000);
for ( int log10NonRefPrior = 1; log10NonRefPrior < 100*REF_PL_AC; log10NonRefPrior += 1 ) {
final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior);
final double nonRefPrior = (1-refPrior) / 2;
final double[] priors = MathUtils.toLog10(new double[]{refPrior, nonRefPrior, nonRefPrior, nonRefPrior, nonRefPrior, nonRefPrior});
GetGLsTest cfg = new GetGLsTest(model, 2, Arrays.asList(AB, AC), priors, "pNonRef" + log10NonRefPrior);
final AlleleFrequencyCalculationResult result = cfg.execute();
final int actualAC_AB = result.getAlleleCountsOfMAP()[0];
final double pRefABWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
final double pHetABWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1];
final int expectedAC_AB = pRefABWithPrior <= pHetABWithPrior ? 1 : 0;
Assert.assertEquals(actualAC_AB, expectedAC_AB,
"actual AC with priors " + log10NonRefPrior + " not expected "
+ expectedAC_AB + " priors " + Utils.join(",", priors));
final double nonRefPriorSecondAllele = Math.pow(nonRefPrior, 2);
final double refPriorSecondAllele = 1 - nonRefPriorSecondAllele;
final int actualAC_AC = result.getAlleleCountsOfMAP()[1];
final double pRefACWithPrior = AB.getLikelihoods().getAsVector()[0] + Math.log10(refPriorSecondAllele);
final double pHetACWithPrior = AC.getLikelihoods().getAsVector()[3] + Math.log10(nonRefPriorSecondAllele);
final int expectedAC_AC = pRefACWithPrior <= pHetACWithPrior ? 1 : 0;
Assert.assertEquals(actualAC_AC, expectedAC_AC,
"actual AC with priors " + log10NonRefPrior + " not expected "
+ expectedAC_AC + " priors " + Utils.join(",", priors));
}
}
}

View File

@ -141,7 +141,7 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest {
final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size());
double[] priors = new double[len]; // flat priors
GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
GeneralPloidyExactAFCalculation.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));

View File

@ -1,13 +1,12 @@
package org.broadinstitute.sting.gatk.arguments;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
/**
* Created with IntelliJ IDEA.
* User: rpoplin
@ -59,4 +58,8 @@ public class StandardCallerArgumentCollection {
@Advanced
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
public int MAX_ALTERNATE_ALLELES = 3;
@Hidden
@Argument(shortName = "logExactCalls", doc="x", required=false)
public File exactCallsLog = null;
}

View File

@ -124,6 +124,12 @@ public class BAMScheduler implements Iterator<FilePointer> {
*/
private FilePointer generatePointerOverEntireFileset() {
FilePointer filePointer = new FilePointer();
// This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is
// the only FilePointer we will create. This allows us to have this FilePointer represent regions from
// multiple contigs
filePointer.setIsMonolithic(true);
Map<SAMReaderID,GATKBAMFileSpan> currentPosition;
// Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
@ -88,6 +89,17 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
*/
private PeekableIterator<SAMRecord> currentContigReadsIterator = null;
/**
* How many FilePointers have we pulled from the filePointers iterator?
*/
private int totalFilePointersConsumed = 0;
/**
* Have we encountered a monolithic FilePointer?
*/
private boolean encounteredMonolithicFilePointer = false;
{
createNextContigFilePointer();
advance();
@ -167,6 +179,20 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
logger.info("Loading BAM index data for next contig");
while ( filePointers.hasNext() ) {
// Make sure that if we see a monolithic FilePointer (representing all regions in all files) that
// it is the ONLY FilePointer we ever encounter
if ( encounteredMonolithicFilePointer ) {
throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer");
}
if ( filePointers.peek().isMonolithic() ) {
if ( totalFilePointersConsumed > 0 ) {
throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer");
}
encounteredMonolithicFilePointer = true;
logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek()));
}
// If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the
// same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge
if ( nextContigFilePointers.isEmpty() ||
@ -175,6 +201,7 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
(nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) {
nextContigFilePointers.add(filePointers.next());
totalFilePointersConsumed++;
}
else {
break; // next FilePointer is on a different contig or has different mapped/unmapped status,

View File

@ -50,6 +50,14 @@ public class FilePointer {
*/
protected final boolean isRegionUnmapped;
/**
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
* from more than one contig.
*/
private boolean isMonolithic = false;
public FilePointer( List<GenomeLoc> locations ) {
this.locations.addAll(locations);
this.isRegionUnmapped = checkUnmappedStatus();
@ -81,7 +89,8 @@ public class FilePointer {
}
private void validateLocations() {
if ( isRegionUnmapped ) {
// Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction
if ( isRegionUnmapped || isMonolithic ) {
return;
}
@ -123,6 +132,29 @@ public class FilePointer {
return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
}
/**
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
* from more than one contig.
*
* @return true if this FP is a monolithic FP representing all regions in all files, otherwise false
*/
public boolean isMonolithic() {
return isMonolithic;
}
/**
* Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all
* regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic
* FP may contain intervals from more than one contig.
*
* @param isMonolithic set this FP's monolithic status to this value
*/
public void setIsMonolithic( boolean isMonolithic ) {
this.isMonolithic = isMonolithic;
}
@Override
public boolean equals(final Object other) {
if(!(other instanceof FilePointer))

View File

@ -215,19 +215,29 @@ public class ReadShard extends Shard {
int start = Integer.MAX_VALUE;
int stop = Integer.MIN_VALUE;
String contig = null;
boolean foundMapped = false;
for ( final SAMRecord read : reads ) {
if ( contig != null && ! read.getReferenceName().equals(contig) )
throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. "
+ "First contig is " + contig + " next read was " + read.getReferenceName() );
contig = read.getReferenceName();
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
// Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates
// of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries,
// this shard might consist *only* of unmapped mates! We need to refrain from using the alignment
// starts/stops of these unmapped mates, and detect the case where the shard has been filled *only*
// with unmapped mates.
if ( ! read.getReadUnmappedFlag() ) {
foundMapped = true;
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
}
}
assert contig != null;
if ( contig.equals("*") ) // all reads are unmapped
if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped
return GenomeLoc.UNMAPPED;
else
return parser.createGenomeLoc(contig, start, stop);

View File

@ -179,7 +179,7 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
int numReadGroups = 0;
for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() )
numReadGroups += header.getReadGroups().size();
recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups);
recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG);
recalibrationEngine = initializeRecalibrationEngine();
recalibrationEngine.initialize(requestedCovariates, recalibrationTables);

View File

@ -182,6 +182,10 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
@Hidden
@Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only")
public PrintStream RECAL_TABLE_UPDATE_LOG = null;
public File existingRecalibrationReport = null;
public GATKReportTable generateReportTable(final String covariateNames) {

View File

@ -0,0 +1,233 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.List;
/**
* Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods
*/
public abstract class AlleleFrequencyCalculation implements Cloneable {
private final static Logger defaultLogger = Logger.getLogger(AlleleFrequencyCalculation.class);
public enum Model {
/** The default model with the best performance in all cases */
EXACT("ExactAFCalculation");
final String implementationName;
private Model(String implementationName) {
this.implementationName = implementationName;
}
}
protected int nSamples;
protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
protected Logger logger;
protected PrintStream verboseWriter;
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
private SimpleTimer callTimer = new SimpleTimer();
private PrintStream callReport = null;
protected AlleleFrequencyCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) {
this(nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.exactCallsLog, logger, verboseWriter);
}
protected AlleleFrequencyCalculation(final int nSamples,
final int maxAltAlleles,
final boolean capMaxAltsForIndels,
final File exactCallsLog,
final Logger logger,
final PrintStream verboseWriter) {
if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples);
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles);
this.nSamples = nSamples;
this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = maxAltAlleles;
this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = capMaxAltsForIndels;
this.logger = logger == null ? defaultLogger : logger;
this.verboseWriter = verboseWriter;
if ( exactCallsLog != null )
initializeOutputFile(exactCallsLog);
}
/**
* @see #getLog10PNonRef(org.broadinstitute.sting.utils.variantcontext.VariantContext, double[], AlleleFrequencyCalculationResult)
*
* Allocates a new results object. Useful for testing but slow in practice.
*/
public final AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
return getLog10PNonRef(vc, log10AlleleFrequencyPriors, new AlleleFrequencyCalculationResult(MAX_ALTERNATE_ALLELES_TO_GENOTYPE));
}
/**
* Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
*
* @param vc the VariantContext holding the alleles and sample information
* @param log10AlleleFrequencyPriors a prior vector nSamples x 2 in length indicating the Pr(AF = i)
* @param result a pre-allocated (for efficiency) object to hold the result of the calculation
* @return result (for programming convenience)
*/
public final AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null");
if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null");
if ( result == null ) throw new IllegalArgumentException("Results object cannot be null");
// reset the result, so we can store our new result there
result.reset();
final VariantContext vcWorking = reduceScope(vc);
callTimer.start();
computeLog10PNonRef(vcWorking, log10AlleleFrequencyPriors, result);
final long nanoTime = callTimer.getElapsedTimeNano();
if ( callReport != null )
printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, result.getLog10PosteriorOfAFzero());
result.setAllelesUsedInGenotyping(vcWorking.getAlleles());
return result;
}
// ---------------------------------------------------------------------------
//
// Abstract methods that should be implemented by concrete implementations
// to actually calculate the AF
//
// ---------------------------------------------------------------------------
/**
* Look at VC and perhaps return a new one of reduced complexity, if that's necessary
*
* Used before the call to computeLog10PNonRef to simply the calculation job at hand,
* if vc exceeds bounds. For example, if VC has 100 alt alleles this function
* may decide to only genotype the best 2 of them.
*
* @param vc the initial VC provided by the caller to this AFcalculation
* @return a potentially simpler VC that's more tractable to genotype
*/
@Requires("vc != null")
@Ensures("result != null")
protected abstract VariantContext reduceScope(final VariantContext vc);
/**
* Actually carry out the log10PNonRef calculation on vc, storing results in results
*
* @param vc variant context with alleles and genotype likelihoods
* @param log10AlleleFrequencyPriors priors
* @param result (pre-allocated) object to store results
*/
// TODO -- add consistent requires among args
protected abstract void computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result);
/**
* Must be overridden by concrete subclasses
*
* @param vc variant context with alleles and genotype likelihoods
* @param allelesToUse alleles to subset
* @param assignGenotypes
* @param ploidy
* @return GenotypesContext object
*/
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy);
// ---------------------------------------------------------------------------
//
// Print information about the call to the calls log
//
// ---------------------------------------------------------------------------
private void initializeOutputFile(final File outputFile) {
try {
if (outputFile != null) {
callReport = new PrintStream( new FileOutputStream(outputFile) );
callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value")));
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(outputFile, e);
}
}
private void printCallInfo(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final long runtimeNano,
final double log10PosteriorOfAFzero) {
printCallElement(vc, "type", "ignore", vc.getType());
int allelei = 0;
for ( final Allele a : vc.getAlleles() )
printCallElement(vc, "allele", allelei++, a.getDisplayString());
for ( final Genotype g : vc.getGenotypes() )
printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString());
for ( int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++ )
printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]);
printCallElement(vc, "runtime.nano", "ignore", runtimeNano);
printCallElement(vc, "log10PosteriorOfAFzero", "ignore", log10PosteriorOfAFzero);
callReport.flush();
}
private void printCallElement(final VariantContext vc,
final Object variable,
final Object key,
final Object value) {
final String loc = String.format("%s:%d", vc.getChr(), vc.getStart());
callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value)));
}
}

View File

@ -25,9 +25,12 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import com.google.java.contract.Ensures;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.List;
/**
* Created by IntelliJ IDEA.
@ -35,9 +38,10 @@ import java.util.Arrays;
* Date: Dec 14, 2011
*
* Useful helper class to communicate the results of the allele frequency calculation
*
* TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF?
*/
public class AlleleFrequencyCalculationResult {
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
private double log10MLE;
private double log10MAP;
@ -54,21 +58,88 @@ public class AlleleFrequencyCalculationResult {
private double log10LikelihoodOfAFzero;
private double log10PosteriorOfAFzero;
int nEvaluations = 0;
/**
* The list of alleles actually used in computing the AF
*/
private List<Allele> allelesUsedInGenotyping = null;
/**
* Create a results object capability of storing results for calls with up to maxAltAlleles
*
* @param maxAltAlleles an integer >= 1
*/
public AlleleFrequencyCalculationResult(final int maxAltAlleles) {
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles);
alleleCountsOfMLE = new int[maxAltAlleles];
alleleCountsOfMAP = new int[maxAltAlleles];
reset();
}
/**
* Get the log10 value of the probability mass at the MLE
*
* @return a log10 prob
*/
@Ensures("goodLog10Value(result)")
public double getLog10MLE() {
return log10MLE;
}
/**
* Get the log10 value of the probability mass at the max. a posterior (MAP)
*
* @return a log10 prob
*/
@Ensures("goodLog10Value(result)")
public double getLog10MAP() {
return log10MAP;
}
/**
* Returns a vector with maxAltAlleles values containing AC values at the MLE
*
* The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order,
* starting from index 0 (i.e., the first alt allele is at 0). The vector is always
* maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values
* are meaningful.
*
* @return a vector with allele counts, not all of which may be meaningful
*/
@Ensures("result != null")
public int[] getAlleleCountsOfMLE() {
return alleleCountsOfMLE;
}
/**
* Returns a vector with maxAltAlleles values containing AC values at the MAP
*
* @see #getAlleleCountsOfMLE() for the encoding of results in this vector
*
* @return a non-null vector of ints
*/
@Ensures("result != null")
public int[] getAlleleCountsOfMAP() {
return alleleCountsOfMAP;
}
/**
* Returns the number of cycles used to evaluate the pNonRef for this AF calculation
*
* @return the number of evaluations required to produce the answer for this AF calculation
*/
public int getnEvaluations() {
return nEvaluations;
}
/**
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
*
* @return
*/
public double getLog10PosteriorsMatrixSumWithoutAFzero() {
if ( log10PosteriorMatrixSum == null ) {
log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
@ -76,33 +147,99 @@ public class AlleleFrequencyCalculationResult {
return log10PosteriorMatrixSum;
}
public int[] getAlleleCountsOfMLE() {
return alleleCountsOfMLE;
}
public int[] getAlleleCountsOfMAP() {
return alleleCountsOfMAP;
}
/**
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
*
* @return
*/
public double getLog10LikelihoodOfAFzero() {
return log10LikelihoodOfAFzero;
}
/**
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
*
* @return
*/
public double getLog10PosteriorOfAFzero() {
return log10PosteriorOfAFzero;
}
public void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
/**
* Get the list of alleles actually used in genotyping.
*
* Due to computational / implementation constraints this may be smaller than
* the actual list of alleles requested
*
* @return a non-empty list of alleles used during genotyping
*/
@Ensures({"result != null", "! result.isEmpty()"})
public List<Allele> getAllelesUsedInGenotyping() {
if ( allelesUsedInGenotyping == null )
throw new IllegalStateException("allelesUsedInGenotyping requested but not yet set");
return allelesUsedInGenotyping;
}
/**
* Get the normalized -- across all AFs -- of AC == 0, NOT LOG10
* @return
*/
// TODO -- this ensure cannot be enabled right now because the log10 inputs can be infinity, etc.
// TODO -- we should own these values in a more meaningful way and return good values in the case
// TODO -- where this happens, or instead thrown an error and have a function to say "was this calculation successful
// @Ensures({"result >= 0.0", "result <= 1.0"})
public double getNormalizedPosteriorOfAFzero() {
return getNormalizedPosteriors()[0];
}
/**
* Get the normalized -- across all AFs -- of AC > 0, NOT LOG10
* @return
*/
// TODO -- this ensure cannot be enabled right now because the log10 inputs can be infinity, etc.
// TODO -- we should own these values in a more meaningful way and return good values in the case
// TODO -- where this happens, or instead thrown an error and have a function to say "was this calculation successful
//@Ensures({"result >= 0.0", "result <= 1.0"})
public double getNormalizedPosteriorOfAFGTZero() {
return getNormalizedPosteriors()[1];
}
private double[] getNormalizedPosteriors() {
final double[] posteriors = new double[]{ getLog10PosteriorOfAFzero(), getLog10PosteriorsMatrixSumWithoutAFzero() };
return MathUtils.normalizeFromLog10(posteriors);
}
// --------------------------------------------------------------------------------
//
// Protected mutational methods only for use within the calculation models themselves
//
// --------------------------------------------------------------------------------
/**
* Reset the data in this results object, so that it can be used in a subsequent AF calculation
*
* Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer
*/
protected void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculation.VALUE_NOT_CALCULATED;
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
alleleCountsOfMLE[i] = 0;
alleleCountsOfMAP[i] = 0;
}
currentPosteriorsCacheIndex = 0;
log10PosteriorMatrixSum = null;
allelesUsedInGenotyping = null;
}
public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
/**
* Tell this result we used one more evaluation cycle
*/
protected void incNEvaluations() {
nEvaluations++;
}
protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
if ( log10LofK > log10MLE ) {
log10MLE = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
@ -110,7 +247,7 @@ public class AlleleFrequencyCalculationResult {
}
}
public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
addToPosteriorsCache(log10LofK);
if ( log10LofK > log10MAP ) {
@ -132,7 +269,7 @@ public class AlleleFrequencyCalculationResult {
}
}
public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {
log10MLE = log10LikelihoodOfAFzero;
@ -140,11 +277,22 @@ public class AlleleFrequencyCalculationResult {
}
}
public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
if ( log10PosteriorOfAFzero > log10MAP ) {
log10MAP = log10PosteriorOfAFzero;
Arrays.fill(alleleCountsOfMAP, 0);
}
}
protected void setAllelesUsedInGenotyping(List<Allele> allelesUsedInGenotyping) {
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() )
throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty");
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
}
private static boolean goodLog10Value(final double result) {
return result <= 0.0 || Double.isInfinite(result) || Double.isNaN(result);
}
}

View File

@ -32,41 +32,54 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public class DiploidExactAFCalculation extends ExactAFCalculation {
// private final static boolean DEBUG = false;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
public DiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) {
super(nSamples, maxAltAlleles, false, null, null, null);
}
/**
* Dynamically found in UnifiedGenotyperEngine
*
* @param UAC
* @param N
* @param logger
* @param verboseWriter
*/
public DiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
}
public List<Allele> getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
GenotypesContext GLs = vc.getGenotypes();
List<Allele> alleles = vc.getAlleles();
@Override
public void computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
linearExactMultiAllelic(vc.getGenotypes(), vc.getNAlleles() - 1, log10AlleleFrequencyPriors, result);
}
@Override
protected VariantContext reduceScope(final VariantContext vc) {
final int myMaxAltAllelesToGenotype = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS && vc.getType().equals(VariantContext.Type.INDEL) ? 2 : MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
VariantContextBuilder builder = new VariantContextBuilder(vc);
List<Allele> alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
GLs = VariantContextUtils.subsetDiploidAlleles(vc, alleles, false);
builder.alleles(alleles);
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
return builder.make();
} else {
return vc;
}
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result);
return alleles;
}
private static final int PL_INDEX_OF_HOM_REF = 0;
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
@ -134,6 +147,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// keep processing while we have AC conformations that need to be calculated
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
while ( !ACqueue.isEmpty() ) {
result.incNEvaluations(); // keep track of the number of evaluations
// compute log10Likelihoods
final ExactACset set = ACqueue.remove();
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);

View File

@ -30,40 +30,23 @@ import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* The model representing how we calculate a genotype given the priors and a pile
* of bases and quality scores
* Uses the Exact calculation of Heng Li
*/
public abstract class AlleleFrequencyCalculationModel implements Cloneable {
public enum Model {
/** The default model with the best performance in all cases */
EXACT
abstract class ExactAFCalculation extends AlleleFrequencyCalculation {
protected ExactAFCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) {
super(UAC, nSamples, logger, verboseWriter);
}
protected int N;
protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
protected Logger logger;
protected PrintStream verboseWriter;
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
protected AlleleFrequencyCalculationModel(final UnifiedArgumentCollection UAC, final int N, final Logger logger, final PrintStream verboseWriter) {
this.N = N;
this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = UAC.MAX_ALTERNATE_ALLELES;
this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
this.logger = logger;
this.verboseWriter = verboseWriter;
protected ExactAFCalculation(final int nSamples, int maxAltAlleles, boolean capMaxAltsForIndels, File exactCallsLog, Logger logger, PrintStream verboseWriter) {
super(nSamples, maxAltAlleles, capMaxAltsForIndels, exactCallsLog, logger, verboseWriter);
}
/**
@ -102,31 +85,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
return genotypeLikelihoods;
}
/**
* Must be overridden by concrete subclasses
* @param vc variant context with alleles and genotype likelihoods
* @param log10AlleleFrequencyPriors priors
* @param result (pre-allocated) object to store likelihoods results
* @return the alleles used for genotyping
*/
protected abstract List<Allele> getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result);
/**
* Must be overridden by concrete subclasses
* @param vc variant context with alleles and genotype likelihoods
* @param allelesToUse alleles to subset
* @param assignGenotypes
* @param ploidy
* @return GenotypesContext object
*/
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy);
// -------------------------------------------------------------------------------------
//
// protected classes used to store exact model matrix columns

View File

@ -0,0 +1,496 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
public class OptimizedDiploidExactAFCalculation extends ExactAFCalculation {
// private final static boolean DEBUG = false;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
public OptimizedDiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) {
super(nSamples, maxAltAlleles, false, null, null, null);
}
/**
* Dynamically found in UnifiedGenotyperEngine
*
* @param UAC
* @param N
* @param logger
* @param verboseWriter
*/
public OptimizedDiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
}
@Override
public void computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
linearExactMultiAllelic(vc.getGenotypes(), vc.getNAlleles() - 1, log10AlleleFrequencyPriors, result);
}
@Override
protected VariantContext reduceScope(final VariantContext vc) {
final int myMaxAltAllelesToGenotype = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS && vc.getType().equals(VariantContext.Type.INDEL) ? 2 : MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
VariantContextBuilder builder = new VariantContextBuilder(vc);
List<Allele> alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
builder.alleles(alleles);
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
return builder.make();
} else {
return vc;
}
}
private static final int PL_INDEX_OF_HOM_REF = 0;
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
for ( int i = 0; i < numOriginalAltAlleles; i++ )
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
for ( final double[] likelihoods : GLs ) {
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL);
if ( alleles.alleleIndex1 != 0 )
likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
// don't double-count it
if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 )
likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
}
}
// sort them by probability mass and choose the best ones
Collections.sort(Arrays.asList(likelihoodSums));
final ArrayList<Allele> bestAlleles = new ArrayList<Allele>(numAllelesToChoose);
for ( int i = 0; i < numAllelesToChoose; i++ )
bestAlleles.add(likelihoodSums[i].allele);
final ArrayList<Allele> orderedBestAlleles = new ArrayList<Allele>(numAllelesToChoose);
for ( Allele allele : vc.getAlternateAlleles() ) {
if ( bestAlleles.contains(allele) )
orderedBestAlleles.add(allele);
}
return orderedBestAlleles;
}
// -------------------------------------------------------------------------------------
//
// Multi-allelic implementation.
//
// -------------------------------------------------------------------------------------
public static void linearExactMultiAllelic(final GenotypesContext GLs,
final int numAlternateAlleles,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
// queue of AC conformations to process
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1);
// add AC=0 to the queue
int[] zeroCounts = new int[numAlternateAlleles];
ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts));
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet);
// keep processing while we have AC conformations that need to be calculated
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
while ( !ACqueue.isEmpty() ) {
result.incNEvaluations(); // keep track of the number of evaluations
// compute log10Likelihoods
final ExactACset set = ACqueue.remove();
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
// adjust max likelihood seen if needed
if ( log10LofKs > maxLikelihoodSeen.maxLog10L )
maxLikelihoodSeen.update(log10LofKs, set.ACcounts);
// clean up memory
indexesToACset.remove(set.ACcounts);
//if ( DEBUG )
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
}
}
private static final class DependentSet {
public final int[] ACcounts;
public final int PLindex;
public DependentSet(final int[] ACcounts, final int PLindex) {
this.ACcounts = ACcounts;
this.PLindex = PLindex;
}
}
private static double calculateAlleleCountConformation(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final MaxLikelihoodSeen maxLikelihoodSeen,
final int numChr,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
//if ( DEBUG )
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
// compute the log10Likelihoods
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result);
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
// can we abort early because the log10Likelihoods are so small?
if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) {
//if ( DEBUG )
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
return log10LofK;
}
// iterate over higher frequencies if possible
final int ACwiggle = numChr - set.getACsum();
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK;
final int numAltAlleles = set.ACcounts.getCounts().length;
// add conformations for the k+1 case
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele]++;
// to get to this conformation, a sample would need to be AB (remember that ref=0)
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
if ( ACwiggle > 1 ) {
final ArrayList<DependentSet> differentAlleles = new ArrayList<DependentSet>(numAltAlleles * numAltAlleles);
final ArrayList<DependentSet> sameAlleles = new ArrayList<DependentSet>(numAltAlleles);
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele_i]++;
ACcountsClone[allele_j]++;
// to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index)
final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1);
if ( allele_i == allele_j )
sameAlleles.add(new DependentSet(ACcountsClone, PLindex));
else
differentAlleles.add(new DependentSet(ACcountsClone, PLindex));
}
}
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
for ( DependentSet dependent : differentAlleles )
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
for ( DependentSet dependent : sameAlleles )
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
return log10LofK;
}
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
// also pushes its value to the given callingSetIndex.
private static void updateACset(final int[] newSetCounts,
final int numChr,
final ExactACset dependentSet,
final int PLsetIndex,
final Queue<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final ArrayList<double[]> genotypeLikelihoods) {
final ExactACcounts index = new ExactACcounts(newSetCounts);
if ( !indexesToACset.containsKey(index) ) {
ExactACset set = new ExactACset(numChr/2 +1, index);
indexesToACset.put(index, set);
ACqueue.add(set);
}
// push data from the dependency to the new set
//if ( DEBUG )
// System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts);
pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods);
}
private static void computeLofK(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
set.log10Likelihoods[0] = 0.0; // the zero case
final int totalK = set.getACsum();
// special case for k = 0 over all k
if ( totalK == 0 ) {
for ( int j = 1; j < set.log10Likelihoods.length; j++ )
set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1];
result.setLog10LikelihoodOfAFzero(log10Lof0);
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
return;
}
// if we got here, then k > 0 for at least one k.
// the non-AA possible conformations were already dealt with by pushes from dependent sets;
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
if ( totalK < 2*j-1 ) {
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue);
}
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator;
}
double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
// update the MLE if necessary
result.updateMLEifNeeded(log10LofK, set.ACcounts.counts);
// apply the priors over each alternate allele
for ( final int ACcount : set.ACcounts.getCounts() ) {
if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
result.updateMAPifNeeded(log10LofK, set.ACcounts.counts);
}
private static void pushData(final ExactACset targetSet,
final ExactACset dependentSet,
final int PLsetIndex,
final ArrayList<double[]> genotypeLikelihoods) {
final int totalK = targetSet.getACsum();
for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) {
if ( totalK <= 2*j ) { // skip impossible conformations
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue =
determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex];
targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue);
}
}
}
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
// the closed form representation generalized for multiple alleles is as follows:
// AA: (2j - totalK) * (2j - totalK - 1)
// AB: 2k_b * (2j - totalK)
// AC: 2k_c * (2j - totalK)
// BB: k_b * (k_b - 1)
// BC: 2 * k_b * k_c
// CC: k_c * (k_c - 1)
// find the 2 alleles that are represented by this PL index
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
// *** note that throughout this method we subtract one from the alleleIndex because ACcounts ***
// *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. ***
// the AX het case
if ( alleles.alleleIndex1 == 0 )
return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK];
final int k_i = ACcounts[alleles.alleleIndex1-1];
// the hom var case (e.g. BB, CC, DD)
final double coeff;
if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) {
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
}
// the het non-ref case (e.g. BC, BD, CD)
else {
final int k_j = ACcounts[alleles.alleleIndex2-1];
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
}
return coeff;
}
public GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy) {
return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes);
}
// -------------------------------------------------------------------------------------
//
// Deprecated bi-allelic ~O(N) implementation. Kept here for posterity.
//
// -------------------------------------------------------------------------------------
/**
* A simple data structure that holds the current, prev, and prev->prev likelihoods vectors
* for the exact model calculation
*/
/*
private final static class ExactACCache {
double[] kMinus2, kMinus1, kMinus0;
private final static double[] create(int n) {
return new double[n];
}
public ExactACCache(int n) {
kMinus2 = create(n);
kMinus1 = create(n);
kMinus0 = create(n);
}
final public void rotate() {
double[] tmp = kMinus2;
kMinus2 = kMinus1;
kMinus1 = kMinus0;
kMinus0 = tmp;
}
final public double[] getkMinus2() {
return kMinus2;
}
final public double[] getkMinus1() {
return kMinus1;
}
final public double[] getkMinus0() {
return kMinus0;
}
}
public int linearExact(GenotypesContext GLs,
double[] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyLikelihoods,
double[][] log10AlleleFrequencyPosteriors) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
final ExactACCache logY = new ExactACCache(numSamples+1);
logY.getkMinus0()[0] = 0.0; // the zero case
double maxLog10L = Double.NEGATIVE_INFINITY;
boolean done = false;
int lastK = -1;
for (int k=0; k <= numChr && ! done; k++ ) {
final double[] kMinus0 = logY.getkMinus0();
if ( k == 0 ) { // special case for k = 0
for ( int j=1; j <= numSamples; j++ ) {
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0];
}
} else { // k > 0
final double[] kMinus1 = logY.getkMinus1();
final double[] kMinus2 = logY.getkMinus2();
for ( int j=1; j <= numSamples; j++ ) {
final double[] gl = genotypeLikelihoods.get(j);
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
double aa = Double.NEGATIVE_INFINITY;
double ab = Double.NEGATIVE_INFINITY;
if (k < 2*j-1)
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0];
if (k < 2*j)
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1];
double log10Max;
if (k > 1) {
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2];
log10Max = approximateLog10SumLog10(aa, ab, bb);
} else {
// we know we aren't considering the BB case, so we can use an optimized log10 function
log10Max = approximateLog10SumLog10(aa, ab);
}
// finally, update the L(j,k) value
kMinus0[j] = log10Max - logDenominator;
}
}
// update the posteriors vector
final double log10LofK = kMinus0[numSamples];
log10AlleleFrequencyLikelihoods[0][k] = log10LofK;
log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k];
// can we abort early?
lastK = k;
maxLog10L = Math.max(maxLog10L, log10LofK);
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
//if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
done = true;
}
logY.rotate();
}
return lastK;
}
final static double approximateLog10SumLog10(double a, double b, double c) {
return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c);
}
*/
}

View File

@ -41,7 +41,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
*/
@Advanced
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
protected AlleleFrequencyCalculation.Model AFmodel = AlleleFrequencyCalculation.Model.EXACT;
/**
* The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily
@ -186,7 +186,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
public UnifiedArgumentCollection clone() {
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
@ -224,6 +223,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
uac.minReferenceDepth = minReferenceDepth;
uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES;
uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO;
uac.exactCallsLog = exactCallsLog;
// todo- arguments to remove
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
@ -242,5 +242,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
this.OutputMode = SCAC.OutputMode;
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
this.exactCallsLog = SCAC.exactCallsLog;
}
}

View File

@ -27,10 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
@ -249,7 +249,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2");
}
if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL)
if (UAC.AFmodel != AlleleFrequencyCalculation.Model.POOL)
throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2");
}

View File

@ -78,11 +78,10 @@ public class UnifiedGenotyperEngine {
private ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>> glcm = new ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>>();
// the model used for calculating p(non-ref)
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
private ThreadLocal<AlleleFrequencyCalculation> afcm = new ThreadLocal<AlleleFrequencyCalculation>();
// the allele frequency likelihoods and posteriors (allocated once as an optimization)
private ThreadLocal<AlleleFrequencyCalculationResult> alleleFrequencyCalculationResult = new ThreadLocal<AlleleFrequencyCalculationResult>();
private ThreadLocal<double[]> posteriorsArray = new ThreadLocal<double[]>();
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
private final double[] log10AlleleFrequencyPriorsSNPs;
@ -357,7 +356,6 @@ public class UnifiedGenotyperEngine {
if ( afcm.get() == null ) {
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES));
posteriorsArray.set(new double[2]);
}
AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get();
@ -370,8 +368,7 @@ public class UnifiedGenotyperEngine {
generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
}
AFresult.reset();
List<Allele> allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
// is the most likely frequency conformation AC=0 for all alternate alleles?
boolean bestGuessIsRef = true;
@ -382,7 +379,7 @@ public class UnifiedGenotyperEngine {
myAlleles.add(vc.getReference());
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
final Allele alternateAllele = vc.getAlternateAllele(i);
final int indexOfAllele = allelesUsedInGenotyping.indexOf(alternateAllele);
final int indexOfAllele = AFresult.getAllelesUsedInGenotyping().indexOf(alternateAllele);
// the genotyping model may have stripped it out
if ( indexOfAllele == -1 )
continue;
@ -403,16 +400,16 @@ public class UnifiedGenotyperEngine {
}
// calculate p(f>0):
final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get());
final double PofF = 1.0 - normalizedPosteriors[0];
final double PoFEq0 = AFresult.getNormalizedPosteriorOfAFzero();
final double PoFGT0 = AFresult.getNormalizedPosteriorOfAFGTZero();
double phredScaledConfidence;
if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PoFEq0);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero();
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PoFGT0);
if ( Double.isInfinite(phredScaledConfidence) ) {
final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
@ -423,7 +420,7 @@ public class UnifiedGenotyperEngine {
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, 1.0 - PofF);
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0);
}
// start constructing the resulting VC
@ -439,7 +436,7 @@ public class UnifiedGenotyperEngine {
// print out stats if we have a writer
if ( verboseWriter != null && !limitedContext )
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model);
printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
final HashMap<String, Object> attributes = new HashMap<String, Object>();
@ -477,7 +474,6 @@ public class UnifiedGenotyperEngine {
// the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
AFresult.reset();
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
@ -486,7 +482,6 @@ public class UnifiedGenotyperEngine {
// the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
AFresult.reset();
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
@ -524,13 +519,7 @@ public class UnifiedGenotyperEngine {
vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap);
}
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
}
public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) {
normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero();
normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
return MathUtils.normalizeFromLog10(normalizedPosteriors);
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
}
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) {
@ -754,32 +743,34 @@ public class UnifiedGenotyperEngine {
return glcm;
}
private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
private static AlleleFrequencyCalculation getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
List<Class<? extends AlleleFrequencyCalculationModel>> afClasses = new PluginManager<AlleleFrequencyCalculationModel>(AlleleFrequencyCalculationModel.class).getPlugins();
List<Class<? extends AlleleFrequencyCalculation>> afClasses = new PluginManager<AlleleFrequencyCalculation>(AlleleFrequencyCalculation.class).getPlugins();
// user-specified name
String afModelName = UAC.AFmodel.name();
String afModelName = UAC.AFmodel.implementationName;
if (!afModelName.contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY)
afModelName = GPSTRING + afModelName;
else
afModelName = "Diploid" + afModelName;
for (int i = 0; i < afClasses.size(); i++) {
Class<? extends AlleleFrequencyCalculationModel> afClass = afClasses.get(i);
Class<? extends AlleleFrequencyCalculation> afClass = afClasses.get(i);
String key = afClass.getSimpleName().replace("AFCalculationModel","").toUpperCase();
if (afModelName.equalsIgnoreCase(key)) {
try {
Object args[] = new Object[]{UAC,N,logger,verboseWriter};
Constructor c = afClass.getDeclaredConstructor(UnifiedArgumentCollection.class, int.class, Logger.class, PrintStream.class);
return (AlleleFrequencyCalculationModel)c.newInstance(args);
return (AlleleFrequencyCalculation)c.newInstance(args);
}
catch (Exception e) {
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculation " + UAC.AFmodel);
}
}
}
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculation " + UAC.AFmodel);
}
public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {

View File

@ -24,7 +24,7 @@
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult;
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidExactAFCalculation;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.TreeSet;
@ -51,7 +51,7 @@ public class GLBasedSampleSelector extends SampleSelector {
flatPriors = new double[1+2*samples.size()];
}
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size());
ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result);
DiploidExactAFCalculation.linearExactMultiAllelic(subContext.getGenotypes(), vc.getAlternateAlleles().size(), flatPriors, result);
// do we want to let this qual go up or down?
if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) {
return true;

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.alignment.bwa.java.AlignmentMatchSequence;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
@ -7,19 +9,19 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.*;
import java.util.*;
@ -30,6 +32,7 @@ import java.util.*;
* produces a binary ped file in individual major mode.
*/
@DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=0,stop=100))
public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@ -78,8 +81,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
@Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele")
boolean majorAlleleFirst = false;
private ValidateVariants vv = new ValidateVariants();
private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
private static final byte HOM_REF = 0x0;
@ -89,6 +90,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples
private static final String PLINK_DELETION_MARKER = "-";
// note that HET and NO_CALL are flipped from the documentation: that's because
// plink actually reads these in backwards; and we want to use a shift operator
// to put these in the appropriate location
@ -101,7 +104,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private List<String> famOrder = new ArrayList<String>();
public void initialize() {
initializeValidator();
writeBedHeader();
Map<String,Map<String,String>> sampleMetaValues = parseMetaData();
// create temporary output streams and buffers
@ -150,22 +152,25 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null || ! tracker.hasValues(variantCollection.variants) ||
tracker.getFirstValue(variantCollection.variants).isFiltered() ||
! tracker.getFirstValue(variantCollection.variants).isSNP() ||
! tracker.getFirstValue(variantCollection.variants).isBiallelic()) {
if ( tracker == null ) {
return 0;
}
VariantContext vc = tracker.getFirstValue(variantCollection.variants,context.getLocation());
if ( vc == null || vc.isFiltered() || ! vc.isBiallelic() ) {
return 0;
}
try {
vv.map(tracker,ref,context);
} catch (UserException e) {
validateVariantSite(vc,ref,context);
} catch (TribbleException e) {
throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+
"Please run ValidateVariants for more detailed information.");
"Please run ValidateVariants for more detailed information. This error is: "+e.getMessage());
}
VariantContext vc = tracker.getFirstValue(variantCollection.variants);
String refOut;
String altOut;
String vcRef = getReferenceAllele(vc);
String vcAlt = getAlternateAllele(vc);
boolean altMajor;
if ( majorAlleleFirst ) {
// want to use the major allele as ref
@ -174,17 +179,17 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
VariantContextUtils.calculateChromosomeCounts(vc,ats,true);
}
if ( getAF(ats.get("AF")) > 0.5 ) {
refOut = vc.getAlternateAllele(0).getBaseString();
altOut = vc.getReference().getBaseString();
refOut = vcAlt;
altOut = vcRef;
altMajor = true;
} else {
refOut = vc.getReference().getBaseString();
altOut = vc.getAlternateAllele(0).getBaseString();
refOut = vcRef;
altOut = vcAlt;
altMajor = false;
}
} else {
refOut = vc.getReference().getBaseString();
altOut = vc.getAlternateAllele(0).getBaseString();
refOut = vcRef;
altOut = vcAlt;
altMajor = false;
}
// write an entry into the map file
@ -286,8 +291,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private byte getStandardEncoding(Genotype g, int offset) {
byte b;
if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) {
b = NO_CALL;
if ( ! checkGQIsGood(g) ) {
b = NO_CALL;
} else if ( g.isHomRef() ) {
b = HOM_REF;
} else if ( g.isHomVar() ) {
@ -322,7 +327,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
if ( genotype.hasGQ() ) {
return genotype.getGQ() >= minGenotypeQuality;
} else if ( genotype.hasLikelihoods() ) {
return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality;
double log10gq = GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector());
return MathUtils.log10ProbabilityToPhredScale(log10gq) >= minGenotypeQuality;
}
return false;
@ -346,13 +352,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
}
}
private void initializeValidator() {
vv.variantCollection = variantCollection;
vv.dbsnp = dbsnp;
vv.DO_NOT_VALIDATE_FILTERED = true;
vv.type = ValidateVariants.ValidationType.REF;
}
private void writeBedHeader() {
// write magic bits into the ped file
try {
@ -410,4 +409,53 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
return metaValues;
}
private void validateVariantSite(VariantContext vc, ReferenceContext ref, AlignmentContext context) {
final Allele reportedRefAllele = vc.getReference();
final int refLength = reportedRefAllele.length();
if ( refLength > 100 ) {
logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", refLength, vc.getChr(), vc.getStart()));
return;
}
final byte[] observedRefBases = new byte[refLength];
System.arraycopy(ref.getBases(), 0, observedRefBases, 0, refLength);
final Allele observedRefAllele = Allele.create(observedRefBases);
vc.validateReferenceBases(reportedRefAllele, observedRefAllele);
vc.validateAlternateAlleles();
}
private String getReferenceAllele(VariantContext vc) {
if ( vc.isSimpleInsertion() ) {
// bi-allelic, so we just have "-" for ped output
return PLINK_DELETION_MARKER;
}
if ( vc.isSymbolic() ) {
// either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Reference will just be 1.
return "1";
}
if ( vc.isSimpleDeletion() ) {
// bi-allelic. Want to take the standard representation and strip off the leading base.
return vc.getReference().getBaseString().substring(1);
}
// snp or mnp
return vc.getReference().getBaseString();
}
private String getAlternateAllele(VariantContext vc ) {
if ( vc.isSimpleInsertion() ) {
// bi-allelic. Want to take the standard representation and strip off the leading base.
return vc.getAlternateAllele(0).getBaseString().substring(1);
}
if ( vc.isSymbolic() ) {
// either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Alt will just be 2.
return "2";
}
if ( vc.isSimpleDeletion() ) {
// bi-allelic, so we just have "-" for ped output
return PLINK_DELETION_MARKER;
}
// snp or mnp
return vc.getAlternateAllele(0).getBaseString();
}
}

View File

@ -236,6 +236,33 @@ public class Utils {
}
}
public static <T> List<T> append(final List<T> left, T ... elts) {
final List<T> l = new LinkedList<T>(left);
l.addAll(Arrays.asList(elts));
return l;
}
/**
* Returns a string of the values in joined by separator, such as A,B,C
*
* @param separator
* @param doubles
* @return
*/
public static String join(String separator, double[] doubles) {
if ( doubles == null || doubles.length == 0)
return "";
else {
StringBuilder ret = new StringBuilder();
ret.append(doubles[0]);
for (int i = 1; i < doubles.length; ++i) {
ret.append(separator);
ret.append(doubles[i]);
}
return ret.toString();
}
}
/**
* Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of
* elti objects (note there's no actual space between sep and the elti elements). Returns

View File

@ -0,0 +1,79 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.collections;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.PrintStream;
/**
* Wrapper around the basic NestedIntegerArray class that logs all updates (ie., all calls to put())
* to the provided output stream. For testing/debugging purposes.
*
* Log entries are of the following form (fields are tab-separated):
* LABEL VALUE KEY1 KEY2 ... KEY_N
*
* @author David Roazen
*/
public class LoggingNestedIntegerArray<T> extends NestedIntegerArray<T> {
private PrintStream log;
private String logEntryLabel;
/**
*
* @param log output stream to which to log update operations
* @param logEntryLabel String that should be prefixed to each log entry
* @param dimensions
*/
public LoggingNestedIntegerArray( PrintStream log, String logEntryLabel, final int... dimensions ) {
super(dimensions);
if ( log == null ) {
throw new ReviewedStingException("Log output stream must not be null");
}
this.log = log;
this.logEntryLabel = logEntryLabel != null ? logEntryLabel : "";
}
@Override
public void put( final T value, final int... keys ) {
super.put(value, keys);
StringBuilder logEntry = new StringBuilder();
logEntry.append(logEntryLabel);
logEntry.append("\t");
logEntry.append(value);
for ( int key : keys ) {
logEntry.append("\t");
logEntry.append(key);
}
// PrintStream methods all use synchronized blocks internally, so our logging is thread-safe
log.println(logEntry.toString());
}
}

View File

@ -25,9 +25,12 @@
package org.broadinstitute.sting.utils.recalibration;
import org.broadinstitute.sting.utils.collections.LoggingNestedIntegerArray;
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
import org.broadinstitute.sting.utils.collections.NestedIntegerArray;
import java.io.PrintStream;
/**
* Utility class to facilitate on-the-fly base quality score recalibration.
*
@ -52,19 +55,31 @@ public class RecalibrationTables {
private final NestedIntegerArray[] tables;
public RecalibrationTables(final Covariate[] covariates) {
this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1);
this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, null);
}
public RecalibrationTables(final Covariate[] covariates, final PrintStream log) {
this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, log);
}
public RecalibrationTables(final Covariate[] covariates, final int numReadGroups) {
this(covariates, numReadGroups, null);
}
public RecalibrationTables(final Covariate[] covariates, final int numReadGroups, final PrintStream log) {
tables = new NestedIntegerArray[covariates.length];
final int qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.index].maximumKeyValue() + 1;
final int eventDimension = EventType.values().length;
tables[TableType.READ_GROUP_TABLE.index] = new NestedIntegerArray<RecalDatum>(numReadGroups, eventDimension);
tables[TableType.QUALITY_SCORE_TABLE.index] = new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, eventDimension);
tables[TableType.READ_GROUP_TABLE.index] = log == null ? new NestedIntegerArray<RecalDatum>(numReadGroups, eventDimension) :
new LoggingNestedIntegerArray<RecalDatum>(log, "READ_GROUP_TABLE", numReadGroups, eventDimension);
tables[TableType.QUALITY_SCORE_TABLE.index] = log == null ? new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, eventDimension) :
new LoggingNestedIntegerArray<RecalDatum>(log, "QUALITY_SCORE_TABLE", numReadGroups, qualDimension, eventDimension);
for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.index; i < covariates.length; i++)
tables[i] = new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension);
tables[i] = log == null ? new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) :
new LoggingNestedIntegerArray<RecalDatum>(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.index + 1),
numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension);
}
public NestedIntegerArray<RecalDatum> getReadGroupTable() {

View File

@ -1,127 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.Arrays;
public class ExactAFCalculationModelUnitTest extends BaseTest {
static double[] AA1, AB1, BB1;
static double[] AA2, AB2, AC2, BB2, BC2, CC2;
static final int numSamples = 3;
static double[] priors = new double[2*numSamples+1]; // flat priors
@BeforeSuite
public void before() {
AA1 = new double[]{0.0, -20.0, -20.0};
AB1 = new double[]{-20.0, 0.0, -20.0};
BB1 = new double[]{-20.0, -20.0, 0.0};
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
}
private class GetGLsTest extends TestDataProvider {
GenotypesContext GLs;
int numAltAlleles;
String name;
private GetGLsTest(String name, int numAltAlleles, Genotype... arg) {
super(GetGLsTest.class, name);
GLs = GenotypesContext.create(arg);
this.name = name;
this.numAltAlleles = numAltAlleles;
}
public String toString() {
return String.format("%s input=%s", super.toString(), GLs);
}
}
private static Genotype createGenotype(String name, double[] gls) {
return new GenotypeBuilder(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).PL(gls).make();
}
@DataProvider(name = "getGLs")
public Object[][] createGLsData() {
// bi-allelic case
new GetGLsTest("B0", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AA3", AA1));
new GetGLsTest("B1", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AB", AB1));
new GetGLsTest("B2", 1, createGenotype("AA1", AA1), createGenotype("BB", BB1), createGenotype("AA2", AA1));
new GetGLsTest("B3a", 1, createGenotype("AB", AB1), createGenotype("AA", AA1), createGenotype("BB", BB1));
new GetGLsTest("B3b", 1, createGenotype("AB1", AB1), createGenotype("AB2", AB1), createGenotype("AB3", AB1));
new GetGLsTest("B4", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("AA", AA1));
new GetGLsTest("B5", 1, createGenotype("BB1", BB1), createGenotype("AB", AB1), createGenotype("BB2", BB1));
new GetGLsTest("B6", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("BB3", BB1));
// tri-allelic case
new GetGLsTest("B1C0", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AB", AB2));
new GetGLsTest("B0C1", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AC", AC2));
new GetGLsTest("B1C1a", 2, createGenotype("AA", AA2), createGenotype("AB", AB2), createGenotype("AC", AC2));
new GetGLsTest("B1C1b", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("BC", BC2));
new GetGLsTest("B2C1", 2, createGenotype("AB1", AB2), createGenotype("AB2", AB2), createGenotype("AC", AC2));
new GetGLsTest("B3C2a", 2, createGenotype("AB", AB2), createGenotype("BC1", BC2), createGenotype("BC2", BC2));
new GetGLsTest("B3C2b", 2, createGenotype("AB", AB2), createGenotype("BB", BB2), createGenotype("CC", CC2));
return GetGLsTest.getTests(GetGLsTest.class);
}
@Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) {
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
}
}
@Test
public void testLargeGLs() {
final double[] BB = new double[]{-20000000.0, -20000000.0, 0.0};
GetGLsTest cfg = new GetGLsTest("B6", 1, createGenotype("1", BB), createGenotype("2", BB), createGenotype("3", BB));
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0];
Assert.assertEquals(calculatedAlleleCount, 6);
}
@Test
public void testMismatchedGLs() {
final double[] AB = new double[]{-2000.0, 0.0, -2000.0, -2000.0, -2000.0, -2000.0};
final double[] AC = new double[]{-100.0, -100.0, -100.0, 0.0, -100.0, -100.0};
GetGLsTest cfg = new GetGLsTest("B1C1", 2, createGenotype("1", AC), createGenotype("2", AB));
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1);
Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1);
}
}

View File

@ -182,12 +182,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testOutputParameterAllConfident() {
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "da318257d25a02abd26a3348421c3c69");
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "7bb6375fddc461c72d44f261f6d4b3c7");
}
@Test
public void testOutputParameterAllSites() {
testOutputParameters("--output_mode EMIT_ALL_SITES", "13c4f01cffbbfac600318be95b3ca02f");
testOutputParameters("--output_mode EMIT_ALL_SITES", "2104dac76fa2a58a92c72b331c7f2095");
}
private void testOutputParameters(final String args, final String md5) {

View File

@ -52,6 +52,50 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
executeTest(testName, spec);
}
@Test
public void testNA12878HighGQ() {
String testName = "testNA12878HighGQ";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",80),
3,
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","0822adea688e99bb336afe5172d4c959")
);
executeTest(testName, spec);
}
@Test
public void testVCFMismatchReference() {
String testName = "testVCFMismatchReference";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12878.badReference.vcf", "CEUTrio.NA12878.metadata.txt",80),
3,
UserException.class
);
executeTest(testName, spec);
}
@Test
public void test1000GWithIndels() {
String testName = "test1000GWithIndels";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0),
3,
Arrays.asList("3c98112434d9948dc47da72ad14e8d84","3aceda4f9bb5b5457797c1fe5a85b03d","451498ceff06c1649890900fa994f1af")
);
}
@Test
public void test1000G_Symbolic() {
String testName = "test1000G_Symbolic";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("1000G_selected_SVs.vcf", "1000G_selected_allVariants.md.txt",0),
3,
Arrays.asList("5e7ede48e7c5d5972c59dc5558a06e40","451498ceff06c1649890900fa994f1af","4b53a82a0b2d1a22a6eebca50a4f83a8")
);
}
@Test
public void testCEUTrio() {
String testName = "testCEUTrio";
@ -112,6 +156,7 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
executeTest(testName, spec);
}
}

View File

@ -189,7 +189,7 @@ class QCommandLine extends CommandLineProgram with Logging {
private def createQueueHeader() : Seq[String] = {
Seq(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp),
"Copyright (c) 2012 The Broad Institute",
"Fro support and documentation go to http://www.broadinstitute.org/gatk")
"For support and documentation go to http://www.broadinstitute.org/gatk")
}
private def getQueueVersion : String = {