diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationPerformanceTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationPerformanceTest.java new file mode 100644 index 000000000..b4d041061 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationPerformanceTest.java @@ -0,0 +1,228 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.Logger; +import org.apache.log4j.SimpleLayout; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; + +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 10/2/12 + * Time: 10:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ExactAFCalculationPerformanceTest { + final static Logger logger = Logger.getLogger(ExactAFCalculationPerformanceTest.class); + + private static abstract class Analysis { + final GATKReport report; + + public Analysis(final String name, final List columns) { + report = GATKReport.newSimpleReport(name, columns); + } + + public abstract void run(final ExactAFCalculationTestBuilder testBuilder, + final List coreColumns); + + public String getName() { + return getTable().getTableName(); + } + + public GATKReportTable getTable() { + return report.getTables().iterator().next(); + } + } + + private static class AnalyzeByACAndPL extends Analysis { + public AnalyzeByACAndPL(final List columns) { + super("AnalyzeByACAndPL", Utils.append(columns, "non.type.pls", "ac", "n.alt.seg", "other.ac")); + } + + public void run(final ExactAFCalculationTestBuilder testBuilder, final List coreValues) { + final SimpleTimer timer = new SimpleTimer(); + + for ( final int nonTypePL : Arrays.asList(10, 100, 1000) ) { + final ExactAFCalculation calc = testBuilder.makeModel(); + final double[] priors = testBuilder.makePriors(); + + for ( int[] ACs : makeACs(testBuilder.numAltAlleles, testBuilder.nSamples*2) ) { + final VariantContext vc = testBuilder.makeACTest(ACs, nonTypePL); + + timer.start(); + final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vc, priors); + final long runtime = timer.getElapsedTimeNano(); + + int otherAC = 0; + int nAltSeg = 0; + for ( int i = 0; i < ACs.length; i++ ) { + nAltSeg += ACs[i] > 0 ? 1 : 0; + if ( i > 0 ) otherAC += ACs[i]; + } + + final List columns = new LinkedList(coreValues); + columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, ACs[0], nAltSeg, otherAC)); + report.addRowList(columns); + } + } + } + + private List makeACs(final int nAltAlleles, final int nChrom) { + if ( nAltAlleles > 2 ) throw new IllegalArgumentException("nAltAlleles must be < 3"); + + final List ACs = new LinkedList(); + + if ( nAltAlleles == 1 ) + for ( int i = 0; i < nChrom; i++ ) { + ACs.add(new int[]{i}); + } else if ( nAltAlleles == 2 ) { + for ( int i = 0; i < nChrom; i++ ) { + for ( int j : Arrays.asList(0, 1, 5, 10, 50, 100, 1000, 10000, 100000) ) { + if ( j < nChrom - i ) + ACs.add(new int[]{i, j}); + } + } + } else { + throw new IllegalStateException("cannot get here"); + } + + return ACs; + } + } + + private static class AnalyzeBySingletonPosition extends Analysis { + public AnalyzeBySingletonPosition(final List columns) { + super("AnalyzeBySingletonPosition", Utils.append(columns, "non.type.pls", "position.of.singleton")); + } + + public void run(final ExactAFCalculationTestBuilder testBuilder, final List coreValues) { + final SimpleTimer timer = new SimpleTimer(); + + for ( final int nonTypePL : Arrays.asList(100) ) { + final ExactAFCalculation calc = testBuilder.makeModel(); + final double[] priors = testBuilder.makePriors(); + + final int[] ac = new int[testBuilder.numAltAlleles]; + ac[0] = 1; + final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL); + + for ( int position = 0; position < vc.getNSamples(); position++ ) { + final VariantContextBuilder vcb = new VariantContextBuilder(vc); + final List genotypes = new ArrayList(vc.getGenotypes()); + Collections.rotate(genotypes, position); + vcb.genotypes(genotypes); + + timer.start(); + final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vcb.make(), priors); + final long runtime = timer.getElapsedTimeNano(); + + final List columns = new LinkedList(coreValues); + columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, position)); + report.addRowList(columns); + } + } + } + } + + private static class AnalyzeByNonInformative extends Analysis { + public AnalyzeByNonInformative(final List columns) { + super("AnalyzeByNonInformative", Utils.append(columns, "non.type.pls", "n.non.informative")); + } + + public void run(final ExactAFCalculationTestBuilder testBuilder, final List coreValues) { + final SimpleTimer timer = new SimpleTimer(); + + for ( final int nonTypePL : Arrays.asList(100) ) { + final ExactAFCalculation calc = testBuilder.makeModel(); + final double[] priors = testBuilder.makePriors(); + + final int[] ac = new int[testBuilder.numAltAlleles]; + ac[0] = 1; + final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL); + final Genotype nonInformative = testBuilder.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0); + + for ( int nNonInformative = 0; nNonInformative < vc.getNSamples(); nNonInformative++ ) { + final VariantContextBuilder vcb = new VariantContextBuilder(vc); + + final List genotypes = new ArrayList(); + genotypes.addAll(vc.getGenotypes().subList(0, nNonInformative + 1)); + genotypes.addAll(Collections.nCopies(vc.getNSamples() - nNonInformative, nonInformative)); + vcb.genotypes(genotypes); + + timer.start(); + final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vcb.make(), priors); + final long runtime = timer.getElapsedTimeNano(); + + final List columns = new LinkedList(coreValues); + columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, nNonInformative)); + report.addRowList(columns); + } + } + } + } + + public static void main(final String[] args) throws Exception { + logger.addAppender(new ConsoleAppender(new SimpleLayout())); + + final List coreColumns = Arrays.asList("iteration", "n.alt.alleles", "n.samples", + "exact.model", "prior.type", "runtime", "n.evaluations"); + + final PrintStream out = new PrintStream(new FileOutputStream(args[0])); + + final boolean USE_GENERAL = false; + final List modelTypes = USE_GENERAL + ? Arrays.asList(ExactAFCalculationTestBuilder.ModelType.values()) + : Arrays.asList(ExactAFCalculationTestBuilder.ModelType.DiploidExact); + + final boolean ONLY_HUMAN_PRIORS = false; + final List priorTypes = ONLY_HUMAN_PRIORS + ? Arrays.asList(ExactAFCalculationTestBuilder.PriorType.values()) + : Arrays.asList(ExactAFCalculationTestBuilder.PriorType.human); + + final int MAX_N_SAMPLES_FOR_MULTI_ALLELIC = 100; + + final List analyzes = new ArrayList(); + analyzes.add(new AnalyzeByACAndPL(coreColumns)); + analyzes.add(new AnalyzeBySingletonPosition(coreColumns)); + analyzes.add(new AnalyzeByNonInformative(coreColumns)); + + for ( int iteration = 0; iteration < 1; iteration++ ) { + for ( final int nAltAlleles : Arrays.asList(1, 2) ) { + for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) { + if ( nSamples > MAX_N_SAMPLES_FOR_MULTI_ALLELIC && nAltAlleles > 1 ) + continue; // skip things that will take forever! + + for ( final ExactAFCalculationTestBuilder.ModelType modelType : modelTypes ) { + for ( final ExactAFCalculationTestBuilder.PriorType priorType : priorTypes ) { + final ExactAFCalculationTestBuilder testBuilder + = new ExactAFCalculationTestBuilder(nSamples, nAltAlleles, modelType, priorType); + + for ( final Analysis analysis : analyzes ) { + logger.info(Utils.join("\t", Arrays.asList(iteration, nAltAlleles, nSamples, modelType, priorType, analysis.getName()))); + final List values = Arrays.asList(iteration, nAltAlleles, nSamples, modelType, priorType); + analysis.run(testBuilder, (List)values); + } + } + } + } + } + } + + final GATKReport report = new GATKReport(); + for ( final Analysis analysis : analyzes ) + report.addTable(analysis.getTable()); + report.print(out); + out.close(); + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationTestBuilder.java new file mode 100644 index 000000000..f472a1140 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationTestBuilder.java @@ -0,0 +1,158 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class ExactAFCalculationTestBuilder { + final static Allele A = Allele.create("A", true); + final static Allele C = Allele.create("C"); + final static Allele G = Allele.create("G"); + final static Allele T = Allele.create("T"); + + static int sampleNameCounter = 0; + + final int nSamples; + final int numAltAlleles; + final ModelType modelType; + final PriorType priorType; + + public ExactAFCalculationTestBuilder(final int nSamples, final int numAltAlleles, + final ModelType modelType, final PriorType priorType) { + this.nSamples = nSamples; + this.numAltAlleles = numAltAlleles; + this.modelType = modelType; + this.priorType = priorType; + } + + public enum ModelType { + DiploidExact, + OptimizedDiploidExact, + GeneralExact + } + + public enum PriorType { + flat, + human + } + + public int getnSamples() { + return nSamples; + } + + public ExactAFCalculation makeModel() { + switch (modelType) { + case DiploidExact: return new DiploidExactAFCalculation(nSamples, 4); + case OptimizedDiploidExact: return new OptimizedDiploidExactAFCalculation(nSamples, 4); + case GeneralExact: return new GeneralPloidyExactAFCalculation(nSamples, 4, 2); + default: throw new RuntimeException("Unexpected type " + modelType); + } + } + + public double[] makePriors() { + final int nPriorValues = 2*nSamples+1; + + switch ( priorType ) { + case flat: + return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors + case human: + final double[] humanPriors = new double[nPriorValues]; + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues-1, humanPriors, 0.001); + return humanPriors; + default: + throw new RuntimeException("Unexpected type " + priorType); + } + } + + public VariantContext makeACTest(final int[] ACs, final int nonTypePL) { + final int nChrom = nSamples * 2; + + final int[] nhet = new int[numAltAlleles]; + final int[] nhomvar = new int[numAltAlleles]; + + for ( int i = 0; i < ACs.length; i++ ) { + final double p = ACs[i] / (1.0 * nChrom); + nhomvar[i] = (int)Math.floor(nSamples * p * p); + nhet[i] = ACs[i] - 2 * nhomvar[i]; + + if ( nhet[i] < 0 ) + throw new IllegalStateException("Bug!"); + } + + final long calcAC = MathUtils.sum(nhet) + 2 * MathUtils.sum(nhomvar); + if ( calcAC != MathUtils.sum(ACs) ) + throw new IllegalStateException("calculated AC " + calcAC + " not equal to desired AC " + Utils.join(",", ACs)); + + return makeACTest(nhet, nhomvar, nonTypePL); + } + + public VariantContext makeACTest(final int[] nhet, final int[] nhomvar, final int nonTypePL) { + List samples = new ArrayList(nSamples); + + for ( int altI = 0; altI < nhet.length; altI++ ) { + for ( int i = 0; i < nhet[altI]; i++ ) + samples.add(makePL(GenotypeType.HET, nonTypePL, altI+1)); + for ( int i = 0; i < nhomvar[altI]; i++ ) + samples.add(makePL(GenotypeType.HOM_VAR, nonTypePL, altI+1)); + } + + final int nRef = (int)(nSamples - MathUtils.sum(nhet) - MathUtils.sum(nhomvar)); + for ( int i = 0; i < nRef; i++ ) samples.add(makePL(GenotypeType.HOM_REF, nonTypePL, 0)); + + samples = samples.subList(0, nSamples); + + if ( samples.size() > nSamples ) + throw new IllegalStateException("too many samples"); + + VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, getAlleles()); + vcb.genotypes(samples); + return vcb.make(); + } + + public List getAlleles() { + return Arrays.asList(A, C, G, T).subList(0, numAltAlleles+1); + } + + public List getAlleles(final GenotypeType type, final int altI) { + switch (type) { + case HOM_REF: return Arrays.asList(getAlleles().get(0), getAlleles().get(0)); + case HET: return Arrays.asList(getAlleles().get(0), getAlleles().get(altI)); + case HOM_VAR: return Arrays.asList(getAlleles().get(altI), getAlleles().get(altI)); + default: throw new IllegalArgumentException("Unexpected type " + type); + } + } + + public Genotype makePL(final List expectedGT, int ... pls) { + GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++); + gb.alleles(expectedGT); + gb.PL(pls); + return gb.make(); + } + + private int numPLs() { + return GenotypeLikelihoods.numLikelihoods(numAltAlleles+1, 2); + } + + public Genotype makePL(final GenotypeType type, final int nonTypePL, final int altI) { + GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++); + gb.alleles(getAlleles(type, altI)); + + final int[] pls = new int[numPLs()]; + Arrays.fill(pls, nonTypePL); + + int index = 0; + switch ( type ) { + case HOM_REF: index = GenotypeLikelihoods.calculatePLindex(0, 0); break; + case HET: index = GenotypeLikelihoods.calculatePLindex(0, altI); break; + case HOM_VAR: index = GenotypeLikelihoods.calculatePLindex(altI, altI); break; + } + pls[index] = 0; + gb.PL(pls); + + return gb.make(); + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculation.java similarity index 96% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculation.java index 5662d82d6..903d553da 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculation.java @@ -34,43 +34,47 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; import java.util.*; -public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel { +public class GeneralPloidyExactAFCalculation extends ExactAFCalculation { static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them - final protected UnifiedArgumentCollection UAC; private final int ploidy; private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 private final static boolean VERBOSE = false; - protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { + protected GeneralPloidyExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { super(UAC, N, logger, verboseWriter); ploidy = UAC.samplePloidy; - this.UAC = UAC; - } - public List getLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { - - GenotypesContext GLs = vc.getGenotypes(); - List alleles = vc.getAlleles(); + public GeneralPloidyExactAFCalculation(final int nSamples, final int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, false, null, null, null); + this.ploidy = ploidy; + } + @Override + protected VariantContext reduceScope(VariantContext vc) { // don't try to genotype too many alternate alleles if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) { logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - alleles = new ArrayList(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1); + final List alleles = new ArrayList(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1); alleles.add(vc.getReference()); alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy)); - - GLs = subsetAlleles(vc, alleles, false, ploidy); + VariantContextBuilder builder = new VariantContextBuilder(vc); + builder.alleles(alleles); + builder.genotypes(subsetAlleles(vc, alleles, false, ploidy)); + return builder.make(); + } else { + return vc; } + } - combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result); - - return alleles; + @Override + public void computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, result); } @@ -194,7 +198,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula combinedPoolLikelihoods.add(set); for (int p=1; p ACqueue = new LinkedList(); + final LinkedList ACqueue = new LinkedList(); // mapping of ExactACset indexes to the objects - final HashMap indexesToACset = new HashMap(likelihoodDim); + final HashMap indexesToACset = new HashMap(likelihoodDim); // add AC=0 to the queue final int[] zeroCounts = new int[nAlleles]; zeroCounts[0] = numChromosomes; - AlleleFrequencyCalculationModel.ExactACset zeroSet = - new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts)); + ExactAFCalculation.ExactACset zeroSet = + new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(zeroCounts)); ACqueue.add(zeroSet); indexesToACset.put(zeroSet.ACcounts, zeroSet); @@ -508,7 +508,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods { double maxLog10L = Double.NEGATIVE_INFINITY; while ( !ACqueue.isEmpty() ) { // compute log10Likelihoods - final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove(); + final ExactAFCalculation.ExactACset ACset = ACqueue.remove(); final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup); // adjust max likelihood seen if needed @@ -525,8 +525,8 @@ public abstract class GeneralPloidyGenotypeLikelihoods { int plIdx = 0; SumIterator iterator = new SumIterator(nAlleles, numChromosomes); while (iterator.hasNext()) { - AlleleFrequencyCalculationModel.ExactACset ACset = - new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector())); + ExactAFCalculation.ExactACset ACset = + new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(iterator.getCurrentVector())); // for observed base X, add Q(jX,k) to likelihood vector for all k in error model //likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k)) getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup); @@ -540,14 +540,14 @@ public abstract class GeneralPloidyGenotypeLikelihoods { } - private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set, + private double calculateACConformationAndUpdateQueue(final DiploidExactAFCalculation.ExactACset set, final ErrorModel errorModel, final List alleleList, final List numObservations, final double maxLog10L, - final LinkedList ACqueue, - final HashMap indexesToACset, + final LinkedList ACqueue, + final HashMap indexesToACset, final ReadBackedPileup pileup) { // compute likelihood of set getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup); @@ -597,7 +597,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods { * @param numObservations Number of observations for each allele * @param pileup Read backed pileup in case it's necessary */ - public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, + public abstract void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset, final ErrorModel errorModel, final List alleleList, final List numObservations, @@ -608,12 +608,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods { // Static methods public static void updateACset(final int[] newSetCounts, - final LinkedList ACqueue, - final HashMap indexesToACset) { + final LinkedList ACqueue, + final HashMap indexesToACset) { - final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts); + final ExactAFCalculation.ExactACcounts index = new ExactAFCalculation.ExactACcounts(newSetCounts); if ( !indexesToACset.containsKey(index) ) { - AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index); + ExactAFCalculation.ExactACset newSet = new ExactAFCalculation.ExactACset(1, index); indexesToACset.put(index, newSet); ACqueue.add(newSet); if (VERBOSE) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java index ac212cfb5..d038934ba 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java @@ -188,7 +188,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype * @param alleleList List of alleles * @param numObservations Number of observations for each allele in alleleList */ - public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, + public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset, final ErrorModel errorModel, final List alleleList, final List numObservations, diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java index 944372907..fc9910cc0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java @@ -12,7 +12,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; import static java.lang.Math.log10; import static java.lang.Math.pow; @@ -218,7 +221,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi * @param alleleList List of alleles * @param numObservations Number of observations for each allele in alleleList */ - public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, + public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset, final ErrorModel errorModel, final List alleleList, final List numObservations, diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index f4d8a88e0..71e4f5f8a 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -237,9 +237,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling - UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING); - UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); - UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); + UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); + UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); + + // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested + UnifiedArgumentCollection simpleUAC = UAC.clone(); + simpleUAC.exactCallsLog = null; + UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); // initialize the output VCF header annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java new file mode 100644 index 000000000..602009654 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -0,0 +1,348 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +public class ExactAFCalculationModelUnitTest extends BaseTest { + static Allele A = Allele.create("A", true); + static Allele C = Allele.create("C"); + static Allele G = Allele.create("G"); + + static int sampleNameCounter = 0; + static Genotype AA1, AB1, BB1, NON_INFORMATIVE1; + static Genotype AA2, AB2, AC2, BB2, BC2, CC2, NON_INFORMATIVE2; + final double[] FLAT_3SAMPLE_PRIORS = new double[2*3+1]; // flat priors + final private static boolean INCLUDE_BIALLELIC = true; + final private static boolean INCLUDE_TRIALLELIC = true; + final private static boolean Guillermo_FIXME = false; // TODO -- can only be enabled when GdA fixes bug + + @BeforeSuite + public void before() { + AA1 = makePL(Arrays.asList(A, A), 0, 20, 20); + AB1 = makePL(Arrays.asList(A, C), 20, 0, 20); + BB1 = makePL(Arrays.asList(C, C), 20, 20, 0); + NON_INFORMATIVE1 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0); + + AA2 = makePL(Arrays.asList(A, A), 0, 20, 20, 20, 20, 20); + AB2 = makePL(Arrays.asList(A, C), 20, 0, 20, 20, 20, 20); + BB2 = makePL(Arrays.asList(C, C), 20, 20, 0, 20, 20, 20); + AC2 = makePL(Arrays.asList(A, G), 20, 20, 20, 0, 20, 20); + BC2 = makePL(Arrays.asList(C, G), 20, 20, 20, 20, 0, 20); + CC2 = makePL(Arrays.asList(G, G), 20, 20, 20, 20, 20, 0); + NON_INFORMATIVE2 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0, 0, 0, 0); + } + + private Genotype makePL(final List expectedGT, int ... pls) { + GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++); + gb.alleles(expectedGT); + gb.PL(pls); + return gb.make(); + } + + private class GetGLsTest extends TestDataProvider { + GenotypesContext GLs; + int numAltAlleles; + final ExactAFCalculation calc; + final int[] expectedACs; + final double[] priors; + final String priorName; + + private GetGLsTest(final ExactAFCalculation calculation, int numAltAlleles, List arg, final double[] priors, final String priorName) { + super(GetGLsTest.class); + GLs = GenotypesContext.create(new ArrayList(arg)); + this.numAltAlleles = numAltAlleles; + this.calc = calculation; + this.priors = priors; + this.priorName = priorName; + + expectedACs = new int[numAltAlleles+1]; + for ( int alleleI = 0; alleleI < expectedACs.length; alleleI++ ) { + expectedACs[alleleI] = 0; + final Allele allele = getAlleles().get(alleleI); + for ( Genotype g : arg ) { + expectedACs[alleleI] += Collections.frequency(g.getAlleles(), allele); + } + } + } + + public AlleleFrequencyCalculationResult execute() { + return getCalc().getLog10PNonRef(getVC(), getPriors()); + } + + public double[] getPriors() { + return priors; + } + + public ExactAFCalculation getCalc() { + return calc; + } + + public VariantContext getVC() { + VariantContextBuilder builder = new VariantContextBuilder("test", "1", 1, 1, getAlleles()); + builder.genotypes(GLs); + return builder.make(); + } + + public List getAlleles() { + return Arrays.asList(Allele.create("A", true), + Allele.create("C"), + Allele.create("G"), + Allele.create("T")).subList(0, numAltAlleles+1); + } + + public int getExpectedAltAC(final int alleleI) { + return expectedACs[alleleI+1]; + } + + public String toString() { + return String.format("%s model=%s prior=%s input=%s", super.toString(), calc.getClass().getSimpleName(), + priorName, GLs.size() > 5 ? String.format("%d samples", GLs.size()) : GLs); + } + } + + @DataProvider(name = "wellFormedGLs") + public Object[][] createSimpleGLsData() { + final List biAllelicSamples = Arrays.asList(AA1, AB1, BB1); + final List triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2); + + for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) { + final ExactAFCalculation diploidCalc = new DiploidExactAFCalculation(nSamples, 4); + final ExactAFCalculation optDiploidCalc = new OptimizedDiploidExactAFCalculation(nSamples, 4); + final ExactAFCalculation generalCalc = new GeneralPloidyExactAFCalculation(nSamples, 4, 2); + + final int nPriorValues = 2*nSamples+1; + final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors + final double[] humanPriors = new double[nPriorValues]; + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues-1, humanPriors, 0.001); + + for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) { + for ( ExactAFCalculation model : Arrays.asList(diploidCalc, optDiploidCalc, generalCalc) ) { + final String priorName = priors == humanPriors ? "human" : "flat"; + + // bi-allelic + if ( INCLUDE_BIALLELIC && nSamples <= biAllelicSamples.size() ) + for ( List genotypes : Utils.makePermutations(biAllelicSamples, nSamples, true) ) + new GetGLsTest(model, 1, genotypes, priors, priorName); + + // tri-allelic + if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || model != generalCalc || Guillermo_FIXME ) ) + for ( List genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) ) + new GetGLsTest(model, 2, genotypes, priors, priorName); + } + } + } + + return GetGLsTest.getTests(GetGLsTest.class); + } + + private static class NonInformativeData { + final Genotype nonInformative; + final List called; + final int nAltAlleles; + + private NonInformativeData(List called, Genotype nonInformative, int nAltAlleles) { + this.called = called; + this.nonInformative = nonInformative; + this.nAltAlleles = nAltAlleles; + } + } + + @DataProvider(name = "GLsWithNonInformative") + public Object[][] makeGLsWithNonInformative() { + List tests = new ArrayList(); + + final List nonInformativeTests = new LinkedList(); + nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB1), NON_INFORMATIVE1, 1)); + nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2), NON_INFORMATIVE2, 2)); + nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2, BC2), NON_INFORMATIVE2, 2)); + + for ( final int nNonInformative : Arrays.asList(1, 10, 100) ) { + for ( final NonInformativeData testData : nonInformativeTests ) { + final List samples = new ArrayList(); + samples.addAll(testData.called); + samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative)); + + final int nSamples = samples.size(); + final ExactAFCalculation diploidCalc = new DiploidExactAFCalculation(nSamples, 4); + final ExactAFCalculation optDiploidCalc = new OptimizedDiploidExactAFCalculation(nSamples, 4); + final ExactAFCalculation generalCalc = new GeneralPloidyExactAFCalculation(nSamples, 4, 2); + final double[] priors = new double[2*nSamples+1]; // flat priors + + for ( ExactAFCalculation model : Arrays.asList(diploidCalc, optDiploidCalc, generalCalc) ) { + final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat"); + + for ( int rotation = 0; rotation < nSamples; rotation++ ) { + Collections.rotate(samples, 1); + final GetGLsTest withNonInformative = new GetGLsTest(model, testData.nAltAlleles, samples, priors, "flat"); + tests.add(new Object[]{onlyInformative, withNonInformative}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "wellFormedGLs") + public void testGLs(GetGLsTest cfg) { + testResultSimple(cfg); + } + + @Test(enabled = true, dataProvider = "GLsWithNonInformative", dependsOnMethods = "testGLs") + public void testGLsWithNonInformative(GetGLsTest onlyInformative, GetGLsTest withNonInformative) { + final AlleleFrequencyCalculationResult expected = onlyInformative.execute(); + final AlleleFrequencyCalculationResult actual = withNonInformative.execute(); + + testResultSimple(withNonInformative); + + Assert.assertEquals(actual.getLog10PosteriorOfAFzero(), expected.getLog10LikelihoodOfAFzero()); + Assert.assertEquals(actual.getLog10LikelihoodOfAFzero(), expected.getLog10LikelihoodOfAFzero()); + Assert.assertEquals(actual.getLog10PosteriorsMatrixSumWithoutAFzero(), expected.getLog10PosteriorsMatrixSumWithoutAFzero()); + Assert.assertEquals(actual.getAlleleCountsOfMAP(), expected.getAlleleCountsOfMAP()); + Assert.assertEquals(actual.getAlleleCountsOfMLE(), expected.getAlleleCountsOfMLE()); + Assert.assertEquals(actual.getLog10MAP(), expected.getLog10MAP()); + Assert.assertEquals(actual.getLog10MLE(), expected.getLog10MLE()); + Assert.assertEquals(actual.getAllelesUsedInGenotyping(), expected.getAllelesUsedInGenotyping()); + } + + private void testResultSimple(final GetGLsTest cfg) { + final AlleleFrequencyCalculationResult result = cfg.execute(); + + Assert.assertEquals(result.getNormalizedPosteriorOfAFzero() + result.getNormalizedPosteriorOfAFGTZero(), 1.0, 1e-4); + + final int minNumberOfEvaluations = cfg.getVC().getCalledChrCount(); + Assert.assertTrue(result.getnEvaluations() >= minNumberOfEvaluations, + "Number of evaluations " + result.getnEvaluations() + " must be at least " + minNumberOfEvaluations); + Assert.assertNotNull(result.getAllelesUsedInGenotyping()); + Assert.assertTrue(cfg.getAlleles().containsAll(result.getAllelesUsedInGenotyping()), "Result object has alleles not in our initial allele list"); + + for ( int altAlleleI = 0; altAlleleI < cfg.numAltAlleles; altAlleleI++ ) { + int expectedAlleleCount = cfg.getExpectedAltAC(altAlleleI); + int calcAC_MLE = result.getAlleleCountsOfMLE()[altAlleleI]; + + final Allele allele = cfg.getAlleles().get(altAlleleI+1); + Assert.assertEquals(calcAC_MLE, expectedAlleleCount, "MLE AC not equal to expected AC for allele " + allele); + } + + // TODO + // TODO -- enable when we understand the contract between AC_MAP and pNonRef + // TODO +// final int AC_MAP = (int)MathUtils.sum(result.getAlleleCountsOfMAP()); +// if ( AC_MAP > 0 ) { +// Assert.assertTrue(result.getNormalizedPosteriorOfAFzero() < 0.50, "MAP AC " + AC_MAP + " > 0 but we had posterior AF = 0 > 0.5 of " + result.getNormalizedPosteriorOfAFzero()); +// } else { +// Assert.assertTrue(result.getNormalizedPosteriorOfAFzero() > 0.50, "MAP AC " + AC_MAP + " == 0 but we had posterior AF = 0 < 0.5 of " + result.getNormalizedPosteriorOfAFzero()); +// } + } + + @Test(enabled = true, dataProvider = "Models") + public void testLargeGLs(final ExactAFCalculation calc) { + final Genotype BB = makePL(Arrays.asList(C, C), 20000000, 20000000, 0); + GetGLsTest cfg = new GetGLsTest(calc, 1, Arrays.asList(BB, BB, BB), FLAT_3SAMPLE_PRIORS, "flat"); + + final AlleleFrequencyCalculationResult result = cfg.execute(); + + int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0]; + Assert.assertEquals(calculatedAlleleCount, 6); + } + + @Test(enabled = true, dataProvider = "Models") + public void testMismatchedGLs(final ExactAFCalculation calc) { + final Genotype AB = makePL(Arrays.asList(A,C), 2000, 0, 2000, 2000, 2000, 2000); + final Genotype AC = makePL(Arrays.asList(A,G), 100, 100, 100, 0, 100, 100); + GetGLsTest cfg = new GetGLsTest(calc, 2, Arrays.asList(AB, AC), FLAT_3SAMPLE_PRIORS, "flat"); + + final AlleleFrequencyCalculationResult result = cfg.execute(); + + Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1); + Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1); + } + + @DataProvider(name = "Models") + public Object[][] makeModels() { + List tests = new ArrayList(); + + tests.add(new Object[]{new DiploidExactAFCalculation(2, 4)}); + tests.add(new Object[]{new OptimizedDiploidExactAFCalculation(2, 4)}); + tests.add(new Object[]{new GeneralPloidyExactAFCalculation(2, 4, 2)}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "Models") + public void testBiallelicPriors(final ExactAFCalculation model) { + final int REF_PL = 10; + final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000); + + for ( int log10NonRefPrior = 1; log10NonRefPrior < 100*REF_PL; log10NonRefPrior += 1 ) { + final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior); + final double[] priors = MathUtils.toLog10(new double[]{refPrior, (1-refPrior) / 2, (1-refPrior) / 2}); + GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior); + final AlleleFrequencyCalculationResult result = cfg.execute(); + final int actualAC = result.getAlleleCountsOfMAP()[0]; + + final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0]; + final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1]; + final boolean expectNonRef = pRefWithPrior <= pHetWithPrior; + + if ( expectNonRef ) + Assert.assertTrue(result.getNormalizedPosteriorOfAFGTZero() > 0.5); + else + Assert.assertTrue(result.getNormalizedPosteriorOfAFGTZero() < 0.5); + + final int expectedAC = expectNonRef ? 1 : 0; + Assert.assertEquals(actualAC, expectedAC, + "actual AC with priors " + log10NonRefPrior + " not expected " + + expectedAC + " priors " + Utils.join(",", priors)); + } + } + + @Test(enabled = false, dataProvider = "Models") + public void testTriallelicPriors(final ExactAFCalculation model) { + // TODO + // TODO + // TODO THIS SEEMS TO ID A BUG IN THE EXACT MODEL FOR MULTI-ALLELICS, AS THE + // TODO SECOND ALLELE ISN'T HAVING A SQUARED PRIOR. TALK TO ERIC AND CONFIRM + // TODO + // TODO + final int REF_PL_AB = 10, REF_PL_AC = 20; // first AC goes, then AB + final Genotype AB = makePL(Arrays.asList(A,C), REF_PL_AB, 0, 10000, 10000, 10000); + final Genotype AC = makePL(Arrays.asList(A, G), REF_PL_AC, 10000, 10000, 0, 10000, 10000); + + for ( int log10NonRefPrior = 1; log10NonRefPrior < 100*REF_PL_AC; log10NonRefPrior += 1 ) { + final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior); + final double nonRefPrior = (1-refPrior) / 2; + final double[] priors = MathUtils.toLog10(new double[]{refPrior, nonRefPrior, nonRefPrior, nonRefPrior, nonRefPrior, nonRefPrior}); + GetGLsTest cfg = new GetGLsTest(model, 2, Arrays.asList(AB, AC), priors, "pNonRef" + log10NonRefPrior); + final AlleleFrequencyCalculationResult result = cfg.execute(); + final int actualAC_AB = result.getAlleleCountsOfMAP()[0]; + + final double pRefABWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0]; + final double pHetABWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1]; + final int expectedAC_AB = pRefABWithPrior <= pHetABWithPrior ? 1 : 0; + Assert.assertEquals(actualAC_AB, expectedAC_AB, + "actual AC with priors " + log10NonRefPrior + " not expected " + + expectedAC_AB + " priors " + Utils.join(",", priors)); + + final double nonRefPriorSecondAllele = Math.pow(nonRefPrior, 2); + final double refPriorSecondAllele = 1 - nonRefPriorSecondAllele; + final int actualAC_AC = result.getAlleleCountsOfMAP()[1]; + final double pRefACWithPrior = AB.getLikelihoods().getAsVector()[0] + Math.log10(refPriorSecondAllele); + final double pHetACWithPrior = AC.getLikelihoods().getAsVector()[3] + Math.log10(nonRefPriorSecondAllele); + final int expectedAC_AC = pRefACWithPrior <= pHetACWithPrior ? 1 : 0; + Assert.assertEquals(actualAC_AC, expectedAC_AC, + "actual AC with priors " + log10NonRefPrior + " not expected " + + expectedAC_AC + " priors " + Utils.join(",", priors)); + } + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java index 983f562d2..a646e6f09 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java @@ -141,7 +141,7 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest { final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size()); double[] priors = new double[len]; // flat priors - GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result); + GeneralPloidyExactAFCalculation.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index f30fc0316..b2e1a12c6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -1,13 +1,12 @@ package org.broadinstitute.sting.gatk.arguments; -import org.broadinstitute.sting.commandline.Advanced; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.File; + /** * Created with IntelliJ IDEA. * User: rpoplin @@ -59,4 +58,8 @@ public class StandardCallerArgumentCollection { @Advanced @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) public int MAX_ALTERNATE_ALLELES = 3; + + @Hidden + @Argument(shortName = "logExactCalls", doc="x", required=false) + public File exactCallsLog = null; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index d0e310d3f..8ee7e0439 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -124,6 +124,12 @@ public class BAMScheduler implements Iterator { */ private FilePointer generatePointerOverEntireFileset() { FilePointer filePointer = new FilePointer(); + + // This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is + // the only FilePointer we will create. This allows us to have this FilePointer represent regions from + // multiple contigs + filePointer.setIsMonolithic(true); + Map currentPosition; // Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java index 6c064cf86..0440c7eae 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; @@ -88,6 +89,17 @@ public class ExperimentalReadShardBalancer extends ShardBalancer { */ private PeekableIterator currentContigReadsIterator = null; + /** + * How many FilePointers have we pulled from the filePointers iterator? + */ + private int totalFilePointersConsumed = 0; + + /** + * Have we encountered a monolithic FilePointer? + */ + private boolean encounteredMonolithicFilePointer = false; + + { createNextContigFilePointer(); advance(); @@ -167,6 +179,20 @@ public class ExperimentalReadShardBalancer extends ShardBalancer { logger.info("Loading BAM index data for next contig"); while ( filePointers.hasNext() ) { + + // Make sure that if we see a monolithic FilePointer (representing all regions in all files) that + // it is the ONLY FilePointer we ever encounter + if ( encounteredMonolithicFilePointer ) { + throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer"); + } + if ( filePointers.peek().isMonolithic() ) { + if ( totalFilePointersConsumed > 0 ) { + throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer"); + } + encounteredMonolithicFilePointer = true; + logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek())); + } + // If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the // same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge if ( nextContigFilePointers.isEmpty() || @@ -175,6 +201,7 @@ public class ExperimentalReadShardBalancer extends ShardBalancer { (nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) { nextContigFilePointers.add(filePointers.next()); + totalFilePointersConsumed++; } else { break; // next FilePointer is on a different contig or has different mapped/unmapped status, diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java index 50f4e0273..639887cf3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java @@ -50,6 +50,14 @@ public class FilePointer { */ protected final boolean isRegionUnmapped; + /** + * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will + * ever visit during this GATK run? If this is set to true, the engine will expect to see only this + * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals + * from more than one contig. + */ + private boolean isMonolithic = false; + public FilePointer( List locations ) { this.locations.addAll(locations); this.isRegionUnmapped = checkUnmappedStatus(); @@ -81,7 +89,8 @@ public class FilePointer { } private void validateLocations() { - if ( isRegionUnmapped ) { + // Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction + if ( isRegionUnmapped || isMonolithic ) { return; } @@ -123,6 +132,29 @@ public class FilePointer { return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; } + /** + * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will + * ever visit during this GATK run? If this is set to true, the engine will expect to see only this + * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals + * from more than one contig. + * + * @return true if this FP is a monolithic FP representing all regions in all files, otherwise false + */ + public boolean isMonolithic() { + return isMonolithic; + } + + /** + * Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all + * regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic + * FP may contain intervals from more than one contig. + * + * @param isMonolithic set this FP's monolithic status to this value + */ + public void setIsMonolithic( boolean isMonolithic ) { + this.isMonolithic = isMonolithic; + } + @Override public boolean equals(final Object other) { if(!(other instanceof FilePointer)) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index 662c7526b..27e666f6f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -215,19 +215,29 @@ public class ReadShard extends Shard { int start = Integer.MAX_VALUE; int stop = Integer.MIN_VALUE; String contig = null; + boolean foundMapped = false; for ( final SAMRecord read : reads ) { if ( contig != null && ! read.getReferenceName().equals(contig) ) throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. " + "First contig is " + contig + " next read was " + read.getReferenceName() ); contig = read.getReferenceName(); - if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart(); - if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); + + // Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates + // of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries, + // this shard might consist *only* of unmapped mates! We need to refrain from using the alignment + // starts/stops of these unmapped mates, and detect the case where the shard has been filled *only* + // with unmapped mates. + if ( ! read.getReadUnmappedFlag() ) { + foundMapped = true; + if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart(); + if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); + } } assert contig != null; - if ( contig.equals("*") ) // all reads are unmapped + if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped return GenomeLoc.UNMAPPED; else return parser.createGenomeLoc(contig, start, stop); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index e78b9b6fc..ee6a619fd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -179,7 +179,7 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed int numReadGroups = 0; for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) numReadGroups += header.getReadGroups().size(); - recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups); + recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG); recalibrationEngine = initializeRecalibrationEngine(); recalibrationEngine.initialize(requestedCovariates, recalibrationTables); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index f1f0ce38e..fc7d8a8a4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -182,6 +182,10 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; + @Hidden + @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only") + public PrintStream RECAL_TABLE_UPDATE_LOG = null; + public File existingRecalibrationReport = null; public GATKReportTable generateReportTable(final String covariateNames) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculation.java new file mode 100755 index 000000000..4189dbd6d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculation.java @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.List; + + +/** + * Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods + */ +public abstract class AlleleFrequencyCalculation implements Cloneable { + private final static Logger defaultLogger = Logger.getLogger(AlleleFrequencyCalculation.class); + + public enum Model { + /** The default model with the best performance in all cases */ + EXACT("ExactAFCalculation"); + + final String implementationName; + + private Model(String implementationName) { + this.implementationName = implementationName; + } + } + + protected int nSamples; + protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE; + protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS; + + protected Logger logger; + protected PrintStream verboseWriter; + + protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; + + private SimpleTimer callTimer = new SimpleTimer(); + private PrintStream callReport = null; + + protected AlleleFrequencyCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) { + this(nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.exactCallsLog, logger, verboseWriter); + } + + protected AlleleFrequencyCalculation(final int nSamples, + final int maxAltAlleles, + final boolean capMaxAltsForIndels, + final File exactCallsLog, + final Logger logger, + final PrintStream verboseWriter) { + if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); + + this.nSamples = nSamples; + this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = maxAltAlleles; + this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = capMaxAltsForIndels; + this.logger = logger == null ? defaultLogger : logger; + this.verboseWriter = verboseWriter; + if ( exactCallsLog != null ) + initializeOutputFile(exactCallsLog); + } + + /** + * @see #getLog10PNonRef(org.broadinstitute.sting.utils.variantcontext.VariantContext, double[], AlleleFrequencyCalculationResult) + * + * Allocates a new results object. Useful for testing but slow in practice. + */ + public final AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { + return getLog10PNonRef(vc, log10AlleleFrequencyPriors, new AlleleFrequencyCalculationResult(MAX_ALTERNATE_ALLELES_TO_GENOTYPE)); + } + + /** + * Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc + * + * @param vc the VariantContext holding the alleles and sample information + * @param log10AlleleFrequencyPriors a prior vector nSamples x 2 in length indicating the Pr(AF = i) + * @param result a pre-allocated (for efficiency) object to hold the result of the calculation + * @return result (for programming convenience) + */ + public final AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null"); + if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null"); + if ( result == null ) throw new IllegalArgumentException("Results object cannot be null"); + + // reset the result, so we can store our new result there + result.reset(); + + final VariantContext vcWorking = reduceScope(vc); + + callTimer.start(); + computeLog10PNonRef(vcWorking, log10AlleleFrequencyPriors, result); + final long nanoTime = callTimer.getElapsedTimeNano(); + + if ( callReport != null ) + printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, result.getLog10PosteriorOfAFzero()); + + result.setAllelesUsedInGenotyping(vcWorking.getAlleles()); + return result; + } + + // --------------------------------------------------------------------------- + // + // Abstract methods that should be implemented by concrete implementations + // to actually calculate the AF + // + // --------------------------------------------------------------------------- + + /** + * Look at VC and perhaps return a new one of reduced complexity, if that's necessary + * + * Used before the call to computeLog10PNonRef to simply the calculation job at hand, + * if vc exceeds bounds. For example, if VC has 100 alt alleles this function + * may decide to only genotype the best 2 of them. + * + * @param vc the initial VC provided by the caller to this AFcalculation + * @return a potentially simpler VC that's more tractable to genotype + */ + @Requires("vc != null") + @Ensures("result != null") + protected abstract VariantContext reduceScope(final VariantContext vc); + + /** + * Actually carry out the log10PNonRef calculation on vc, storing results in results + * + * @param vc variant context with alleles and genotype likelihoods + * @param log10AlleleFrequencyPriors priors + * @param result (pre-allocated) object to store results + */ + // TODO -- add consistent requires among args + protected abstract void computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result); + + /** + * Must be overridden by concrete subclasses + * + * @param vc variant context with alleles and genotype likelihoods + * @param allelesToUse alleles to subset + * @param assignGenotypes + * @param ploidy + * @return GenotypesContext object + */ + protected abstract GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy); + + // --------------------------------------------------------------------------- + // + // Print information about the call to the calls log + // + // --------------------------------------------------------------------------- + + private void initializeOutputFile(final File outputFile) { + try { + if (outputFile != null) { + callReport = new PrintStream( new FileOutputStream(outputFile) ); + callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value"))); + } + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotCreateOutputFile(outputFile, e); + } + } + + private void printCallInfo(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final long runtimeNano, + final double log10PosteriorOfAFzero) { + printCallElement(vc, "type", "ignore", vc.getType()); + + int allelei = 0; + for ( final Allele a : vc.getAlleles() ) + printCallElement(vc, "allele", allelei++, a.getDisplayString()); + + for ( final Genotype g : vc.getGenotypes() ) + printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString()); + + for ( int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++ ) + printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]); + + printCallElement(vc, "runtime.nano", "ignore", runtimeNano); + printCallElement(vc, "log10PosteriorOfAFzero", "ignore", log10PosteriorOfAFzero); + + callReport.flush(); + } + + private void printCallElement(final VariantContext vc, + final Object variable, + final Object key, + final Object value) { + final String loc = String.format("%s:%d", vc.getChr(), vc.getStart()); + callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value))); + } + +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java index c93e780bf..aabca9bcb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java @@ -25,9 +25,12 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; +import com.google.java.contract.Ensures; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.Arrays; +import java.util.List; /** * Created by IntelliJ IDEA. @@ -35,9 +38,10 @@ import java.util.Arrays; * Date: Dec 14, 2011 * * Useful helper class to communicate the results of the allele frequency calculation + * + * TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF? */ public class AlleleFrequencyCalculationResult { - // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles private double log10MLE; private double log10MAP; @@ -54,21 +58,88 @@ public class AlleleFrequencyCalculationResult { private double log10LikelihoodOfAFzero; private double log10PosteriorOfAFzero; + int nEvaluations = 0; + /** + * The list of alleles actually used in computing the AF + */ + private List allelesUsedInGenotyping = null; + + /** + * Create a results object capability of storing results for calls with up to maxAltAlleles + * + * @param maxAltAlleles an integer >= 1 + */ public AlleleFrequencyCalculationResult(final int maxAltAlleles) { + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles); + alleleCountsOfMLE = new int[maxAltAlleles]; alleleCountsOfMAP = new int[maxAltAlleles]; + reset(); } + /** + * Get the log10 value of the probability mass at the MLE + * + * @return a log10 prob + */ + @Ensures("goodLog10Value(result)") public double getLog10MLE() { return log10MLE; } + /** + * Get the log10 value of the probability mass at the max. a posterior (MAP) + * + * @return a log10 prob + */ + @Ensures("goodLog10Value(result)") public double getLog10MAP() { return log10MAP; } + /** + * Returns a vector with maxAltAlleles values containing AC values at the MLE + * + * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, + * starting from index 0 (i.e., the first alt allele is at 0). The vector is always + * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values + * are meaningful. + * + * @return a vector with allele counts, not all of which may be meaningful + */ + @Ensures("result != null") + public int[] getAlleleCountsOfMLE() { + return alleleCountsOfMLE; + } + + /** + * Returns a vector with maxAltAlleles values containing AC values at the MAP + * + * @see #getAlleleCountsOfMLE() for the encoding of results in this vector + * + * @return a non-null vector of ints + */ + @Ensures("result != null") + public int[] getAlleleCountsOfMAP() { + return alleleCountsOfMAP; + } + + /** + * Returns the number of cycles used to evaluate the pNonRef for this AF calculation + * + * @return the number of evaluations required to produce the answer for this AF calculation + */ + public int getnEvaluations() { + return nEvaluations; + } + + /** + * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should + * + * @return + */ public double getLog10PosteriorsMatrixSumWithoutAFzero() { if ( log10PosteriorMatrixSum == null ) { log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex); @@ -76,33 +147,99 @@ public class AlleleFrequencyCalculationResult { return log10PosteriorMatrixSum; } - public int[] getAlleleCountsOfMLE() { - return alleleCountsOfMLE; - } - - public int[] getAlleleCountsOfMAP() { - return alleleCountsOfMAP; - } - + /** + * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should + * + * @return + */ public double getLog10LikelihoodOfAFzero() { return log10LikelihoodOfAFzero; } + /** + * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should + * + * @return + */ public double getLog10PosteriorOfAFzero() { return log10PosteriorOfAFzero; } - public void reset() { - log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + /** + * Get the list of alleles actually used in genotyping. + * + * Due to computational / implementation constraints this may be smaller than + * the actual list of alleles requested + * + * @return a non-empty list of alleles used during genotyping + */ + @Ensures({"result != null", "! result.isEmpty()"}) + public List getAllelesUsedInGenotyping() { + if ( allelesUsedInGenotyping == null ) + throw new IllegalStateException("allelesUsedInGenotyping requested but not yet set"); + + return allelesUsedInGenotyping; + } + + /** + * Get the normalized -- across all AFs -- of AC == 0, NOT LOG10 + * @return + */ + // TODO -- this ensure cannot be enabled right now because the log10 inputs can be infinity, etc. + // TODO -- we should own these values in a more meaningful way and return good values in the case + // TODO -- where this happens, or instead thrown an error and have a function to say "was this calculation successful +// @Ensures({"result >= 0.0", "result <= 1.0"}) + public double getNormalizedPosteriorOfAFzero() { + return getNormalizedPosteriors()[0]; + } + + /** + * Get the normalized -- across all AFs -- of AC > 0, NOT LOG10 + * @return + */ + // TODO -- this ensure cannot be enabled right now because the log10 inputs can be infinity, etc. + // TODO -- we should own these values in a more meaningful way and return good values in the case + // TODO -- where this happens, or instead thrown an error and have a function to say "was this calculation successful + //@Ensures({"result >= 0.0", "result <= 1.0"}) + public double getNormalizedPosteriorOfAFGTZero() { + return getNormalizedPosteriors()[1]; + } + + private double[] getNormalizedPosteriors() { + final double[] posteriors = new double[]{ getLog10PosteriorOfAFzero(), getLog10PosteriorsMatrixSumWithoutAFzero() }; + return MathUtils.normalizeFromLog10(posteriors); + } + + // -------------------------------------------------------------------------------- + // + // Protected mutational methods only for use within the calculation models themselves + // + // -------------------------------------------------------------------------------- + + /** + * Reset the data in this results object, so that it can be used in a subsequent AF calculation + * + * Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer + */ + protected void reset() { + log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculation.VALUE_NOT_CALCULATED; for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) { alleleCountsOfMLE[i] = 0; alleleCountsOfMAP[i] = 0; } currentPosteriorsCacheIndex = 0; log10PosteriorMatrixSum = null; + allelesUsedInGenotyping = null; } - public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { + /** + * Tell this result we used one more evaluation cycle + */ + protected void incNEvaluations() { + nEvaluations++; + } + + protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { if ( log10LofK > log10MLE ) { log10MLE = log10LofK; for ( int i = 0; i < alleleCountsForK.length; i++ ) @@ -110,7 +247,7 @@ public class AlleleFrequencyCalculationResult { } } - public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { + protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { addToPosteriorsCache(log10LofK); if ( log10LofK > log10MAP ) { @@ -132,7 +269,7 @@ public class AlleleFrequencyCalculationResult { } } - public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { + protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; if ( log10LikelihoodOfAFzero > log10MLE ) { log10MLE = log10LikelihoodOfAFzero; @@ -140,11 +277,22 @@ public class AlleleFrequencyCalculationResult { } } - public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { + protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { this.log10PosteriorOfAFzero = log10PosteriorOfAFzero; if ( log10PosteriorOfAFzero > log10MAP ) { log10MAP = log10PosteriorOfAFzero; Arrays.fill(alleleCountsOfMAP, 0); } } + + protected void setAllelesUsedInGenotyping(List allelesUsedInGenotyping) { + if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() ) + throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty"); + + this.allelesUsedInGenotyping = allelesUsedInGenotyping; + } + + private static boolean goodLog10Value(final double result) { + return result <= 0.0 || Double.isInfinite(result) || Double.isNaN(result); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidExactAFCalculation.java similarity index 94% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidExactAFCalculation.java index ba7f0f622..4e449a8bb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidExactAFCalculation.java @@ -32,41 +32,54 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; import java.util.*; -public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { - +public class DiploidExactAFCalculation extends ExactAFCalculation { // private final static boolean DEBUG = false; private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { + public DiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) { + super(nSamples, maxAltAlleles, false, null, null, null); + } + + /** + * Dynamically found in UnifiedGenotyperEngine + * + * @param UAC + * @param N + * @param logger + * @param verboseWriter + */ + public DiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { super(UAC, N, logger, verboseWriter); } - public List getLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { - - GenotypesContext GLs = vc.getGenotypes(); - List alleles = vc.getAlleles(); + @Override + public void computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + linearExactMultiAllelic(vc.getGenotypes(), vc.getNAlleles() - 1, log10AlleleFrequencyPriors, result); + } + @Override + protected VariantContext reduceScope(final VariantContext vc) { final int myMaxAltAllelesToGenotype = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS && vc.getType().equals(VariantContext.Type.INDEL) ? 2 : MAX_ALTERNATE_ALLELES_TO_GENOTYPE; // don't try to genotype too many alternate alleles if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) { logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - alleles = new ArrayList(myMaxAltAllelesToGenotype + 1); + VariantContextBuilder builder = new VariantContextBuilder(vc); + List alleles = new ArrayList(myMaxAltAllelesToGenotype + 1); alleles.add(vc.getReference()); alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype)); - GLs = VariantContextUtils.subsetDiploidAlleles(vc, alleles, false); + builder.alleles(alleles); + builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); + return builder.make(); + } else { + return vc; } - - linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result); - - return alleles; } - private static final int PL_INDEX_OF_HOM_REF = 0; private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) { final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); @@ -134,6 +147,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // keep processing while we have AC conformations that need to be calculated MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen(); while ( !ACqueue.isEmpty() ) { + result.incNEvaluations(); // keep track of the number of evaluations + // compute log10Likelihoods final ExactACset set = ACqueue.remove(); final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculation.java similarity index 71% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculation.java index 569cd7072..2dea9e951 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculation.java @@ -30,40 +30,23 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.File; import java.io.PrintStream; import java.util.ArrayList; import java.util.Arrays; -import java.util.List; /** - * The model representing how we calculate a genotype given the priors and a pile - * of bases and quality scores + * Uses the Exact calculation of Heng Li */ -public abstract class AlleleFrequencyCalculationModel implements Cloneable { - - public enum Model { - /** The default model with the best performance in all cases */ - EXACT +abstract class ExactAFCalculation extends AlleleFrequencyCalculation { + protected ExactAFCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) { + super(UAC, nSamples, logger, verboseWriter); } - protected int N; - protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE; - protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS; - - protected Logger logger; - protected PrintStream verboseWriter; - - protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; - - protected AlleleFrequencyCalculationModel(final UnifiedArgumentCollection UAC, final int N, final Logger logger, final PrintStream verboseWriter) { - this.N = N; - this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = UAC.MAX_ALTERNATE_ALLELES; - this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS; - this.logger = logger; - this.verboseWriter = verboseWriter; + protected ExactAFCalculation(final int nSamples, int maxAltAlleles, boolean capMaxAltsForIndels, File exactCallsLog, Logger logger, PrintStream verboseWriter) { + super(nSamples, maxAltAlleles, capMaxAltsForIndels, exactCallsLog, logger, verboseWriter); } /** @@ -102,31 +85,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { return genotypeLikelihoods; } - /** - * Must be overridden by concrete subclasses - * @param vc variant context with alleles and genotype likelihoods - * @param log10AlleleFrequencyPriors priors - * @param result (pre-allocated) object to store likelihoods results - * @return the alleles used for genotyping - */ - protected abstract List getLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result); - - /** - * Must be overridden by concrete subclasses - * @param vc variant context with alleles and genotype likelihoods - * @param allelesToUse alleles to subset - * @param assignGenotypes - * @param ploidy - * @return GenotypesContext object - */ - protected abstract GenotypesContext subsetAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes, - final int ploidy); - - // ------------------------------------------------------------------------------------- // // protected classes used to store exact model matrix columns diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OptimizedDiploidExactAFCalculation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OptimizedDiploidExactAFCalculation.java new file mode 100755 index 000000000..2b3b517ce --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OptimizedDiploidExactAFCalculation.java @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.io.PrintStream; +import java.util.*; + +public class OptimizedDiploidExactAFCalculation extends ExactAFCalculation { + // private final static boolean DEBUG = false; + + private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 + + public OptimizedDiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) { + super(nSamples, maxAltAlleles, false, null, null, null); + } + + /** + * Dynamically found in UnifiedGenotyperEngine + * + * @param UAC + * @param N + * @param logger + * @param verboseWriter + */ + public OptimizedDiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { + super(UAC, N, logger, verboseWriter); + } + + @Override + public void computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + linearExactMultiAllelic(vc.getGenotypes(), vc.getNAlleles() - 1, log10AlleleFrequencyPriors, result); + } + + @Override + protected VariantContext reduceScope(final VariantContext vc) { + final int myMaxAltAllelesToGenotype = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS && vc.getType().equals(VariantContext.Type.INDEL) ? 2 : MAX_ALTERNATE_ALLELES_TO_GENOTYPE; + + // don't try to genotype too many alternate alleles + if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) { + logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + + VariantContextBuilder builder = new VariantContextBuilder(vc); + List alleles = new ArrayList(myMaxAltAllelesToGenotype + 1); + alleles.add(vc.getReference()); + alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype)); + builder.alleles(alleles); + builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); + return builder.make(); + } else { + return vc; + } + } + + private static final int PL_INDEX_OF_HOM_REF = 0; + private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) { + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) + likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); + + // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype + final ArrayList GLs = getGLs(vc.getGenotypes()); + for ( final double[] likelihoods : GLs ) { + final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); + if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) { + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL); + if ( alleles.alleleIndex1 != 0 ) + likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + // don't double-count it + if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 ) + likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + } + } + + // sort them by probability mass and choose the best ones + Collections.sort(Arrays.asList(likelihoodSums)); + final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); + for ( int i = 0; i < numAllelesToChoose; i++ ) + bestAlleles.add(likelihoodSums[i].allele); + + final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); + for ( Allele allele : vc.getAlternateAlleles() ) { + if ( bestAlleles.contains(allele) ) + orderedBestAlleles.add(allele); + } + + return orderedBestAlleles; + } + + + // ------------------------------------------------------------------------------------- + // + // Multi-allelic implementation. + // + // ------------------------------------------------------------------------------------- + + public static void linearExactMultiAllelic(final GenotypesContext GLs, + final int numAlternateAlleles, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + + final ArrayList genotypeLikelihoods = getGLs(GLs); + final int numSamples = genotypeLikelihoods.size()-1; + final int numChr = 2*numSamples; + + // queue of AC conformations to process + final LinkedList ACqueue = new LinkedList(); + + // mapping of ExactACset indexes to the objects + final HashMap indexesToACset = new HashMap(numChr+1); + + // add AC=0 to the queue + int[] zeroCounts = new int[numAlternateAlleles]; + ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts)); + ACqueue.add(zeroSet); + indexesToACset.put(zeroSet.ACcounts, zeroSet); + + // keep processing while we have AC conformations that need to be calculated + MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen(); + while ( !ACqueue.isEmpty() ) { + result.incNEvaluations(); // keep track of the number of evaluations + + // compute log10Likelihoods + final ExactACset set = ACqueue.remove(); + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result); + + // adjust max likelihood seen if needed + if ( log10LofKs > maxLikelihoodSeen.maxLog10L ) + maxLikelihoodSeen.update(log10LofKs, set.ACcounts); + + // clean up memory + indexesToACset.remove(set.ACcounts); + //if ( DEBUG ) + // System.out.printf(" *** removing used set=%s%n", set.ACcounts); + } + } + + private static final class DependentSet { + public final int[] ACcounts; + public final int PLindex; + + public DependentSet(final int[] ACcounts, final int PLindex) { + this.ACcounts = ACcounts; + this.PLindex = PLindex; + } + } + + private static double calculateAlleleCountConformation(final ExactACset set, + final ArrayList genotypeLikelihoods, + final MaxLikelihoodSeen maxLikelihoodSeen, + final int numChr, + final LinkedList ACqueue, + final HashMap indexesToACset, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + + //if ( DEBUG ) + // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); + + // compute the log10Likelihoods + computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result); + + final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; + + // can we abort early because the log10Likelihoods are so small? + if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) { + //if ( DEBUG ) + // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); + return log10LofK; + } + + // iterate over higher frequencies if possible + final int ACwiggle = numChr - set.getACsum(); + if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies + return log10LofK; + + final int numAltAlleles = set.ACcounts.getCounts().length; + + // add conformations for the k+1 case + for ( int allele = 0; allele < numAltAlleles; allele++ ) { + final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + ACcountsClone[allele]++; + // to get to this conformation, a sample would need to be AB (remember that ref=0) + final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); + updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + } + + // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different + if ( ACwiggle > 1 ) { + final ArrayList differentAlleles = new ArrayList(numAltAlleles * numAltAlleles); + final ArrayList sameAlleles = new ArrayList(numAltAlleles); + + for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { + for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { + final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + ACcountsClone[allele_i]++; + ACcountsClone[allele_j]++; + + // to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index) + final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1); + if ( allele_i == allele_j ) + sameAlleles.add(new DependentSet(ACcountsClone, PLindex)); + else + differentAlleles.add(new DependentSet(ACcountsClone, PLindex)); + } + } + + // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering + for ( DependentSet dependent : differentAlleles ) + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + for ( DependentSet dependent : sameAlleles ) + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + } + + return log10LofK; + } + + // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and + // also pushes its value to the given callingSetIndex. + private static void updateACset(final int[] newSetCounts, + final int numChr, + final ExactACset dependentSet, + final int PLsetIndex, + final Queue ACqueue, + final HashMap indexesToACset, + final ArrayList genotypeLikelihoods) { + final ExactACcounts index = new ExactACcounts(newSetCounts); + if ( !indexesToACset.containsKey(index) ) { + ExactACset set = new ExactACset(numChr/2 +1, index); + indexesToACset.put(index, set); + ACqueue.add(set); + } + + // push data from the dependency to the new set + //if ( DEBUG ) + // System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts); + pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods); + } + + private static void computeLofK(final ExactACset set, + final ArrayList genotypeLikelihoods, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + + set.log10Likelihoods[0] = 0.0; // the zero case + final int totalK = set.getACsum(); + + // special case for k = 0 over all k + if ( totalK == 0 ) { + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) + set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; + + final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1]; + result.setLog10LikelihoodOfAFzero(log10Lof0); + result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + return; + } + + // if we got here, then k > 0 for at least one k. + // the non-AA possible conformations were already dealt with by pushes from dependent sets; + // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { + + if ( totalK < 2*j-1 ) { + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; + set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue); + } + + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator; + } + + double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; + + // update the MLE if necessary + result.updateMLEifNeeded(log10LofK, set.ACcounts.counts); + + // apply the priors over each alternate allele + for ( final int ACcount : set.ACcounts.getCounts() ) { + if ( ACcount > 0 ) + log10LofK += log10AlleleFrequencyPriors[ACcount]; + } + result.updateMAPifNeeded(log10LofK, set.ACcounts.counts); + } + + private static void pushData(final ExactACset targetSet, + final ExactACset dependentSet, + final int PLsetIndex, + final ArrayList genotypeLikelihoods) { + final int totalK = targetSet.getACsum(); + + for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) { + + if ( totalK <= 2*j ) { // skip impossible conformations + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = + determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex]; + targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue); + } + } + } + + private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { + + // the closed form representation generalized for multiple alleles is as follows: + // AA: (2j - totalK) * (2j - totalK - 1) + // AB: 2k_b * (2j - totalK) + // AC: 2k_c * (2j - totalK) + // BB: k_b * (k_b - 1) + // BC: 2 * k_b * k_c + // CC: k_c * (k_c - 1) + + // find the 2 alleles that are represented by this PL index + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + + // *** note that throughout this method we subtract one from the alleleIndex because ACcounts *** + // *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. *** + + // the AX het case + if ( alleles.alleleIndex1 == 0 ) + return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK]; + + final int k_i = ACcounts[alleles.alleleIndex1-1]; + + // the hom var case (e.g. BB, CC, DD) + final double coeff; + if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) { + coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; + } + // the het non-ref case (e.g. BC, BD, CD) + else { + final int k_j = ACcounts[alleles.alleleIndex2-1]; + coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; + } + + return coeff; + } + + public GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy) { + return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes); + } + + // ------------------------------------------------------------------------------------- + // + // Deprecated bi-allelic ~O(N) implementation. Kept here for posterity. + // + // ------------------------------------------------------------------------------------- + + /** + * A simple data structure that holds the current, prev, and prev->prev likelihoods vectors + * for the exact model calculation + */ +/* + private final static class ExactACCache { + double[] kMinus2, kMinus1, kMinus0; + + private final static double[] create(int n) { + return new double[n]; + } + + public ExactACCache(int n) { + kMinus2 = create(n); + kMinus1 = create(n); + kMinus0 = create(n); + } + + final public void rotate() { + double[] tmp = kMinus2; + kMinus2 = kMinus1; + kMinus1 = kMinus0; + kMinus0 = tmp; + } + + final public double[] getkMinus2() { + return kMinus2; + } + + final public double[] getkMinus1() { + return kMinus1; + } + + final public double[] getkMinus0() { + return kMinus0; + } + } + + public int linearExact(GenotypesContext GLs, + double[] log10AlleleFrequencyPriors, + double[][] log10AlleleFrequencyLikelihoods, + double[][] log10AlleleFrequencyPosteriors) { + final ArrayList genotypeLikelihoods = getGLs(GLs); + final int numSamples = genotypeLikelihoods.size()-1; + final int numChr = 2*numSamples; + + final ExactACCache logY = new ExactACCache(numSamples+1); + logY.getkMinus0()[0] = 0.0; // the zero case + + double maxLog10L = Double.NEGATIVE_INFINITY; + boolean done = false; + int lastK = -1; + + for (int k=0; k <= numChr && ! done; k++ ) { + final double[] kMinus0 = logY.getkMinus0(); + + if ( k == 0 ) { // special case for k = 0 + for ( int j=1; j <= numSamples; j++ ) { + kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0]; + } + } else { // k > 0 + final double[] kMinus1 = logY.getkMinus1(); + final double[] kMinus2 = logY.getkMinus2(); + + for ( int j=1; j <= numSamples; j++ ) { + final double[] gl = genotypeLikelihoods.get(j); + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + + double aa = Double.NEGATIVE_INFINITY; + double ab = Double.NEGATIVE_INFINITY; + if (k < 2*j-1) + aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0]; + + if (k < 2*j) + ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1]; + + double log10Max; + if (k > 1) { + final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2]; + log10Max = approximateLog10SumLog10(aa, ab, bb); + } else { + // we know we aren't considering the BB case, so we can use an optimized log10 function + log10Max = approximateLog10SumLog10(aa, ab); + } + + // finally, update the L(j,k) value + kMinus0[j] = log10Max - logDenominator; + } + } + + // update the posteriors vector + final double log10LofK = kMinus0[numSamples]; + log10AlleleFrequencyLikelihoods[0][k] = log10LofK; + log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k]; + + // can we abort early? + lastK = k; + maxLog10L = Math.max(maxLog10L, log10LofK); + if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { + //if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); + done = true; + } + + logY.rotate(); + } + + return lastK; + } + + final static double approximateLog10SumLog10(double a, double b, double c) { + return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c); + } +*/ + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 30c0f3e18..9b80d6266 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -41,7 +41,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection */ @Advanced @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT; + protected AlleleFrequencyCalculation.Model AFmodel = AlleleFrequencyCalculation.Model.EXACT; /** * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily @@ -186,7 +186,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! public UnifiedArgumentCollection clone() { UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); @@ -224,6 +223,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection uac.minReferenceDepth = minReferenceDepth; uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES; uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO; + uac.exactCallsLog = exactCallsLog; // todo- arguments to remove uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; @@ -242,5 +242,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection this.OutputMode = SCAC.OutputMode; this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; + this.exactCallsLog = SCAC.exactCallsLog; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 0d1997252..30a1439e4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -27,10 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; @@ -249,7 +249,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2"); } - if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL) + if (UAC.AFmodel != AlleleFrequencyCalculation.Model.POOL) throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 469d63b8a..609d2d731 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -78,11 +78,10 @@ public class UnifiedGenotyperEngine { private ThreadLocal> glcm = new ThreadLocal>(); // the model used for calculating p(non-ref) - private ThreadLocal afcm = new ThreadLocal(); + private ThreadLocal afcm = new ThreadLocal(); // the allele frequency likelihoods and posteriors (allocated once as an optimization) private ThreadLocal alleleFrequencyCalculationResult = new ThreadLocal(); - private ThreadLocal posteriorsArray = new ThreadLocal(); // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything private final double[] log10AlleleFrequencyPriorsSNPs; @@ -357,7 +356,6 @@ public class UnifiedGenotyperEngine { if ( afcm.get() == null ) { afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES)); - posteriorsArray.set(new double[2]); } AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get(); @@ -370,8 +368,7 @@ public class UnifiedGenotyperEngine { generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); } - AFresult.reset(); - List allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult); + afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; @@ -382,7 +379,7 @@ public class UnifiedGenotyperEngine { myAlleles.add(vc.getReference()); for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { final Allele alternateAllele = vc.getAlternateAllele(i); - final int indexOfAllele = allelesUsedInGenotyping.indexOf(alternateAllele); + final int indexOfAllele = AFresult.getAllelesUsedInGenotyping().indexOf(alternateAllele); // the genotyping model may have stripped it out if ( indexOfAllele == -1 ) continue; @@ -403,16 +400,16 @@ public class UnifiedGenotyperEngine { } // calculate p(f>0): - final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get()); - final double PofF = 1.0 - normalizedPosteriors[0]; + final double PoFEq0 = AFresult.getNormalizedPosteriorOfAFzero(); + final double PoFGT0 = AFresult.getNormalizedPosteriorOfAFGTZero(); double phredScaledConfidence; if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); + phredScaledConfidence = QualityUtils.phredScaleErrorRate(PoFEq0); if ( Double.isInfinite(phredScaledConfidence) ) phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero(); } else { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); + phredScaledConfidence = QualityUtils.phredScaleErrorRate(PoFGT0); if ( Double.isInfinite(phredScaledConfidence) ) { final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); @@ -423,7 +420,7 @@ public class UnifiedGenotyperEngine { if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) { // technically, at this point our confidence in a reference call isn't accurately estimated // because it didn't take into account samples with no data, so let's get a better estimate - return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, 1.0 - PofF); + return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0); } // start constructing the resulting VC @@ -439,7 +436,7 @@ public class UnifiedGenotyperEngine { // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) - printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model); + printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model); // *** note that calculating strand bias involves overwriting data structures, so we do that last final HashMap attributes = new HashMap(); @@ -477,7 +474,6 @@ public class UnifiedGenotyperEngine { // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); - AFresult.reset(); afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); @@ -486,7 +482,6 @@ public class UnifiedGenotyperEngine { // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); - AFresult.reset(); afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); @@ -524,13 +519,7 @@ public class UnifiedGenotyperEngine { vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); } - return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); - } - - public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) { - normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero(); - normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); - return MathUtils.normalizeFromLog10(normalizedPosteriors); + return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); } private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { @@ -754,32 +743,34 @@ public class UnifiedGenotyperEngine { return glcm; } - private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) { + private static AlleleFrequencyCalculation getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) { - List> afClasses = new PluginManager(AlleleFrequencyCalculationModel.class).getPlugins(); + List> afClasses = new PluginManager(AlleleFrequencyCalculation.class).getPlugins(); // user-specified name - String afModelName = UAC.AFmodel.name(); + String afModelName = UAC.AFmodel.implementationName; if (!afModelName.contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) afModelName = GPSTRING + afModelName; + else + afModelName = "Diploid" + afModelName; for (int i = 0; i < afClasses.size(); i++) { - Class afClass = afClasses.get(i); + Class afClass = afClasses.get(i); String key = afClass.getSimpleName().replace("AFCalculationModel","").toUpperCase(); if (afModelName.equalsIgnoreCase(key)) { try { Object args[] = new Object[]{UAC,N,logger,verboseWriter}; Constructor c = afClass.getDeclaredConstructor(UnifiedArgumentCollection.class, int.class, Logger.class, PrintStream.class); - return (AlleleFrequencyCalculationModel)c.newInstance(args); + return (AlleleFrequencyCalculation)c.newInstance(args); } catch (Exception e) { - throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel); + throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculation " + UAC.AFmodel); } } } - throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel); + throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculation " + UAC.AFmodel); } public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index 3e48520a7..cbc4c4401 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -24,7 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult; -import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidExactAFCalculation; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.TreeSet; @@ -51,7 +51,7 @@ public class GLBasedSampleSelector extends SampleSelector { flatPriors = new double[1+2*samples.size()]; } AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size()); - ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result); + DiploidExactAFCalculation.linearExactMultiAllelic(subContext.getGenotypes(), vc.getAlternateAlleles().size(), flatPriors, result); // do we want to let this qual go up or down? if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) { return true; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 37fc96681..b7ef85a04 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import org.broad.tribble.TribbleException; +import org.broadinstitute.sting.alignment.bwa.java.AlignmentMatchSequence; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; @@ -7,19 +9,19 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; import java.util.*; @@ -30,6 +32,7 @@ import java.util.*; * produces a binary ped file in individual major mode. */ @DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=0,stop=100)) public class VariantsToBinaryPed extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @@ -78,8 +81,6 @@ public class VariantsToBinaryPed extends RodWalker { @Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele") boolean majorAlleleFirst = false; - private ValidateVariants vv = new ValidateVariants(); - private static double APPROX_CM_PER_BP = 1000000.0/750000.0; private static final byte HOM_REF = 0x0; @@ -89,6 +90,8 @@ public class VariantsToBinaryPed extends RodWalker { private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples + private static final String PLINK_DELETION_MARKER = "-"; + // note that HET and NO_CALL are flipped from the documentation: that's because // plink actually reads these in backwards; and we want to use a shift operator // to put these in the appropriate location @@ -101,7 +104,6 @@ public class VariantsToBinaryPed extends RodWalker { private List famOrder = new ArrayList(); public void initialize() { - initializeValidator(); writeBedHeader(); Map> sampleMetaValues = parseMetaData(); // create temporary output streams and buffers @@ -150,22 +152,25 @@ public class VariantsToBinaryPed extends RodWalker { } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null || ! tracker.hasValues(variantCollection.variants) || - tracker.getFirstValue(variantCollection.variants).isFiltered() || - ! tracker.getFirstValue(variantCollection.variants).isSNP() || - ! tracker.getFirstValue(variantCollection.variants).isBiallelic()) { + if ( tracker == null ) { + return 0; + } + + VariantContext vc = tracker.getFirstValue(variantCollection.variants,context.getLocation()); + if ( vc == null || vc.isFiltered() || ! vc.isBiallelic() ) { return 0; } try { - vv.map(tracker,ref,context); - } catch (UserException e) { + validateVariantSite(vc,ref,context); + } catch (TribbleException e) { throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+ - "Please run ValidateVariants for more detailed information."); + "Please run ValidateVariants for more detailed information. This error is: "+e.getMessage()); } - VariantContext vc = tracker.getFirstValue(variantCollection.variants); String refOut; String altOut; + String vcRef = getReferenceAllele(vc); + String vcAlt = getAlternateAllele(vc); boolean altMajor; if ( majorAlleleFirst ) { // want to use the major allele as ref @@ -174,17 +179,17 @@ public class VariantsToBinaryPed extends RodWalker { VariantContextUtils.calculateChromosomeCounts(vc,ats,true); } if ( getAF(ats.get("AF")) > 0.5 ) { - refOut = vc.getAlternateAllele(0).getBaseString(); - altOut = vc.getReference().getBaseString(); + refOut = vcAlt; + altOut = vcRef; altMajor = true; } else { - refOut = vc.getReference().getBaseString(); - altOut = vc.getAlternateAllele(0).getBaseString(); + refOut = vcRef; + altOut = vcAlt; altMajor = false; } } else { - refOut = vc.getReference().getBaseString(); - altOut = vc.getAlternateAllele(0).getBaseString(); + refOut = vcRef; + altOut = vcAlt; altMajor = false; } // write an entry into the map file @@ -286,8 +291,8 @@ public class VariantsToBinaryPed extends RodWalker { private byte getStandardEncoding(Genotype g, int offset) { byte b; - if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) { - b = NO_CALL; + if ( ! checkGQIsGood(g) ) { + b = NO_CALL; } else if ( g.isHomRef() ) { b = HOM_REF; } else if ( g.isHomVar() ) { @@ -322,7 +327,8 @@ public class VariantsToBinaryPed extends RodWalker { if ( genotype.hasGQ() ) { return genotype.getGQ() >= minGenotypeQuality; } else if ( genotype.hasLikelihoods() ) { - return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality; + double log10gq = GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()); + return MathUtils.log10ProbabilityToPhredScale(log10gq) >= minGenotypeQuality; } return false; @@ -346,13 +352,6 @@ public class VariantsToBinaryPed extends RodWalker { } } - private void initializeValidator() { - vv.variantCollection = variantCollection; - vv.dbsnp = dbsnp; - vv.DO_NOT_VALIDATE_FILTERED = true; - vv.type = ValidateVariants.ValidationType.REF; - } - private void writeBedHeader() { // write magic bits into the ped file try { @@ -410,4 +409,53 @@ public class VariantsToBinaryPed extends RodWalker { return metaValues; } + + private void validateVariantSite(VariantContext vc, ReferenceContext ref, AlignmentContext context) { + final Allele reportedRefAllele = vc.getReference(); + final int refLength = reportedRefAllele.length(); + if ( refLength > 100 ) { + logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", refLength, vc.getChr(), vc.getStart())); + return; + } + + final byte[] observedRefBases = new byte[refLength]; + System.arraycopy(ref.getBases(), 0, observedRefBases, 0, refLength); + final Allele observedRefAllele = Allele.create(observedRefBases); + vc.validateReferenceBases(reportedRefAllele, observedRefAllele); + vc.validateAlternateAlleles(); + } + + private String getReferenceAllele(VariantContext vc) { + if ( vc.isSimpleInsertion() ) { + // bi-allelic, so we just have "-" for ped output + return PLINK_DELETION_MARKER; + } + if ( vc.isSymbolic() ) { + // either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Reference will just be 1. + return "1"; + } + if ( vc.isSimpleDeletion() ) { + // bi-allelic. Want to take the standard representation and strip off the leading base. + return vc.getReference().getBaseString().substring(1); + } + // snp or mnp + return vc.getReference().getBaseString(); + } + + private String getAlternateAllele(VariantContext vc ) { + if ( vc.isSimpleInsertion() ) { + // bi-allelic. Want to take the standard representation and strip off the leading base. + return vc.getAlternateAllele(0).getBaseString().substring(1); + } + if ( vc.isSymbolic() ) { + // either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Alt will just be 2. + return "2"; + } + if ( vc.isSimpleDeletion() ) { + // bi-allelic, so we just have "-" for ped output + return PLINK_DELETION_MARKER; + } + // snp or mnp + return vc.getAlternateAllele(0).getBaseString(); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 74b038032..f4a200af0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -236,6 +236,33 @@ public class Utils { } } + public static List append(final List left, T ... elts) { + final List l = new LinkedList(left); + l.addAll(Arrays.asList(elts)); + return l; + } + + /** + * Returns a string of the values in joined by separator, such as A,B,C + * + * @param separator + * @param doubles + * @return + */ + public static String join(String separator, double[] doubles) { + if ( doubles == null || doubles.length == 0) + return ""; + else { + StringBuilder ret = new StringBuilder(); + ret.append(doubles[0]); + for (int i = 1; i < doubles.length; ++i) { + ret.append(separator); + ret.append(doubles[i]); + } + return ret.toString(); + } + } + /** * Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of * elti objects (note there's no actual space between sep and the elti elements). Returns diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java new file mode 100644 index 000000000..617391714 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.collections; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.PrintStream; + +/** + * Wrapper around the basic NestedIntegerArray class that logs all updates (ie., all calls to put()) + * to the provided output stream. For testing/debugging purposes. + * + * Log entries are of the following form (fields are tab-separated): + * LABEL VALUE KEY1 KEY2 ... KEY_N + * + * @author David Roazen + */ +public class LoggingNestedIntegerArray extends NestedIntegerArray { + + private PrintStream log; + private String logEntryLabel; + + /** + * + * @param log output stream to which to log update operations + * @param logEntryLabel String that should be prefixed to each log entry + * @param dimensions + */ + public LoggingNestedIntegerArray( PrintStream log, String logEntryLabel, final int... dimensions ) { + super(dimensions); + + if ( log == null ) { + throw new ReviewedStingException("Log output stream must not be null"); + } + this.log = log; + this.logEntryLabel = logEntryLabel != null ? logEntryLabel : ""; + } + + @Override + public void put( final T value, final int... keys ) { + super.put(value, keys); + + StringBuilder logEntry = new StringBuilder(); + + logEntry.append(logEntryLabel); + logEntry.append("\t"); + logEntry.append(value); + for ( int key : keys ) { + logEntry.append("\t"); + logEntry.append(key); + } + + // PrintStream methods all use synchronized blocks internally, so our logging is thread-safe + log.println(logEntry.toString()); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java index afc8f5065..0dd510245 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java @@ -25,9 +25,12 @@ package org.broadinstitute.sting.utils.recalibration; +import org.broadinstitute.sting.utils.collections.LoggingNestedIntegerArray; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; +import java.io.PrintStream; + /** * Utility class to facilitate on-the-fly base quality score recalibration. * @@ -52,19 +55,31 @@ public class RecalibrationTables { private final NestedIntegerArray[] tables; public RecalibrationTables(final Covariate[] covariates) { - this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1); + this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, null); + } + + public RecalibrationTables(final Covariate[] covariates, final PrintStream log) { + this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, log); } public RecalibrationTables(final Covariate[] covariates, final int numReadGroups) { + this(covariates, numReadGroups, null); + } + + public RecalibrationTables(final Covariate[] covariates, final int numReadGroups, final PrintStream log) { tables = new NestedIntegerArray[covariates.length]; final int qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.index].maximumKeyValue() + 1; final int eventDimension = EventType.values().length; - tables[TableType.READ_GROUP_TABLE.index] = new NestedIntegerArray(numReadGroups, eventDimension); - tables[TableType.QUALITY_SCORE_TABLE.index] = new NestedIntegerArray(numReadGroups, qualDimension, eventDimension); + tables[TableType.READ_GROUP_TABLE.index] = log == null ? new NestedIntegerArray(numReadGroups, eventDimension) : + new LoggingNestedIntegerArray(log, "READ_GROUP_TABLE", numReadGroups, eventDimension); + tables[TableType.QUALITY_SCORE_TABLE.index] = log == null ? new NestedIntegerArray(numReadGroups, qualDimension, eventDimension) : + new LoggingNestedIntegerArray(log, "QUALITY_SCORE_TABLE", numReadGroups, qualDimension, eventDimension); for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.index; i < covariates.length; i++) - tables[i] = new NestedIntegerArray(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension); + tables[i] = log == null ? new NestedIntegerArray(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) : + new LoggingNestedIntegerArray(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.index + 1), + numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension); } public NestedIntegerArray getReadGroupTable() { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java deleted file mode 100644 index 0731d3fd8..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ /dev/null @@ -1,127 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.Arrays; - - -public class ExactAFCalculationModelUnitTest extends BaseTest { - - static double[] AA1, AB1, BB1; - static double[] AA2, AB2, AC2, BB2, BC2, CC2; - static final int numSamples = 3; - static double[] priors = new double[2*numSamples+1]; // flat priors - - @BeforeSuite - public void before() { - AA1 = new double[]{0.0, -20.0, -20.0}; - AB1 = new double[]{-20.0, 0.0, -20.0}; - BB1 = new double[]{-20.0, -20.0, 0.0}; - AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0}; - AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0}; - AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0}; - BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0}; - BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0}; - CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0}; - } - - private class GetGLsTest extends TestDataProvider { - GenotypesContext GLs; - int numAltAlleles; - String name; - - private GetGLsTest(String name, int numAltAlleles, Genotype... arg) { - super(GetGLsTest.class, name); - GLs = GenotypesContext.create(arg); - this.name = name; - this.numAltAlleles = numAltAlleles; - } - - public String toString() { - return String.format("%s input=%s", super.toString(), GLs); - } - } - - private static Genotype createGenotype(String name, double[] gls) { - return new GenotypeBuilder(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).PL(gls).make(); - } - - @DataProvider(name = "getGLs") - public Object[][] createGLsData() { - - // bi-allelic case - new GetGLsTest("B0", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AA3", AA1)); - new GetGLsTest("B1", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AB", AB1)); - new GetGLsTest("B2", 1, createGenotype("AA1", AA1), createGenotype("BB", BB1), createGenotype("AA2", AA1)); - new GetGLsTest("B3a", 1, createGenotype("AB", AB1), createGenotype("AA", AA1), createGenotype("BB", BB1)); - new GetGLsTest("B3b", 1, createGenotype("AB1", AB1), createGenotype("AB2", AB1), createGenotype("AB3", AB1)); - new GetGLsTest("B4", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("AA", AA1)); - new GetGLsTest("B5", 1, createGenotype("BB1", BB1), createGenotype("AB", AB1), createGenotype("BB2", BB1)); - new GetGLsTest("B6", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("BB3", BB1)); - - // tri-allelic case - new GetGLsTest("B1C0", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AB", AB2)); - new GetGLsTest("B0C1", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AC", AC2)); - new GetGLsTest("B1C1a", 2, createGenotype("AA", AA2), createGenotype("AB", AB2), createGenotype("AC", AC2)); - new GetGLsTest("B1C1b", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("BC", BC2)); - new GetGLsTest("B2C1", 2, createGenotype("AB1", AB2), createGenotype("AB2", AB2), createGenotype("AC", AC2)); - new GetGLsTest("B3C2a", 2, createGenotype("AB", AB2), createGenotype("BC1", BC2), createGenotype("BC2", BC2)); - new GetGLsTest("B3C2b", 2, createGenotype("AB", AB2), createGenotype("BB", BB2), createGenotype("CC", CC2)); - - return GetGLsTest.getTests(GetGLsTest.class); - } - - - @Test(dataProvider = "getGLs") - public void testGLs(GetGLsTest cfg) { - - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); - - int nameIndex = 1; - for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { - int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele]; - - Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); - } - } - - @Test - public void testLargeGLs() { - - final double[] BB = new double[]{-20000000.0, -20000000.0, 0.0}; - GetGLsTest cfg = new GetGLsTest("B6", 1, createGenotype("1", BB), createGenotype("2", BB), createGenotype("3", BB)); - - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); - - int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0]; - Assert.assertEquals(calculatedAlleleCount, 6); - } - - @Test - public void testMismatchedGLs() { - - final double[] AB = new double[]{-2000.0, 0.0, -2000.0, -2000.0, -2000.0, -2000.0}; - final double[] AC = new double[]{-100.0, -100.0, -100.0, 0.0, -100.0, -100.0}; - GetGLsTest cfg = new GetGLsTest("B1C1", 2, createGenotype("1", AC), createGenotype("2", AB)); - - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); - - Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1); - Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 1f418f736..f3fe63e95 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -182,12 +182,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "da318257d25a02abd26a3348421c3c69"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "7bb6375fddc461c72d44f261f6d4b3c7"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "13c4f01cffbbfac600318be95b3ca02f"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "2104dac76fa2a58a92c72b331c7f2095"); } private void testOutputParameters(final String args, final String md5) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java index a75da6cf9..3e59508bc 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java @@ -52,6 +52,50 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { executeTest(testName, spec); } + @Test + public void testNA12878HighGQ() { + String testName = "testNA12878HighGQ"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",80), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","0822adea688e99bb336afe5172d4c959") + ); + + executeTest(testName, spec); + } + + @Test + public void testVCFMismatchReference() { + String testName = "testVCFMismatchReference"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.badReference.vcf", "CEUTrio.NA12878.metadata.txt",80), + 3, + UserException.class + ); + + executeTest(testName, spec); + } + + @Test + public void test1000GWithIndels() { + String testName = "test1000GWithIndels"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0), + 3, + Arrays.asList("3c98112434d9948dc47da72ad14e8d84","3aceda4f9bb5b5457797c1fe5a85b03d","451498ceff06c1649890900fa994f1af") + ); + } + + @Test + public void test1000G_Symbolic() { + String testName = "test1000G_Symbolic"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("1000G_selected_SVs.vcf", "1000G_selected_allVariants.md.txt",0), + 3, + Arrays.asList("5e7ede48e7c5d5972c59dc5558a06e40","451498ceff06c1649890900fa994f1af","4b53a82a0b2d1a22a6eebca50a4f83a8") + ); + } + @Test public void testCEUTrio() { String testName = "testCEUTrio"; @@ -112,6 +156,7 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { executeTest(testName, spec); } + } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 0d0fab9d1..d0379d022 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -189,7 +189,7 @@ class QCommandLine extends CommandLineProgram with Logging { private def createQueueHeader() : Seq[String] = { Seq(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp), "Copyright (c) 2012 The Broad Institute", - "Fro support and documentation go to http://www.broadinstitute.org/gatk") + "For support and documentation go to http://www.broadinstitute.org/gatk") } private def getQueueVersion : String = {