diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index b89f68e24..be8357679 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -28,26 +28,22 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource; -import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.recalibration.RecalDatum; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.threading.ThreadLocalArray; public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource { - // optimizations: don't reallocate an array each time - private byte[] tempQualArray; - private boolean[] tempErrorArray; - private double[] tempFractionalErrorArray; + // optimization: only allocate temp arrays once per thread + private final ThreadLocal threadLocalTempQualArray = new ThreadLocalArray(EventType.values().length, byte.class); + private final ThreadLocal threadLocalTempErrorArray = new ThreadLocalArray(EventType.values().length, boolean.class); + private final ThreadLocal threadLocalTempFractionalErrorArray = new ThreadLocalArray(EventType.values().length, double.class); public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) { super.initialize(covariates, recalibrationTables); - tempQualArray = new byte[EventType.values().length]; - tempErrorArray = new boolean[EventType.values().length]; - tempFractionalErrorArray = new double[EventType.values().length]; } /** @@ -60,10 +56,13 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp * @param refBase The reference base at this locus */ @Override - public synchronized void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) { + public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) { final int offset = pileupElement.getOffset(); final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead()); + byte[] tempQualArray = threadLocalTempQualArray.get(); + boolean[] tempErrorArray = threadLocalTempErrorArray.get(); + tempQualArray[EventType.BASE_SUBSTITUTION.index] = pileupElement.getQual(); tempErrorArray[EventType.BASE_SUBSTITUTION.index] = !BaseUtils.basesAreEqual(pileupElement.getBase(), refBase); tempQualArray[EventType.BASE_INSERTION.index] = pileupElement.getBaseInsertionQual(); @@ -77,40 +76,29 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp final byte qual = tempQualArray[eventIndex]; final boolean isError = tempErrorArray[eventIndex]; - final NestedIntegerArray rgRecalTable = recalibrationTables.getReadGroupTable(); - final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); - final RecalDatum rgThisDatum = createDatumObject(qual, isError); - if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it - rgRecalTable.put(rgThisDatum, keys[0], eventIndex); - else - rgPreviousDatum.combine(rgThisDatum); + // TODO: should this really be combine rather than increment? + combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex); - final NestedIntegerArray qualRecalTable = recalibrationTables.getQualityScoreTable(); - final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex); - if (qualPreviousDatum == null) - qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex); - else - qualPreviousDatum.increment(isError); + incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex); for (int i = 2; i < covariates.length; i++) { if (keys[i] < 0) continue; - final NestedIntegerArray covRecalTable = recalibrationTables.getTable(i); - final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex); - if (covPreviousDatum == null) - covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex); - else - covPreviousDatum.increment(isError); + + incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); } } } @Override - public synchronized void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { for( int offset = 0; offset < read.getReadBases().length; offset++ ) { if( !skip[offset] ) { final ReadCovariates readCovariates = covariateKeySetFrom(read); + byte[] tempQualArray = threadLocalTempQualArray.get(); + double[] tempFractionalErrorArray = threadLocalTempFractionalErrorArray.get(); + tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset]; tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset]; tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset]; @@ -124,30 +112,15 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp final byte qual = tempQualArray[eventIndex]; final double isError = tempFractionalErrorArray[eventIndex]; - final NestedIntegerArray rgRecalTable = recalibrationTables.getReadGroupTable(); - final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); - final RecalDatum rgThisDatum = createDatumObject(qual, isError); - if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it - rgRecalTable.put(rgThisDatum, keys[0], eventIndex); - else - rgPreviousDatum.combine(rgThisDatum); + combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex); - final NestedIntegerArray qualRecalTable = recalibrationTables.getQualityScoreTable(); - final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex); - if (qualPreviousDatum == null) - qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex); - else - qualPreviousDatum.increment(1.0, isError); + incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex); for (int i = 2; i < covariates.length; i++) { if (keys[i] < 0) continue; - final NestedIntegerArray covRecalTable = recalibrationTables.getTable(i); - final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex); - if (covPreviousDatum == null) - covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex); - else - covPreviousDatum.increment(1.0, isError); + + incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 63524ae82..32abe8ef6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -276,7 +276,10 @@ public class SlidingWindow { final int windowHeaderStartLocation = getStartLocation(windowHeader); final int sizeOfMarkedRegion = stop - windowHeaderStartLocation + contextSize + 1; - final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion); + + // copy over as many bits as we can from the previous calculation. Note that we can't trust the + // last (contextSize - 1) worth of bits because we may not have actually looked at variant regions there. + final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion) - contextSize - 1; final int locationToProcess = Math.min(lastPositionMarked, stop - contextSize); // update the iterator to the correct position diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java index f76225134..8042c15d8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -72,7 +72,7 @@ public class ErrorModel { haplotypeMap = new LinkedHashMap(); if (refSampleVC.isIndel()) { pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java index 4c20700ac..2522fc16e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java @@ -245,7 +245,7 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G // find the alternate allele(s) that we should be using final List alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs); - if (alleles == null || alleles.isEmpty()) + if (alleles == null || alleles.isEmpty() || (alleles.size() == 1 && alleles.get(0).isReference())) return null; // start making the VariantContext final GenomeLoc loc = ref.getLocus(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java index fc0c526bc..f09a1ea3e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java @@ -62,7 +62,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); haplotypeMap = new LinkedHashMap(); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java index cfb67164d..6f3740ab3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java @@ -54,7 +54,7 @@ public class AFCalcTestBuilder { } public AFCalc makeModel() { - return AFCalcFactory.createAFCalc(modelType, nSamples, getNumAltAlleles(), getNumAltAlleles(), 2); + return AFCalcFactory.createAFCalc(modelType, nSamples, getNumAltAlleles(), 2); } public double[] makePriors() { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 0e97c090c..b248c8759 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; -import org.broadinstitute.sting.gatk.walkers.genotyper.ProbabilityVector; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -41,22 +40,20 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 private final static boolean VERBOSE = false; - protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); this.ploidy = ploidy; } @Override protected VariantContext reduceScope(VariantContext vc) { - final int maxAltAlleles = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype; - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > maxAltAlleles) { - logger.warn("this tool is currently set to genotype at most " + maxAltAlleles + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + if ( vc.getAlternateAlleles().size() > getMaxAltAlleles()) { + logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - final List alleles = new ArrayList(maxAltAlleles + 1); + final List alleles = new ArrayList(getMaxAltAlleles() + 1); alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, maxAltAlleles, ploidy)); + alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles(), ploidy)); VariantContextBuilder builder = new VariantContextBuilder(vc); builder.alleles(alleles); @@ -69,8 +66,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { @Override public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { - combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, getResultTracker()); - return resultFromTracker(vc, log10AlleleFrequencyPriors); + combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors); + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); } /** @@ -171,13 +168,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param numAlleles Number of alternate alleles * @param ploidyPerPool Number of samples per pool * @param log10AlleleFrequencyPriors Frequency priors - * @param resultTracker object to fill with output values */ - protected static void combineSinglePools(final GenotypesContext GLs, - final int numAlleles, - final int ploidyPerPool, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + protected void combineSinglePools(final GenotypesContext GLs, + final int numAlleles, + final int ploidyPerPool, + final double[] log10AlleleFrequencyPriors) { final ArrayList genotypeLikelihoods = getGLs(GLs, true); @@ -196,24 +191,24 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { if ( genotypeLikelihoods.size() <= 1 ) { // no meaningful GLs at all, just set the tracker to non poly values - resultTracker.reset(); // just mimic-ing call below - resultTracker.setLog10LikelihoodOfAFzero(0.0); + getStateTracker().reset(); // just mimic-ing call below + getStateTracker().setLog10LikelihoodOfAFzero(0.0); } else { for (int p=1; p ACqueue = new LinkedList(); // mapping of ExactACset indexes to the objects final HashMap indexesToACset = new HashMap(); @@ -230,16 +225,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { indexesToACset.put(zeroSet.getACcounts(), zeroSet); // keep processing while we have AC conformations that need to be calculated - StateTracker stateTracker = new StateTracker(); while ( !ACqueue.isEmpty() ) { - resultTracker.incNEvaluations(); + getStateTracker().incNEvaluations(); // compute log10Likelihoods final ExactACset ACset = ACqueue.remove(); - final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, resultTracker, stateTracker, ACqueue, indexesToACset); - - // adjust max likelihood seen if needed - if ( log10LofKs > stateTracker.getMaxLog10L()) - stateTracker.update(log10LofKs, ACset.getACcounts()); + final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset); // clean up memory indexesToACset.remove(ACset.getACcounts()); @@ -260,39 +250,32 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param log10AlleleFrequencyPriors Prior object * @param originalPloidy Total ploidy of original combined pool * @param newGLPloidy Ploidy of GL vector - * @param resultTracker AFResult object - * @param stateTracker max likelihood observed so far * @param ACqueue Queue of conformations to compute * @param indexesToACset AC indices of objects in queue * @return max log likelihood */ - private static double calculateACConformationAndUpdateQueue(final ExactACset set, - final CombinedPoolLikelihoods newPool, - final CombinedPoolLikelihoods originalPool, - final double[] newGL, - final double[] log10AlleleFrequencyPriors, - final int originalPloidy, - final int newGLPloidy, - final AFCalcResultTracker resultTracker, - final StateTracker stateTracker, - final LinkedList ACqueue, - final HashMap indexesToACset) { + private double calculateACConformationAndUpdateQueue(final ExactACset set, + final CombinedPoolLikelihoods newPool, + final CombinedPoolLikelihoods originalPool, + final double[] newGL, + final double[] log10AlleleFrequencyPriors, + final int originalPloidy, + final int newGLPloidy, + final LinkedList ACqueue, + final HashMap indexesToACset) { // compute likeihood in "set" of new set based on original likelihoods final int numAlleles = set.getACcounts().getCounts().length; final int newPloidy = set.getACsum(); - final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, resultTracker); + final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy); // add to new pool if (!Double.isInfinite(log10LofK)) newPool.add(set); - // TODO -- uncomment this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) - //if ( log10LofK < stateTracker.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && stateTracker.isLowerAC(set.ACcounts) ) { - if ( log10LofK < stateTracker.getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY ) { - if ( VERBOSE ) - System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, stateTracker.getMaxLog10L()); + // TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) + if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) { return log10LofK; } @@ -323,67 +306,67 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { } - /** - * Naive combiner of two multiallelic pools - number of alt alleles must be the same. - * Math is generalization of biallelic combiner. - * - * For vector K representing an allele count conformation, - * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) - * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) - * @param originalPool First log-likelihood pool GL vector - * @param yy Second pool GL vector - * @param ploidy1 Ploidy of first pool (# of chromosomes in it) - * @param ploidy2 Ploidy of second pool - * @param numAlleles Number of alleles - * @param log10AlleleFrequencyPriors Array of biallelic priors - * @param resultTracker Af calculation result object - */ - public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { -/* - final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); - final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); - - if (dim1 != originalPool.getLength() || dim2 != yy.length) - throw new ReviewedStingException("BUG: Inconsistent vector length"); - - if (ploidy2 == 0) - return; - - final int newPloidy = ploidy1 + ploidy2; - - // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) - // and L2(K) = Pr(D|AC2=K) * choose(m2,K) - GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); - final double[] x = originalPool.getLikelihoodsAsVector(true); - while(firstIterator.hasNext()) { - x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); - firstIterator.next(); - } - - GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); - final double[] y = yy.clone(); - while(secondIterator.hasNext()) { - y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); - secondIterator.next(); - } - - // initialize output to -log10(choose(m1+m2,[k1 k2...]) - final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); - final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); - - - // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K - while(outputIterator.hasNext()) { - final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); - double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); - - originalPool.add(likelihood, set, outputIterator.getLinearIndex()); - outputIterator.next(); - } -*/ - } +// /** +// * Naive combiner of two multiallelic pools - number of alt alleles must be the same. +// * Math is generalization of biallelic combiner. +// * +// * For vector K representing an allele count conformation, +// * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) +// * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) +// * @param originalPool First log-likelihood pool GL vector +// * @param yy Second pool GL vector +// * @param ploidy1 Ploidy of first pool (# of chromosomes in it) +// * @param ploidy2 Ploidy of second pool +// * @param numAlleles Number of alleles +// * @param log10AlleleFrequencyPriors Array of biallelic priors +// * @param resultTracker Af calculation result object +// */ +// public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, +// final double[] log10AlleleFrequencyPriors, +// final AFCalcResultTracker resultTracker) { +///* +// final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); +// final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); +// +// if (dim1 != originalPool.getLength() || dim2 != yy.length) +// throw new ReviewedStingException("BUG: Inconsistent vector length"); +// +// if (ploidy2 == 0) +// return; +// +// final int newPloidy = ploidy1 + ploidy2; +// +// // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) +// // and L2(K) = Pr(D|AC2=K) * choose(m2,K) +// GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); +// final double[] x = originalPool.getLikelihoodsAsVector(true); +// while(firstIterator.hasNext()) { +// x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); +// firstIterator.next(); +// } +// +// GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); +// final double[] y = yy.clone(); +// while(secondIterator.hasNext()) { +// y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); +// secondIterator.next(); +// } +// +// // initialize output to -log10(choose(m1+m2,[k1 k2...]) +// final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); +// final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); +// +// +// // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K +// while(outputIterator.hasNext()) { +// final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); +// double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); +// +// originalPool.add(likelihood, set, outputIterator.getLinearIndex()); +// outputIterator.next(); +// } +//*/ +// } /** * Compute likelihood of a particular AC conformation and update AFresult object @@ -394,15 +377,13 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param numAlleles Number of alleles (including ref) * @param ploidy1 Ploidy of original pool (combined) * @param ploidy2 Ploidy of new pool - * @param resultTracker AFResult object * @return log-likehood of requested conformation */ - private static double computeLofK(final ExactACset set, - final CombinedPoolLikelihoods firstGLs, - final double[] secondGL, - final double[] log10AlleleFrequencyPriors, - final int numAlleles, final int ploidy1, final int ploidy2, - final AFCalcResultTracker resultTracker) { + private double computeLofK(final ExactACset set, + final CombinedPoolLikelihoods firstGLs, + final double[] secondGL, + final double[] log10AlleleFrequencyPriors, + final int numAlleles, final int ploidy1, final int ploidy2) { final int newPloidy = ploidy1 + ploidy2; @@ -420,8 +401,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX]; set.getLog10Likelihoods()[0] = log10Lof0; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); return log10Lof0; } else { @@ -464,14 +445,16 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); - resultTracker.updateMLEifNeeded(log10LofK, altCounts); + // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY + getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - resultTracker.updateMAPifNeeded(log10LofK, altCounts); + // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY + getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); return log10LofK; } @@ -494,99 +477,6 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { return (sum == ploidy); } - /** - * Combines naively two biallelic pools (of arbitrary size). - * For two pools of size m1 and m2, we can compute the combined likelihood as: - * Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k) - * @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i - * @param newPLVector Second GL vector - * @param ploidy1 Ploidy of first pool (# of chromosomes in it) - * @param ploidy2 Ploidy of second pool - * @param log10AlleleFrequencyPriors Array of biallelic priors - * @param resultTracker Af calculation result object - * @return Combined likelihood vector - */ - public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector, - final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { - - final int newPloidy = ploidy1 + ploidy2; - - final double[] combinedLikelihoods = new double[1+newPloidy]; - - /** Pre-fill result array and incorporate weights into input vectors - * Say L1(k) = Pr(D|AC1=k) * choose(m1,k) - * and L2(k) = Pr(D|AC2=k) * choose(m2,k) - * equation reduces to - * Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j) - * which is just plain convolution of L1 and L2 (with pre-existing vector) - */ - - // intialize result vector to -infinity - Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY); - - final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1); - for (int k=originalPool.getProbabilityVector().length; k< x.length; k++) - x[k] = Double.NEGATIVE_INFINITY; - - final double[] y = newPLVector.clone(); - - - final double log10Lof0 = x[0]+y[0]; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); - - double maxElement = log10Lof0; - int maxElementIdx = 0; - int[] alleleCounts = new int[1]; - for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) { - double[] acc = new double[k+1]; - Arrays.fill(acc,Double.NEGATIVE_INFINITY); - double innerMax = Double.NEGATIVE_INFINITY; - - for (int j=0; j <=k; j++) { - double x1,y1; - - - if (k-j>=0 && k-j < y.length) - y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j); - else - continue; - - if (j < x.length) - x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j); - else - continue; - - if (Double.isInfinite(x1) || Double.isInfinite(y1)) - continue; - acc[j] = x1 + y1; - if (acc[j] > innerMax) - innerMax = acc[j]; - else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY) - break; - } - combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k); - maxElementIdx = k; - double maxDiff = combinedLikelihoods[k] - maxElement; - if (maxDiff > 0) - maxElement = combinedLikelihoods[k]; - else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) { - break; - } - - alleleCounts[0] = k; - resultTracker.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts); - resultTracker.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts); - - - } - - - return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true)); - } - - /** * From a given variant context, extract a given subset of alleles, and update genotype context accordingly, * including updating the PL's, and assign genotypes accordingly @@ -675,10 +565,10 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * * @return genotype */ - private static void assignGenotype(final GenotypeBuilder gb, - final double[] newLikelihoods, - final List allelesToUse, - final int numChromosomes) { + private void assignGenotype(final GenotypeBuilder gb, + final double[] newLikelihoods, + final List allelesToUse, + final int numChromosomes) { final int numNewAltAlleles = allelesToUse.size() - 1; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 8738def50..d91df82e2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -217,7 +217,7 @@ public class GenotypingEngine { } cleanUpSymbolicUnassembledEvents( haplotypes ); - if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 3 ) { // if not in GGA mode and have at least 3 samples try to create MNP and complex events by looking at LD structure + if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc ); } if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode! diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 71e4f5f8a..a08614deb 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -52,6 +52,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -114,6 +115,12 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false) protected PrintStream graphWriter = null; + /** + * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. + */ + @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; + @Hidden @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) protected String keepRG = null; @@ -234,14 +241,14 @@ public class HaplotypeCaller extends ActiveRegionWalker implem samplesList.addAll( samples ); // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); - UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling - UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling - UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); - UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested - UnifiedArgumentCollection simpleUAC = UAC.clone(); + UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); + simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); + simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); @@ -287,7 +294,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); - likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false ); + likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); } @@ -400,6 +407,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final List filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! + // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM + Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() ); + // evaluate each sample's reads against all haplotypes final HashMap> perSampleReadList = splitReadsBySample( activeRegion.getReads() ); final HashMap> perSampleFilteredReadList = splitReadsBySample( filteredReads ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 14c1cd59d..e90d89a8c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -30,6 +30,9 @@ import com.google.java.contract.Requires; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pairhmm.*; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -44,8 +47,25 @@ public class LikelihoodCalculationEngine { private final boolean DEBUG; private final PairHMM pairHMM; - public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) { - pairHMM = new PairHMM( noBanded ); + public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) { + + switch (hmmType) { + case EXACT: + pairHMM = new ExactPairHMM(); + break; + case ORIGINAL: + pairHMM = new OriginalPairHMM(); + break; + case CACHING: + pairHMM = new CachingPairHMM(); + break; + case LOGLESS_CACHING: + pairHMM = new LoglessCachingPairHMM(); + break; + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING."); + } + this.constantGCP = constantGCP; DEBUG = debug; } @@ -69,23 +89,18 @@ public class LikelihoodCalculationEngine { X_METRIC_LENGTH += 2; Y_METRIC_LENGTH += 2; - // initial arrays to hold the probabilities of being in the match, insertion and deletion cases - final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases + pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); // for each sample's reads for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } // evaluate the likelihood of the reads given those haplotypes - computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey(), matchMetricArray, XMetricArray, YMetricArray ); + computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() ); } } - private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample ) { final int numHaplotypes = haplotypes.size(); final int numReads = reads.size(); @@ -113,9 +128,8 @@ public class LikelihoodCalculationEngine { final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype; - readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(), - readQuals, readInsQuals, readDelQuals, overallGCP, - haplotypeStart, matchMetricArray, XMetricArray, YMetricArray); + readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), + readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0); readCounts[jjj][iii] = readCount; } } @@ -125,12 +139,12 @@ public class LikelihoodCalculationEngine { } private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) { - for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ){ + for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ) { if( b1[iii] != b2[iii] ) { return iii; } } - return b1.length; + return Math.min(b1.length, b2.length); } @Requires({"haplotypes.size() > 0"}) @@ -280,7 +294,7 @@ public class LikelihoodCalculationEngine { final int numHaplotypes = haplotypes.size(); final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples final ArrayList bestHaplotypesIndexList = new ArrayList(); - bestHaplotypesIndexList.add(0); // always start with the reference haplotype + bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype // set up the default 1-to-1 haplotype mapping object final ArrayList> haplotypeMapping = new ArrayList>(); for( final Haplotype h : haplotypes ) { @@ -322,6 +336,13 @@ public class LikelihoodCalculationEngine { return bestHaplotypes; } + public static int findReferenceIndex( final List haplotypes ) { + for( final Haplotype h : haplotypes ) { + if( h.isReference() ) { return haplotypes.indexOf(h); } + } + throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); + } + public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java new file mode 100644 index 000000000..282db45d5 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ + +public class CachingPairHMM extends OriginalPairHMM { + + double[][] constantMatrix = null; // The cache in the CachingPairHMM + double[][] distanceMatrix = null; // The cache in the CachingPairHMM + + protected static final double [] firstRowConstantMatrix = { + QualityUtils.qualToProbLog10((byte) (DEFAULT_GOP + DEFAULT_GOP)), + QualityUtils.qualToProbLog10(DEFAULT_GCP), + QualityUtils.qualToErrorProbLog10(DEFAULT_GOP), + QualityUtils.qualToErrorProbLog10(DEFAULT_GCP), + 0.0, + 0.0 + }; + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + super.initialize(READ_MAX_LENGTH, HAPLOTYPE_MAX_LENGTH); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + constantMatrix = new double[X_METRIC_LENGTH][6]; + distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + // fill in the first row + for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) { + updateCell(1, jjj, 0.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray); + } + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + if( recacheReadValues ) { + initializeConstants( insertionGOP, deletionGOP, overallGCP ); + } + initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex ); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + for (int i = 2; i < X_METRIC_LENGTH; i++) { + for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) { + updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializeDistanceMatrix( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final int startIndex ) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + public void initializeConstants( final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP ) { + + final int l = insertionGOP.length; + constantMatrix[1] = firstRowConstantMatrix; + for (int i = 0; i < l; i++) { + final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); + constantMatrix[i+2][0] = QualityUtils.qualToProbLog10((byte) qualIndexGOP); + constantMatrix[i+2][1] = QualityUtils.qualToProbLog10(overallGCP[i]); + constantMatrix[i+2][2] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]); + constantMatrix[i+2][3] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); + constantMatrix[i+2][4] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]); + constantMatrix[i+2][5] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); + } + constantMatrix[l+1][4] = 0.0; + constantMatrix[l+1][5] = 0.0; + } + + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions + + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param constants an array with the six constants relevant to this location + * @param matchMetricArray the matches likelihood matrix + * @param XMetricArray the insertions likelihood matrix + * @param YMetricArray the deletions likelihood matrix + */ + private void updateCell( final int indI, final int indJ, final double prior, final double[] constants, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + matchMetricArray[indI][indJ] = prior + + MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ - 1] + constants[0], + XMetricArray[indI - 1][indJ - 1] + constants[1], + YMetricArray[indI - 1][indJ - 1] + constants[1] ); + XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ] + constants[2], + XMetricArray[indI - 1][indJ] + constants[3]); + YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI][indJ - 1] + constants[4], + YMetricArray[indI][indJ - 1] + constants[5]); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java new file mode 100644 index 000000000..d2aef5bb5 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ + +public class LoglessCachingPairHMM extends CachingPairHMM { + + protected static final double SCALE_FACTOR_LOG10 = 300.0; + + protected static final double [] firstRowConstantMatrix = { + QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)), + QualityUtils.qualToProb(DEFAULT_GCP), + QualityUtils.qualToErrorProb(DEFAULT_GOP), + QualityUtils.qualToErrorProb(DEFAULT_GCP), + 1.0, + 1.0 + }; + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], 0.0); + Arrays.fill(XMetricArray[iii], 0.0); + Arrays.fill(YMetricArray[iii], 0.0); + } + + // the initial condition + matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10); // Math.log10(1.0); + + constantMatrix = new double[X_METRIC_LENGTH][6]; + distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + // fill in the first row + for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) { + updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray); + } + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + if( recacheReadValues ) { + initializeConstants( insertionGOP, deletionGOP, overallGCP ); + } + initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex ); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + for (int i = 2; i < X_METRIC_LENGTH; i++) { + for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) { + updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10; + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializeDistanceMatrix( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final int startIndex ) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + public void initializeConstants( final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP ) { + + final int l = insertionGOP.length; + constantMatrix[1] = firstRowConstantMatrix; + for (int i = 0; i < l; i++) { + final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); + constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP); + constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]); + constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]); + constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]); + constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]); + constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]); + } + constantMatrix[l+1][4] = 1.0; + constantMatrix[l+1][5] = 1.0; + } + + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions + + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param constants an array with the six constants relevant to this location + * @param matchMetricArray the matches likelihood matrix + * @param XMetricArray the insertions likelihood matrix + * @param YMetricArray the deletions likelihood matrix + */ + private void updateCell( final int indI, final int indJ, final double prior, final double[] constants, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] + + XMetricArray[indI - 1][indJ - 1] * constants[1] + + YMetricArray[indI - 1][indJ - 1] * constants[1] ); + XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3]; + YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5]; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index 26ee78484..90d67d393 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -73,11 +73,6 @@ public class BQSRIntegrationTest extends WalkerTest { params.getCommandLine(), Arrays.asList(params.md5)); executeTest("testBQSR-"+params.args, spec).getFirst(); - - WalkerTestSpec specNT2 = new WalkerTestSpec( - params.getCommandLine() + " -nt 2", - Arrays.asList(params.md5)); - executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst(); } @Test diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 989f06ec5..ae3befe66 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -60,27 +60,27 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","67dabdbf1e6ed8a83d2e85766558a20a"); + PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","9ce24f4ff787aed9d3754519a60ef49f"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","d4bfae27f1b07923f381d708d8a34cf4"); + PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","492c8ba9a80a902097ff15bbeb031592"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","9acfe0019efdc91217ee070acb071228"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","848e1092b5cd57b0da5f1187e67134e7"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","c1d4dd793f61710a1b1fc5d82803210f"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","51a7b51d82a341adec0e6510f5dfadd8"); } @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da84bf45f7080a46a7a78542b3a0629d"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","0a8c3b06243040b743dd90d497bb3f83"); } @Test(enabled = true) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java index 1070642e9..cbe2eb268 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java @@ -56,6 +56,11 @@ public class AFCalcResultUnitTest extends BaseTest { tests.add(new Object[]{new MyTest(new double[]{-1e-9, badL}, new double[]{0.0, badL})}); } + // test that a non-ref site gets reasonable posteriors with an ~0.0 value doesn't get lost + for ( final double nonRefL : Arrays.asList(-100.0, -50.0, -10.0, -9.0, -8.0, -7.0, -6.0, -5.0)) { + tests.add(new Object[]{new MyTest(new double[]{0.0, nonRefL}, new double[]{0.0, nonRefL})}); + } + return tests.toArray(new Object[][]{}); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 25df0f6d2..2d346e548 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; +import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MathUtils; @@ -124,12 +125,7 @@ public class AFCalcUnitTest extends BaseTest { final List triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2); for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) { - List calcs = AFCalcFactory.createAFCalcs( - Arrays.asList( - AFCalcFactory.Calculation.EXACT_REFERENCE, - AFCalcFactory.Calculation.EXACT_INDEPENDENT, - AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY - ), 4, 2, 2, 2); + List calcs = AFCalcFactory.createAFCalcs( Arrays.asList( AFCalcFactory.Calculation.values() ), 4, 2, 2); final int nPriorValues = 2*nSamples+1; final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors @@ -146,7 +142,7 @@ public class AFCalcUnitTest extends BaseTest { new GetGLsTest(model, 1, genotypes, priors, priorName); // tri-allelic - if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) ) // || model != generalCalc ) ) + if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) && ! ( model instanceof OriginalDiploidExactAFCalc) ) // || model != generalCalc ) ) for ( List genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) ) new GetGLsTest(model, 2, genotypes, priors, priorName); } @@ -156,22 +152,28 @@ public class AFCalcUnitTest extends BaseTest { return GetGLsTest.getTests(GetGLsTest.class); } - @DataProvider(name = "badGLs") - public Object[][] createBadGLs() { - final List genotypes = Arrays.asList(AB2, BB2, CC2, CC2); - final int nSamples = genotypes.size(); +// @DataProvider(name = "badGLs") +// public Object[][] createBadGLs() { +// final List genotypes = Arrays.asList(AB2, BB2, CC2, CC2); +// final int nSamples = genotypes.size(); +// +// final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4); +// +// final int nPriorValues = 2*nSamples+1; +// final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors +// for ( AFCalc model : Arrays.asList(indCalc) ) { +// final String priorName = "flat"; +// new GetGLsTest(model, 2, genotypes, priors, priorName); +// } +// +// return GetGLsTest.getTests(GetGLsTest.class); +// } - final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4); - - final int nPriorValues = 2*nSamples+1; - final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors - for ( AFCalc model : Arrays.asList(indCalc) ) { - final String priorName = "flat"; - new GetGLsTest(model, 2, genotypes, priors, priorName); - } - - return GetGLsTest.getTests(GetGLsTest.class); - } +// +// @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs") +// public void testBadGLs(GetGLsTest cfg) { +// testResultSimple(cfg); +// } @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs") public void testBiallelicGLs(GetGLsTest cfg) { @@ -185,11 +187,6 @@ public class AFCalcUnitTest extends BaseTest { testResultSimple(cfg); } - @Test(enabled = true, dataProvider = "badGLs") - public void testBadGLs(GetGLsTest cfg) { - testResultSimple(cfg); - } - private static class NonInformativeData { final Genotype nonInformative; final List called; @@ -218,16 +215,14 @@ public class AFCalcUnitTest extends BaseTest { samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative)); final int nSamples = samples.size(); - List calcs = AFCalcFactory.createAFCalcs( - Arrays.asList( - AFCalcFactory.Calculation.EXACT_REFERENCE, - AFCalcFactory.Calculation.EXACT_INDEPENDENT, - AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY - ), 4, 2, 2, 2); + List calcs = AFCalcFactory.createAFCalcs(Arrays.asList(AFCalcFactory.Calculation.values()), 4, 2, 2); final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors for ( AFCalc model : calcs ) { + if ( testData.nAltAlleles > 1 && model instanceof OriginalDiploidExactAFCalc ) + continue; + final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat"); for ( int rotation = 0; rotation < nSamples; rotation++ ) { @@ -428,6 +423,94 @@ public class AFCalcUnitTest extends BaseTest { "Actual pNonRef not within tolerance " + tolerance + " of expected"); } + @DataProvider(name = "PNonRefBiallelicSystematic") + public Object[][] makePNonRefBiallelicSystematic() { + List tests = new ArrayList(); + + final List bigNonRefPLs = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 50, 100, 1000); + final List> bigDiploidPLs = removeBadPLs(Utils.makePermutations(bigNonRefPLs, 3, true)); + + for ( AFCalcFactory.Calculation modelType : AFCalcFactory.Calculation.values() ) { + + if ( false ) { // for testing only + tests.add(new Object[]{modelType, toGenotypes(Arrays.asList(Arrays.asList(0,100,0)))}); + } else { + if ( modelType == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) continue; // TODO -- GENERAL_PLOIDY DOESN'T WORK + + // test all combinations of PLs for 1 sample + for ( final List> PLsPerSample : Utils.makePermutations(bigDiploidPLs, 1, true) ) { + tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)}); + } + + + final List> smallDiploidPLs = new LinkedList>(); + for ( final int nonRefPL : Arrays.asList(5, 10, 20, 30) ) { + for ( int i = 0; i < 2; i++ ) { + List pls = new ArrayList(Collections.nCopies(3, nonRefPL)); + pls.set(i, 0); + smallDiploidPLs.add(pls); + } + } + + for ( final List> PLsPerSample : Utils.makePermutations(smallDiploidPLs, 5, false) ) { + tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + final List> removeBadPLs(List> listOfPLs) { + List> clean = new LinkedList>(); + + for ( final List PLs : listOfPLs ) { + int x = PLs.get(0); + boolean bad = false; + for ( int pl1 : PLs ) + if ( pl1 > x ) + bad = true; + else + x = pl1; + if ( ! bad ) clean.add(PLs); + } + + return clean; + } + + private List toGenotypes(final List> PLsPerSample) { + final List nocall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + final List genotypes = new ArrayList(PLsPerSample.size()); + + for ( final List PLs : PLsPerSample ) { + final int[] pls = ArrayUtils.toPrimitive(PLs.toArray(new Integer[3])); + final int min = MathUtils.arrayMin(pls); + for ( int i = 0; i < pls.length; i++ ) pls[i] -= min; + genotypes.add(makePL(nocall, pls)); + } + + return genotypes; + } + + @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRefBiallelicSystematic") + private void PNonRefBiallelicSystematic(AFCalcFactory.Calculation modelType, final List genotypes) { + //logger.warn("Running " + modelType + " with " + genotypes); + final AFCalcTestBuilder refBuilder = new AFCalcTestBuilder(genotypes.size(), 1, AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcTestBuilder.PriorType.human); + final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(genotypes.size(), 1, modelType, AFCalcTestBuilder.PriorType.human); + + final VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A, C)); + vcb.genotypes(genotypes); + + final AFCalcResult refResult = refBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors()); + final AFCalcResult testResult = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors()); + + final double tolerance = 1e-3; + Assert.assertEquals(testResult.getLog10PosteriorOfAFGT0(), refResult.getLog10PosteriorOfAFGT0(), tolerance, + "Actual pNonRef not within tolerance " + tolerance + " of expected"); + Assert.assertEquals(testResult.getAlleleCountsOfMLE(), refResult.getAlleleCountsOfMLE(), + "Actual MLE " + Utils.join(",", testResult.getAlleleCountsOfMLE()) + " not equal to expected " + Utils.join(",", refResult.getAlleleCountsOfMLE())); + } + // -------------------------------------------------------------------------------- // // Test priors @@ -495,7 +578,7 @@ public class AFCalcUnitTest extends BaseTest { // list of all high-quality models in the system final List models = Arrays.asList( - AFCalcFactory.Calculation.EXACT, + AFCalcFactory.Calculation.getDefaultModel(), AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcFactory.Calculation.EXACT_INDEPENDENT); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java index 48f282901..3df2f7883 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java @@ -137,18 +137,15 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest { @Test(dataProvider = "getGLs") public void testGLs(GetGLsTest cfg) { - - final AFCalcResultTracker resultTracker = new AFCalcResultTracker(cfg.numAltAlleles); final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size()); double[] priors = new double[len]; // flat priors - GeneralPloidyExactAFCalc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, resultTracker); + final GeneralPloidyExactAFCalc calc = new GeneralPloidyExactAFCalc(cfg.GLs.size(), 1 + cfg.numAltAlleles, cfg.ploidy); + calc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = resultTracker.getAlleleCountsOfMAP()[allele]; - -// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount); + int calculatedAlleleCount = calc.getStateTracker().getAlleleCountsOfMAP()[allele]; Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a8ea4b7da..71f03ea00 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,7 +21,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "75013fa6a884104f0b1797502b636698"); + HCTest(CEUTRIO_BAM, "", "1d826f852ec1457efbfa7ee828c5783a"); } @Test @@ -31,7 +31,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "3cd3363976b1937d801f9f82996f4abe"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "53caa950535749f99d3c5b9bb61c7b60"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -42,7 +42,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(CEUTRIO_BAM, "", "30598abeeb0b0ae5816ffdbf0c4044fd"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "966d338f423c86a390d685aa6336ec69"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "6eb9c1026225b38ba7bd3c4c218f8269"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "b4ea70a446e4782bd3700ca14dd726ff"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -64,13 +64,13 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "98d82d74e8d6a778290bee6c0df6d092"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8"); } @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("fa5c5eb996e95aed12c50d70e6dd74d7")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c54c0c9411054bf629bfd98b616e53fc")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @@ -94,6 +94,4 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { Arrays.asList("79af83432dc4a1768b3ebffffc4d2b8f")); executeTest("HC calling on a ReducedRead BAM", spec); } - - } diff --git a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java similarity index 56% rename from public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java rename to protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java index 22bcb1bbf..6281054b1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java @@ -23,24 +23,26 @@ */ // our package -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.utils.pairhmm; // the imports for unit testing. - import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; - public class PairHMMUnitTest extends BaseTest { final static boolean EXTENSIVE_TESTING = true; - PairHMM hmm = new PairHMM( false ); // reference implementation - PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding + PairHMM exactHMM = new ExactPairHMM(); // the log truth implementation + PairHMM originalHMM = new OriginalPairHMM(); // the reference implementation + PairHMM cachingHMM = new CachingPairHMM(); + PairHMM loglessHMM = new LoglessCachingPairHMM(); // -------------------------------------------------------------------------------- // @@ -57,7 +59,7 @@ public class PairHMMUnitTest extends BaseTest { final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC"; final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA"; - public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { + public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) { this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); } @@ -76,115 +78,51 @@ public class PairHMMUnitTest extends BaseTest { } public double expectedLogL() { - return expectedQual / -10.0; + return (expectedQual / -10.0) + 0.03 ; } - public double tolerance() { - return 0.1; // TODO FIXME arbitrary + public double toleranceFromTheoretical() { + return 0.2; } - public double calcLogL() { + public double toleranceFromReference() { + return 1E-4; + } - double logL = hmm.computeReadLikelihoodGivenHaplotype( + public double toleranceFromExact() { + return 1E-9; + } + + public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) { + pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length); + return pairHMM.computeReadLikelihoodGivenHaplotypeLog10( refBasesWithContext, readBasesWithContext, - qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true), - qualAsBytes(gcp, false)); - - return logL; + qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel), + qualAsBytes(gcp, false, anchorIndel), 0, true); } private final byte[] asBytes(final String bases, final boolean left, final boolean right) { return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); } - private byte[] qualAsBytes(final int phredQual, final boolean doGOP) { + private byte[] qualAsBytes(final int phredQual, final boolean doGOP, final boolean anchorIndel) { final byte phredQuals[] = new byte[readBasesWithContext.length]; - // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM - Arrays.fill(phredQuals, (byte)100); - // update just the bases corresponding to the provided micro read with the quality scores - if( doGOP ) { - phredQuals[0 + CONTEXT.length()] = (byte)phredQual; - } else { - for ( int i = 0; i < read.length(); i++) - phredQuals[i + CONTEXT.length()] = (byte)phredQual; - } + if( anchorIndel ) { + // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM + Arrays.fill(phredQuals, (byte)100); - return phredQuals; - } - } - - final Random random = new Random(87865573); - private class BandedLikelihoodTestProvider extends TestDataProvider { - final String ref, read; - final byte[] refBasesWithContext, readBasesWithContext; - final int baseQual, insQual, delQual, gcp; - final int expectedQual; - final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC"; - final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA"; - final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT"; - final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA"; - final byte[] baseQuals, insQuals, delQuals, gcps; - - public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { - this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); - } - - public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { - super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual)); - this.baseQual = baseQual; - this.delQual = delQual; - this.insQual = insQual; - this.gcp = gcp; - this.read = read; - this.ref = ref; - this.expectedQual = expectedQual; - - refBasesWithContext = asBytes(ref, left, right); - readBasesWithContext = asBytes(read, false, false); - baseQuals = qualAsBytes(baseQual); - insQuals = qualAsBytes(insQual); - delQuals = qualAsBytes(delQual); - gcps = qualAsBytes(gcp, false); - } - - public double expectedLogL() { - double logL = hmm.computeReadLikelihoodGivenHaplotype( - refBasesWithContext, readBasesWithContext, - baseQuals, insQuals, delQuals, gcps); - - return logL; - } - - public double tolerance() { - return 0.2; // TODO FIXME arbitrary - } - - public double calcLogL() { - - double logL = bandedHMM.computeReadLikelihoodGivenHaplotype( - refBasesWithContext, readBasesWithContext, - baseQuals, insQuals, delQuals, gcps); - - return logL; - } - - private final byte[] asBytes(final String bases, final boolean left, final boolean right) { - return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); - } - - private byte[] qualAsBytes(final int phredQual) { - return qualAsBytes(phredQual, true); - } - - private byte[] qualAsBytes(final int phredQual, final boolean addRandom) { - final byte phredQuals[] = new byte[readBasesWithContext.length]; - Arrays.fill(phredQuals, (byte)phredQual); - if(addRandom) { - for( int iii = 0; iii < phredQuals.length; iii++) { - phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3)); + // update just the bases corresponding to the provided micro read with the quality scores + if( doGOP ) { + phredQuals[0 + CONTEXT.length()] = (byte)phredQual; + } else { + for ( int i = 0; i < read.length(); i++) + phredQuals[i + CONTEXT.length()] = (byte)phredQual; } + } else { + Arrays.fill(phredQuals, (byte)phredQual); } + return phredQuals; } } @@ -195,8 +133,8 @@ public class PairHMMUnitTest extends BaseTest { // test all combinations final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30); final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40); - final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); - final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(8, 10, 20) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20,30,35) : Arrays.asList(2); for ( final int baseQual : baseQuals ) { for ( final int indelQual : indelQuals ) { @@ -219,7 +157,7 @@ public class PairHMMUnitTest extends BaseTest { for ( boolean insertionP : Arrays.asList(true, false)) { final String small = Utils.dupString((char)base, 1); - final String big = Utils.dupString((char)base, size); + final String big = Utils.dupString((char) base, size); final String ref = insertionP ? small : big; final String read = insertionP ? big : small; @@ -238,69 +176,65 @@ public class PairHMMUnitTest extends BaseTest { return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); } - @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) - public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { - double calculatedLogL = cfg.calcLogL(); - double expectedLogL = cfg.expectedLogL(); - logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); - Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); - } - - @DataProvider(name = "BandedLikelihoodTestProvider") - public Object[][] makeBandedLikelihoodTests() { + final Random random = new Random(87860573); + @DataProvider(name = "OptimizedLikelihoodTestProvider") + public Object[][] makeOptimizedLikelihoodTests() { // context on either side is ACGTTGCA REF ACGTTGCA // test all combinations - final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30); - final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40); - final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10); - final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 30, 40, 60) : Arrays.asList(30); + final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 40, 60) : Arrays.asList(40); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(3, 20, 50, 90, 160) : Arrays.asList(2); for ( final int baseQual : baseQuals ) { for ( final int indelQual : indelQuals ) { for ( final int gcp : gcps ) { - - // test substitutions - for ( final byte refBase : BaseUtils.BASES ) { - for ( final byte readBase : BaseUtils.BASES ) { - final String ref = new String(new byte[]{refBase}); - final String read = new String(new byte[]{readBase}); - final int expected = refBase == readBase ? 0 : baseQual; - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); - } - } - - // test insertions and deletions - for ( final int size : sizes ) { - for ( final byte base : BaseUtils.BASES ) { - final int expected = indelQual + (size - 2) * gcp; - - for ( boolean insertionP : Arrays.asList(true, false)) { - final String small = Utils.dupString((char)base, 1); - final String big = Utils.dupString((char)base, size); - - final String ref = insertionP ? small : big; - final String read = insertionP ? big : small; - - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true); + for ( final int refSize : sizes ) { + for ( final int readSize : sizes ) { + String ref = ""; + String read = ""; + for( int iii = 0; iii < refSize; iii++) { + ref += (char) BaseUtils.BASES[random.nextInt(4)]; } + for( int iii = 0; iii < readSize; iii++) { + read += (char) BaseUtils.BASES[random.nextInt(4)]; + } + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, false); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, false, true); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, true); } } } } } - return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class); + return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); } - @Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true) - public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) { - double calculatedLogL = cfg.calcLogL(); + @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) + public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { + double exactLogL = cfg.calcLogL( exactHMM, true ); + double calculatedLogL = cfg.calcLogL( originalHMM, true ); + double optimizedLogL = cfg.calcLogL( cachingHMM, true ); + double loglessLogL = cfg.calcLogL( loglessHMM, true ); double expectedLogL = cfg.expectedLogL(); - logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); - Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); + //logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(exactLogL, expectedLogL, cfg.toleranceFromTheoretical()); + Assert.assertEquals(calculatedLogL, expectedLogL, cfg.toleranceFromTheoretical()); + Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference()); + Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact()); + } + + @Test(dataProvider = "OptimizedLikelihoodTestProvider", enabled = true) + public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) { + double exactLogL = cfg.calcLogL( exactHMM, false ); + double calculatedLogL = cfg.calcLogL( originalHMM, false ); + double optimizedLogL = cfg.calcLogL( cachingHMM, false ); + double loglessLogL = cfg.calcLogL( loglessHMM, false ); + //logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference()); + Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact()); } @Test @@ -322,11 +256,11 @@ public class PairHMMUnitTest extends BaseTest { byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - double res1 = hmm.computeReadLikelihoodGivenHaplotype( + originalHMM.initialize(mread.length, haplotype1.length); + double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( haplotype1, mread, quals, gop, gop, - gcp); - + gcp, 0, false); System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); @@ -353,11 +287,11 @@ public class PairHMMUnitTest extends BaseTest { byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - double res1 = hmm.computeReadLikelihoodGivenHaplotype( + originalHMM.initialize(mread.length, haplotype1.length); + double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( haplotype1, mread, quals, gop, gop, - gcp); - + gcp, 0, false); System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 085a60191..aa3a1e6ac 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.arguments; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; @@ -54,22 +55,60 @@ public class StandardCallerArgumentCollection { * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend * that you not play around with this parameter. + * + * As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6. */ @Advanced @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) - public int MAX_ALTERNATE_ALLELES = 3; + public int MAX_ALTERNATE_ALLELES = 6; /** * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend * that you not play around with this parameter. + * + * This argument has been retired in GATK 2.2. Please specify just maxAltAlleles from now on */ - @Advanced - @Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false) - public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2; + @Deprecated + @Hidden + @Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "This argument has been retired in GATK 2.2. Please specify just maxAltAlleles from now on, which will apply to any variant, regardless of type", required = false) + public int MAX_ALTERNATE_ALLELES_FOR_INDELS = -1; + + /** + * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. + * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we + * will try to remove (N * contamination fraction) bases for each alternate allele. + */ + @Hidden + @Argument(fullName = "contamination_percentage_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false) + public double CONTAMINATION_PERCENTAGE = 0.0; @Hidden @Argument(shortName = "logExactCalls", doc="x", required=false) public File exactCallsLog = null; + + public StandardCallerArgumentCollection() { } + + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + public StandardCallerArgumentCollection(final StandardCallerArgumentCollection SCAC) { + this.alleles = SCAC.alleles; + this.GenotypingMode = SCAC.GenotypingMode; + this.heterozygosity = SCAC.heterozygosity; + this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; + this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS; + this.OutputMode = SCAC.OutputMode; + this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; + this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; + this.CONTAMINATION_PERCENTAGE = SCAC.CONTAMINATION_PERCENTAGE; + this.exactCallsLog = SCAC.exactCallsLog; + this.AFmodel = SCAC.AFmodel; + } + + /** + * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. + */ + @Advanced + @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) + public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 223e11680..df4ed9ef8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -74,8 +74,6 @@ import java.util.*; * */ public abstract class MicroScheduler implements MicroSchedulerMBean { - // TODO -- remove me and retire non nano scheduled versions of traversals - private final static boolean USE_NANOSCHEDULER_FOR_EVERYTHING = true; protected static final Logger logger = Logger.getLogger(MicroScheduler.class); /** @@ -238,15 +236,9 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { @Ensures("result != null") private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) { if (walker instanceof ReadWalker) { - if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ) - return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); - else - return new TraverseReads(); + return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); } else if (walker instanceof LocusWalker) { - if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ) - return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); - else - return new TraverseLociLinear(); + return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); } else if (walker instanceof DuplicateWalker) { return new TraverseDuplicates(); } else if (walker instanceof ReadPairWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java index 43350ccc1..f521c959d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java @@ -151,7 +151,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { ? new VariantContextWriterStub(engine, writerFile, argumentSources) : new VariantContextWriterStub(engine, defaultOutputStream, argumentSources); - stub.setCompressed(isCompressed(writerFileName.asString())); + stub.setCompressed(isCompressed(writerFileName == null ? null: writerFileName.asString())); stub.setDoNotWriteGenotypes(argumentIsPresent(createSitesOnlyArgumentDefinition(),matches)); stub.setSkipWritingCommandLineHeader(argumentIsPresent(createNoCommandLineHeaderArgumentDefinition(),matches)); stub.setForceBCF(argumentIsPresent(createBCFArgumentDefinition(),matches)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java deleted file mode 100755 index 30e78ef5c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java +++ /dev/null @@ -1,103 +0,0 @@ -package org.broadinstitute.sting.gatk.traversals; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; - -/** - * A simple solution to iterating over all reference positions over a series of genomic locations. - */ -public abstract class TraverseLociBase extends TraversalEngine,LocusShardDataProvider> { - /** - * our log, which we want to capture anything from this class - */ - protected static final Logger logger = Logger.getLogger(TraversalEngine.class); - - @Override - public final String getTraversalUnits() { - return "sites"; - } - - protected static class TraverseResults { - final int numIterations; - final T reduceResult; - - public TraverseResults(int numIterations, T reduceResult) { - this.numIterations = numIterations; - this.reduceResult = reduceResult; - } - } - - protected abstract TraverseResults traverse( final LocusWalker walker, - final LocusView locusView, - final LocusReferenceView referenceView, - final ReferenceOrderedView referenceOrderedDataView, - final T sum); - - @Override - public T traverse( LocusWalker walker, - LocusShardDataProvider dataProvider, - T sum) { - logger.debug(String.format("TraverseLociBase.traverse: Shard is %s", dataProvider)); - - final LocusView locusView = getLocusView( walker, dataProvider ); - - if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all - //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); - ReferenceOrderedView referenceOrderedDataView = null; - if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) - referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); - else - referenceOrderedDataView = (RodLocusView)locusView; - - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - - final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); - sum = result.reduceResult; - dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); - updateCumulativeMetrics(dataProvider.getShard()); - } - - // We have a final map call to execute here to clean up the skipped based from the - // last position in the ROD to that in the interval - if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { - // only do this if the walker isn't done! - final RodLocusView rodLocusView = (RodLocusView)locusView; - final long nSkipped = rodLocusView.getLastSkippedBases(); - if ( nSkipped > 0 ) { - final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); - final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); - final M x = walker.map(null, null, ac); - sum = walker.reduce(x, sum); - } - } - - return sum; - } - - /** - * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' - * of sorts, providing a consistent interface so that TraverseLociBase doesn't need to be reimplemented for any new datatype - * that comes along. - * @param walker walker to interrogate. - * @param dataProvider Data which which to drive the locus view. - * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. - */ - private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { - final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); - if( dataSource == DataSource.READS ) - return new CoveredLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) - return new AllLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) - return new RodLocusView(dataProvider); - else - throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java deleted file mode 100755 index 22381092f..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java +++ /dev/null @@ -1,47 +0,0 @@ -package org.broadinstitute.sting.gatk.traversals; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; -import org.broadinstitute.sting.gatk.datasources.providers.LocusView; -import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.utils.GenomeLoc; - -/** - * A simple solution to iterating over all reference positions over a series of genomic locations. - */ -public class TraverseLociLinear extends TraverseLociBase { - - @Override - protected TraverseResults traverse(LocusWalker walker, LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView, T sum) { - // We keep processing while the next reference location is within the interval - boolean done = false; - int numIterations = 0; - - while( locusView.hasNext() && ! done ) { - numIterations++; - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - final boolean keepMeP = walker.filter(tracker, refContext, locus); - if (keepMeP) { - final M x = walker.map(tracker, refContext, locus); - sum = walker.reduce(x, sum); - done = walker.isDone(); - } - - printProgress(locus.getLocation()); - } - - return new TraverseResults(numIterations, sum); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java index 469625c30..84715e5fd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java @@ -1,24 +1,26 @@ package org.broadinstitute.sting.gatk.traversals; +import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; -import org.broadinstitute.sting.gatk.datasources.providers.LocusView; -import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; +import org.broadinstitute.sting.gatk.datasources.providers.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction; import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import java.util.Iterator; /** * A simple solution to iterating over all reference positions over a series of genomic locations. */ -public class TraverseLociNano extends TraverseLociBase { +public class TraverseLociNano extends TraversalEngine,LocusShardDataProvider> { /** our log, which we want to capture anything from this class */ private static final boolean DEBUG = false; @@ -30,6 +32,81 @@ public class TraverseLociNano extends TraverseLociBase { } @Override + public final String getTraversalUnits() { + return "sites"; + } + + protected static class TraverseResults { + final int numIterations; + final T reduceResult; + + public TraverseResults(int numIterations, T reduceResult) { + this.numIterations = numIterations; + this.reduceResult = reduceResult; + } + } + + @Override + public T traverse( LocusWalker walker, + LocusShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider)); + + final LocusView locusView = getLocusView( walker, dataProvider ); + + if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all + //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); + ReferenceOrderedView referenceOrderedDataView = null; + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); + else + referenceOrderedDataView = (RodLocusView)locusView; + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + + final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); + sum = result.reduceResult; + dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); + updateCumulativeMetrics(dataProvider.getShard()); + } + + // We have a final map call to execute here to clean up the skipped based from the + // last position in the ROD to that in the interval + if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { + // only do this if the walker isn't done! + final RodLocusView rodLocusView = (RodLocusView)locusView; + final long nSkipped = rodLocusView.getLastSkippedBases(); + if ( nSkipped > 0 ) { + final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); + final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); + final M x = walker.map(null, null, ac); + sum = walker.reduce(x, sum); + } + } + + return sum; + } + + /** + * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' + * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype + * that comes along. + * @param walker walker to interrogate. + * @param dataProvider Data which which to drive the locus view. + * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. + */ + private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { + final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + if( dataSource == DataSource.READS ) + return new CoveredLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) + return new AllLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) + return new RodLocusView(dataProvider); + else + throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); + } + protected TraverseResults traverse(final LocusWalker walker, final LocusView locusView, final LocusReferenceView referenceView, diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java index aef3cf7d0..8273e1328 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java @@ -42,7 +42,7 @@ public class TraverseReadPairs extends TraversalEngine walker, ReadShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseReads.traverse Covered dataset is %s", dataProvider)); + logger.debug(String.format("TraverseReadsPairs.traverse Covered dataset is %s", dataProvider)); if( !dataProvider.hasReads() ) throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java deleted file mode 100755 index d41d17bde..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ -package org.broadinstitute.sting.gatk.traversals; - -import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView; -import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView; -import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.providers.ReadView; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -/** - * @author aaron - * @version 1.0 - * @date Apr 24, 2009 - *

- * Class TraverseReads - *

- * This class handles traversing by reads in the new shardable style - */ -public class TraverseReads extends TraversalEngine,ReadShardDataProvider> { - /** our log, which we want to capture anything from this class */ - protected static final Logger logger = Logger.getLogger(TraverseReads.class); - - @Override - public String getTraversalUnits() { - return "reads"; - } - - /** - * Traverse by reads, given the data and the walker - * - * @param walker the walker to traverse with - * @param dataProvider the provider of the reads data - * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function - * @return the reduce variable of the read walker - */ - public T traverse(ReadWalker walker, - ReadShardDataProvider dataProvider, - T sum) { - - logger.debug(String.format("TraverseReads.traverse Covered dataset is %s", dataProvider)); - - if( !dataProvider.hasReads() ) - throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); - - final ReadView reads = new ReadView(dataProvider); - final ReadReferenceView reference = new ReadReferenceView(dataProvider); - - // get the reference ordered data - final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - - boolean done = walker.isDone(); - // while we still have more reads - for (final SAMRecord read : reads) { - if ( done ) break; - - // ReferenceContext -- the reference bases covered by the read - final ReferenceContext refContext = ! read.getReadUnmappedFlag() && dataProvider.hasReference() - ? reference.getReferenceContext(read) - : null; - - // update the number of reads we've seen - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // if the read is mapped, create a metadata tracker - final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; - - final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); - if (keepMeP) { - M x = walker.map(refContext, (GATKSAMRecord) read, tracker); // the tracker can be null - sum = walker.reduce(x, sum); - } - - final GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart()); - - updateCumulativeMetrics(dataProvider.getShard()); - printProgress(locus); - - done = walker.isDone(); - } - return sum; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 2c7a12a36..245c8e3e8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -109,7 +109,7 @@ import java.util.ArrayList; @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file @Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality @PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta -public class BaseRecalibrator extends LocusWalker implements TreeReducible, NanoSchedulable { +public class BaseRecalibrator extends LocusWalker { @ArgumentCollection private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates @@ -277,11 +277,6 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed return sum; } - public Long treeReduce(Long sum1, Long sum2) { - sum1 += sum2; - return sum1; - } - @Override public void onTraversalDone(Long result) { logger.info("Calculating quantized quality scores..."); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index 4fe9c5323..1f7fbbf42 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -55,7 +55,7 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP * @param refBase The reference base at this locus */ @Override - public synchronized void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) { + public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) { final int offset = pileupElement.getOffset(); final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead()); @@ -65,35 +65,21 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP final int[] keys = readCovariates.getKeySet(offset, EventType.BASE_SUBSTITUTION); final int eventIndex = EventType.BASE_SUBSTITUTION.index; - final NestedIntegerArray rgRecalTable = recalibrationTables.getReadGroupTable(); - final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); - final RecalDatum rgThisDatum = createDatumObject(qual, isError); - if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it - rgRecalTable.put(rgThisDatum, keys[0], eventIndex); - else - rgPreviousDatum.combine(rgThisDatum); + // TODO: should this really be combine rather than increment? + combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex); - final NestedIntegerArray qualRecalTable = recalibrationTables.getQualityScoreTable(); - final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex); - if (qualPreviousDatum == null) - qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex); - else - qualPreviousDatum.increment(isError); + incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex); for (int i = 2; i < covariates.length; i++) { if (keys[i] < 0) continue; - final NestedIntegerArray covRecalTable = recalibrationTables.getTable(i); - final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex); - if (covPreviousDatum == null) - covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex); - else - covPreviousDatum.increment(isError); + + incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); } } @Override - public synchronized void updateDataForRead( final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + public void updateDataForRead( final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { throw new UnsupportedOperationException("Delocalized BQSR is not available in the GATK-lite version"); } @@ -121,4 +107,133 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP protected ReadCovariates covariateKeySetFrom(GATKSAMRecord read) { return (ReadCovariates) read.getTemporaryAttribute(BaseRecalibrator.COVARS_ATTRIBUTE); } + + /** + * Increments the RecalDatum at the specified position in the specified table, or put a new item there + * if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error status for this event + * @param keys location in table of our item + */ + protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, final byte qual, final boolean isError, final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + + // There are three cases here: + // + // 1. The original get succeeded (existingDatum != null) because there already was an item in this position. + // In this case we can just increment the existing item. + // + // 2. The original get failed (existingDatum == null), and we successfully put a new item in this position + // and are done. + // + // 3. The original get failed (existingDatum == null), AND the put fails because another thread comes along + // in the interim and puts an item in this position. In this case we need to do another get after the + // failed put to get the item the other thread put here and increment it. + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(createDatumObject(qual, isError), keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and increment it (item is guaranteed to exist at this point) + table.get(keys).increment(isError); + } + } + else { + // Easy case: already an item here, so increment it + existingDatum.increment(isError); + } + } + + /** + * Increments the RecalDatum at the specified position in the specified table, or put a new item there + * if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param keys location in table of our item + */ + protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, final byte qual, final double isError, final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(createDatumObject(qual, isError), keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and increment it (item is guaranteed to exist at this point) + table.get(keys).increment(1.0, isError); + } + } + else { + // Easy case: already an item here, so increment it + existingDatum.increment(1.0, isError); + } + } + + /** + * Combines the RecalDatum at the specified position in the specified table with a new RecalDatum, or put a + * new item there if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error status for this event + * @param keys location in table of our item + */ + protected void combineDatumOrPutIfNecessary( final NestedIntegerArray table, final byte qual, final boolean isError, final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + final RecalDatum newDatum = createDatumObject(qual, isError); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(newDatum, keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and combine it with our item (item is guaranteed to exist at this point) + table.get(keys).combine(newDatum); + } + } + else { + // Easy case: already an item here, so combine it with our item + existingDatum.combine(newDatum); + } + } + + /** + * Combines the RecalDatum at the specified position in the specified table with a new RecalDatum, or put a + * new item there if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param keys location in table of our item + */ + protected void combineDatumOrPutIfNecessary( final NestedIntegerArray table, final byte qual, final double isError, final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + final RecalDatum newDatum = createDatumObject(qual, isError); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(newDatum, keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and combine it with our item (item is guaranteed to exist at this point) + table.get(keys).combine(newDatum); + } + } + else { + // Easy case: already an item here, so combine it with our item + existingDatum.combine(newDatum); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index ebfbc49fe..1761f9cec 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -57,7 +57,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; @@ -231,7 +231,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood int count = 0; for (PileupElement p : pileup) { if (p.isDeletion() || p.isInsertionAtBeginningOfRead() || BaseUtils.isRegularBase(p.getBase())) - count++; + count += p.getRepresentativeCount(); } return count; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 76ba72017..d053e13a8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -41,19 +41,20 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.variantcontext.*; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; +import java.util.*; public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { private final boolean useAlleleFromVCF; private final double[] likelihoodSums = new double[4]; - + private final ArrayList[] alleleStratifiedElements = new ArrayList[4]; + protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i] = new ArrayList(); } public VariantContext getLikelihoods(final RefMetaDataTracker tracker, @@ -78,8 +79,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC ArrayList GLs = new ArrayList(contexts.size()); for ( Map.Entry sample : contexts.entrySet() ) { ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); + if ( UAC.CONTAMINATION_PERCENTAGE > 0.0 ) + pileup = createDecontaminatedPileup(pileup, UAC.CONTAMINATION_PERCENTAGE); if ( useBAQedPileup ) - pileup = createBAQedPileup( pileup ); + pileup = createBAQedPileup(pileup); // create the GenotypeLikelihoods object final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.PCR_error); @@ -150,8 +153,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC // create the genotypes; no-call everyone for now final GenotypesContext genotypes = GenotypesContext.create(); - final List noCall = new ArrayList(); - noCall.add(Allele.NO_CALL); for ( SampleGenotypeData sampleData : GLs ) { final double[] allLikelihoods = sampleData.GL.getLikelihoods(); @@ -202,6 +203,42 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC return allelesToUse; } + public ReadBackedPileup createDecontaminatedPileup(final ReadBackedPileup pileup, final double contaminationPercentage) { + // special case removal of all reads + if ( contaminationPercentage >= 1.0 ) + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); + + // start by stratifying the reads by the alleles they represent at this position + for( final PileupElement pe : pileup ) { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); + if ( baseIndex != -1 ) + alleleStratifiedElements[baseIndex].add(pe); + } + + // Down-sample *each* allele by the contamination fraction applied to the entire pileup. + // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later. + int numReadsToRemove = (int)Math.ceil((double)pileup.getNumberOfElements() * contaminationPercentage); + final TreeSet elementsToKeep = new TreeSet(new Comparator() { + @Override + public int compare(PileupElement element1, PileupElement element2) { + final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); + return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); + } + }); + + for ( int i = 0; i < 4; i++ ) { + final ArrayList alleleList = alleleStratifiedElements[i]; + if ( alleleList.size() > numReadsToRemove ) + elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove)); + } + + // clean up pointers so memory can be garbage collected if needed + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i].clear(); + + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(elementsToKeep)); + } + public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) { final List BAQedElements = new ArrayList(); for( final PileupElement PE : pileup ) { @@ -220,6 +257,22 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); } } + private List downsampleElements(final ArrayList elements, final int numElementsToRemove) { + final int pileupSize = elements.size(); + final BitSet itemsToRemove = new BitSet(pileupSize); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); + } + + ArrayList elementsToKeep = new ArrayList(pileupSize - numElementsToRemove); + for ( int i = 0; i < pileupSize; i++ ) { + if ( !itemsToRemove.get(i) ) + elementsToKeep.add(elements.get(i)); + } + + return elementsToKeep; + } + private static class SampleGenotypeData { public final String name; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 885463fcb..e50f25fe8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -27,23 +27,15 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - public class UnifiedArgumentCollection extends StandardCallerArgumentCollection { @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; - /** - * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. - */ - @Advanced - @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT; - /** * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily * distinguish between PCR errors vs. sequencing errors. The practical implication for this value is that it @@ -65,6 +57,12 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; + /** + * The PairHMM implementation to use for -glm INDEL genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. + */ + @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for -glm INDEL genotype likelihood calculations", required = false) + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ORIGINAL; + /** * The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base * is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value. @@ -112,10 +110,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) public int INDEL_HAPLOTYPE_SIZE = 80; - @Hidden - @Argument(fullName = "noBandedIndel", shortName = "noBandedIndel", doc = "Don't do Banded Indel likelihood computation", required = false) - public boolean DONT_DO_BANDED_INDEL_COMPUTATION = false; - @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) public boolean OUTPUT_DEBUG_INDEL_INFO = false; @@ -183,63 +177,57 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! - public UnifiedArgumentCollection clone() { - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); + /** + * Create a new UAC with defaults for all UAC arguments + */ + public UnifiedArgumentCollection() { + super(); + } - uac.GLmodel = GLmodel; - uac.AFmodel = AFmodel; - uac.heterozygosity = heterozygosity; - uac.PCR_error = PCR_error; - uac.GenotypingMode = GenotypingMode; - uac.OutputMode = OutputMode; - uac.NO_SLOD = NO_SLOD; - uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; - uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; - uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; - uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; - uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; - uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; - uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE; - uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; - uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY; - uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; - uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO; - uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; - uac.alleles = alleles; - uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES; - uac.MAX_ALTERNATE_ALLELES_FOR_INDELS = MAX_ALTERNATE_ALLELES_FOR_INDELS; - uac.GLmodel = GLmodel; - uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL; - uac.referenceSampleRod = referenceSampleRod; - uac.referenceSampleName = referenceSampleName; - uac.samplePloidy = samplePloidy; - uac.maxQualityScore = minQualityScore; - uac.phredScaledPrior = phredScaledPrior; - uac.minPower = minPower; - uac.minReferenceDepth = minReferenceDepth; - uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES; - uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO; - uac.exactCallsLog = exactCallsLog; + /** + * Create a new UAC based on the information only our in super-class scac and defaults for all UAC arguments + * @param scac + */ + public UnifiedArgumentCollection(final StandardCallerArgumentCollection scac) { + super(scac); + } + + /** + * Create a new UAC with all parameters having the values in uac + * + * @param uac + */ + public UnifiedArgumentCollection(final UnifiedArgumentCollection uac) { + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + super(uac); + + this.GLmodel = uac.GLmodel; + this.AFmodel = uac.AFmodel; + this.PCR_error = uac.PCR_error; + this.NO_SLOD = uac.NO_SLOD; + this.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; + this.MIN_BASE_QUALTY_SCORE = uac.MIN_BASE_QUALTY_SCORE; + this.MAX_DELETION_FRACTION = uac.MAX_DELETION_FRACTION; + this.MIN_INDEL_COUNT_FOR_GENOTYPING = uac.MIN_INDEL_COUNT_FOR_GENOTYPING; + this.MIN_INDEL_FRACTION_PER_SAMPLE = uac.MIN_INDEL_FRACTION_PER_SAMPLE; + this.INDEL_HETEROZYGOSITY = uac.INDEL_HETEROZYGOSITY; + this.INDEL_GAP_OPEN_PENALTY = uac.INDEL_GAP_OPEN_PENALTY; + this.INDEL_GAP_CONTINUATION_PENALTY = uac.INDEL_GAP_CONTINUATION_PENALTY; + this.OUTPUT_DEBUG_INDEL_INFO = uac.OUTPUT_DEBUG_INDEL_INFO; + this.INDEL_HAPLOTYPE_SIZE = uac.INDEL_HAPLOTYPE_SIZE; + this.TREAT_ALL_READS_AS_SINGLE_POOL = uac.TREAT_ALL_READS_AS_SINGLE_POOL; + this.referenceSampleRod = uac.referenceSampleRod; + this.referenceSampleName = uac.referenceSampleName; + this.samplePloidy = uac.samplePloidy; + this.maxQualityScore = uac.minQualityScore; + this.phredScaledPrior = uac.phredScaledPrior; + this.minPower = uac.minPower; + this.minReferenceDepth = uac.minReferenceDepth; + this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; + this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; + this.pairHMM = uac.pairHMM; // todo- arguments to remove - uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; - uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION; - return uac; - } - - public UnifiedArgumentCollection() { } - - public UnifiedArgumentCollection( final StandardCallerArgumentCollection SCAC ) { - super(); - this.alleles = SCAC.alleles; - this.GenotypingMode = SCAC.GenotypingMode; - this.heterozygosity = SCAC.heterozygosity; - this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; - this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS; - this.OutputMode = SCAC.OutputMode; - this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; - this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; - this.exactCallsLog = SCAC.exactCallsLog; + this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java index 07f88c9e3..f783267bc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -45,30 +45,46 @@ public abstract class AFCalc implements Cloneable { protected final int nSamples; protected final int maxAlternateAllelesToGenotype; - protected final int maxAlternateAllelesForIndels; protected Logger logger = defaultLogger; private SimpleTimer callTimer = new SimpleTimer(); - private final AFCalcResultTracker resultTracker; + private final StateTracker stateTracker; private ExactCallLogger exactCallLogger = null; - protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + /** + * Create a new AFCalc object capable of calculating the prob. that alleles are + * segregating among nSamples with up to maxAltAlleles for SNPs and maxAltAllelesForIndels + * for indels for samples with ploidy + * + * @param nSamples number of samples, must be > 0 + * @param maxAltAlleles maxAltAlleles for SNPs + * @param ploidy the ploidy, must be > 0 + */ + protected AFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); - if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels); if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be > 0 but got " + ploidy); this.nSamples = nSamples; this.maxAlternateAllelesToGenotype = maxAltAlleles; - this.maxAlternateAllelesForIndels = maxAltAllelesForIndels; - this.resultTracker = new AFCalcResultTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); + this.stateTracker = new StateTracker(maxAltAlleles); } + /** + * Enable exact call logging to file + * + * @param exactCallsLog the destination file + */ public void enableProcessLog(final File exactCallsLog) { exactCallLogger = new ExactCallLogger(exactCallsLog); } + /** + * Use this logger instead of the default logger + * + * @param logger + */ public void setLogger(Logger logger) { this.logger = logger; } @@ -83,10 +99,10 @@ public abstract class AFCalc implements Cloneable { public AFCalcResult getLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null"); if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null"); - if ( resultTracker == null ) throw new IllegalArgumentException("Results object cannot be null"); + if ( stateTracker == null ) throw new IllegalArgumentException("Results object cannot be null"); // reset the result, so we can store our new result there - resultTracker.reset(); + stateTracker.reset(); final VariantContext vcWorking = reduceScope(vc); @@ -100,10 +116,20 @@ public abstract class AFCalc implements Cloneable { return result; } - @Deprecated - protected AFCalcResult resultFromTracker(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { - resultTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); - return resultTracker.toAFCalcResult(log10AlleleFrequencyPriors); + /** + * Convert the final state of the state tracker into our result as an AFCalcResult + * + * Assumes that stateTracker has been updated accordingly + * + * @param vcWorking the VariantContext we actually used as input to the calc model (after reduction) + * @param log10AlleleFrequencyPriors the priors by AC vector + * @return a AFCalcResult describing the result of this calculation + */ + @Requires("stateTracker.getnEvaluations() >= 0") + @Ensures("result != null") + protected AFCalcResult getResultFromFinalState(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { + stateTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); + return stateTracker.toAFCalcResult(log10AlleleFrequencyPriors); } // --------------------------------------------------------------------------- @@ -134,11 +160,13 @@ public abstract class AFCalc implements Cloneable { * @param log10AlleleFrequencyPriors priors * @return a AFCalcResult object describing the results of this calculation */ - // TODO -- add consistent requires among args + @Requires({"vc != null", "log10AlleleFrequencyPriors != null"}) protected abstract AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors); /** + * Subset VC to the just allelesToUse, updating genotype likelihoods + * * Must be overridden by concrete subclasses * * @param vc variant context with alleles and genotype likelihoods @@ -159,11 +187,11 @@ public abstract class AFCalc implements Cloneable { // --------------------------------------------------------------------------- public int getMaxAltAlleles() { - return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels); + return maxAlternateAllelesToGenotype; } - public AFCalcResultTracker getResultTracker() { - return resultTracker; + protected StateTracker getStateTracker() { + return stateTracker; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java index 7d67815cf..efb16101e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java @@ -24,15 +24,12 @@ public class AFCalcFactory { * the needs of the request (i.e., considering ploidy). */ public enum Calculation { - /** The default implementation */ - EXACT(ReferenceDiploidExactAFCalc.class, 2, -1), - - /** reference implementation of multi-allelic EXACT model */ - EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1), - /** expt. implementation -- for testing only */ EXACT_INDEPENDENT(IndependentAllelesDiploidExactAFCalc.class, 2, -1), + /** reference implementation of multi-allelic EXACT model. Extremely slow for many alternate alleles */ + EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1), + /** original biallelic exact model, for testing only */ EXACT_ORIGINAL(OriginalDiploidExactAFCalc.class, 2, 2), @@ -60,6 +57,8 @@ public class AFCalcFactory { return (requiredPloidy == -1 || requiredPloidy == requestedPloidy) && (maxAltAlleles == -1 || maxAltAlleles >= requestedMaxAltAlleles); } + + public static Calculation getDefaultModel() { return EXACT_INDEPENDENT; } } private static final Map> afClasses; @@ -92,7 +91,7 @@ public class AFCalcFactory { public static AFCalc createAFCalc(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger) { - final int maxAltAlleles = Math.max(UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS); + final int maxAltAlleles = UAC.MAX_ALTERNATE_ALLELES; if ( ! UAC.AFmodel.usableForParams(UAC.samplePloidy, maxAltAlleles) ) { logger.info("Requested ploidy " + UAC.samplePloidy + " maxAltAlleles " + maxAltAlleles + " not supported by requested model " + UAC.AFmodel + " looking for an option"); final List supportingCalculations = new LinkedList(); @@ -110,7 +109,7 @@ public class AFCalcFactory { logger.info("Selecting model " + UAC.AFmodel); } - final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.samplePloidy); + final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.samplePloidy); if ( logger != null ) calc.setLogger(logger); if ( UAC.exactCallsLog != null ) calc.enableProcessLog(UAC.exactCallsLog); @@ -127,7 +126,7 @@ public class AFCalcFactory { * @return an initialized AFCalc */ public static AFCalc createAFCalc(final int nSamples) { - return createAFCalc(chooseBestCalculation(nSamples, 2, 1), nSamples, 2, 2, 2); + return createAFCalc(chooseBestCalculation(nSamples, 2, 1), nSamples, 2, 2); } /** @@ -140,7 +139,7 @@ public class AFCalcFactory { * @return an initialized AFCalc */ public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles) { - return createAFCalc(calc, nSamples, maxAltAlleles, maxAltAlleles, 2); + return createAFCalc(calc, nSamples, maxAltAlleles, 2); } /** @@ -148,14 +147,12 @@ public class AFCalcFactory { * * @param nSamples the number of samples we'll be using * @param maxAltAlleles the max. alt alleles to consider for SNPs - * @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs * @param ploidy the sample ploidy. Must be consistent with the calc * * @return an initialized AFCalc */ - public static AFCalc createAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { - final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels); - return createAFCalc(chooseBestCalculation(nSamples, ploidy, maxAlt), nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + public static AFCalc createAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + return createAFCalc(chooseBestCalculation(nSamples, ploidy, maxAltAlleles), nSamples, maxAltAlleles, ploidy); } /** @@ -182,20 +179,17 @@ public class AFCalcFactory { * @param calc the calculation to use * @param nSamples the number of samples we'll be using * @param maxAltAlleles the max. alt alleles to consider for SNPs - * @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs * @param ploidy the sample ploidy. Must be consistent with the calc * * @return an initialized AFCalc */ - public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles, final int ploidy) { if ( calc == null ) throw new IllegalArgumentException("Calculation cannot be null"); if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); - if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels); if ( ploidy < 1 ) throw new IllegalArgumentException("sample ploidy must be greater than zero " + ploidy); - final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels); - if ( ! calc.usableForParams(ploidy, maxAlt) ) + if ( ! calc.usableForParams(ploidy, maxAltAlleles) ) throw new IllegalArgumentException("AFCalc " + calc + " does not support requested ploidy " + ploidy); final Class afClass = getClassByName(calc.className); @@ -203,19 +197,19 @@ public class AFCalcFactory { throw new IllegalArgumentException("Unexpected AFCalc " + calc); try { - Object args[] = new Object[]{nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy}; - Constructor c = afClass.getDeclaredConstructor(int.class, int.class, int.class, int.class); + Object args[] = new Object[]{nSamples, maxAltAlleles, ploidy}; + Constructor c = afClass.getDeclaredConstructor(int.class, int.class, int.class); return (AFCalc)c.newInstance(args); } catch (Exception e) { throw new ReviewedStingException("Could not instantiate AFCalc " + calc, e); } } - protected static List createAFCalcs(final List calcs, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + protected static List createAFCalcs(final List calcs, final int nSamples, final int maxAltAlleles, final int ploidy) { final List AFCalcs = new LinkedList(); for ( final Calculation calc : calcs ) - AFCalcs.add(createAFCalc(calc, nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy)); + AFCalcs.add(createAFCalc(calc, nSamples, maxAltAlleles, ploidy)); return AFCalcs; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index 7cacb2060..a65772444 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -83,8 +83,8 @@ public class AFCalcResult { if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null"); if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); - if ( ! goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); - if ( ! goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); + if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); + if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); this.alleleCountsOfMLE = alleleCountsOfMLE; this.nEvaluations = nEvaluations; @@ -147,7 +147,7 @@ public class AFCalcResult { * Due to computational / implementation constraints this may be smaller than * the actual list of alleles requested * - * @return a non-empty list of alleles used during genotyping + * @return a non-empty list of alleles used during genotyping, the first of which is the reference allele */ @Ensures({"result != null", "! result.isEmpty()"}) public List getAllelesUsedInGenotyping() { @@ -159,7 +159,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PosteriorOfAFEq0() { return log10PosteriorsOfAC[AF0]; } @@ -169,7 +169,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PosteriorOfAFGT0() { return log10PosteriorsOfAC[AF1p]; } @@ -179,7 +179,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10LikelihoodOfAFEq0() { return log10LikelihoodsOfAC[AF0]; } @@ -189,7 +189,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10LikelihoodOfAFGT0() { return log10LikelihoodsOfAC[AF1p]; } @@ -199,7 +199,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PriorOfAFEq0() { return log10PriorsOfAC[AF0]; } @@ -209,7 +209,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PriorOfAFGT0() { return log10PriorsOfAC[AF1p]; } @@ -263,7 +263,7 @@ public class AFCalcResult { * @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping * @return the log10 probability that allele is segregating at this site */ - @Ensures("goodLog10Probability(result)") + @Ensures("MathUtils.goodLog10Probability(result)") public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) { final Double log10pNonRef = log10pNonRefByAllele.get(allele); if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele); @@ -279,7 +279,7 @@ public class AFCalcResult { * @return freshly allocated log10 normalized posteriors vector */ @Requires("log10LikelihoodsOfAC.length == log10PriorsOfAC.length") - @Ensures("goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)") + @Ensures("MathUtils.goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)") private static double[] computePosteriors(final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC) { final double[] log10UnnormalizedPosteriors = new double[log10LikelihoodsOfAC.length]; for ( int i = 0; i < log10LikelihoodsOfAC.length; i++ ) @@ -287,29 +287,6 @@ public class AFCalcResult { return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false); } - /** - * Check that the log10 prob vector vector is well formed - * - * @param vector - * @param expectedSize - * @param shouldSumToOne - * - * @return true if vector is well-formed, false otherwise - */ - private static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { - if ( vector.length != expectedSize ) return false; - - for ( final double pr : vector ) { - if ( ! goodLog10Probability(pr) ) - return false; - } - - if ( shouldSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(vector), 1.0, 1e-4) != 0 ) - return false; - - return true; // everything is good - } - /** * Computes the offset into linear vectors indexed by alt allele for allele * @@ -331,14 +308,4 @@ public class AFCalcResult { else return index - 1; } - - /** - * Checks that the result is a well-formed log10 probability - * - * @param result a supposedly well-formed log10 probability value - * @return true if result is really well formed - */ - private static boolean goodLog10Probability(final double result) { - return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); - } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java deleted file mode 100644 index 5c926a4d8..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import com.google.java.contract.Ensures; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Created by IntelliJ IDEA. - * User: ebanks - * Date: Dec 14, 2011 - * - * Useful helper class to communicate the results of the allele frequency calculation - * - * TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF? - */ -class AFCalcResultTracker { - protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; - - // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles - protected double log10MLE; - protected double log10MAP; - private final int[] alleleCountsOfMLE; - private final int[] alleleCountsOfMAP; - - // The posteriors seen, not including that of AF=0 - private static final int LIKELIHOODS_CACHE_SIZE = 5000; - private final double[] log10LikelihoodsMatrixValues = new double[LIKELIHOODS_CACHE_SIZE]; - private int currentLikelihoodsCacheIndex = 0; - protected Double log10LikelihoodsMatrixSum = null; - - // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) - private double log10LikelihoodOfAFzero; - private double log10PosteriorOfAFzero; - private int[] AClimits; - - int nEvaluations = 0; - - /** - * The list of alleles actually used in computing the AF - */ - private List allelesUsedInGenotyping = null; - - /** - * Create a results object capability of storing results for calls with up to maxAltAlleles - * - * @param maxAltAlleles an integer >= 1 - */ - public AFCalcResultTracker(final int maxAltAlleles) { - if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles); - - alleleCountsOfMLE = new int[maxAltAlleles]; - alleleCountsOfMAP = new int[maxAltAlleles]; - - reset(); - } - - /** - * Returns a vector with maxAltAlleles values containing AC values at the MLE - * - * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, - * starting from index 0 (i.e., the first alt allele is at 0). The vector is always - * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values - * are meaningful. - * - * @return a vector with allele counts, not all of which may be meaningful - */ - @Ensures("result != null") - public int[] getAlleleCountsOfMLE() { - return alleleCountsOfMLE; - } - - /** - * Returns a vector with maxAltAlleles values containing AC values at the MAP - * - * @see #getAlleleCountsOfMLE() for the encoding of results in this vector - * - * @return a non-null vector of ints - */ - @Ensures("result != null") - public int[] getAlleleCountsOfMAP() { - return alleleCountsOfMAP; - } - - /** - * Returns the likelihoods summed across all AC values for AC > 0 - * - * @return - */ - public double getLog10LikelihoodOfAFNotZero() { - if ( log10LikelihoodsMatrixSum == null ) { - if ( currentLikelihoodsCacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have - log10LikelihoodsMatrixSum = MathUtils.LOG10_P_OF_ZERO; - else - log10LikelihoodsMatrixSum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex); - } - return log10LikelihoodsMatrixSum; - } - - public double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { - return Math.min(getLog10LikelihoodOfAFNotZero(), capAt0 ? 0.0 : Double.POSITIVE_INFINITY); - } - - /** - * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should - * - * @return - */ - public double getLog10LikelihoodOfAFzero() { - return log10LikelihoodOfAFzero; - } - - /** - * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should - * - * @return - */ - public double getLog10PosteriorOfAFzero() { - return log10PosteriorOfAFzero; - } - - protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { - final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); - final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; - final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); - - // TODO -- replace with more meaningful computation - // TODO -- refactor this calculation into the ref calculation - final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); - for ( int i = 0; i < subACOfMLE.length; i++ ) { - final Allele allele = allelesUsedInGenotyping.get(i+1); - final double log10PNonRef = getAlleleCountsOfMAP()[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was - log10pNonRefByAllele.put(allele, log10PNonRef); - } - - return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); - } - - // -------------------------------------------------------------------------------- - // - // Protected mutational methods only for use within the calculation models themselves - // - // -------------------------------------------------------------------------------- - - /** - * Reset the data in this results object, so that it can be used in a subsequent AF calculation - * - * Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer - */ - protected void reset() { - log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = VALUE_NOT_CALCULATED; - for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) { - alleleCountsOfMLE[i] = 0; - alleleCountsOfMAP[i] = 0; - } - currentLikelihoodsCacheIndex = 0; - log10LikelihoodsMatrixSum = null; - allelesUsedInGenotyping = null; - nEvaluations = 0; - Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY); - } - - /** - * Tell this result we used one more evaluation cycle - */ - protected void incNEvaluations() { - nEvaluations++; - } - - protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { - addToLikelihoodsCache(log10LofK); - - if ( log10LofK > log10MLE ) { - log10MLE = log10LofK; - for ( int i = 0; i < alleleCountsForK.length; i++ ) - alleleCountsOfMLE[i] = alleleCountsForK[i]; - } - } - - protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { - if ( log10LofK > log10MAP ) { - log10MAP = log10LofK; - for ( int i = 0; i < alleleCountsForK.length; i++ ) - alleleCountsOfMAP[i] = alleleCountsForK[i]; - } - } - - private void addToLikelihoodsCache(final double log10LofK) { - // add to the cache - log10LikelihoodsMatrixValues[currentLikelihoodsCacheIndex++] = log10LofK; - - // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell - if ( currentLikelihoodsCacheIndex == LIKELIHOODS_CACHE_SIZE) { - final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex); - Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY); - log10LikelihoodsMatrixValues[0] = temporarySum; - currentLikelihoodsCacheIndex = 1; - } - } - - protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { - this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; - if ( log10LikelihoodOfAFzero > log10MLE ) { - log10MLE = log10LikelihoodOfAFzero; - Arrays.fill(alleleCountsOfMLE, 0); - } - } - - protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { - this.log10PosteriorOfAFzero = log10PosteriorOfAFzero; - if ( log10PosteriorOfAFzero > log10MAP ) { - log10MAP = log10PosteriorOfAFzero; - Arrays.fill(alleleCountsOfMAP, 0); - } - } - - protected void setAllelesUsedInGenotyping(List allelesUsedInGenotyping) { - if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() ) - throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty"); - - this.allelesUsedInGenotyping = allelesUsedInGenotyping; - } - - protected void setAClimits(int[] AClimits) { - this.AClimits = AClimits; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java index 49915c515..4895c84d9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -31,15 +31,11 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; public abstract class DiploidExactAFCalc extends ExactAFCalc { - public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); } - protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) { - return new StateTracker(); - } - @Override protected AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { @@ -60,43 +56,33 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { ACqueue.add(zeroSet); indexesToACset.put(zeroSet.getACcounts(), zeroSet); - // keep processing while we have AC conformations that need to be calculated - final StateTracker stateTracker = makeMaxLikelihood(vc, getResultTracker()); - while ( !ACqueue.isEmpty() ) { - getResultTracker().incNEvaluations(); // keep track of the number of evaluations + getStateTracker().incNEvaluations(); // keep track of the number of evaluations // compute log10Likelihoods final ExactACset set = ACqueue.remove(); - if ( stateTracker.withinMaxACs(set.getACcounts()) ) { - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, stateTracker, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, getResultTracker()); + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors); - // adjust max likelihood seen if needed - stateTracker.update(log10LofKs, set.getACcounts()); - - // clean up memory - indexesToACset.remove(set.getACcounts()); - //if ( DEBUG ) - // System.out.printf(" *** removing used set=%s%n", set.ACcounts); - } + // clean up memory + indexesToACset.remove(set.getACcounts()); + //if ( DEBUG ) + // System.out.printf(" *** removing used set=%s%n", set.ACcounts); } - return resultFromTracker(vc, log10AlleleFrequencyPriors); + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); } @Override protected VariantContext reduceScope(final VariantContext vc) { - final int myMaxAltAllelesToGenotype = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype; - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) { - logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + if ( vc.getAlternateAlleles().size() > getMaxAltAlleles() ) { + logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); VariantContextBuilder builder = new VariantContextBuilder(vc); - List alleles = new ArrayList(myMaxAltAllelesToGenotype + 1); + List alleles = new ArrayList(getMaxAltAlleles() + 1); alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype)); + alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles())); builder.alleles(alleles); builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); return builder.make(); @@ -153,23 +139,21 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { private double calculateAlleleCountConformation(final ExactACset set, final ArrayList genotypeLikelihoods, - final StateTracker stateTracker, final int numChr, final LinkedList ACqueue, final HashMap indexesToACset, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + final double[] log10AlleleFrequencyPriors) { //if ( DEBUG ) // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); // compute the log10Likelihoods - computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, resultTracker); + computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors); final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; // can we abort early because the log10Likelihoods are so small? - if ( stateTracker.abort(log10LofK, set.getACcounts()) ) { + if ( getStateTracker().abort(log10LofK, set.getACcounts(), true) ) { //if ( DEBUG ) // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); return log10LofK; @@ -188,7 +172,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { ACcountsClone[allele]++; // to get to this conformation, a sample would need to be AB (remember that ref=0) final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); - updateACset(stateTracker, ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different @@ -213,9 +197,9 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering for ( DependentSet dependent : differentAlleles ) - updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); for ( DependentSet dependent : sameAlleles ) - updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } return log10LofK; @@ -223,8 +207,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and // also pushes its value to the given callingSetIndex. - private void updateACset(final StateTracker stateTracker, - final int[] newSetCounts, + private void updateACset(final int[] newSetCounts, final int numChr, final ExactACset dependentSet, final int PLsetIndex, @@ -246,8 +229,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { private void computeLofK(final ExactACset set, final ArrayList genotypeLikelihoods, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + final double[] log10AlleleFrequencyPriors) { set.getLog10Likelihoods()[0] = 0.0; // the zero case final int totalK = set.getACsum(); @@ -258,8 +240,8 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); return; } @@ -281,14 +263,15 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; // update the MLE if necessary - resultTracker.updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); + getStateTracker().updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); // apply the priors over each alternate allele for ( final int ACcount : set.getACcounts().getCounts() ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - resultTracker.updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); + + getStateTracker().updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); } private void pushData(final ExactACset targetSet, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java index df0793352..ab230d398 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java @@ -39,8 +39,8 @@ import java.util.ArrayList; abstract class ExactAFCalc extends AFCalc { protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first - protected ExactAFCalc(final int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + protected ExactAFCalc(final int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index c0edee291..d0b801a20 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -89,7 +89,7 @@ import java.util.*; /** * The min. confidence of an allele to be included in the joint posterior. */ - private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-20); + private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-10); private final static int[] BIALLELIC_NON_INFORMATIVE_PLS = new int[]{0,0,0}; private final static List BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); @@ -100,7 +100,7 @@ import java.util.*; private final static class CompareAFCalcResultsByPNonRef implements Comparator { @Override public int compare(AFCalcResult o1, AFCalcResult o2) { - return Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); + return -1 * Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); } } @@ -111,9 +111,9 @@ import java.util.*; */ final AFCalc biAlleleExactModel; - protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); - biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, 1, ploidy); + protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, ploidy); } /** @@ -285,10 +285,14 @@ import java.util.*; // sort the results, so the most likely allele is first Collections.sort(sorted, compareAFCalcResultsByPNonRef); + double lastPosteriorGt0 = sorted.get(0).getLog10PosteriorOfAFGT0(); final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0(); for ( int i = 0; i < sorted.size(); i++ ) { - final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; + if ( sorted.get(i).getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) + throw new IllegalStateException("pNonRefResults not sorted: lastPosteriorGt0 " + lastPosteriorGt0 + " but current is " + sorted.get(i).getLog10PosteriorOfAFGT0()); + + final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0)); final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 }; @@ -303,7 +307,13 @@ import java.util.*; /** * Take the independent estimates of pNonRef for each alt allele and combine them into a single result * - * TODO -- add more docs + * Given n independent calculations for each of n alternate alleles create a single + * combined AFCalcResult with: + * + * priors for AF == 0 equal to theta^N for the nth least likely allele + * posteriors that reflect the combined chance that any alleles are segregating and corresponding + * likelihoods + * combined MLEs in the order of the alt alleles in vc * * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently */ @@ -315,9 +325,11 @@ import java.util.*; final double[] log10PriorsOfAC = new double[2]; final Map log10pNonRefByAllele = new HashMap(nAltAlleles); - // this value is a sum in log space + // the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs double log10PosteriorOfACEq0Sum = 0.0; + double log10PosteriorOfACGt0Sum = 0.0; + boolean anyPoly = false; for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); final int altI = vc.getAlleles().indexOf(altAllele) - 1; @@ -325,12 +337,15 @@ import java.util.*; // MLE of altI allele is simply the MLE of this allele in altAlleles alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele); - log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0(); - log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); - // the AF > 0 case requires us to store the normalized likelihood for later summation - if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) + if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) { + anyPoly = true; log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); + log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0(); + log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); + } + + log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0()); @@ -339,10 +354,25 @@ import java.util.*; nEvaluations += sortedResultWithThetaNPriors.nEvaluations; } + // If no alleles were polymorphic, make sure we have the proper priors (the defaults) for likelihood calculation + if ( ! anyPoly ) { + log10PriorsOfAC[0] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFEq0(); + log10PriorsOfAC[1] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFGT0(); + } + // In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C, // the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently // log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0 - final double log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); + // + // note we need to handle the case where the posterior of AF == 0 is 0.0, in which case we + // use the summed log10PosteriorOfACGt0Sum directly. This happens in cases where + // AF > 0 : 0.0 and AF == 0 : -16, and if you use the inverse calculation you get 0.0 and MathUtils.LOG10_P_OF_ZERO + final double log10PosteriorOfACGt0; + if ( log10PosteriorOfACEq0Sum == 0.0 ) + log10PosteriorOfACGt0 = log10PosteriorOfACGt0Sum; + else + log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); + final double[] log10LikelihoodsOfAC = new double[] { // L + prior = posterior => L = poster - prior log10PosteriorOfACEq0Sum - log10PriorsOfAC[0], @@ -350,8 +380,10 @@ import java.util.*; }; return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), - MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), // necessary to ensure all values < 0 - MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized + // necessary to ensure all values < 0 + MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), + // priors incorporate multiple alt alleles, must be normalized + MathUtils.normalizeFromLog10(log10PriorsOfAC, true), log10pNonRefByAllele, sortedResultsWithThetaNPriors); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index 093bf47d5..fc26111e0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -11,28 +12,31 @@ import java.util.Map; /** * Original bi-allelic ~O(N) implementation. Kept here for posterity and reference */ -public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { - protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); - } - - protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) { - return new StateTracker(); +class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { + protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); } @Override protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) { final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length]; final double[] log10AlleleFrequencyPosteriors = new double[log10AlleleFrequencyPriors.length]; - final int lastK = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + final Pair result = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + final int lastK = result.getFirst(); + final int mleK = result.getSecond(); - final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1)}; + final double log10LikelihoodAFGt0 = lastK == 0 ? MathUtils.LOG10_P_OF_ZERO : MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1, lastK+1); + final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], log10LikelihoodAFGt0}; final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)}; + final double[] log10Posteriors = MathUtils.vectorSum(log10Likelihoods, log10Priors); - final double pNonRef = lastK > 0 ? 0.0 : -1000.0; - final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), pNonRef); + final double log10PNonRef = log10Posteriors[1] > log10Posteriors[0] ? 0.0 : MathUtils.LOG10_P_OF_ZERO; + final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PNonRef); - return new AFCalcResult(new int[]{lastK}, 0, vc.getAlleles(), log10Likelihoods, log10Priors, log10pNonRefByAllele); + return new AFCalcResult(new int[]{mleK}, 0, vc.getAlleles(), + MathUtils.normalizeFromLog10(log10Likelihoods, true), + MathUtils.normalizeFromLog10(log10Priors, true), + log10pNonRefByAllele); } /** @@ -72,11 +76,11 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { } } - public int linearExact(final VariantContext vc, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyLikelihoods, - double[] log10AlleleFrequencyPosteriors) { - final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), false); + public Pair linearExact(final VariantContext vc, + double[] log10AlleleFrequencyPriors, + double[] log10AlleleFrequencyLikelihoods, + double[] log10AlleleFrequencyPosteriors) { + final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), true); final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; @@ -85,7 +89,7 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { double maxLog10L = Double.NEGATIVE_INFINITY; boolean done = false; - int lastK = -1; + int lastK = -1, mleK = -1; for (int k=0; k <= numChr && ! done; k++ ) { final double[] kMinus0 = logY.getkMinus0(); @@ -131,7 +135,11 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { // can we abort early? lastK = k; - maxLog10L = Math.max(maxLog10L, log10LofK); + if ( log10LofK > maxLog10L ) { + maxLog10L = log10LofK; + mleK = k; + } + if ( log10LofK < maxLog10L - StateTracker.MAX_LOG10_ERROR_TO_STOP_EARLY ) { //if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); done = true; @@ -140,6 +148,6 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { logY.rotate(); } - return lastK; + return new Pair(lastK, mleK); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java index b4e7b2ab1..97e5fed3b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java @@ -1,7 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; public class ReferenceDiploidExactAFCalc extends DiploidExactAFCalc { - protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index 19e253277..b82ec1d29 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -1,35 +1,85 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + /** - * Keeps track of the best state seen by the exact model and the max states to visit - * allowing us to abort the search before we visit the entire matrix of AC x samples + * Keeps track of the state information during the exact model AF calculation. + * + * Tracks things like the MLE and MAP AC values, their corresponding likelhood and posterior + * values, the likelihood of the AF == 0 state, and the number of evaluations needed + * by the calculation to compute the P(AF == 0) */ final class StateTracker { - public final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - - final private int[] maxACsToConsider; - - private ExactACcounts ACsAtMax = null; - private double maxLog10L = Double.NEGATIVE_INFINITY; - - public StateTracker() { - this(null); - } - - public StateTracker(final int[] maxACsToConsider) { - this.maxACsToConsider = maxACsToConsider; - } + protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; + protected final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 /** - * Update the maximum log10L seen, if log10LofKs is higher, and the corresponding ACs of this state - * - * @param log10LofKs the likelihood of our current configuration state + * These variables are intended to contain the MLE and MAP (and their corresponding allele counts) + * of the site over all alternate alleles */ - public void update(final double log10LofKs, final ExactACcounts ACs) { - if ( log10LofKs > getMaxLog10L()) { - this.setMaxLog10L(log10LofKs); - this.ACsAtMax = ACs; - } + protected double log10MLE; + protected double log10MAP; + + /** + * Returns a vector with maxAltAlleles values containing AC values at the MLE + * + * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, + * starting from index 0 (i.e., the first alt allele is at 0). The vector is always + * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values + * are meaningful. + */ + private final int[] alleleCountsOfMLE; + private final int[] alleleCountsOfMAP; + + /** + * A vector of log10 likelihood values seen, for future summation. When the size of the + * vector is exceeed -- because we've pushed more posteriors than there's space to hold + * -- we simply sum up the existing values, make that the first value, and continue. + */ + private final double[] log10LikelihoodsForAFGt0 = new double[LIKELIHOODS_CACHE_SIZE]; + private static final int LIKELIHOODS_CACHE_SIZE = 5000; + private int log10LikelihoodsForAFGt0CacheIndex = 0; + + /** + * The actual sum of the likelihoods. Null if the sum hasn't been computed yet + */ + protected Double log10LikelihoodsForAFGt0Sum = null; + + /** + * Contains the likelihood for the site's being monomorphic (i.e. AF=0 for all alternate alleles) + */ + private double log10LikelihoodOfAFzero = 0.0; + + /** + * The number of evaluates we've gone through in the AFCalc + */ + private int nEvaluations = 0; + + /** + * The list of alleles actually used in computing the AF + */ + private List allelesUsedInGenotyping = null; + + /** + * Create a results object capability of storing results for calls with up to maxAltAlleles + * + * @param maxAltAlleles an integer >= 1 + */ + public StateTracker(final int maxAltAlleles) { + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles); + + alleleCountsOfMLE = new int[maxAltAlleles]; + alleleCountsOfMAP = new int[maxAltAlleles]; + + reset(); } /** @@ -39,58 +89,194 @@ final class StateTracker { * @param log10LofK the log10 likelihood of the configuration we're considering analyzing * @return true if the configuration cannot meaningfully contribute to our likelihood sum */ - public boolean tooLowLikelihood(final double log10LofK) { - return log10LofK < getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY; + private boolean tooLowLikelihood(final double log10LofK) { + return log10LofK < log10MLE - MAX_LOG10_ERROR_TO_STOP_EARLY; } /** - * Are all ACs in otherACs less than or equal to their corresponding ACs in the maxACsToConsider? + * @return true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set + */ + private boolean isLowerAC(final ExactACcounts otherACs) { + final int[] otherACcounts = otherACs.getCounts(); + + for ( int i = 0; i < otherACcounts.length; i++ ) { + if ( alleleCountsOfMLE[i] > otherACcounts[i] ) + return false; + } + + return true; + } + + /** + * Should we stop exploring paths from ACs, given it's log10LofK * - * @param otherACs the set of otherACs that we want to know if we should consider analyzing - * @return true if otherACs is a state worth considering, or false otherwise + * @param log10LofK the log10LofK of these ACs + * @param ACs the ACs of this state + * @return return true if there's no reason to continue with subpaths of AC, or false otherwise */ - public boolean withinMaxACs(final ExactACcounts otherACs) { - if ( maxACsToConsider == null ) - return true; + protected boolean abort( final double log10LofK, final ExactACcounts ACs, final boolean enforceLowerACs ) { + return tooLowLikelihood(log10LofK) && (!enforceLowerACs || isLowerAC(ACs)); + } - final int[] otherACcounts = otherACs.getCounts(); + @Ensures("result != null") + protected int[] getAlleleCountsOfMAP() { + return alleleCountsOfMAP; + } - for ( int i = 0; i < maxACsToConsider.length; i++ ) { - // consider one more than the max AC to collect a bit more likelihood mass - if ( otherACcounts[i] > maxACsToConsider[i] + 1 ) - return false; - } - - return true; + @Ensures("result >= 0") + protected int getnEvaluations() { + return nEvaluations; } /** - * returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set + * @return the likelihoods summed across all AC values for AC > 0 */ - public boolean isLowerAC(final ExactACcounts otherACs) { - if ( ACsAtMax == null ) - return true; + private double getLog10LikelihoodOfAFNotZero() { + if ( log10LikelihoodsForAFGt0Sum == null ) { + if ( log10LikelihoodsForAFGt0CacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have + log10LikelihoodsForAFGt0Sum = MathUtils.LOG10_P_OF_ZERO; + else + log10LikelihoodsForAFGt0Sum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); + } + return log10LikelihoodsForAFGt0Sum; + } - final int[] myACcounts = this.ACsAtMax.getCounts(); - final int[] otherACcounts = otherACs.getCounts(); + /** + * @return the log10 likelihood of AF == 0 + */ + private double getLog10LikelihoodOfAFzero() { + return log10LikelihoodOfAFzero; + } - for ( int i = 0; i < myACcounts.length; i++ ) { - if ( myACcounts[i] > otherACcounts[i] ) - return false; + /** + * Convert this state to an corresponding AFCalcResult. + * + * Assumes that the values in this state have been filled in with meaningful values during the calculation. + * For example, that the allelesUsedInGenotyping has been set, that the alleleCountsOfMLE contains meaningful + * values, etc. + * + * @param log10PriorsByAC the priors by AC + * + * @return an AFCalcResult summarizing the final results of this calculation + */ + @Requires("allelesUsedInGenotyping != null") + protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { + final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); + final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true); + final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); + + final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); + for ( int i = 0; i < subACOfMLE.length; i++ ) { + final Allele allele = allelesUsedInGenotyping.get(i+1); + final double log10PNonRef = alleleCountsOfMAP[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was + log10pNonRefByAllele.put(allele, log10PNonRef); } - return true; + return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); } - public boolean abort( final double log10LofK, final ExactACcounts ACs ) { - return tooLowLikelihood(log10LofK) && isLowerAC(ACs); + // -------------------------------------------------------------------------------- + // + // Protected mutational methods only for use within the calculation models themselves + // + // -------------------------------------------------------------------------------- + + /** + * Reset the data in this results object, so that it can be used in a subsequent AF calculation + * + * Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer + */ + protected void reset() { + log10MLE = log10MAP = log10LikelihoodOfAFzero = VALUE_NOT_CALCULATED; + log10LikelihoodsForAFGt0CacheIndex = 0; + log10LikelihoodsForAFGt0Sum = null; + allelesUsedInGenotyping = null; + nEvaluations = 0; + Arrays.fill(alleleCountsOfMLE, 0); + Arrays.fill(alleleCountsOfMAP, 0); + Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY); } - public double getMaxLog10L() { - return maxLog10L; + /** + * Tell this result we used one more evaluation cycle + */ + protected void incNEvaluations() { + nEvaluations++; } - public void setMaxLog10L(double maxLog10L) { - this.maxLog10L = maxLog10L; + /** + * Update the maximum log10 likelihoods seen, if log10LofKs is higher, and the corresponding ACs of this state + * + * @param log10LofK the likelihood of our current configuration state, cannot be the 0 state + * @param alleleCountsForK the allele counts for this state + */ + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"}) + @Ensures("log10MLE == Math.max(log10LofK, log10MLE)") + protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { + addToLikelihoodsCache(log10LofK); + + if ( log10LofK > log10MLE ) { + log10MLE = log10LofK; + System.arraycopy(alleleCountsForK, 0, alleleCountsOfMLE, 0, alleleCountsForK.length); + } + } + + /** + * Update the maximum log10 posterior seen, if log10PofKs is higher, and the corresponding ACs of this state + * + * @param log10PofK the posterior of our current configuration state + * @param alleleCountsForK the allele counts for this state + */ + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"}) + @Ensures("log10MAP == Math.max(log10PofK, log10MAP)") + protected void updateMAPifNeeded(final double log10PofK, final int[] alleleCountsForK) { + if ( log10PofK > log10MAP ) { + log10MAP = log10PofK; + System.arraycopy(alleleCountsForK, 0, alleleCountsOfMAP, 0, alleleCountsForK.length); + } + } + + private void addToLikelihoodsCache(final double log10LofK) { + // add to the cache + log10LikelihoodsForAFGt0[log10LikelihoodsForAFGt0CacheIndex++] = log10LofK; + + // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell + if ( log10LikelihoodsForAFGt0CacheIndex == LIKELIHOODS_CACHE_SIZE) { + final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); + Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY); + log10LikelihoodsForAFGt0[0] = temporarySum; + log10LikelihoodsForAFGt0CacheIndex = 1; + } + } + + protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { + this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; + if ( log10LikelihoodOfAFzero > log10MLE ) { + log10MLE = log10LikelihoodOfAFzero; + Arrays.fill(alleleCountsOfMLE, 0); + } + } + + @Requires({"MathUtils.goodLog10Probability(log10PosteriorOfAFzero)"}) + protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { + if ( log10PosteriorOfAFzero > log10MAP ) { + log10MAP = log10PosteriorOfAFzero; + Arrays.fill(alleleCountsOfMAP, 0); + } + } + + /** + * Set the list of alleles used in genotyping + * + * @param allelesUsedInGenotyping the list of alleles, where the first allele is reference + */ + @Requires({"allelesUsedInGenotyping != null", "allelesUsedInGenotyping.size() > 1"}) + protected void setAllelesUsedInGenotyping(List allelesUsedInGenotyping) { + if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() ) + throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty"); + if ( allelesUsedInGenotyping.get(0).isNonReference() ) + throw new IllegalArgumentException("The first element of allelesUsedInGenotyping must be the reference allele"); + + this.allelesUsedInGenotyping = allelesUsedInGenotyping; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 9234a9fe8..3d287057c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -30,8 +30,11 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.PairHMM; import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pairhmm.ExactPairHMM; +import org.broadinstitute.sting.utils.pairhmm.OriginalPairHMM; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -48,7 +51,6 @@ public class PairHMMIndelErrorModel { public static final int BASE_QUAL_THRESHOLD = 20; private boolean DEBUG = false; - private boolean bandedLikelihoods = false; private static final int MAX_CACHED_QUAL = 127; @@ -67,6 +69,8 @@ public class PairHMMIndelErrorModel { private final byte[] GAP_OPEN_PROB_TABLE; private final byte[] GAP_CONT_PROB_TABLE; + private final PairHMM pairHMM; + ///////////////////////////// // Private Member Variables ///////////////////////////// @@ -85,15 +89,26 @@ public class PairHMMIndelErrorModel { } } - public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, boolean bandedLikelihoods) { + public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, final PairHMM.HMM_IMPLEMENTATION hmmType ) { this.DEBUG = deb; - this.bandedLikelihoods = bandedLikelihoods; + + switch (hmmType) { + case EXACT: + pairHMM = new ExactPairHMM(); + break; + case ORIGINAL: + pairHMM = new OriginalPairHMM(); + break; + case CACHING: + case LOGLESS_CACHING: + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL and EXACT."); + } // fill gap penalty table, affine naive model: this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; - for (int i = 0; i < START_HRUN_GAP_IDX; i++) { GAP_OPEN_PROB_TABLE[i] = indelGOP; GAP_CONT_PROB_TABLE[i] = indelGCP; @@ -190,7 +205,6 @@ public class PairHMMIndelErrorModel { final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final int[] readCounts) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; - final PairHMM pairHMM = new PairHMM(bandedLikelihoods); int readIdx=0; for (PileupElement p: pileup) { @@ -303,8 +317,6 @@ public class PairHMMIndelErrorModel { final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases); int j=0; - // initialize path metric and traceback memories for likelihood computation - double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null; byte[] previousHaplotypeSeen = null; final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; @@ -341,14 +353,9 @@ public class PairHMMIndelErrorModel { final int X_METRIC_LENGTH = readBases.length+2; final int Y_METRIC_LENGTH = haplotypeBases.length+2; - if (matchMetricArray == null) { + if (previousHaplotypeSeen == null) { //no need to reallocate arrays for each new haplotype, as length won't change - matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - - PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); } int startIndexInHaplotype = 0; @@ -356,11 +363,10 @@ public class PairHMMIndelErrorModel { startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); previousHaplotypeSeen = haplotypeBases.clone(); - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, (read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities), (read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities), - contextLogGapContinuationProbabilities, - startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); + contextLogGapContinuationProbabilities, startIndexInHaplotype, false); if (DEBUG) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index f8c871e7d..d71d0c9c8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -48,11 +48,14 @@ public class GLBasedSampleSelector extends SampleSelector { // first subset to the samples VariantContext subContext = vc.subContextFromSamples(samples); + if ( ! subContext.isPolymorphicInSamples() ) + return false; + // now check to see (using EXACT model) whether this should be variant // do we want to apply a prior? maybe user-spec? if ( flatPriors == null ) { flatPriors = new double[1+2*samples.size()]; - AFCalculator = AFCalcFactory.createAFCalc(samples.size(), 4, 4, 2); + AFCalculator = AFCalcFactory.createAFCalc(samples.size(), 4, 2); } final AFCalcResult result = AFCalculator.getLog10PNonRef(subContext, flatPriors); // do we want to let this qual go up or down? diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index befd24307..b30d47074 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.Serializable; import java.util.*; public class Haplotype { @@ -184,6 +185,21 @@ public class Haplotype { return new Haplotype(newHaplotypeBases); } + public static class HaplotypeBaseComparator implements Comparator, Serializable { + @Override + public int compare( final Haplotype hap1, final Haplotype hap2 ) { + final byte[] arr1 = hap1.getBases(); + final byte[] arr2 = hap2.getBases(); + // compares byte arrays using lexical ordering + final int len = Math.min(arr1.length, arr2.length); + for( int iii = 0; iii < len; iii++ ) { + final int cmp = arr1[iii] - arr2[iii]; + if (cmp != 0) { return cmp; } + } + return arr2.length - arr1.length; + } + } + public static LinkedHashMap makeHaplotypeListFromAlleles(final List alleleList, final int startPos, final ReferenceContext ref, diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 3740d5d7c..ff153a85c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -1194,6 +1194,39 @@ public class MathUtils { return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.)); } + /** + * Check that the log10 prob vector vector is well formed + * + * @param vector + * @param expectedSize + * @param shouldSumToOne + * + * @return true if vector is well-formed, false otherwise + */ + public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { + if ( vector.length != expectedSize ) return false; + + for ( final double pr : vector ) { + if ( ! goodLog10Probability(pr) ) + return false; + } + + if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 ) + return false; + + return true; // everything is good + } + + /** + * Checks that the result is a well-formed log10 probability + * + * @param result a supposedly well-formed log10 probability value + * @return true if result is really well formed + */ + public static boolean goodLog10Probability(final double result) { + return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); + } + /** * A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that diff --git a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java deleted file mode 100644 index 15f7a7869..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -import java.util.*; - -/** - * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. - * User: rpoplin - * Date: 3/1/12 - */ - -public class PairHMM { - private static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; - private static final byte DEFAULT_GOP = (byte) 45; - private static final byte DEFAULT_GCP = (byte) 10; - private static final double BANDING_TOLERANCE = 22.0; - private static final int BANDING_CLUSTER_WINDOW = 12; - private final boolean noBanded; - - public PairHMM() { - noBanded = false; - } - - public PairHMM( final boolean noBanded ) { - this.noBanded = noBanded; - } - - - public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, - final int X_METRIC_LENGTH) { - - for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { - Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); - } - - // the initial condition - matchMetricArray[1][1] = 0.0; // Math.log10(1.0); - - } - - @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) - @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability - public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, - final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) { - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - final int X_METRIC_LENGTH = readBases.length + 2; - final int Y_METRIC_LENGTH = haplotypeBases.length + 2; - - // initial arrays to hold the probabilities of being in the match, insertion and deletion cases - final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); - - return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray); - } - - @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) - @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability - public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, - final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - final int X_METRIC_LENGTH = readBases.length + 2; - final int Y_METRIC_LENGTH = haplotypeBases.length + 2; - - // ensure that all the qual scores have valid values - for( int iii = 0; iii < readQuals.length; iii++ ) { - readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); - } - - if( false ) { - final ArrayList workQueue = new ArrayList(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step - final ArrayList workToBeAdded = new ArrayList(); - final ArrayList calculatedValues = new ArrayList(); - final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1; - workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype - - for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over - //Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order - int el = 1; - for( int work : workQueue ) { - // choose the appropriate diagonal baseline location - int iii = 0; - int jjj = diag; - if( diag > Y_METRIC_LENGTH ) { - iii = diag - Y_METRIC_LENGTH; - jjj = Y_METRIC_LENGTH; - } - // move to the starting work location along the diagonal - iii += work; - jjj -= work; - while( iii >= X_METRIC_LENGTH || jjj <= 0 ) { - iii--; - jjj++; - work--; - } - if( !detectClusteredStartLocations(workToBeAdded, work ) ) { - workToBeAdded.add(work); // keep this thread going once it has started - } - - if( work >= el - 3 ) { - // step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value - double maxElement = Double.NEGATIVE_INFINITY; - for( el = work; el < numDiags + 1; el++ ) { - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, - insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); - final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); - calculatedValues.add(bestMetric); - if( bestMetric > maxElement ) { - maxElement = bestMetric; - } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { - break; - } - if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix - break; - } - if( --jjj <= 0 ) { // don't walk off the edge of the matrix - break; - } - } - - // find a local maximum to start a new band in the work queue - double localMaxElement = Double.NEGATIVE_INFINITY; - int localMaxElementIndex = 0; - for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) { - final double bestMetric = calculatedValues.get(kkk); - if( bestMetric > localMaxElement ) { - localMaxElement = bestMetric; - localMaxElementIndex = kkk; - } else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum - if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) { - workToBeAdded.add( work + localMaxElementIndex ); - } - break; - } - } - calculatedValues.clear(); - - // reset iii and jjj to the appropriate diagonal baseline location - iii = 0; - jjj = diag; - if( diag > Y_METRIC_LENGTH ) { - iii = diag - Y_METRIC_LENGTH; - jjj = Y_METRIC_LENGTH; - } - // move to the starting work location along the diagonal - iii += work-1; - jjj -= work-1; - - // step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value - for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) { - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, - insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); - final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); - if( bestMetric > maxElement ) { - maxElement = bestMetric; - } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { - break; - } - } - } - } - workQueue.clear(); - workQueue.addAll(workToBeAdded); - workToBeAdded.clear(); - } - } else { - // simple rectangular version of update loop, slow - for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { - for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { - if( (iii == 1 && jjj == 1) ) { continue; } - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, - matchMetricArray, XMetricArray, YMetricArray); - } - } - } - - // final probability is the log10 sum of the last element in all three state arrays - final int endI = X_METRIC_LENGTH - 1; - final int endJ = Y_METRIC_LENGTH - 1; - return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); - } - - private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, - final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { - - // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions - final int im1 = indI - 1; - final int jm1 = indJ - 1; - - // update the match array - double pBaseReadLog10 = 0.0; // Math.log10(1.0); - if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state - final byte x = readBases[im1-1]; - final byte y = haplotypeBases[jm1-1]; - final byte qual = readQuals[im1-1]; - pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); - } - final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); - final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); - final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); - matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); - - // update the X (insertion) array - final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); - final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); - - // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype - final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); - final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); - } - - // private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other - private boolean detectClusteredStartLocations( final ArrayList list, int loc ) { - for(int x : list) { - if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) { - return true; - } - } - return false; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java index 916fb43ea..6b3fce966 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java @@ -25,8 +25,11 @@ package org.broadinstitute.sting.utils.codecs.hapmap; import org.broad.tribble.AsciiFeatureCodec; +import org.broad.tribble.FeatureCodecHeader; import org.broad.tribble.annotation.Strand; +import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.LineReader; +import org.broad.tribble.readers.PositionalBufferedStream; import java.io.IOException; import java.util.Arrays; @@ -116,4 +119,10 @@ public class RawHapMapCodec extends AsciiFeatureCodec { } return headerLine; } + + @Override + public FeatureCodecHeader readHeader(final PositionalBufferedStream stream) throws IOException { + final AsciiLineReader br = new AsciiLineReader(stream); + return new FeatureCodecHeader(readHeader(br), br.getPosition()); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 4df1efee7..f12f13dc7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -2,8 +2,6 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.variantcontext.*; import java.io.IOException; import java.util.*; @@ -119,7 +117,7 @@ public class VCFCodec extends AbstractVCFCodec { // empty set for passes filters List fFields = new LinkedList(); // otherwise we have to parse and cache the value - if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) + if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) fFields.add(filterString); else fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java index 6fda0245b..63927ac84 100644 --- a/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java +++ b/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java @@ -99,9 +99,7 @@ public class LoggingNestedIntegerArray extends NestedIntegerArray { } @Override - public void put( final T value, final int... keys ) { - super.put(value, keys); - + public boolean put( final T value, final int... keys ) { StringBuilder logEntry = new StringBuilder(); logEntry.append(logEntryLabel); @@ -116,5 +114,7 @@ public class LoggingNestedIntegerArray extends NestedIntegerArray { // PrintStream methods all use synchronized blocks internally, so our logging is thread-safe log.println(logEntry.toString()); + + return super.put(value, keys); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java index 31d316555..c83de8ec7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java +++ b/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java @@ -50,6 +50,28 @@ public class NestedIntegerArray { this.dimensions = dimensions.clone(); data = new Object[dimensions[0]]; + prepopulateArray(data, 0); + } + + /** + * Recursively allocate the entire tree of arrays in all its dimensions. + * + * Doing this upfront uses more memory initially, but saves time over the course of the run + * and (crucially) avoids having to make threads wait while traversing the tree to check + * whether branches exist or not. + * + * @param subarray current node in the tree + * @param dimension current level in the tree + */ + private void prepopulateArray( Object[] subarray, int dimension ) { + if ( dimension >= numDimensions - 1 ) { + return; + } + + for ( int i = 0; i < subarray.length; i++ ) { + subarray[i] = new Object[dimensions[dimension + 1]]; + prepopulateArray((Object[])subarray[i], dimension + 1); + } } public T get(final int... keys) { @@ -59,14 +81,29 @@ public class NestedIntegerArray { for( int i = 0; i < numNestedDimensions; i++ ) { if ( keys[i] >= dimensions[i] ) return null; - myData = (Object[])myData[keys[i]]; - if ( myData == null ) - return null; + + myData = (Object[])myData[keys[i]]; // interior nodes in the tree will never be null, so we can safely traverse + // down to the leaves } + return (T)myData[keys[numNestedDimensions]]; } - public synchronized void put(final T value, final int... keys) { // WARNING! value comes before the keys! + /** + * Insert a value at the position specified by the given keys. + * + * This method is THREAD-SAFE despite not being synchronized, however the caller MUST + * check the return value to see if the put succeeded. This method RETURNS FALSE if + * the value could not be inserted because there already was a value present + * at the specified location. In this case the caller should do a get() to get + * the already-existing value and (potentially) update it. + * + * @param value value to insert + * @param keys keys specifying the location of the value in the tree + * @return true if the value was inserted, false if it could not be inserted because there was already + * a value at the specified position + */ + public boolean put(final T value, final int... keys) { // WARNING! value comes before the keys! if ( keys.length != numDimensions ) throw new ReviewedStingException("Exactly " + numDimensions + " keys should be passed to this NestedIntegerArray but " + keys.length + " were provided"); @@ -75,15 +112,26 @@ public class NestedIntegerArray { for ( int i = 0; i < numNestedDimensions; i++ ) { if ( keys[i] >= dimensions[i] ) throw new ReviewedStingException("Key " + keys[i] + " is too large for dimension " + i + " (max is " + (dimensions[i]-1) + ")"); - Object[] temp = (Object[])myData[keys[i]]; - if ( temp == null ) { - temp = new Object[dimensions[i+1]]; - myData[keys[i]] = temp; - } - myData = temp; + + myData = (Object[])myData[keys[i]]; // interior nodes in the tree will never be null, so we can safely traverse + // down to the leaves } - myData[keys[numNestedDimensions]] = value; + synchronized ( myData ) { // lock the bottom row while we examine and (potentially) update it + + // Insert the new value only if there still isn't any existing value in this position + if ( myData[keys[numNestedDimensions]] == null ) { + myData[keys[numNestedDimensions]] = value; + } + else { + // Already have a value for this leaf (perhaps another thread came along and inserted one + // while we traversed the tree), so return false to notify the caller that we didn't put + // the item + return false; + } + } + + return true; } public List getAllValues() { diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 6b97f8f9f..c1f408bc7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -352,6 +352,9 @@ public class UserException extends ReviewedStingException { } public static class CannotExecuteQScript extends UserException { + public CannotExecuteQScript(String message) { + super(String.format("Unable to execute QScript: " + message)); + } public CannotExecuteQScript(String message, Exception e) { super(String.format("Unable to execute QScript: " + message), e); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java new file mode 100644 index 000000000..17089ee81 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java @@ -0,0 +1,107 @@ +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 10/16/12 + */ + +public class ExactPairHMM extends PairHMM { + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); + } + + // the initial condition + matchMetricArray[1][1] = 0.0; // Math.log10(1.0); + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.log10sumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]}); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0}); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1}); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2}); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java new file mode 100644 index 000000000..cd946cdf1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * User: rpoplin + * Date: 3/1/12 + */ + +public class OriginalPairHMM extends ExactPairHMM { + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java new file mode 100644 index 000000000..7a1399c32 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 10/16/12 + */ + +public abstract class PairHMM { + protected static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; + protected static final byte DEFAULT_GOP = (byte) 45; + protected static final byte DEFAULT_GCP = (byte) 10; + + public enum HMM_IMPLEMENTATION { + /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ + EXACT, + /* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */ + ORIGINAL, + /* Optimized version of the PairHMM which caches per-read computations */ + CACHING, + /* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */ + LOGLESS_CACHING + } + + protected double[][] matchMetricArray = null; + protected double[][] XMetricArray = null; + protected double[][] YMetricArray = null; + + public abstract void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ); + + @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", + "readBases.length == overallGCP.length", "matchMetricArray!=null", "XMetricArray!=null", "YMetricArray!=null"}) + @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 likelihood + public abstract double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java index 85568dac9..29c15adf7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java @@ -66,7 +66,9 @@ public class ReadGroupCovariate implements RequiredCovariate { } @Override - public String formatKey(final int key) { + public synchronized String formatKey(final int key) { + // This method is synchronized so that we don't attempt to do a get() + // from the reverse lookup table while that table is being updated return readGroupReverseLookupTable.get(key); } @@ -76,17 +78,32 @@ public class ReadGroupCovariate implements RequiredCovariate { } private int keyForReadGroup(final String readGroupId) { - if (!readGroupLookupTable.containsKey(readGroupId)) { - readGroupLookupTable.put(readGroupId, nextId); - readGroupReverseLookupTable.put(nextId, readGroupId); - nextId++; + // Rather than synchronize this entire method (which would be VERY expensive for walkers like the BQSR), + // synchronize only the table updates. + + // Before entering the synchronized block, check to see if this read group is not in our tables. + // If it's not, either we will have to insert it, OR another thread will insert it first. + // This preliminary check avoids doing any synchronization most of the time. + if ( ! readGroupLookupTable.containsKey(readGroupId) ) { + + synchronized ( this ) { + + // Now we need to make sure the key is STILL not there, since another thread may have come along + // and inserted it while we were waiting to enter this synchronized block! + if ( ! readGroupLookupTable.containsKey(readGroupId) ) { + readGroupLookupTable.put(readGroupId, nextId); + readGroupReverseLookupTable.put(nextId, readGroupId); + nextId++; + } + } } return readGroupLookupTable.get(readGroupId); } @Override - public int maximumKeyValue() { + public synchronized int maximumKeyValue() { + // Synchronized so that we don't query table size while the tables are being updated return readGroupLookupTable.size() - 1; } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 1feb76517..9fdb48b34 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -60,8 +60,9 @@ public class GATKSAMRecord extends BAMRecord { private String mReadString = null; private GATKSAMReadGroupRecord mReadGroup = null; private byte[] reducedReadCounts = null; - private int softStart = -1; - private int softEnd = -1; + private final static int UNINITIALIZED = -1; + private int softStart = UNINITIALIZED; + private int softEnd = UNINITIALIZED; // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; @@ -386,7 +387,7 @@ public class GATKSAMRecord extends BAMRecord { * @return the unclipped start of the read taking soft clips (but not hard clips) into account */ public int getSoftStart() { - if (softStart < 0) { + if ( softStart == UNINITIALIZED ) { softStart = getAlignmentStart(); for (final CigarElement cig : getCigar().getCigarElements()) { final CigarOperator op = cig.getOperator(); @@ -408,17 +409,23 @@ public class GATKSAMRecord extends BAMRecord { * @return the unclipped end of the read taking soft clips (but not hard clips) into account */ public int getSoftEnd() { - if ( softEnd < 0 ) { + if ( softEnd == UNINITIALIZED ) { + boolean foundAlignedBase = false; softEnd = getAlignmentEnd(); final List cigs = getCigar().getCigarElements(); - for (int i=cigs.size() - 1; i>=0; --i) { + for (int i = cigs.size() - 1; i >= 0; --i) { final CigarElement cig = cigs.get(i); final CigarOperator op = cig.getOperator(); - if (op == CigarOperator.SOFT_CLIP) + if (op == CigarOperator.SOFT_CLIP) // assumes the soft clip that we found is at the end of the aligned read softEnd += cig.getLength(); - else if (op != CigarOperator.HARD_CLIP) + else if (op != CigarOperator.HARD_CLIP) { + foundAlignedBase = true; break; + } + } + if( !foundAlignedBase ) { // for example 64H14S, the soft end is actually the same as the alignment end + softEnd = getAlignmentEnd(); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java new file mode 100644 index 000000000..cc50152ac --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.threading; + +import java.lang.reflect.Array; + +/** + * ThreadLocal implementation for arrays + * + * Example usage: + * + * private ThreadLocal threadLocalByteArray = new ThreadLocalArray(length, byte.class); + * .... + * byte[] byteArray = threadLocalByteArray.get(); + * + * @param the type of the array itself (eg., int[], double[], etc.) + * + * @author David Roazen + */ +public class ThreadLocalArray extends ThreadLocal { + private int arraySize; + private Class arrayElementType; + + /** + * Create a new ThreadLocalArray + * + * @param arraySize desired length of the array + * @param arrayElementType type of the elements within the array (eg., Byte.class, Integer.class, etc.) + */ + public ThreadLocalArray( int arraySize, Class arrayElementType ) { + super(); + + this.arraySize = arraySize; + this.arrayElementType = arrayElementType; + } + + @Override + @SuppressWarnings("unchecked") + protected T initialValue() { + return (T)Array.newInstance(arrayElementType, arraySize); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index 46f77c283..bf1fc9e65 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -6,13 +6,12 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.qc.CountReads; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -62,9 +61,9 @@ public class TraverseReadsUnitTest extends BaseTest { private SAMReaderID bam = new SAMReaderID(new File(validationDataLocation + "index_test.bam"),new Tags()); // TCGA-06-0188.aligned.duplicates_marked.bam"); private File refFile = new File(validationDataLocation + "Homo_sapiens_assembly17.fasta"); private List bamList; - private Walker countReadWalker; + private ReadWalker countReadWalker; private File output; - private TraverseReads traversalEngine = null; + private TraverseReadsNano traversalEngine = null; private IndexedFastaSequenceFile ref = null; private GenomeLocParser genomeLocParser = null; @@ -107,7 +106,7 @@ public class TraverseReadsUnitTest extends BaseTest { bamList.add(bam); countReadWalker = new CountReads(); - traversalEngine = new TraverseReads(); + traversalEngine = new TraverseReadsNano(1); traversalEngine.initialize(engine); } @@ -125,7 +124,7 @@ public class TraverseReadsUnitTest extends BaseTest { fail("Shard == null"); } - ShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null,null); + ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null,null); accumulator = traversalEngine.traverse(countReadWalker, dataProvider, accumulator); dataProvider.close(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 72724e46a..dee54b9f8 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -7,6 +7,7 @@ import org.testng.annotations.Test; import java.io.File; import java.util.Arrays; +import java.util.Collections; import java.util.List; // ********************************************************************************** // @@ -18,6 +19,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; + private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam"; // -------------------------------------------------------------------------------------------------------------- // @@ -28,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("b3abf320f7d02d0e3b2883833419130e")); + Arrays.asList("847605f4efafef89529fe0e496315edd")); executeTest("test MultiSample Pilot1", spec); } @@ -52,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("57e409dbb12e0d85cd8af73db221b1fc")); + Arrays.asList("afb8768f31ab57eb43f75c1115eadc99")); executeTest("test SingleSample Pilot2", spec); } @@ -60,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("48b4f4b05461be276bffc91350f08cbc")); + Arrays.asList("543f68e42034bf44cfb24da8c9204320")); executeTest("test Multiple SNP alleles", spec); } @@ -76,7 +78,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("04affcc9d720ee17bc221759707e0cd2")); + Arrays.asList("5ce03dd9ca2d9324c1d4a9d64389beb5")); executeTest("test reverse trim", spec); } @@ -84,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("112e7bedfd284d4d9390aa006118c733")); + Arrays.asList("3c006b06b17bbe8e787d64eff6a63a19")); executeTest("test mismatched PLs", spec); } @@ -94,7 +96,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "367c0355b4e7b10c2988e5c41f44b3d2"; + private final static String COMPRESSED_OUTPUT_MD5 = "fd236bd635d514e4214d364f45ec4d10"; @Test public void testCompressedOutput() { @@ -115,7 +117,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "360d1274c1072a1ae9868e4e106c2650"; + String md5 = "d408b4661b820ed86272415b8ea08780"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -147,7 +149,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("6ae4a219c7b9c837fcbf12edeeac3c0c")); + Arrays.asList("839ecd30d354a36b5dfa2b5e99859765")); executeTest("test min_base_quality_score 26", spec); } @@ -175,6 +177,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test using comp track", spec); } + @Test + public void testNoCmdLineHeaderStdout() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandNoCmdLineHeaderStdout + " -glm INDEL -L 1:67,225,396-67,288,518", 0, + Collections.emptyList()); + executeTest("testNoCmdLineHeaderStdout", spec); + } + @Test public void testOutputParameterSitesOnly() { testOutputParameters("-sites_only", "97ba874eafc9884a4de027a84c036311"); @@ -187,7 +197,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "67739a3ccf30975bcaef8a563e4b80cf"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "41c046d38ea328421df924e37e017645"); } private void testOutputParameters(final String args, final String md5) { @@ -220,12 +230,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "f1c4c8e701b2334bf3c4f12fc395fec8" ); + testHeterozosity( 0.01, "986923de51c71635d47e3d06fe3794a1" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "7fbbf4a21d6bf0026bfdadbb3c086fbe" ); + testHeterozosity( 1.0 / 1850, "fb12b1553f813004a394a391a8540873" ); } private void testHeterozosity(final double arg, final String md5) { @@ -268,7 +278,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("950fb032cc9902ae48bd21f272d2fd52")); + Arrays.asList("98058fc913b61c22d44875da1f5ea89c")); executeTest(String.format("test calling with BAQ"), spec); } @@ -287,7 +297,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("b3df138254ed141b61a758df87757e0d")); + Arrays.asList("650c53774afacfc07a595675e8cdde17")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -302,7 +312,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("63fd9488daadd4baaef0a98f02916996")); + Arrays.asList("6a0c2a3a7bcc56ad01428c71408055aa")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -315,7 +325,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("52b5a432092995c92fe71e1942689ba8")); + Arrays.asList("5f2721c3323de5390d2d47446139f32b")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -343,13 +353,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("863ee56b3594f09795644127f2f9539f")); + Arrays.asList("a4761d7f25e7a62f34494801c98a0da7")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("503ca1b75cc7b2679eaa80f7b5e7ef1c")); + Arrays.asList("c526c234947482d1cd2ffc5102083a08")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -371,7 +381,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 20:10,000,000-10,100,000", 1, - Arrays.asList("945a2f994eaced8efdf8de24b58f2680")); + Arrays.asList("1e0d2c15546c3b0959b00ffb75488b56")); executeTest(String.format("test UG with base indel quality scores"), spec); } @@ -449,7 +459,25 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("bbf16e1873e525ee5975021cfb8988cf")); + Arrays.asList("da9c05f87bd6415e97f90c49cf68ed19")); executeTest("test calling on a ReducedRead BAM", spec); } + + @Test + public void testReducedBamSNPs() { + testReducedCalling("SNP", "1d4a826b144723ff0766c36aa0239287"); + } + + @Test + public void testReducedBamINDELs() { + testReducedCalling("INDEL", "68ef51d5c98480e0c0192e0eecb95bca"); + } + + + private void testReducedCalling(final String model, final String md5) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-11,000,000 -glm " + model, 1, + Arrays.asList(md5)); + executeTest("test calling on a ReducedRead BAM with " + model, spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 58d3677c7..f29ac8cad 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -190,7 +190,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("549321a2543608f214ab4893ab478be6") + Arrays.asList("46ff472fc7ef6734ad01170028d5924a") ); executeTest("testRegenotype--" + testFile, spec); @@ -216,7 +216,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("549321a2543608f214ab4893ab478be6") + Arrays.asList("46ff472fc7ef6734ad01170028d5924a") ); executeTest("testRemoveMLEAndRegenotype--" + testFile, spec); diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java index c1b28314c..37614f15f 100755 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -21,7 +21,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest { for ( final int nct : Arrays.asList(1, 2) ) { // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); - tests.add(new Object[]{ "BOTH", "8cad82c3a5f5b932042933f136663c8a", nt, nct }); + tests.add(new Object[]{ "BOTH", "85fc5d6dfeb60ed89763470f4b4c981e", nt, nct }); } return tests.toArray(new Object[][]{}); diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 2ba2da734..af2e18ad9 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -23,6 +23,7 @@ import java.util.List; * To change this template use File | Settings | File Templates. */ public class NanoSchedulerUnitTest extends BaseTest { + private final static boolean DEBUG = false; private final static boolean debug = false; public static final int NANO_SCHEDULE_MAX_RUNTIME = 30000; @@ -30,19 +31,22 @@ public class NanoSchedulerUnitTest extends BaseTest { @Override public Integer apply(Integer input) { return input * 2; } } + private static void maybeDelayMe(final int input) { + try { + if ( input % 7 == 0 ) { + final int milliToSleep = (input % 10); + //System.out.printf("Sleeping %d millseconds%n", milliToSleep); + Thread.sleep(milliToSleep); + } + } catch ( InterruptedException ex ) { + throw new RuntimeException(ex); + } + } + private static class Map2xWithDelays extends Map2x { @Override public Integer apply(Integer input) { - try { - if ( input % 7 == 0 ) { - final int milliToSleep = (input % 10); - //System.out.printf("Sleeping %d millseconds%n", milliToSleep); - Thread.sleep(milliToSleep); - } - - return input * 2; - } catch ( InterruptedException ex ) { - throw new RuntimeException(ex); - } + maybeDelayMe(input); + return input * 2; } } @@ -117,10 +121,12 @@ public class NanoSchedulerUnitTest extends BaseTest { } static NanoSchedulerBasicTest exampleTest = null; + static NanoSchedulerBasicTest exampleTestWithDelays = null; @BeforeSuite public void setUp() throws Exception { exampleTest = new NanoSchedulerBasicTest(10, 2, 1, 10, false); + exampleTestWithDelays = new NanoSchedulerBasicTest(10, 2, 1, 10, true); } @DataProvider(name = "NanoSchedulerBasicTest") @@ -151,14 +157,14 @@ public class NanoSchedulerUnitTest extends BaseTest { return NanoSchedulerBasicTest.getTests(NanoSchedulerBasicTest.class); } - @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME) + @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testSingleThreadedNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { logger.warn("Running " + test); if ( test.nThreads == 1 ) testNanoScheduler(test); } - @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME, dependsOnMethods = "testSingleThreadedNanoScheduler") + @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME, dependsOnMethods = "testSingleThreadedNanoScheduler") public void testMultiThreadedNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { logger.warn("Running " + test); if ( test.nThreads >= 1 ) @@ -195,7 +201,7 @@ public class NanoSchedulerUnitTest extends BaseTest { } } - @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) + @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testNanoSchedulerInLoop(final NanoSchedulerBasicTest test) throws InterruptedException { if ( test.bufferSize > 1) { logger.warn("Running " + test); @@ -213,7 +219,7 @@ public class NanoSchedulerUnitTest extends BaseTest { } } - @Test(timeOut = NANO_SCHEDULE_MAX_RUNTIME) + @Test(enabled = true && ! DEBUG, timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testShutdown() throws InterruptedException { final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); Assert.assertFalse(nanoScheduler.isShutdown(), "scheduler should be alive"); @@ -221,38 +227,78 @@ public class NanoSchedulerUnitTest extends BaseTest { Assert.assertTrue(nanoScheduler.isShutdown(), "scheduler should be dead"); } - @Test(expectedExceptions = IllegalStateException.class, timeOut = NANO_SCHEDULE_MAX_RUNTIME) + @Test(enabled = true && ! DEBUG, expectedExceptions = IllegalStateException.class, timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testShutdownExecuteFailure() throws InterruptedException { final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); nanoScheduler.shutdown(); nanoScheduler.execute(exampleTest.makeReader(), exampleTest.makeMap(), exampleTest.initReduce(), exampleTest.makeReduce()); } - @Test(expectedExceptions = NullPointerException.class, timeOut = 10000, invocationCount = 50) + @DataProvider(name = "NanoSchedulerInputExceptionTest") + public Object[][] createNanoSchedulerInputExceptionTest() { + List tests = new ArrayList(); + + + for ( final int bufSize : Arrays.asList(100) ) { + for ( final int nThreads : Arrays.asList(8) ) { + for ( final boolean addDelays : Arrays.asList(true, false) ) { + final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(bufSize, nThreads, 1, 1000000, false); + final int maxN = addDelays ? 10000 : 100000; + for ( int nElementsBeforeError = 0; nElementsBeforeError < maxN; nElementsBeforeError += Math.max(nElementsBeforeError / 10, 1) ) { + tests.add(new Object[]{nElementsBeforeError, test, addDelays}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, expectedExceptions = NullPointerException.class, timeOut = 10000) public void testInputErrorIsThrown_NPE() throws InterruptedException { - executeTestErrorThrowingInput(new NullPointerException()); + executeTestErrorThrowingInput(10, new NullPointerException(), exampleTest, false); } - @Test(expectedExceptions = ReviewedStingException.class, timeOut = 10000, invocationCount = 50) + @Test(enabled = true, expectedExceptions = ReviewedStingException.class, timeOut = 10000) public void testInputErrorIsThrown_RSE() throws InterruptedException { - executeTestErrorThrowingInput(new ReviewedStingException("test")); + executeTestErrorThrowingInput(10, new ReviewedStingException("test"), exampleTest, false); } - private void executeTestErrorThrowingInput(final RuntimeException ex) { - final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); - nanoScheduler.execute(new ErrorThrowingIterator(ex), exampleTest.makeMap(), exampleTest.initReduce(), exampleTest.makeReduce()); + @Test(enabled = true, expectedExceptions = NullPointerException.class, dataProvider = "NanoSchedulerInputExceptionTest", timeOut = 10000, invocationCount = 1) + public void testInputErrorDoesntDeadlock(final int nElementsBeforeError, final NanoSchedulerBasicTest test, final boolean addDelays ) throws InterruptedException { + executeTestErrorThrowingInput(nElementsBeforeError, new NullPointerException(), test, addDelays); + } + + private void executeTestErrorThrowingInput(final int nElementsBeforeError, final RuntimeException ex, final NanoSchedulerBasicTest test, final boolean addDelays) { + logger.warn("executeTestErrorThrowingInput " + nElementsBeforeError + " ex=" + ex + " test=" + test + " addInputDelays=" + addDelays); + final NanoScheduler nanoScheduler = test.makeScheduler(); + nanoScheduler.execute(new ErrorThrowingIterator(nElementsBeforeError, ex, addDelays), test.makeMap(), test.initReduce(), test.makeReduce()); } private static class ErrorThrowingIterator implements Iterator { + final int nElementsBeforeError; + final boolean addDelays; + int i = 0; final RuntimeException ex; - private ErrorThrowingIterator(RuntimeException ex) { + private ErrorThrowingIterator(final int nElementsBeforeError, RuntimeException ex, boolean addDelays) { + this.nElementsBeforeError = nElementsBeforeError; this.ex = ex; + this.addDelays = addDelays; } - @Override public boolean hasNext() { throw ex; } - @Override public Integer next() { throw ex; } - @Override public void remove() { throw ex; } + @Override public boolean hasNext() { return true; } + @Override public Integer next() { + if ( i++ > nElementsBeforeError ) { + throw ex; + } else if ( addDelays ) { + maybeDelayMe(i); + return i; + } else { + return i; + } + } + @Override public void remove() { throw new UnsupportedOperationException("x"); } } public static void main(String [ ] args) { diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java index 7381bebc4..c072c808d 100755 --- a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java @@ -130,7 +130,10 @@ public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { return StateTest.getTests(StateTest.class); } - @Test(enabled = true, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND) + // NOTE this test takes an unreasonably long time to run, and so it's been disabled as these monitoring threads + // aren't a core GATK feature any longer. Should be reabled if we come to care about this capability again + // in the future, or we can run these in parallel + @Test(enabled = false, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND) public void testStateTest(final StateTest test) throws InterruptedException { // allows us to test blocking final EfficiencyMonitoringThreadFactory factory = new EfficiencyMonitoringThreadFactory(test.getNStates()); diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala index 0d8edc25d..54e89ec58 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala @@ -28,6 +28,7 @@ import collection.JavaConversions._ import org.broadinstitute.sting.queue.QException import java.lang.Class import org.broadinstitute.sting.commandline.{ArgumentMatches, ArgumentSource, ArgumentTypeDescriptor, ParsingEngine} +import org.broadinstitute.sting.utils.exceptions.UserException import java.lang.reflect.Type /** @@ -75,6 +76,8 @@ class ScalaCompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { def parse(parsingEngine: ParsingEngine, source: ArgumentSource, classType: Class[_], argumentMatches: ArgumentMatches) = { val componentType = ReflectionUtils.getCollectionType(source.field) + if (componentType == classOf[java.lang.Object]) + throw new UserException.CannotExecuteQScript("Please also include a @ClassType(classOf[]) annotation on field: " + source.field + ". Example: @ClassType(classOf[Double]). The scala generic type for the field was subjected to java/scala type erasure and is not available via reflection.") val componentArgumentParser = parsingEngine.selectBestTypeDescriptor(componentType) if (classOf[Seq[_]].isAssignableFrom(classType)) {