diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java index 98a96fbfb..d5afc5722 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java @@ -23,7 +23,7 @@ public class BaseAndQualsCounts extends BaseCounts { } } - public void incr(byte base, byte baseQual, byte insQual, byte delQual) { + public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { super.incr(base, baseQual); BaseIndex i = BaseIndex.byteToBase(base); if (i != null) { // do not allow Ns @@ -32,7 +32,7 @@ public class BaseAndQualsCounts extends BaseCounts { } } - public void decr(byte base, byte baseQual, byte insQual, byte delQual) { + public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { super.decr(base, baseQual); BaseIndex i = BaseIndex.byteToBase(base); if (i != null) { // do not allow Ns @@ -41,16 +41,15 @@ public class BaseAndQualsCounts extends BaseCounts { } } - public byte averageInsertionQualsOfMostCommonBase() { - return getGenericAverageQualOfMostCommonBase(sumInsertionQuals); + public byte averageInsertionQualsOfBase(final BaseIndex base) { + return getGenericAverageQualOfBase(base, sumInsertionQuals); } - public byte averageDeletionQualsOfMostCommonBase() { - return getGenericAverageQualOfMostCommonBase(sumDeletionQuals); + public byte averageDeletionQualsOfBase(final BaseIndex base) { + return getGenericAverageQualOfBase(base, sumDeletionQuals); } - private byte getGenericAverageQualOfMostCommonBase(Map sumQuals) { - BaseIndex base = BaseIndex.byteToBase(baseWithMostCounts()); + private byte getGenericAverageQualOfBase(final BaseIndex base, final Map sumQuals) { return (byte) (sumQuals.get(base) / getCount(base)); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index 53c36c3f9..94f3c2b6b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -41,26 +41,26 @@ import java.util.Map; @Requires("other != null") public void add(BaseCounts other) { - for (BaseIndex i : BaseIndex.values()) + for (final BaseIndex i : BaseIndex.values()) counts.put(i, counts.get(i) + other.counts.get(i)); } @Requires("other != null") public void sub(BaseCounts other) { - for (BaseIndex i : BaseIndex.values()) + for (final BaseIndex i : BaseIndex.values()) counts.put(i, counts.get(i) - other.counts.get(i)); } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") public void incr(byte base) { - BaseIndex i = BaseIndex.byteToBase(base); + final BaseIndex i = BaseIndex.byteToBase(base); if (i != null) // no Ns counts.put(i, counts.get(i) + 1); } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") public void incr(byte base, byte qual) { - BaseIndex i = BaseIndex.byteToBase(base); + final BaseIndex i = BaseIndex.byteToBase(base); if (i != null) { // no Ns counts.put(i, counts.get(i) + 1); sumQuals.put(i, sumQuals.get(i) + qual); @@ -69,14 +69,14 @@ import java.util.Map; @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") public void decr(byte base) { - BaseIndex i = BaseIndex.byteToBase(base); + final BaseIndex i = BaseIndex.byteToBase(base); if (i != null) // no Ns counts.put(i, counts.get(i) - 1); } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") public void decr(byte base, byte qual) { - BaseIndex i = BaseIndex.byteToBase(base); + final BaseIndex i = BaseIndex.byteToBase(base); if (i != null) { // no Ns counts.put(i, counts.get(i) - 1); sumQuals.put(i, sumQuals.get(i) - qual); @@ -84,52 +84,48 @@ import java.util.Map; } @Ensures("result >= 0") - public int getCount(byte base) { + public int getCount(final byte base) { return getCount(BaseIndex.byteToBase(base)); } @Ensures("result >= 0") - public int getCount(BaseIndex base) { + public int getCount(final BaseIndex base) { return counts.get(base); } @Ensures("result >= 0") - public long getSumQuals(byte base) { + public long getSumQuals(final byte base) { return getSumQuals(BaseIndex.byteToBase(base)); } @Ensures("result >= 0") - public long getSumQuals(BaseIndex base) { + public long getSumQuals(final BaseIndex base) { return sumQuals.get(base); } @Ensures("result >= 0") - public byte averageQuals(byte base) { + public byte averageQuals(final byte base) { return (byte) (getSumQuals(base) / getCount(base)); } @Ensures("result >= 0") - public byte averageQuals(BaseIndex base) { + public byte averageQuals(final BaseIndex base) { return (byte) (getSumQuals(base) / getCount(base)); } - public byte baseWithMostCounts() { - return baseIndexWithMostCounts().getByte(); + @Ensures("result >= 0") + public int countOfBase(final BaseIndex base) { + return counts.get(base); } @Ensures("result >= 0") - public int countOfMostCommonBase() { - return counts.get(baseIndexWithMostCounts()); + public long sumQualsOfBase(final BaseIndex base) { + return sumQuals.get(base); } @Ensures("result >= 0") - public long sumQualsOfMostCommonBase() { - return sumQuals.get(baseIndexWithMostCounts()); - } - - @Ensures("result >= 0") - public byte averageQualsOfMostCommonBase() { - return (byte) (sumQualsOfMostCommonBase() / countOfMostCommonBase()); + public byte averageQualsOfBase(final BaseIndex base) { + return (byte) (sumQualsOfBase(base) / countOfBase(base)); } @@ -149,7 +145,7 @@ import java.util.Map; * @return the proportion of this base over all other bases */ @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportion(byte base) { + public double baseCountProportion(final byte base) { return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount(); } @@ -160,7 +156,7 @@ import java.util.Map; * @return the proportion of this base over all other bases */ @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportion(BaseIndex baseIndex) { + public double baseCountProportion(final BaseIndex baseIndex) { int total = totalCount(); if (total == 0) return 0.0; @@ -177,22 +173,28 @@ import java.util.Map; return b.toString(); } + public byte baseWithMostCounts() { + return baseIndexWithMostCounts().getByte(); + } + @Ensures("result != null") public BaseIndex baseIndexWithMostCounts() { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - for (BaseIndex i : counts.keySet()) - if (hasHigherCount(i, maxI)) - maxI = i; + for (Map.Entry entry : counts.entrySet()) { + if (entry.getValue() > counts.get(maxI)) + maxI = entry.getKey(); + } return maxI; } @Ensures("result != null") public BaseIndex baseIndexWithMostCountsWithoutIndels() { - BaseIndex mostCounts = MAX_BASE_INDEX_WITH_NO_COUNTS; - for (BaseIndex index : counts.keySet()) - if (index.isNucleotide() && hasHigherCount(index, mostCounts)) - mostCounts = index; - return mostCounts; + BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + for (Map.Entry entry : counts.entrySet()) { + if (entry.getKey().isNucleotide() && entry.getValue() > counts.get(maxI)) + maxI = entry.getKey(); + } + return maxI; } private boolean hasHigherCount(final BaseIndex targetIndex, final BaseIndex testIndex) { @@ -201,6 +203,30 @@ import java.util.Map; return ( targetCount > testCount || (targetCount == testCount && sumQuals.get(targetIndex) > sumQuals.get(testIndex)) ); } + public byte baseWithMostProbability() { + return baseIndexWithMostProbability().getByte(); + } + + @Ensures("result != null") + public BaseIndex baseIndexWithMostProbability() { + BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + for (Map.Entry entry : sumQuals.entrySet()) { + if (entry.getValue() > sumQuals.get(maxI)) + maxI = entry.getKey(); + } + return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCounts()); + } + + @Ensures("result != null") + public BaseIndex baseIndexWithMostProbabilityWithoutIndels() { + BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + for (Map.Entry entry : sumQuals.entrySet()) { + if (entry.getKey().isNucleotide() && entry.getValue() > sumQuals.get(maxI)) + maxI = entry.getKey(); + } + return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCountsWithoutIndels()); + } + @Ensures("result >=0") public int totalCountWithoutIndels() { int sum = 0; @@ -218,8 +244,8 @@ import java.util.Map; */ @Requires("index.isNucleotide()") @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportionWithoutIndels(BaseIndex index) { - int total = totalCountWithoutIndels(); + public double baseCountProportionWithoutIndels(final BaseIndex index) { + final int total = totalCountWithoutIndels(); if (total == 0) return 0.0; return (double) counts.get(index) / totalCountWithoutIndels(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index 3fc438b19..0c1854ad1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -182,7 +182,7 @@ public class HeaderElement { * @return whether or not the HeaderElement is variant due to excess insertions */ private boolean isVariantFromMismatches(double minVariantProportion) { - BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostCountsWithoutIndels(); + BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels(); double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon); return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 6c588898c..6fdf85317 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -55,7 +55,7 @@ public class SlidingWindow { private final int nContigs; - private boolean allowPolyploidReduction; + private boolean allowPolyploidReductionInGeneral; /** * The types of synthetic reads to use in the finalizeAndAdd method @@ -117,7 +117,7 @@ public class SlidingWindow { this.hasIndelQualities = hasIndelQualities; this.nContigs = nContigs; - this.allowPolyploidReduction = allowPolyploidReduction; + this.allowPolyploidReductionInGeneral = allowPolyploidReduction; } /** @@ -207,8 +207,9 @@ public class SlidingWindow { finalizedReads = closeVariantRegions(regions, false); List readsToRemove = new LinkedList(); - for (GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!) - if (read.getSoftEnd() < getStartLocation(windowHeader)) { + final int windowHeaderStartLoc = getStartLocation(windowHeader); + for (final GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!) + if (read.getSoftEnd() < windowHeaderStartLoc) { readsToRemove.add(read); } } @@ -291,7 +292,7 @@ public class SlidingWindow { reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS)); int endOfFilteredData = findNextNonFilteredDataElement(header, start, end); - addToFilteredData(header, start, endOfFilteredData, isNegativeStrand); + reads.addAll(addToFilteredData(header, start, endOfFilteredData, isNegativeStrand)); if (endOfFilteredData <= start) throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start)); @@ -418,7 +419,9 @@ public class SlidingWindow { * @param start the first header index to add to consensus * @param end the first header index NOT TO add to consensus */ - private void addToFilteredData(LinkedList header, int start, int end, boolean isNegativeStrand) { + private List addToFilteredData(LinkedList header, int start, int end, boolean isNegativeStrand) { + List result = new ArrayList(0); + if (filteredDataConsensus == null) filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand); @@ -434,8 +437,15 @@ public class SlidingWindow { if (!headerElement.hasFilteredData()) throw new ReviewedStingException("No filtered data in " + index); + if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) { + result.add(finalizeFilteredDataConsensus()); + filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand); + } + genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS()); } + + return result; } /** @@ -472,15 +482,15 @@ public class SlidingWindow { * @param rms the rms mapping quality in the header element */ private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) { - BaseIndex base = baseCounts.baseIndexWithMostCounts(); - byte count = (byte) Math.min(baseCounts.countOfMostCommonBase(), Byte.MAX_VALUE); - byte qual = baseCounts.averageQualsOfMostCommonBase(); - byte insQual = baseCounts.averageInsertionQualsOfMostCommonBase(); - byte delQual = baseCounts.averageDeletionQualsOfMostCommonBase(); + final BaseIndex base = baseCounts.baseIndexWithMostProbability(); + byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE); + byte qual = baseCounts.averageQualsOfBase(base); + byte insQual = baseCounts.averageInsertionQualsOfBase(base); + byte delQual = baseCounts.averageDeletionQualsOfBase(base); syntheticRead.add(base, count, qual, insQual, delQual, rms); } - private List compressVariantRegion(int start, int stop) { + private List compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) { List allReads = new LinkedList(); // Try to compress into a polyploid consensus @@ -490,7 +500,8 @@ public class SlidingWindow { boolean foundEvent = false; Object[] header = windowHeader.toArray(); - if ( allowPolyploidReduction ) { // foundEvent will remain false if we don't allow polyploid reduction + // foundEvent will remain false if we don't allow polyploid reduction + if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) { for (int i = start; i<=stop; i++) { nHaplotypes = ((HeaderElement) header[i]).getNumberOfHaplotypes(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT); if (nHaplotypes > nContigs) { @@ -512,9 +523,6 @@ public class SlidingWindow { } } - int refStart = windowHeader.get(start).getLocation(); - int refStop = windowHeader.get(stop).getLocation(); - // Try to compress the variant region // the "foundEvent" protects us from trying to compress variant regions that are created by insertions if (canCompress && foundEvent) { @@ -524,6 +532,9 @@ public class SlidingWindow { // Return all reads that overlap the variant region and remove them from the window header entirely // also remove all reads preceding the variant region (since they will be output as consensus right after compression else { + final int refStart = windowHeader.get(start).getLocation(); + final int refStop = windowHeader.get(stop).getLocation(); + LinkedList toRemove = new LinkedList(); for (GATKSAMRecord read : readsInWindow) { if (read.getSoftStart() <= refStop) { @@ -549,8 +560,8 @@ public class SlidingWindow { * @return all reads contained in the variant region plus any adjacent synthetic reads */ @Requires("start <= stop") - protected List closeVariantRegion(int start, int stop) { - List allReads = compressVariantRegion(start, stop); + protected List closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) { + List allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition); List result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads; result.addAll(addToSyntheticReads(windowHeader, 0, stop, false)); @@ -570,7 +581,7 @@ public class SlidingWindow { if (stop < 0 && forceClose) stop = windowHeader.size() - 1; if (stop >= 0) { - allReads.addAll(closeVariantRegion(start, stop)); + allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1)); lastStop = stop; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java index ab65020c3..ccf81dd67 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java @@ -44,7 +44,7 @@ public class SyntheticRead { private String contig; private int contigIndex; private String readName; - private Integer refStart; + private int refStart; private boolean hasIndelQualities = false; private boolean isNegativeStrand = false; @@ -60,7 +60,7 @@ public class SyntheticRead { * @param refStart the alignment start (reference based) * @param readTag the reduce reads tag for the synthetic read */ - public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) { + public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) { final int initialCapacity = 10000; bases = new ArrayList(initialCapacity); counts = new ArrayList(initialCapacity); @@ -80,7 +80,7 @@ public class SyntheticRead { this.isNegativeStrand = isNegativeRead; } - public SyntheticRead(List bases, List counts, List quals, List insertionQuals, List deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, boolean hasIndelQualities, boolean isNegativeRead) { + public SyntheticRead(List bases, List counts, List quals, List insertionQuals, List deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) { this.bases = bases; this.counts = counts; this.quals = quals; @@ -115,11 +115,15 @@ public class SyntheticRead { this.mappingQuality += mappingQuality; } - public BaseIndex getBase(int readCoordinate) { + public BaseIndex getBase(final int readCoordinate) { return bases.get(readCoordinate); } - /** + public int getRefStart() { + return refStart; + } + + /** * Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid. * * Invalid reads are : diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java index 6b0831323..303ab94d6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java @@ -26,6 +26,8 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACcounts; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.collections.Pair; @@ -123,7 +125,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods { * * */ - protected static class SumIterator { + public static class SumIterator { private int[] currentState; private final int[] finalState; private final int restrictSumTo; @@ -491,32 +493,32 @@ public abstract class GeneralPloidyGenotypeLikelihoods { // If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors // and we repeat until queue is empty // queue of AC conformations to process - final LinkedList ACqueue = new LinkedList(); + final LinkedList ACqueue = new LinkedList(); // mapping of ExactACset indexes to the objects - final HashMap indexesToACset = new HashMap(likelihoodDim); + final HashMap indexesToACset = new HashMap(likelihoodDim); // add AC=0 to the queue final int[] zeroCounts = new int[nAlleles]; zeroCounts[0] = numChromosomes; - AlleleFrequencyCalculationModel.ExactACset zeroSet = - new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts)); + ExactACset zeroSet = + new ExactACset(1, new ExactACcounts(zeroCounts)); ACqueue.add(zeroSet); - indexesToACset.put(zeroSet.ACcounts, zeroSet); + indexesToACset.put(zeroSet.getACcounts(), zeroSet); // keep processing while we have AC conformations that need to be calculated double maxLog10L = Double.NEGATIVE_INFINITY; while ( !ACqueue.isEmpty() ) { // compute log10Likelihoods - final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove(); + final ExactACset ACset = ACqueue.remove(); final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup); // adjust max likelihood seen if needed maxLog10L = Math.max(maxLog10L, log10LofKs); // clean up memory - indexesToACset.remove(ACset.ACcounts); + indexesToACset.remove(ACset.getACcounts()); if ( VERBOSE ) - System.out.printf(" *** removing used set=%s%n", ACset.ACcounts); + System.out.printf(" *** removing used set=%s%n", ACset.getACcounts()); } @@ -525,13 +527,13 @@ public abstract class GeneralPloidyGenotypeLikelihoods { int plIdx = 0; SumIterator iterator = new SumIterator(nAlleles, numChromosomes); while (iterator.hasNext()) { - AlleleFrequencyCalculationModel.ExactACset ACset = - new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector())); + ExactACset ACset = + new ExactACset(1, new ExactACcounts(iterator.getCurrentVector())); // for observed base X, add Q(jX,k) to likelihood vector for all k in error model //likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k)) getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup); - setLogPLs(plIdx++, ACset.log10Likelihoods[0]); + setLogPLs(plIdx++, ACset.getLog10Likelihoods()[0]); iterator.next(); } } @@ -540,40 +542,40 @@ public abstract class GeneralPloidyGenotypeLikelihoods { } - private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set, + private double calculateACConformationAndUpdateQueue(final ExactACset set, final ErrorModel errorModel, final List alleleList, final List numObservations, final double maxLog10L, - final LinkedList ACqueue, - final HashMap indexesToACset, + final LinkedList ACqueue, + final HashMap indexesToACset, final ReadBackedPileup pileup) { // compute likelihood of set getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup); - final double log10LofK = set.log10Likelihoods[0]; + final double log10LofK = set.getLog10Likelihoods()[0]; // log result in PL vector - int idx = getLinearIndex(set.ACcounts.getCounts(), nAlleles, numChromosomes); + int idx = getLinearIndex(set.getACcounts().getCounts(), nAlleles, numChromosomes); setLogPLs(idx, log10LofK); // can we abort early because the log10Likelihoods are so small? if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { if ( VERBOSE ) - System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); + System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, maxLog10L); return log10LofK; } // iterate over higher frequencies if possible // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count. - final int ACwiggle = numChromosomes - set.getACsum() + set.ACcounts.counts[0]; + final int ACwiggle = numChromosomes - set.getACsum() + set.getACcounts().getCounts()[0]; if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies return log10LofK; // add conformations for other cases for ( int allele = 1; allele < nAlleles; allele++ ) { - final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); ACcountsClone[allele]++; // is this a valid conformation? int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0]; @@ -597,7 +599,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods { * @param numObservations Number of observations for each allele * @param pileup Read backed pileup in case it's necessary */ - public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, + public abstract void getLikelihoodOfConformation(final ExactACset ACset, final ErrorModel errorModel, final List alleleList, final List numObservations, @@ -608,12 +610,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods { // Static methods public static void updateACset(final int[] newSetCounts, - final LinkedList ACqueue, - final HashMap indexesToACset) { + final LinkedList ACqueue, + final HashMap indexesToACset) { - final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts); + final ExactACcounts index = new ExactACcounts(newSetCounts); if ( !indexesToACset.containsKey(index) ) { - AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index); + ExactACset newSet = new ExactACset(1, index); indexesToACset.put(index, newSet); ACqueue.add(newSet); if (VERBOSE) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java index ac212cfb5..afbd49a08 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -188,12 +189,12 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype * @param alleleList List of alleles * @param numObservations Number of observations for each allele in alleleList */ - public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, + public void getLikelihoodOfConformation(final ExactACset ACset, final ErrorModel errorModel, final List alleleList, final List numObservations, final ReadBackedPileup pileup) { - final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, alleleList.size()); + final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), alleleList.size()); double p1 = 0.0; if (!hasReferenceSampleData) { @@ -218,6 +219,6 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype } p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec); } - ACset.log10Likelihoods[0] = p1; + ACset.getLog10Likelihoods()[0] = p1; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java index 944372907..0f0f85441 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.baq.BAQ; @@ -12,7 +13,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; import static java.lang.Math.log10; import static java.lang.Math.pow; @@ -218,12 +222,12 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi * @param alleleList List of alleles * @param numObservations Number of observations for each allele in alleleList */ - public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, + public void getLikelihoodOfConformation(final ExactACset ACset, final ErrorModel errorModel, final List alleleList, final List numObservations, final ReadBackedPileup pileup) { - final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, BaseUtils.BASES.length); + final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), BaseUtils.BASES.length); final int[] ac = new int[BaseUtils.BASES.length]; for (int k=0; k < BaseUtils.BASES.length; k++ ) @@ -238,9 +242,9 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi final byte qual = qualToUse(elt, true, true, mbq); if ( qual == 0 ) continue; - final double acc[] = new double[ACset.ACcounts.counts.length]; + final double acc[] = new double[ACset.getACcounts().getCounts().length]; for (int k=0; k < acc.length; k++ ) - acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.ACcounts.counts[k]] + acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.getACcounts().getCounts()[k]] - LOG10_PLOIDY; p1 += MathUtils.log10sumLog10(acc); } @@ -264,7 +268,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ), acVec); } - ACset.log10Likelihoods[0] = p1; + ACset.getLog10Likelihoods()[0] = p1; /* System.out.println(Arrays.toString(ACset.ACcounts.getCounts())+" "+String.valueOf(p1)); System.out.println(Arrays.toString(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ))); */ diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java new file mode 100644 index 000000000..68b068509 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java @@ -0,0 +1,277 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.Logger; +import org.apache.log4j.TTCCLayout; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; + +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 10/2/12 + * Time: 10:25 AM + * To change this template use File | Settings | File Templates. + */ +public class AFCalcPerformanceTest { + final static Logger logger = Logger.getLogger(AFCalcPerformanceTest.class); + + private static abstract class Analysis { + final GATKReport report; + + public Analysis(final String name, final List columns) { + report = GATKReport.newSimpleReport(name, columns); + } + + public abstract void run(final AFCalcTestBuilder testBuilder, + final List coreColumns); + + public String getName() { + return getTable().getTableName(); + } + + public GATKReportTable getTable() { + return report.getTables().iterator().next(); + } + } + + private static class AnalyzeByACAndPL extends Analysis { + public AnalyzeByACAndPL(final List columns) { + super("AnalyzeByACAndPL", Utils.append(columns, "non.type.pls", "ac", "n.alt.seg", "other.ac")); + } + + public void run(final AFCalcTestBuilder testBuilder, final List coreValues) { + final SimpleTimer timer = new SimpleTimer(); + + for ( final int nonTypePL : Arrays.asList(100) ) { + final AFCalc calc = testBuilder.makeModel(); + final double[] priors = testBuilder.makePriors(); + + for ( int[] ACs : makeACs(testBuilder.numAltAlleles, testBuilder.nSamples*2) ) { + final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL); + + timer.start(); + final AFCalcResult resultTracker = calc.getLog10PNonRef(vc, priors); + final long runtime = timer.getElapsedTimeNano(); + + int otherAC = 0; + int nAltSeg = 0; + for ( int i = 0; i < ACs.length; i++ ) { + nAltSeg += ACs[i] > 0 ? 1 : 0; + if ( i > 0 ) otherAC += ACs[i]; + } + + final List columns = new LinkedList(coreValues); + columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, ACs[0], nAltSeg, otherAC)); + report.addRowList(columns); + } + } + } + + private List makeACs(final int nAltAlleles, final int nChrom) { + if ( nAltAlleles > 2 ) throw new IllegalArgumentException("nAltAlleles must be < 3"); + + final List ACs = new LinkedList(); + + final List ACsToTry = MathUtils.log10LinearRange(0, nChrom, 0.1); //Arrays.asList(0, 1, 2, 3, 6, 10, 20, 40, 60, 100, 200, 400, 600, 1000, 2000, 4000, 6000, 10000, 100000); + + for ( int i : ACsToTry ) { + if ( i < nChrom ) { + if ( nAltAlleles == 1 ) { + ACs.add(new int[]{i}); + } else if ( nAltAlleles == 2 ) { + for ( int j : ACsToTry ) { + if ( j < nChrom - i ) + ACs.add(new int[]{i, j}); + } + } else { + throw new IllegalStateException("cannot get here"); + } + } + } + + return ACs; + } + } + + private static class AnalyzeBySingletonPosition extends Analysis { + public AnalyzeBySingletonPosition(final List columns) { + super("AnalyzeBySingletonPosition", Utils.append(columns, "non.type.pls", "position.of.singleton")); + } + + public void run(final AFCalcTestBuilder testBuilder, final List coreValues) { + final SimpleTimer timer = new SimpleTimer(); + + for ( final int nonTypePL : Arrays.asList(100) ) { + final AFCalc calc = testBuilder.makeModel(); + final double[] priors = testBuilder.makePriors(); + + final int[] ac = new int[testBuilder.numAltAlleles]; + ac[0] = 1; + final VariantContext vc = testBuilder.makeACTest(ac, 0, nonTypePL); + + for ( final int position : MathUtils.log10LinearRange(0, vc.getNSamples(), 0.1) ) { + final VariantContextBuilder vcb = new VariantContextBuilder(vc); + final List genotypes = new ArrayList(vc.getGenotypes()); + Collections.rotate(genotypes, position); + vcb.genotypes(genotypes); + + timer.start(); + final AFCalcResult resultTracker = calc.getLog10PNonRef(vcb.make(), priors); + final long runtime = timer.getElapsedTimeNano(); + + final List columns = new LinkedList(coreValues); + columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, position)); + report.addRowList(columns); + } + } + } + } + + private static class AnalyzeByNonInformative extends Analysis { + public AnalyzeByNonInformative(final List columns) { + super("AnalyzeByNonInformative", Utils.append(columns, "non.type.pls", "n.non.informative")); + } + + public void run(final AFCalcTestBuilder testBuilder, final List coreValues) { + final SimpleTimer timer = new SimpleTimer(); + + for ( final int nonTypePL : Arrays.asList(100) ) { + final AFCalc calc = testBuilder.makeModel(); + final double[] priors = testBuilder.makePriors(); + + final int[] ac = new int[testBuilder.numAltAlleles]; + ac[0] = 1; + + for ( int nNonInformative = 0; nNonInformative < testBuilder.nSamples; nNonInformative++ ) { + final VariantContext vc = testBuilder.makeACTest(ac, nNonInformative, nonTypePL); + + timer.start(); + final AFCalcResult resultTracker = calc.getLog10PNonRef(vc, priors); + final long runtime = timer.getElapsedTimeNano(); + + final List columns = new LinkedList(coreValues); + columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, nNonInformative)); + report.addRowList(columns); + } + } + } + } + + private static class ModelParams { + final AFCalcFactory.Calculation modelType; + final int maxBiNSamples, maxTriNSamples; + + private ModelParams(AFCalcFactory.Calculation modelType, int maxBiNSamples, int maxTriNSamples) { + this.modelType = modelType; + this.maxBiNSamples = maxBiNSamples; + this.maxTriNSamples = maxTriNSamples; + } + + public boolean meetsConstraints(final int nAltAlleles, final int nSamples) { + if ( nAltAlleles == 1 ) + return nSamples <= maxBiNSamples; + else if ( nAltAlleles == 2 ) + return nSamples <= maxTriNSamples; + else + throw new IllegalStateException("Unexpected number of alt alleles " + nAltAlleles); + } + } + + public enum Operation { + ANALYZE, + SINGLE + } + public static void main(final String[] args) throws Exception { + final TTCCLayout layout = new TTCCLayout(); + layout.setThreadPrinting(false); + layout.setCategoryPrefixing(false); + layout.setContextPrinting(false); + logger.addAppender(new ConsoleAppender(layout)); + + final Operation op = Operation.valueOf(args[0]); + + switch ( op ) { + case ANALYZE: analyze(args); break; + case SINGLE: profileBig(args); break; + default: throw new IllegalAccessException("unknown operation " + op); + } + } + + private static void profileBig(final String[] args) throws Exception { + final int nSamples = Integer.valueOf(args[1]); + final int ac = Integer.valueOf(args[2]); + + final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(nSamples, 1, + AFCalcFactory.Calculation.EXACT_INDEPENDENT, + AFCalcTestBuilder.PriorType.human); + + final VariantContext vc = testBuilder.makeACTest(new int[]{ac}, 0, 100); + + final SimpleTimer timer = new SimpleTimer().start(); + final AFCalcResult resultTracker = testBuilder.makeModel().getLog10PNonRef(vc, testBuilder.makePriors()); + final long runtime = timer.getElapsedTimeNano(); + logger.info("result " + resultTracker.getLog10PosteriorOfAFGT0()); + logger.info("runtime " + runtime); + } + + private static void analyze(final String[] args) throws Exception { + final List coreColumns = Arrays.asList("iteration", "n.alt.alleles", "n.samples", + "exact.model", "prior.type", "runtime", "n.evaluations"); + + final PrintStream out = new PrintStream(new FileOutputStream(args[1])); + + final List modelParams = Arrays.asList( + new ModelParams(AFCalcFactory.Calculation.EXACT_REFERENCE, 10000, 10), +// new ModelParams(AFCalcTestBuilder.ModelType.GeneralExact, 100, 10), + new ModelParams(AFCalcFactory.Calculation.EXACT_CONSTRAINED, 10000, 100), + new ModelParams(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 10000, 1000)); + + final boolean ONLY_HUMAN_PRIORS = false; + final List priorTypes = ONLY_HUMAN_PRIORS + ? Arrays.asList(AFCalcTestBuilder.PriorType.values()) + : Arrays.asList(AFCalcTestBuilder.PriorType.human); + + final List analyzes = new ArrayList(); + analyzes.add(new AnalyzeByACAndPL(coreColumns)); + analyzes.add(new AnalyzeBySingletonPosition(coreColumns)); + //analyzes.add(new AnalyzeByNonInformative(coreColumns)); + + for ( int iteration = 0; iteration < 1; iteration++ ) { + for ( final int nAltAlleles : Arrays.asList(1, 2) ) { + for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) { + for ( final ModelParams modelToRun : modelParams) { + if ( modelToRun.meetsConstraints(nAltAlleles, nSamples) ) { + for ( final AFCalcTestBuilder.PriorType priorType : priorTypes ) { + final AFCalcTestBuilder testBuilder + = new AFCalcTestBuilder(nSamples, nAltAlleles, modelToRun.modelType, priorType); + + for ( final Analysis analysis : analyzes ) { + logger.info(Utils.join("\t", Arrays.asList(iteration, nAltAlleles, nSamples, modelToRun.modelType, priorType, analysis.getName()))); + final List values = Arrays.asList(iteration, nAltAlleles, nSamples, modelToRun.modelType, priorType); + analysis.run(testBuilder, (List)values); + } + } + } + } + } + } + } + + final GATKReport report = new GATKReport(); + for ( final Analysis analysis : analyzes ) + report.addTable(analysis.getTable()); + report.print(out); + out.close(); + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java new file mode 100644 index 000000000..b4d105507 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java @@ -0,0 +1,170 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class AFCalcTestBuilder { + final static Allele A = Allele.create("A", true); + final static Allele C = Allele.create("C"); + final static Allele G = Allele.create("G"); + final static Allele T = Allele.create("T"); + final static Allele AA = Allele.create("AA"); + final static Allele AT = Allele.create("AT"); + final static Allele AG = Allele.create("AG"); + + static int sampleNameCounter = 0; + + final int nSamples; + final int numAltAlleles; + final AFCalcFactory.Calculation modelType; + final PriorType priorType; + + public AFCalcTestBuilder(final int nSamples, final int numAltAlleles, + final AFCalcFactory.Calculation modelType, final PriorType priorType) { + this.nSamples = nSamples; + this.numAltAlleles = numAltAlleles; + this.modelType = modelType; + this.priorType = priorType; + } + + @Override + public String toString() { + return String.format("AFCalcTestBuilder nSamples=%d nAlts=%d model=%s prior=%s", nSamples, numAltAlleles, modelType, priorType); + } + + public enum PriorType { + flat, + human + } + + public int getnSamples() { + return nSamples; + } + + public AFCalc makeModel() { + return AFCalcFactory.createAFCalc(modelType, nSamples, 4, 4, 2); + } + + public double[] makePriors() { + final int nPriorValues = 2*nSamples+1; + + switch ( priorType ) { + case flat: + return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors + case human: + final double[] humanPriors = new double[nPriorValues]; + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001); + return humanPriors; + default: + throw new RuntimeException("Unexpected type " + priorType); + } + } + + public VariantContext makeACTest(final List ACs, final int nNonInformative, final int nonTypePL) { + return makeACTest(ArrayUtils.toPrimitive(ACs.toArray(new Integer[]{})), nNonInformative, nonTypePL); + } + + public VariantContext makeACTest(final int[] ACs, final int nNonInformative, final int nonTypePL) { + final int nChrom = nSamples * 2; + + final int[] nhet = new int[numAltAlleles]; + final int[] nhomvar = new int[numAltAlleles]; + + for ( int i = 0; i < ACs.length; i++ ) { + final double p = ACs[i] / (1.0 * nChrom); + nhomvar[i] = (int)Math.floor((nSamples - nNonInformative) * p * p); + nhet[i] = ACs[i] - 2 * nhomvar[i]; + + if ( nhet[i] < 0 ) + throw new IllegalStateException("Bug! nhet[i] < 0"); + } + + final long calcAC = MathUtils.sum(nhet) + 2 * MathUtils.sum(nhomvar); + if ( calcAC != MathUtils.sum(ACs) ) + throw new IllegalStateException("calculated AC " + calcAC + " not equal to desired AC " + Utils.join(",", ACs)); + + return makeACTest(nhet, nhomvar, nNonInformative, nonTypePL); + } + + public VariantContext makeACTest(final int[] nhet, final int[] nhomvar, final int nNonInformative, final int nonTypePL) { + List samples = new ArrayList(nSamples); + + for ( int altI = 0; altI < nhet.length; altI++ ) { + for ( int i = 0; i < nhet[altI]; i++ ) + samples.add(makePL(GenotypeType.HET, nonTypePL, altI+1)); + for ( int i = 0; i < nhomvar[altI]; i++ ) + samples.add(makePL(GenotypeType.HOM_VAR, nonTypePL, altI+1)); + } + + final Genotype nonInformative = makeNonInformative(); + samples.addAll(Collections.nCopies(nNonInformative, nonInformative)); + + final int nRef = Math.max((int) (nSamples - nNonInformative - MathUtils.sum(nhet) - MathUtils.sum(nhomvar)), 0); + samples.addAll(Collections.nCopies(nRef, makePL(GenotypeType.HOM_REF, nonTypePL, 0))); + + samples = samples.subList(0, nSamples); + + if ( samples.size() > nSamples ) + throw new IllegalStateException("too many samples"); + + VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, getAlleles()); + vcb.genotypes(samples); + return vcb.make(); + } + + public List getAlleles() { + return Arrays.asList(A, C, G, T, AA, AT, AG).subList(0, numAltAlleles+1); + } + + public List getAlleles(final GenotypeType type, final int altI) { + switch (type) { + case HOM_REF: return Arrays.asList(getAlleles().get(0), getAlleles().get(0)); + case HET: return Arrays.asList(getAlleles().get(0), getAlleles().get(altI)); + case HOM_VAR: return Arrays.asList(getAlleles().get(altI), getAlleles().get(altI)); + default: throw new IllegalArgumentException("Unexpected type " + type); + } + } + + public Genotype makePL(final List expectedGT, int ... pls) { + GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++); + gb.alleles(expectedGT); + gb.PL(pls); + return gb.make(); + } + + private int numPLs() { + return GenotypeLikelihoods.numLikelihoods(numAltAlleles+1, 2); + } + + public Genotype makeNonInformative() { + final int[] nonInformativePLs = new int[GenotypeLikelihoods.numLikelihoods(numAltAlleles, 2)]; + return makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), nonInformativePLs); + } + + public Genotype makePL(final GenotypeType type, final int nonTypePL, final int altI) { + GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++); + gb.alleles(getAlleles(type, altI)); + + final int[] pls = new int[numPLs()]; + Arrays.fill(pls, nonTypePL); + + int index = 0; + switch ( type ) { + case HOM_REF: index = GenotypeLikelihoods.calculatePLindex(0, 0); break; + case HET: index = GenotypeLikelihoods.calculatePLindex(0, altI); break; + case HOM_VAR: index = GenotypeLikelihoods.calculatePLindex(altI, altI); break; + } + pls[index] = 0; + gb.PL(pls); + + return gb.make(); + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java similarity index 82% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 5662d82d6..1a864d3d8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -23,56 +23,55 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; -import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; +import org.broadinstitute.sting.gatk.walkers.genotyper.ProbabilityVector; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.*; -import java.io.PrintStream; import java.util.*; -public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel { +public class GeneralPloidyExactAFCalc extends ExactAFCalc { static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them - final protected UnifiedArgumentCollection UAC; private final int ploidy; private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 private final static boolean VERBOSE = false; - protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { - super(UAC, N, logger, verboseWriter); - ploidy = UAC.samplePloidy; - this.UAC = UAC; - + protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + this.ploidy = ploidy; } - public List getLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { - - GenotypesContext GLs = vc.getGenotypes(); - List alleles = vc.getAlleles(); + @Override + protected VariantContext reduceScope(VariantContext vc) { + final int maxAltAlleles = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype; // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) { - logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + if ( vc.getAlternateAlleles().size() > maxAltAlleles) { + logger.warn("this tool is currently set to genotype at most " + maxAltAlleles + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - alleles = new ArrayList(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1); + final List alleles = new ArrayList(maxAltAlleles + 1); alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy)); + alleles.addAll(chooseMostLikelyAlternateAlleles(vc, maxAltAlleles, ploidy)); - - GLs = subsetAlleles(vc, alleles, false, ploidy); + VariantContextBuilder builder = new VariantContextBuilder(vc); + builder.alleles(alleles); + builder.genotypes(subsetAlleles(vc, alleles, false, ploidy)); + return builder.make(); + } else { + return vc; } - - combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result); - - return alleles; } + @Override + public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { + combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, getResultTracker()); + return resultFromTracker(vc, log10AlleleFrequencyPriors); + } /** * Simple wrapper class to hold values of combined pool likelihoods. @@ -94,8 +93,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula public void add(ExactACset set) { alleleCountSetList.add(set); - conformationMap.put(set.ACcounts, set); - final double likelihood = set.log10Likelihoods[0]; + conformationMap.put(set.getACcounts(), set); + final double likelihood = set.getLog10Likelihoods()[0]; if (likelihood > maxLikelihood ) maxLikelihood = likelihood; @@ -108,11 +107,11 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula } public double getLikelihoodOfConformation(int[] ac) { - return conformationMap.get(new ExactACcounts(ac)).log10Likelihoods[0]; + return conformationMap.get(new ExactACcounts(ac)).getLog10Likelihoods()[0]; } public double getGLOfACZero() { - return alleleCountSetList.get(0).log10Likelihoods[0]; // AC 0 is always at beginning of list + return alleleCountSetList.get(0).getLog10Likelihoods()[0]; // AC 0 is always at beginning of list } public int getLength() { @@ -136,7 +135,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype - final ArrayList GLs = getGLs(vc.getGenotypes()); + final ArrayList GLs = getGLs(vc.getGenotypes(), true); for ( final double[] likelihoods : GLs ) { final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); @@ -171,15 +170,15 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula * @param numAlleles Number of alternate alleles * @param ploidyPerPool Number of samples per pool * @param log10AlleleFrequencyPriors Frequency priors - * @param result object to fill with output values + * @param resultTracker object to fill with output values */ protected static void combineSinglePools(final GenotypesContext GLs, final int numAlleles, final int ploidyPerPool, final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { + final AFCalcResultTracker resultTracker) { - final ArrayList genotypeLikelihoods = getGLs(GLs); + final ArrayList genotypeLikelihoods = getGLs(GLs, true); int combinedPloidy = 0; @@ -190,21 +189,27 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula // first element: zero ploidy, e.g. trivial degenerate distribution final int[] zeroCounts = new int[numAlleles]; final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts)); - set.log10Likelihoods[0] = 0.0; + set.getLog10Likelihoods()[0] = 0.0; combinedPoolLikelihoods.add(set); - for (int p=1; p maxLikelihoodSeen.maxLog10L ) - maxLikelihoodSeen.update(log10LofKs, ACset.ACcounts); + if ( log10LofKs > stateTracker.getMaxLog10L()) + stateTracker.update(log10LofKs, ACset.getACcounts()); // clean up memory - indexesToACset.remove(ACset.ACcounts); + indexesToACset.remove(ACset.getACcounts()); if ( VERBOSE ) - System.out.printf(" *** removing used set=%s%n", ACset.ACcounts); + System.out.printf(" *** removing used set=%s%n", ACset.getACcounts()); } return newPool; @@ -253,8 +259,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula * @param log10AlleleFrequencyPriors Prior object * @param originalPloidy Total ploidy of original combined pool * @param newGLPloidy Ploidy of GL vector - * @param result AFResult object - * @param maxLikelihoodSeen max likelihood observed so far + * @param resultTracker AFResult object + * @param stateTracker max likelihood observed so far * @param ACqueue Queue of conformations to compute * @param indexesToACset AC indices of objects in queue * @return max log likelihood @@ -266,15 +272,15 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula final double[] log10AlleleFrequencyPriors, final int originalPloidy, final int newGLPloidy, - final AlleleFrequencyCalculationResult result, - final MaxLikelihoodSeen maxLikelihoodSeen, + final AFCalcResultTracker resultTracker, + final StateTracker stateTracker, final LinkedList ACqueue, final HashMap indexesToACset) { // compute likeihood in "set" of new set based on original likelihoods - final int numAlleles = set.ACcounts.counts.length; + final int numAlleles = set.getACcounts().getCounts().length; final int newPloidy = set.getACsum(); - final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, result); + final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, resultTracker); // add to new pool @@ -282,24 +288,24 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula newPool.add(set); // TODO -- uncomment this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) - //if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) { - if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { + //if ( log10LofK < stateTracker.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && stateTracker.isLowerAC(set.ACcounts) ) { + if ( log10LofK < stateTracker.getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY ) { if ( VERBOSE ) - System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLikelihoodSeen.maxLog10L); + System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, stateTracker.getMaxLog10L()); return log10LofK; } // iterate over higher frequencies if possible // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count. // so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space - final int ACwiggle = set.ACcounts.counts[0]; + final int ACwiggle = set.getACcounts().getCounts()[0]; if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies return log10LofK; // add conformations for other cases for ( int allele = 1; allele < numAlleles; allele++ ) { - final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); ACcountsClone[allele]++; // is this a valid conformation? int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0]; @@ -329,11 +335,11 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula * @param ploidy2 Ploidy of second pool * @param numAlleles Number of alleles * @param log10AlleleFrequencyPriors Array of biallelic priors - * @param result Af calculation result object + * @param resultTracker Af calculation result object */ public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { + final AFCalcResultTracker resultTracker) { /* final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); @@ -387,7 +393,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula * @param numAlleles Number of alleles (including ref) * @param ploidy1 Ploidy of original pool (combined) * @param ploidy2 Ploidy of new pool - * @param result AFResult object + * @param resultTracker AFResult object * @return log-likehood of requested conformation */ private static double computeLofK(final ExactACset set, @@ -395,7 +401,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula final double[] secondGL, final double[] log10AlleleFrequencyPriors, final int numAlleles, final int ploidy1, final int ploidy2, - final AlleleFrequencyCalculationResult result) { + final AFCalcResultTracker resultTracker) { final int newPloidy = ploidy1 + ploidy2; @@ -404,17 +410,17 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula if (newPloidy != totalAltK) throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values"); - totalAltK -= set.ACcounts.counts[0]; + totalAltK -= set.getACcounts().getCounts()[0]; // totalAltK has sum of alt alleles of conformation now // special case for k = 0 over all k if ( totalAltK == 0 ) { // all-ref case final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX]; - set.log10Likelihoods[0] = log10Lof0; + set.getLog10Likelihoods()[0] = log10Lof0; - result.setLog10LikelihoodOfAFzero(log10Lof0); - result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); + resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); return log10Lof0; } else { @@ -423,12 +429,12 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula // ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy. // To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i - int[] currentCount = set.ACcounts.getCounts(); + int[] currentCount = set.getACcounts().getCounts(); double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount); // for current conformation, get all possible ways to break vector K into two components G1 and G2 final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); - set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY; + set.getLog10Likelihoods()[0] = Double.NEGATIVE_INFINITY; while (innerIterator.hasNext()) { // check if breaking current conformation into g1 and g2 is feasible. final int[] acCount2 = innerIterator.getCurrentVector(); @@ -444,27 +450,27 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2); final double sum = firstGL + gl2 + num1 + num2; - set.log10Likelihoods[0] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[0], sum); + set.getLog10Likelihoods()[0] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[0], sum); } } innerIterator.next(); } - set.log10Likelihoods[0] += denom; + set.getLog10Likelihoods()[0] += denom; } - double log10LofK = set.log10Likelihoods[0]; + double log10LofK = set.getLog10Likelihoods()[0]; // update the MLE if necessary - final int altCounts[] = Arrays.copyOfRange(set.ACcounts.counts,1, set.ACcounts.counts.length); - result.updateMLEifNeeded(log10LofK, altCounts); + final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); + resultTracker.updateMLEifNeeded(log10LofK, altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - result.updateMAPifNeeded(log10LofK, altCounts); + resultTracker.updateMAPifNeeded(log10LofK, altCounts); return log10LofK; } @@ -496,12 +502,12 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula * @param ploidy1 Ploidy of first pool (# of chromosomes in it) * @param ploidy2 Ploidy of second pool * @param log10AlleleFrequencyPriors Array of biallelic priors - * @param result Af calculation result object + * @param resultTracker Af calculation result object * @return Combined likelihood vector */ public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector, final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { + final AFCalcResultTracker resultTracker) { final int newPloidy = ploidy1 + ploidy2; @@ -526,8 +532,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula final double log10Lof0 = x[0]+y[0]; - result.setLog10LikelihoodOfAFzero(log10Lof0); - result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); + resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); double maxElement = log10Lof0; int maxElementIdx = 0; @@ -569,8 +575,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula } alleleCounts[0] = k; - result.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts); - result.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts); + resultTracker.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts); + resultTracker.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts); } @@ -622,7 +628,10 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula // create the new likelihoods array from the alleles we are allowed to use final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); double[] newLikelihoods; - if ( numOriginalAltAlleles == numNewAltAlleles) { + + // Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization + // and subsetting + if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) { newLikelihoods = originalLikelihoods; } else { newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 192befe67..8738def50 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -283,7 +283,7 @@ public class GenotypingEngine { final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); if( mergedVC == null ) { continue; } - final HashMap> alleleHashMap = new HashMap>(); + HashMap> alleleHashMap = new HashMap>(); int aCount = 0; for( final Allele a : mergedVC.getAlleles() ) { alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper @@ -308,9 +308,20 @@ public class GenotypingEngine { } genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() ); } - final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); - + VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); if( call != null ) { + if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! + final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call); + // also, need to update the allele -> haplotype mapping + final HashMap> alleleHashMapTrim = new HashMap>(); + for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC + alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii))); + } + + call = vcCallTrim; + alleleHashMap = alleleHashMapTrim; + } + returnCalls.add( new Pair>>(call, alleleHashMap) ); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index f4d8a88e0..71e4f5f8a 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -237,9 +237,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling - UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING); - UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); - UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); + UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); + UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); + + // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested + UnifiedArgumentCollection simpleUAC = UAC.clone(); + simpleUAC.exactCallsLog = null; + UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); // initialize the output VCF header annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index db289ecab..072f81db9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -40,7 +40,6 @@ import java.util.*; public class LikelihoodCalculationEngine { private static final double LOG_ONE_HALF = -Math.log10(2.0); - private static final double BEST_LIKELIHOOD_THRESHOLD = 0.1; private final byte constantGCP; private final boolean DEBUG; private final PairHMM pairHMM; @@ -184,7 +183,7 @@ public class LikelihoodCalculationEngine { haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF ); } } - haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum? + haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); } } } @@ -323,11 +322,13 @@ public class LikelihoodCalculationEngine { return bestHaplotypes; } - public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, final Pair>> call) { + public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, + final HashMap> perSampleReadList, + final HashMap> perSampleFilteredReadList, + final Pair>> call) { final Map returnMap = new HashMap(); final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst()); for( final Map.Entry> sample : perSampleReadList.entrySet() ) { - //final Map> alleleReadMap = new HashMap>(); final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); final ArrayList readsForThisSample = sample.getValue(); @@ -352,7 +353,7 @@ public class LikelihoodCalculationEngine { // only count the read if it overlaps the event, otherwise it is not added to the output read list at all if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { for( final Allele a : call.getFirst().getAlleles() ) - likelihoodMap.add(read,a,0.0); + likelihoodMap.add(read, a, 0.0); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java index a8707641a..3e5cbf0e8 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java @@ -63,7 +63,7 @@ public class BaseCountsUnitTest extends BaseTest { String name = String.format("Test-%s", params.bases); Assert.assertEquals(counts.totalCount(), params.bases.length(), name); - Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name); + Assert.assertEquals(counts.countOfBase(counts.baseIndexWithMostCounts()), params.mostCommonCount, name); Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name); } } \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 7b4dd9b52..4de3cd887 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -1,9 +1,9 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; import java.util.Arrays; -import org.testng.annotations.Test; /** * Created by IntelliJ IDEA. @@ -60,31 +60,31 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","a636ae291a27843107294f3e7940b98a"); + PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","6d60d9f3dfe8e1580214be0d170b0fff"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","738fa68a3fc838b4bbad5c257f3e96fe"); + PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","30abf3c1868a61145edbc08fe35c8150"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","9bcf1f2c204a251ee2b0b6f17ed59a61"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","ef99bc0513d3267f43b84cb88a324376"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","c73678eeaad574af9ed45045074828fa"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","8ca07270717641385fe5d2e07e530782"); } @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","c32e10070e10d30d33e5b882c1f89413"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da84bf45f7080a46a7a78542b3a0629d"); } @Test(enabled = true) public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "d1b48f6f3a175fcba9aec6d427005a45"); + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "f8ea18ec6a717a77fdf8c5f2482d8d8d"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java new file mode 100644 index 000000000..4ac4692d7 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -0,0 +1,603 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +public class AFCalcUnitTest extends BaseTest { + static Allele A = Allele.create("A", true); + static Allele C = Allele.create("C"); + static Allele G = Allele.create("G"); + + static int sampleNameCounter = 0; + static Genotype AA1, AB1, BB1, NON_INFORMATIVE1; + static Genotype AA2, AB2, AC2, BB2, BC2, CC2, NON_INFORMATIVE2; + final double[] FLAT_3SAMPLE_PRIORS = MathUtils.normalizeFromLog10(new double[2*3+1], true); // flat priors + + final private static boolean INCLUDE_BIALLELIC = true; + final private static boolean INCLUDE_TRIALLELIC = true; + final private static boolean Guillermo_FIXME = false; // TODO -- can only be enabled when GdA fixes bug + final private static boolean DEBUG_ONLY = false; + + @BeforeSuite + public void before() { + AA1 = makePL(Arrays.asList(A, A), 0, 20, 20); + AB1 = makePL(Arrays.asList(A, C), 20, 0, 20); + BB1 = makePL(Arrays.asList(C, C), 20, 20, 0); + NON_INFORMATIVE1 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0); + + AA2 = makePL(Arrays.asList(A, A), 0, 20, 20, 20, 20, 20); + AB2 = makePL(Arrays.asList(A, C), 20, 0, 20, 20, 20, 20); + BB2 = makePL(Arrays.asList(C, C), 20, 20, 0, 20, 20, 20); + AC2 = makePL(Arrays.asList(A, G), 20, 20, 20, 0, 20, 20); + BC2 = makePL(Arrays.asList(C, G), 20, 20, 20, 20, 0, 20); + CC2 = makePL(Arrays.asList(G, G), 20, 20, 20, 20, 20, 0); + NON_INFORMATIVE2 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0, 0, 0, 0); + } + + protected static Genotype makePL(final List expectedGT, int ... pls) { + GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++); + gb.alleles(expectedGT); + gb.PL(pls); + return gb.make(); + } + + private class GetGLsTest extends TestDataProvider { + GenotypesContext GLs; + int numAltAlleles; + final AFCalc calc; + final int[] expectedACs; + final double[] priors; + final String priorName; + + private GetGLsTest(final AFCalc calc, int numAltAlleles, List arg, final double[] priors, final String priorName) { + super(GetGLsTest.class); + GLs = GenotypesContext.create(new ArrayList(arg)); + this.numAltAlleles = numAltAlleles; + this.calc = calc; + this.priors = priors; + this.priorName = priorName; + + expectedACs = new int[numAltAlleles+1]; + for ( int alleleI = 0; alleleI < expectedACs.length; alleleI++ ) { + expectedACs[alleleI] = 0; + final Allele allele = getAlleles().get(alleleI); + for ( Genotype g : arg ) { + expectedACs[alleleI] += Collections.frequency(g.getAlleles(), allele); + } + } + } + + public AFCalcResult execute() { + return getCalc().getLog10PNonRef(getVC(), getPriors()); + } + + public AFCalcResult executeRef() { + final AFCalc ref = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_REFERENCE, getCalc().nSamples, getCalc().getMaxAltAlleles()); + return ref.getLog10PNonRef(getVC(), getPriors()); + } + + public double[] getPriors() { + return priors; + } + + public AFCalc getCalc() { + return calc; + } + + public VariantContext getVC() { + VariantContextBuilder builder = new VariantContextBuilder("test", "1", 1, 1, getAlleles()); + builder.genotypes(GLs); + return builder.make(); + } + + public List getAlleles() { + return Arrays.asList(Allele.create("A", true), + Allele.create("C"), + Allele.create("G"), + Allele.create("T")).subList(0, numAltAlleles+1); + } + + public int getExpectedAltAC(final int alleleI) { + return expectedACs[alleleI+1]; + } + + public String toString() { + return String.format("%s model=%s prior=%s input=%s", super.toString(), calc.getClass().getSimpleName(), + priorName, GLs.size() > 5 ? String.format("%d samples", GLs.size()) : GLs); + } + } + + @DataProvider(name = "wellFormedGLs") + public Object[][] createSimpleGLsData() { + final List biAllelicSamples = Arrays.asList(AA1, AB1, BB1); + final List triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2); + + for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) { + List calcs = AFCalcFactory.createAFCalcs( + Arrays.asList( + AFCalcFactory.Calculation.EXACT_REFERENCE, + AFCalcFactory.Calculation.EXACT_INDEPENDENT, + AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY + ), 4, 2, 2, 2); + + final int nPriorValues = 2*nSamples+1; + final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors + final double[] humanPriors = new double[nPriorValues]; + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001); + + for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) { + for ( AFCalc model : calcs ) { + final String priorName = priors == humanPriors ? "human" : "flat"; + + // bi-allelic + if ( INCLUDE_BIALLELIC && nSamples <= biAllelicSamples.size() ) + for ( List genotypes : Utils.makePermutations(biAllelicSamples, nSamples, true) ) + new GetGLsTest(model, 1, genotypes, priors, priorName); + + // tri-allelic + if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) ) // || model != generalCalc ) ) + for ( List genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) ) + new GetGLsTest(model, 2, genotypes, priors, priorName); + } + } + } + + return GetGLsTest.getTests(GetGLsTest.class); + } + + @DataProvider(name = "badGLs") + public Object[][] createBadGLs() { + final List genotypes = Arrays.asList(AB2, BB2, CC2, CC2); + final int nSamples = genotypes.size(); + + final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4); + + final int nPriorValues = 2*nSamples+1; + final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors + for ( AFCalc model : Arrays.asList(indCalc) ) { + final String priorName = "flat"; + new GetGLsTest(model, 2, genotypes, priors, priorName); + } + + return GetGLsTest.getTests(GetGLsTest.class); + } + + @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs") + public void testBiallelicGLs(GetGLsTest cfg) { + if ( cfg.getAlleles().size() == 2 ) + testResultSimple(cfg); + } + + @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs") + public void testTriallelicGLs(GetGLsTest cfg) { + if ( cfg.getAlleles().size() > 2 ) + testResultSimple(cfg); + } + + @Test(enabled = true, dataProvider = "badGLs") + public void testBadGLs(GetGLsTest cfg) { + testResultSimple(cfg); + } + + private static class NonInformativeData { + final Genotype nonInformative; + final List called; + final int nAltAlleles; + + private NonInformativeData(List called, Genotype nonInformative, int nAltAlleles) { + this.called = called; + this.nonInformative = nonInformative; + this.nAltAlleles = nAltAlleles; + } + } + + @DataProvider(name = "GLsWithNonInformative") + public Object[][] makeGLsWithNonInformative() { + List tests = new ArrayList(); + + final List nonInformativeTests = new LinkedList(); + nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB1), NON_INFORMATIVE1, 1)); + nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2), NON_INFORMATIVE2, 2)); + nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2, BC2), NON_INFORMATIVE2, 2)); + + for ( final int nNonInformative : Arrays.asList(1, 10, 100) ) { + for ( final NonInformativeData testData : nonInformativeTests ) { + final List samples = new ArrayList(); + samples.addAll(testData.called); + samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative)); + + final int nSamples = samples.size(); + List calcs = AFCalcFactory.createAFCalcs( + Arrays.asList( + AFCalcFactory.Calculation.EXACT_REFERENCE, + AFCalcFactory.Calculation.EXACT_INDEPENDENT, + AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY + ), 4, 2, 2, 2); + + final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors + + for ( AFCalc model : calcs ) { + final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat"); + + for ( int rotation = 0; rotation < nSamples; rotation++ ) { + Collections.rotate(samples, 1); + final GetGLsTest withNonInformative = new GetGLsTest(model, testData.nAltAlleles, samples, priors, "flat"); + tests.add(new Object[]{onlyInformative, withNonInformative}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "GLsWithNonInformative", dependsOnMethods = {"testBiallelicGLs", "testTriallelicGLs"}) + public void testGLsWithNonInformative(GetGLsTest onlyInformative, GetGLsTest withNonInformative) { + final AFCalcResult expected = onlyInformative.execute(); + final AFCalcResult actual = withNonInformative.execute(); + + testResultSimple(withNonInformative); + compareAFCalcResults(actual, expected, onlyInformative.getCalc(), true); + } + + private void testResultSimple(final GetGLsTest cfg) { + final AFCalcResult refResultTracker = cfg.executeRef(); + final AFCalcResult resultTracker = cfg.execute(); + + compareAFCalcResults(resultTracker, refResultTracker, cfg.getCalc(), true); + + Assert.assertNotNull(resultTracker.getAllelesUsedInGenotyping()); + Assert.assertTrue(cfg.getAlleles().containsAll(resultTracker.getAllelesUsedInGenotyping()), "Result object has alleles not in our initial allele list"); + + for ( int altAlleleI = 0; altAlleleI < cfg.numAltAlleles; altAlleleI++ ) { + int expectedAlleleCount = cfg.getExpectedAltAC(altAlleleI); + int calcAC_MLE = resultTracker.getAlleleCountsOfMLE()[altAlleleI]; + + final Allele allele = cfg.getAlleles().get(altAlleleI+1); + Assert.assertEquals(calcAC_MLE, expectedAlleleCount, "MLE AC not equal to expected AC for allele " + allele); + } + } + + private void compareAFCalcResults(final AFCalcResult actual, final AFCalcResult expected, final AFCalc calc, final boolean onlyPosteriorsShouldBeEqual) { + // note we cannot really test the multi-allelic case because we actually meaningfully differ among the models here + final double TOLERANCE = calc.getMaxAltAlleles() > 1 ? 1000 : 0.1; // much tighter constraints on bi-allelic results + + if ( ! onlyPosteriorsShouldBeEqual ) { + Assert.assertEquals(actual.getLog10PriorOfAFEq0(), expected.getLog10PriorOfAFEq0(), TOLERANCE, "Priors AF == 0"); + Assert.assertEquals(actual.getLog10PriorOfAFGT0(), expected.getLog10PriorOfAFGT0(), TOLERANCE, "Priors AF > 0"); + Assert.assertEquals(actual.getLog10LikelihoodOfAFEq0(), expected.getLog10LikelihoodOfAFEq0(), TOLERANCE, "Likelihoods AF == 0"); + Assert.assertEquals(actual.getLog10LikelihoodOfAFGT0(), expected.getLog10LikelihoodOfAFGT0(), TOLERANCE, "Likelihoods AF > 0"); + } + Assert.assertEquals(actual.getLog10PosteriorOfAFEq0(), expected.getLog10PosteriorOfAFEq0(), TOLERANCE, "Posteriors AF == 0"); + Assert.assertEquals(actual.getLog10PosteriorOfAFGT0(), expected.getLog10PosteriorOfAFGT0(), TOLERANCE, "Posteriors AF > 0"); + Assert.assertEquals(actual.getAlleleCountsOfMLE(), expected.getAlleleCountsOfMLE(), "MLE ACs"); + Assert.assertEquals(actual.getAllelesUsedInGenotyping(), expected.getAllelesUsedInGenotyping(), "Alleles used in genotyping"); + + for ( final Allele a : expected.getAllelesUsedInGenotyping() ) { + if ( ! a.isReference() ) { + Assert.assertEquals(actual.getAlleleCountAtMLE(a), expected.getAlleleCountAtMLE(a), "MLE AC for allele " + a); + // TODO -- enable me when IndependentAllelesDiploidExactAFCalc works properly +// if ( ! ( calc instanceof GeneralPloidyExactAFCalc ) ) +// // TODO -- delete when general ploidy works properly with multi-allelics +// Assert.assertEquals(actual.isPolymorphic(a, 0.0), expected.isPolymorphic(a, 0.0), "isPolymorphic with thread 0.0 for allele " + a); + } + } + } + + @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models") + public void testLargeGLs(final ExactAFCalc calc) { + final Genotype BB = makePL(Arrays.asList(C, C), 20000000, 20000000, 0); + GetGLsTest cfg = new GetGLsTest(calc, 1, Arrays.asList(BB, BB, BB), FLAT_3SAMPLE_PRIORS, "flat"); + + final AFCalcResult resultTracker = cfg.execute(); + + int calculatedAlleleCount = resultTracker.getAlleleCountsOfMLE()[0]; + Assert.assertEquals(calculatedAlleleCount, 6); + } + + @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models") + public void testMismatchedGLs(final ExactAFCalc calc) { + final Genotype AB = makePL(Arrays.asList(A, C), 2000, 0, 2000, 2000, 2000, 2000); + final Genotype AC = makePL(Arrays.asList(A, G), 100, 100, 100, 0, 100, 100); + GetGLsTest cfg = new GetGLsTest(calc, 2, Arrays.asList(AB, AC), FLAT_3SAMPLE_PRIORS, "flat"); + + final AFCalcResult resultTracker = cfg.execute(); + + Assert.assertEquals(resultTracker.getAlleleCountsOfMLE()[0], 1); + Assert.assertEquals(resultTracker.getAlleleCountsOfMLE()[1], 1); + } + + // -------------------------------------------------------------------------------- + // + // Code to test that the pNonRef value is meaningful + // + // -------------------------------------------------------------------------------- + + private static class PNonRefData { + final Genotype g; + final double pNonRef, tolerance; + final boolean canScale; + final List badModels; + final VariantContext vc; + + private PNonRefData(final VariantContext vc, Genotype g, double pNonRef, double tolerance, final boolean canScale) { + this(vc, g, pNonRef, tolerance, canScale, Collections.emptyList()); + } + + private PNonRefData(final VariantContext vc, Genotype g, double pNonRef, double tolerance, final boolean canScale, final List badModels) { + this.g = g; + this.pNonRef = pNonRef; + this.tolerance = tolerance; + this.canScale = canScale; + this.badModels = badModels; + this.vc = vc; + } + + public PNonRefData scale(final int scaleFactor) { + if ( canScale ) { + final int[] PLs = new int[g.getPL().length]; + for ( int i = 0; i < PLs.length; i++ ) PLs[i] = g.getPL()[i] * ((int)Math.log10(scaleFactor)+1); + final Genotype scaledG = new GenotypeBuilder(g).PL(PLs).make(); + final double scaledPNonRef = pNonRef < 0.5 ? pNonRef / scaleFactor : 1 - ((1-pNonRef) / scaleFactor); + return new PNonRefData(vc, scaledG, scaledPNonRef, tolerance, true); + } else { + return this; + } + } + } + + @DataProvider(name = "PNonRef") + public Object[][] makePNonRefTest() { + List tests = new ArrayList(); + + final List AA = Arrays.asList(A, A); + final List AC = Arrays.asList(A, C); + final List CC = Arrays.asList(C, C); + final List AG = Arrays.asList(A, G); + final List GG = Arrays.asList(G, G); + final List CG = Arrays.asList(C, G); + + final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make(); + final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make(); + final AFCalcTestBuilder.PriorType priorType = AFCalcTestBuilder.PriorType.flat; + + final List constrainedModel = Arrays.asList(AFCalcFactory.Calculation.EXACT_CONSTRAINED); + + final double TOLERANCE = 0.5; + + final List initialPNonRefData = Arrays.asList( + // bi-allelic sites + new PNonRefData(vc2, makePL(AA, 0, 10, 10), 0.1666667, TOLERANCE, true), + new PNonRefData(vc2, makePL(AA, 0, 1, 10), 0.4721084, TOLERANCE, false, constrainedModel), + new PNonRefData(vc2, makePL(AA, 0, 1, 1), 0.6136992, TOLERANCE, false, constrainedModel), + new PNonRefData(vc2, makePL(AA, 0, 5, 5), 0.3874259, TOLERANCE, false, constrainedModel), + new PNonRefData(vc2, makePL(AC, 10, 0, 10), 0.9166667, TOLERANCE, true), + new PNonRefData(vc2, makePL(CC, 10, 10, 0), 0.9166667, TOLERANCE, true), + + // tri-allelic sites -- cannot scale because of the naivety of our scaling estimator + new PNonRefData(vc3, makePL(AA, 0, 10, 10, 10, 10, 10), 0.3023255813953489, TOLERANCE * 2, false), // more tolerance because constrained model is a bit inaccurate + new PNonRefData(vc3, makePL(AC, 10, 0, 10, 10, 10, 10), 0.9166667, TOLERANCE, false), + new PNonRefData(vc3, makePL(CC, 10, 10, 0, 10, 10, 10), 0.9166667, TOLERANCE, false), + new PNonRefData(vc3, makePL(AG, 10, 10, 10, 0, 10, 10), 0.9166667, TOLERANCE, false), + new PNonRefData(vc3, makePL(CG, 10, 10, 10, 10, 0, 10), 0.80, TOLERANCE, false), + new PNonRefData(vc3, makePL(GG, 10, 10, 10, 10, 10, 0), 0.9166667, TOLERANCE, false) + ); + + for ( AFCalcFactory.Calculation modelType : Arrays.asList(AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcFactory.Calculation.EXACT_INDEPENDENT) ) { + for ( int nNonInformative = 0; nNonInformative < 3; nNonInformative++ ) { + for ( final PNonRefData rootData : initialPNonRefData ) { + for ( int plScale = 1; plScale <= 100000; plScale *= 10 ) { + if ( ! rootData.badModels.contains(modelType) && (plScale == 1 || rootData.canScale) ) { + final PNonRefData data = rootData.scale(plScale); + tests.add(new Object[]{data.vc, modelType, priorType, Arrays.asList(data.g), data.pNonRef, data.tolerance, nNonInformative}); + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRef") + private void testPNonRef(final VariantContext vcRoot, + AFCalcFactory.Calculation modelType, + AFCalcTestBuilder.PriorType priorType, + final List genotypes, + final double expectedPNonRef, + final double tolerance, + final int nNonInformative) { + final AFCalcTestBuilder testBuilder + = new AFCalcTestBuilder(1, vcRoot.getNAlleles()-1, modelType, priorType); + + final VariantContextBuilder vcb = new VariantContextBuilder(vcRoot); + vcb.genotypes(genotypes); + + final AFCalcResult resultTracker = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors()); + + Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), Math.log10(expectedPNonRef), tolerance, + "Actual pNonRef not within tolerance " + tolerance + " of expected"); + } + + // -------------------------------------------------------------------------------- + // + // Test priors + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "Models") + public Object[][] makeModels() { + List tests = new ArrayList(); + + for ( final AFCalcFactory.Calculation calc : AFCalcFactory.Calculation.values() ) { + if ( calc.usableForParams(2, 4) ) + tests.add(new Object[]{AFCalcFactory.createAFCalc(calc, 2, 4)}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true & ! DEBUG_ONLY, dataProvider = "Models") + public void testBiallelicPriors(final AFCalc model) { + + for ( int REF_PL = 10; REF_PL <= 20; REF_PL += 10 ) { + final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000); + + for ( int log10NonRefPrior = 1; log10NonRefPrior < 10*REF_PL; log10NonRefPrior += 1 ) { + final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior); + final double[] priors = MathUtils.normalizeFromLog10(MathUtils.toLog10(new double[]{refPrior, (1-refPrior) / 2, (1-refPrior) / 2}), true); + GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior); + final AFCalcResult resultTracker = cfg.execute(); + final int actualAC = resultTracker.getAlleleCountsOfMLE()[0]; + + final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0]; + final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1] - Math.log10(0.5); + final double nonRefPost = Math.pow(10, pHetWithPrior) / (Math.pow(10, pRefWithPrior) + Math.pow(10, pHetWithPrior)); + final double log10NonRefPost = Math.log10(nonRefPost); + + if ( ! Double.isInfinite(log10NonRefPost) ) + Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), log10NonRefPost, 1e-2); + + if ( nonRefPost >= 0.9 ) + Assert.assertTrue(resultTracker.isPolymorphic(C, -1)); + + final int expectedMLEAC = 1; // the MLE is independent of the prior + Assert.assertEquals(actualAC, expectedMLEAC, + "actual AC with priors " + log10NonRefPrior + " not expected " + + expectedMLEAC + " priors " + Utils.join(",", priors)); + } + } + } + + @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models") + + // -------------------------------------------------------------------------------- + // + // Test that polymorphic sites (bi and tri) are properly called + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "polyTestProvider") + public Object[][] makePolyTestProvider() { + List tests = new ArrayList(); + + // list of all high-quality models in the system + final List models = Arrays.asList( + AFCalcFactory.Calculation.EXACT, + AFCalcFactory.Calculation.EXACT_REFERENCE, + AFCalcFactory.Calculation.EXACT_INDEPENDENT); + + // note that we cannot use small PLs here or the thresholds are hard to set + for ( final int nonTypePLs : Arrays.asList(100, 1000) ) { + for ( final AFCalcFactory.Calculation model : models ) { + for ( final int allele1AC : Arrays.asList(0, 1, 2, 10, 100, 1000, 10000) ) { + for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) { +// for ( final int nonTypePLs : Arrays.asList(10) ) { +// for ( final AFCalcFactory.Calculation model : models ) { +// for ( final int allele1AC : Arrays.asList(100) ) { +// for ( final int nSamples : Arrays.asList(1000) ) { + if ( nSamples < allele1AC ) continue; + + final double pPerSample = Math.pow(10, nonTypePLs / -10.0); + final double errorFreq = pPerSample * nSamples; + final boolean poly1 = allele1AC > errorFreq && (nonTypePLs * allele1AC) > 30; + + // bi-allelic tests + { + final AFCalcTestBuilder testBuilder + = new AFCalcTestBuilder(nSamples, 1, model, AFCalcTestBuilder.PriorType.human); + final List ACs = Arrays.asList(allele1AC); + tests.add(new Object[]{testBuilder, ACs, nonTypePLs, Arrays.asList(poly1)}); + } + + // multi-allelic tests + for ( final int allele2AC : Arrays.asList(0, 1, 2, 10, 20, 50) ) { + if ( nSamples < allele2AC || allele1AC + allele2AC > nSamples || nSamples > 100 || nSamples == 1) + continue; + + final AFCalcTestBuilder testBuilder + = new AFCalcTestBuilder(nSamples, 2, model, AFCalcTestBuilder.PriorType.human); + final List ACs = Arrays.asList(allele1AC, allele2AC); + final boolean poly2 = allele2AC > errorFreq && (nonTypePLs * allele2AC) > 90; + tests.add(new Object[]{testBuilder, ACs, nonTypePLs, Arrays.asList(poly1, poly2)}); + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "polyTestProvider") + public void testCallingGeneral(final AFCalcTestBuilder testBuilder, final List ACs, final int nonTypePL, final List expectedPoly ) { + testCalling(testBuilder, ACs, nonTypePL, expectedPoly); + } + + @DataProvider(name = "polyTestProviderLotsOfAlleles") + public Object[][] makepolyTestProviderLotsOfAlleles() { + List tests = new ArrayList(); + + // list of all high-quality models in the system + final List models = Arrays.asList(AFCalcFactory.Calculation.EXACT_INDEPENDENT); + + final List alleleCounts = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 20); + + final int nonTypePLs = 1000; + final int nAlleles = 4; + for ( final AFCalcFactory.Calculation model : models ) { + for ( final List ACs : Utils.makePermutations(alleleCounts, nAlleles, true) ) { + final List isPoly = new ArrayList(ACs.size()); + for ( final int ac : ACs ) isPoly.add(ac > 0); + + final double acSum = MathUtils.sum(ACs); + for ( final int nSamples : Arrays.asList(1, 10, 100) ) { + if ( nSamples < acSum ) continue; + final AFCalcTestBuilder testBuilder + = new AFCalcTestBuilder(nSamples, nAlleles, model, AFCalcTestBuilder.PriorType.human); + tests.add(new Object[]{testBuilder, ACs, nonTypePLs, isPoly}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "polyTestProviderLotsOfAlleles") + public void testCallingLotsOfAlleles(final AFCalcTestBuilder testBuilder, final List ACs, final int nonTypePL, final List expectedPoly ) { + testCalling(testBuilder, ACs, nonTypePL, expectedPoly); + } + + private void testCalling(final AFCalcTestBuilder testBuilder, final List ACs, final int nonTypePL, final List expectedPoly) { + final AFCalc calc = testBuilder.makeModel(); + final double[] priors = testBuilder.makePriors(); + final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL); + final AFCalcResult result = calc.getLog10PNonRef(vc, priors); + + boolean anyPoly = false; + for ( final boolean onePoly : expectedPoly ) anyPoly = anyPoly || onePoly; + + if ( anyPoly ) + Assert.assertTrue(result.getLog10PosteriorOfAFGT0() > -1); + + for ( int altI = 1; altI < result.getAllelesUsedInGenotyping().size(); altI++ ) { + final int i = altI - 1; + final Allele alt = result.getAllelesUsedInGenotyping().get(altI); + + // must be getCalledChrCount because we cannot ensure that the VC made has our desired ACs + Assert.assertEquals(result.getAlleleCountAtMLE(alt), vc.getCalledChrCount(alt)); + Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFGt0ForAllele(alt)); + } + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedAFCalculationModelUnitTest.java new file mode 100644 index 000000000..31ec28af4 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedAFCalculationModelUnitTest.java @@ -0,0 +1,124 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + + +public class ConstrainedAFCalculationModelUnitTest extends BaseTest { + static Allele A = Allele.create("A", true); + static Allele C = Allele.create("C"); + static Allele G = Allele.create("G"); + + protected static Genotype makePL(final List expectedGT, int ... pls) { + return AFCalcUnitTest.makePL(expectedGT, pls); + } + + @DataProvider(name = "MaxACsToVisit") + public Object[][] makeMaxACsToVisit() { + List tests = new ArrayList(); + + final int nSamples = 10; + + for (int nNonInformative = 0; nNonInformative < nSamples - 1; nNonInformative++ ) { + final int nChrom = (nSamples - nNonInformative) * 2; + for ( int i = 0; i < nChrom; i++ ) { + // bi-allelic + tests.add(new Object[]{nSamples, Arrays.asList(i), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED}); + + // tri-allelic + for ( int j = 0; j < (nChrom - i); j++) + tests.add(new Object[]{nSamples, Arrays.asList(i, j), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "MaxACsToVisit") + public void testMaxACsToVisit(final int nSamples, final List requestedACs, final int nNonInformative, final AFCalcFactory.Calculation modelType) { + final int nAlts = requestedACs.size(); + final AFCalcTestBuilder testBuilder + = new AFCalcTestBuilder(nSamples, nAlts, modelType, + AFCalcTestBuilder.PriorType.human); + + final VariantContext vc = testBuilder.makeACTest(requestedACs, nNonInformative, 100); + final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc); + + testExpectedACs(vc, maxACsToVisit); + } + + private void testExpectedACs(final VariantContext vc, final int[] maxACsToVisit) { + // this is necessary because cannot ensure that the tester gives us back the + // requested ACs due to rounding errors + final List ACs = new ArrayList(); + for ( final Allele a : vc.getAlternateAlleles() ) + ACs.add(vc.getCalledChrCount(a)); + + for ( int i = 0; i < maxACsToVisit.length; i++ ) { + Assert.assertEquals(maxACsToVisit[i], (int)ACs.get(i), "Maximum AC computed wasn't equal to the max possible in the construction for alt allele " + i); + } + } + + @DataProvider(name = "MaxACsGenotypes") + public Object[][] makeMaxACsForGenotype() { + List tests = new ArrayList(); + + final List AA = Arrays.asList(A, A); + final List AC = Arrays.asList(A, C); + final List CC = Arrays.asList(C, C); + final List AG = Arrays.asList(A, G); + final List GG = Arrays.asList(G, G); + final List CG = Arrays.asList(C, G); + + final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make(); + final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make(); + + tests.add(new Object[]{vc2, makePL(AA, 0, 10, 10)}); + tests.add(new Object[]{vc2, makePL(AC, 10, 0, 10)}); + tests.add(new Object[]{vc2, makePL(CC, 10, 10, 0)}); + + // make sure non-informative => 0 + tests.add(new Object[]{vc2, makePL(AA, 0, 0, 0)}); + tests.add(new Object[]{vc3, makePL(AA, 0, 0, 0, 0, 0, 0)}); + + // multi-allelics + tests.add(new Object[]{vc3, makePL(AG, 10, 10, 10, 0, 10, 10)}); + tests.add(new Object[]{vc3, makePL(CG, 10, 10, 10, 10, 0, 10)}); + tests.add(new Object[]{vc3, makePL(GG, 10, 10, 10, 10, 10, 0)}); + + // deal with non-informatives third alleles + tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 10)}); + tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 10)}); + tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 0)}); + tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 0)}); + tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 10)}); + tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 10)}); + tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 0)}); + tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 0)}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "MaxACsGenotypes") + private void testMakeACByGenotype(final VariantContext vcRoot, final Genotype g) { + final VariantContext vc = new VariantContextBuilder(vcRoot).genotypes(g).make(); + + final AFCalcTestBuilder testBuilder + = new AFCalcTestBuilder(1, vc.getNAlleles()-1, AFCalcFactory.Calculation.EXACT_CONSTRAINED, + AFCalcTestBuilder.PriorType.human); + + final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc); + + testExpectedACs(vc, maxACsToVisit); + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java similarity index 95% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java index 983f562d2..48f282901 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java @@ -1,6 +1,7 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; @@ -137,15 +138,15 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest { @Test(dataProvider = "getGLs") public void testGLs(GetGLsTest cfg) { - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(cfg.numAltAlleles); + final AFCalcResultTracker resultTracker = new AFCalcResultTracker(cfg.numAltAlleles); final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size()); double[] priors = new double[len]; // flat priors - GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result); + GeneralPloidyExactAFCalc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, resultTracker); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele]; + int calculatedAlleleCount = resultTracker.getAlleleCountsOfMAP()[allele]; // System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount); Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java new file mode 100644 index 000000000..ed164f245 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java @@ -0,0 +1,210 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + + +// SEE private/R/pls.R if you want the truth output for these tests +public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { + @DataProvider(name = "TestCombineGLs") + public Object[][] makeTestCombineGLs() { + List tests = new ArrayList(); + + tests.add(new Object[]{1, 1, makePL( 0, 10, 20), makePL( 0, 10, 20)}); + tests.add(new Object[]{1, 1, makePL(10, 0, 20), makePL(10, 0, 20)}); + tests.add(new Object[]{1, 1, makePL(20, 10, 0), makePL(20, 10, 0)}); + + // AA AB BB AC BC CC => AA AB+BC CC + tests.add(new Object[]{1, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 10, 20)}); + tests.add(new Object[]{2, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 30, 50)}); + + tests.add(new Object[]{1, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)}); + tests.add(new Object[]{2, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)}); + + tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5)}); + tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9)}); + + tests.add(new Object[]{1, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)}); + tests.add(new Object[]{2, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 0, 50, 50, 50, 50), makePL(45, 0, 50)}); + tests.add(new Object[]{2, 2, makePL( 50, 0, 50, 50, 50, 50), makePL( 0, 47, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 50, 0, 50, 50, 50), makePL(45, 47, 0)}); + tests.add(new Object[]{2, 2, makePL( 50, 50, 0, 50, 50, 50), makePL( 0, 47, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(0, 47, 50)}); + tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(45, 0, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)}); + tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)}); + + tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(0, 47, 50)}); + tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(45, 47, 0)}); + + return tests.toArray(new Object[][]{}); + } + + @DataProvider(name = "TestCombineGLsWithDrops") + public Object[][] makeTestCombineGLsWithDrops() { + List tests = new ArrayList(); + + final Set noDrops = Collections.emptySet(); + final Set drop1 = Collections.singleton(1); + final Set drop2 = Collections.singleton(2); + + // AA AB BB AC BC CC + // drop1 (B): AA AC CC + // drop2 (C): AA AB BB + tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5), noDrops}); + tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9), noDrops}); + tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 1, 2), drop2}); + tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 3, 5), drop1}); + + tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(0, 2, 6), noDrops}); + tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(1, 0, 2), noDrops}); + tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(2, 1, 0), drop2}); + tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(5, 2, 0), drop1}); + + tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 8,11), noDrops}); + tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL( 5, 7, 0), noDrops}); + tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 0, 0), drop2}); + tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL(10,10, 0), drop1}); + + return tests.toArray(new Object[][]{}); + } + + private Genotype makePL(final int ... PLs) { + return AFCalcUnitTest.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), PLs); + } + + @Test(enabled = true, dataProvider = "TestCombineGLs") + private void testCombineGLs(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) { + testCombineGLsWithDrops(altIndex, nAlts, testg, expected, Collections.emptySet()); + } + + @Test(enabled = true, dataProvider = "TestCombineGLsWithDrops") + private void testCombineGLsWithDrops(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected, Set allelesToDrop) { + final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); + final Genotype combined = calc.combineGLs(testg, altIndex, allelesToDrop, nAlts); + + Assert.assertEquals(combined.getPL(), expected.getPL(), + "Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL())); + } + + + static Allele A = Allele.create("A", true); + static Allele C = Allele.create("C"); + static Allele G = Allele.create("G"); + + @DataProvider(name = "TestMakeAlleleConditionalContexts") + public Object[][] makeTestMakeAlleleConditionalContexts() { + List tests = new ArrayList(); + + final VariantContextBuilder root = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A)); + final VariantContextBuilder vcAC = new VariantContextBuilder(root).alleles(Arrays.asList(A, C)); + final VariantContextBuilder vcAG = new VariantContextBuilder(root).alleles(Arrays.asList(A, G)); + final VariantContextBuilder vcACG = new VariantContextBuilder(root).alleles(Arrays.asList(A, C, G)); + final VariantContextBuilder vcAGC = new VariantContextBuilder(root).alleles(Arrays.asList(A, G, C)); + + final Genotype gACG = makePL( 0, 1, 2, 3, 4, 5); + final Genotype gAGC = makePL( 0, 4, 5, 1, 3, 2); + final Genotype gACcombined = makePL(0, 2, 5); + final Genotype gAGcombined = makePL(0, 4, 9); + final Genotype gACdropped = makePL(0, 1, 2); + final Genotype gAGdropped = makePL(0, 3, 5); + + // biallelic + tests.add(new Object[]{vcAC.genotypes(gACcombined).make(), Arrays.asList(vcAC.genotypes(gACcombined).make())}); + + // tri-allelic + tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGdropped).make())}); + tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACdropped).make())}); + + return tests.toArray(new Object[][]{}); + } + + + @Test(enabled = false, dataProvider = "TestMakeAlleleConditionalContexts") + private void testMakeAlleleConditionalContexts(final VariantContext vc, final List expectedVCs) { + final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); + final List biAllelicVCs = calc.makeAlleleConditionalContexts(vc); + + Assert.assertEquals(biAllelicVCs.size(), expectedVCs.size()); + + for ( int i = 0; i < biAllelicVCs.size(); i++ ) { + final VariantContext actual = biAllelicVCs.get(i); + final VariantContext expected = expectedVCs.get(i); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles()); + + for ( int j = 0; j < actual.getNSamples(); j++ ) + Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL()); + } + } + + + @DataProvider(name = "ThetaNTests") + public Object[][] makeThetaNTests() { + List tests = new ArrayList(); + + final List log10LAlleles = Arrays.asList(0.0, -1.0, -2.0, -3.0, -4.0); + + for ( final double log10pRef : Arrays.asList(-1, -2, -3) ) { + for ( final int ploidy : Arrays.asList(1, 2, 3, 4) ) { + for ( List permutations : Utils.makePermutations(log10LAlleles, ploidy, true)) { + tests.add(new Object[]{permutations, Math.pow(10, log10pRef)}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ThetaNTests") + public void testThetaNTests(final List log10LAlleles, final double pRef) { + // biallelic + final double[] rawPriors = MathUtils.toLog10(new double[]{pRef, 1-pRef}); + + final double log10pNonRef = Math.log10(1-pRef); + + final List originalPriors = new LinkedList(); + final List pNonRefN = new LinkedList(); + for ( int i = 0; i < log10LAlleles.size(); i++ ) { + final double log10LAllele1 = log10LAlleles.get(i); + final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true); + final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, 0.0)); + originalPriors.add(result1); + pNonRefN.add(log10pNonRef*(i+1)); + } + + final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 2); + final List thetaNPriors = calc.applyMultiAllelicPriors(originalPriors); + + double prevPosterior = 0.0; + for ( int i = 0; i < log10LAlleles.size(); i++ ) { + final AFCalcResult thetaN = thetaNPriors.get(i); + AFCalcResult orig = null; + for ( final AFCalcResult x : originalPriors ) + if ( x.getAllelesUsedInGenotyping().equals(thetaN.getAllelesUsedInGenotyping())) + orig = x; + + Assert.assertNotNull(orig, "couldn't find original AFCalc"); + + Assert.assertEquals(orig.getLog10PriorOfAFGT0(), log10pNonRef, 1e-6); + Assert.assertEquals(thetaN.getLog10PriorOfAFGT0(), pNonRefN.get(i), 1e-6); + + Assert.assertTrue(orig.getLog10PosteriorOfAFGT0() <= prevPosterior, "AFCalc results should be sorted but " + prevPosterior + " is > original posterior " + orig.getLog10PosteriorOfAFGT0()); + prevPosterior = orig.getLog10PosteriorOfAFGT0(); + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index b4ac2b86d..3450725c8 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,17 +21,17 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "5b751474ad0aef4cdb53f094e605f97c"); + HCTest(CEUTRIO_BAM, "", "a305107a5ec889152aa2efbe90b249d7"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "60efcd2d2722087e900f6365985d18bf"); + HCTest(NA12878_BAM, "", "0c2217ec81f19790a6d1f98ebf8cf70d"); } @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "71bec55320a2f07af0d54be9d7735322"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "0396c7352ab8ab98b03dca36299a0ddf"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -42,7 +42,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(CEUTRIO_BAM, "", "f5a809e3fbd9998f79b75bb2973209e1"); + HCTestComplexVariants(CEUTRIO_BAM, "", "2cfb7d830d5a7eb7bc754b5f688a27a5"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "8043b0451a4384e678a93600b34afce7"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "d511848a46083c0d0b2495f65f162c2e"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -64,21 +64,36 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "ea6539e05faf10ffaf76f2d16907c47a"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "98d82d74e8d6a778290bee6c0df6d092"); } @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8d092b25f40456e618eef91fdce8adf0")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("7e112ea4623617f1f7f8f562f54aa2aa")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c29e61810c056b52a47baae0696931ea")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c642dcd93771f6f084d55de31f180d1b")); executeTest("HCTestStructuralIndels: ", spec); } + // -------------------------------------------------------------------------------------------------------------- + // + // testing reduced reads + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void HCTestReducedBam() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, + Arrays.asList("79af83432dc4a1768b3ebffffc4d2b8f")); + executeTest("HC calling on a ReducedRead BAM", spec); + } + + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index 1b41b85f4..0daad2c2b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -114,6 +114,9 @@ public class CommandLineGATK extends CommandLineExecutable { public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; + public static final String NO_SPACE_LEFT_ON_DEVICE_ERROR = "No space left on device"; + public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded"; + private static void checkForMaskedUserErrors(final Throwable t) { final String message = t.getMessage(); if ( message == null ) @@ -133,9 +136,9 @@ public class CommandLineGATK extends CommandLineExecutable { exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage())); // disk is full - if ( message.contains("No space left on device") ) + if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) ) exitSystemWithUserError(new UserException.NoSpaceOnDevice()); - if ( t.getCause() != null && t.getCause().getMessage().contains("No space left on device") ) + if ( t.getCause() != null && (t.getCause().getMessage().contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || t.getCause().getMessage().contains(DISK_QUOTA_EXCEEDED_ERROR)) ) exitSystemWithUserError(new UserException.NoSpaceOnDevice()); // masked out of memory error diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index c8887b8b2..7875ced5a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -140,6 +140,9 @@ public class GATKArgumentCollection { @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) public boolean nonDeterministicRandomSeed = false; + @Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.") + public boolean disableRandomization = false; + // -------------------------------------------------------------------------------------------------------------- // // Downsampling Arguments diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index f30fc0316..085a60191 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -1,13 +1,12 @@ package org.broadinstitute.sting.gatk.arguments; -import org.broadinstitute.sting.commandline.Advanced; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.File; + /** * Created with IntelliJ IDEA. * User: rpoplin @@ -59,4 +58,18 @@ public class StandardCallerArgumentCollection { @Advanced @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) public int MAX_ALTERNATE_ALLELES = 3; + + /** + * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), + * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it + * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend + * that you not play around with this parameter. + */ + @Advanced + @Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false) + public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2; + + @Hidden + @Argument(shortName = "logExactCalls", doc="x", required=false) + public File exactCallsLog = null; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index d0e310d3f..8ee7e0439 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -124,6 +124,12 @@ public class BAMScheduler implements Iterator { */ private FilePointer generatePointerOverEntireFileset() { FilePointer filePointer = new FilePointer(); + + // This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is + // the only FilePointer we will create. This allows us to have this FilePointer represent regions from + // multiple contigs + filePointer.setIsMonolithic(true); + Map currentPosition; // Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java index 6c064cf86..0440c7eae 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.util.PeekableIterator; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.*; @@ -88,6 +89,17 @@ public class ExperimentalReadShardBalancer extends ShardBalancer { */ private PeekableIterator currentContigReadsIterator = null; + /** + * How many FilePointers have we pulled from the filePointers iterator? + */ + private int totalFilePointersConsumed = 0; + + /** + * Have we encountered a monolithic FilePointer? + */ + private boolean encounteredMonolithicFilePointer = false; + + { createNextContigFilePointer(); advance(); @@ -167,6 +179,20 @@ public class ExperimentalReadShardBalancer extends ShardBalancer { logger.info("Loading BAM index data for next contig"); while ( filePointers.hasNext() ) { + + // Make sure that if we see a monolithic FilePointer (representing all regions in all files) that + // it is the ONLY FilePointer we ever encounter + if ( encounteredMonolithicFilePointer ) { + throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer"); + } + if ( filePointers.peek().isMonolithic() ) { + if ( totalFilePointersConsumed > 0 ) { + throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer"); + } + encounteredMonolithicFilePointer = true; + logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek())); + } + // If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the // same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge if ( nextContigFilePointers.isEmpty() || @@ -175,6 +201,7 @@ public class ExperimentalReadShardBalancer extends ShardBalancer { (nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) { nextContigFilePointers.add(filePointers.next()); + totalFilePointersConsumed++; } else { break; // next FilePointer is on a different contig or has different mapped/unmapped status, diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java index 50f4e0273..197015641 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java @@ -50,10 +50,28 @@ public class FilePointer { */ protected final boolean isRegionUnmapped; + /** + * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will + * ever visit during this GATK run? If this is set to true, the engine will expect to see only this + * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals + * from more than one contig. + */ + private boolean isMonolithic = false; + + /** + * Index of the contig covered by this FilePointer. Only meaningful for non-monolithic, mapped FilePointers + */ + private Integer contigIndex = null; + + public FilePointer( List locations ) { this.locations.addAll(locations); this.isRegionUnmapped = checkUnmappedStatus(); - validateLocations(); + + validateAllLocations(); + if ( locations.size() > 0 ) { + contigIndex = locations.get(0).getContigIndex(); + } } public FilePointer( final GenomeLoc... locations ) { @@ -80,8 +98,9 @@ public class FilePointer { return foundUnmapped; } - private void validateLocations() { - if ( isRegionUnmapped ) { + private void validateAllLocations() { + // Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction + if ( isRegionUnmapped || isMonolithic ) { return; } @@ -89,13 +108,22 @@ public class FilePointer { for ( GenomeLoc location : locations ) { if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) { - throw new ReviewedStingException("File pointers must contain intervals from at most one contig"); + throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig"); } previousContigIndex = location.getContigIndex(); } } + private void validateLocation( GenomeLoc location ) { + if ( isRegionUnmapped != GenomeLoc.isUnmapped(location) ) { + throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped."); + } + if ( ! isRegionUnmapped && ! isMonolithic && contigIndex != null && contigIndex != location.getContigIndex() ) { + throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig"); + } + } + /** * Returns an immutable view of this FilePointer's file spans * @@ -123,6 +151,29 @@ public class FilePointer { return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; } + /** + * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will + * ever visit during this GATK run? If this is set to true, the engine will expect to see only this + * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals + * from more than one contig. + * + * @return true if this FP is a monolithic FP representing all regions in all files, otherwise false + */ + public boolean isMonolithic() { + return isMonolithic; + } + + /** + * Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all + * regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic + * FP may contain intervals from more than one contig. + * + * @param isMonolithic set this FP's monolithic status to this value + */ + public void setIsMonolithic( boolean isMonolithic ) { + this.isMonolithic = isMonolithic; + } + @Override public boolean equals(final Object other) { if(!(other instanceof FilePointer)) @@ -151,15 +202,12 @@ public class FilePointer { } public void addLocation(final GenomeLoc location) { - this.locations.add(location); - checkUnmappedStatus(); - validateLocations(); - } + validateLocation(location); - public void addLocations( final List locations ) { - this.locations.addAll(locations); - checkUnmappedStatus(); - validateLocations(); + this.locations.add(location); + if ( contigIndex == null ) { + contigIndex = location.getContigIndex(); + } } public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java index 662c7526b..27e666f6f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java @@ -215,19 +215,29 @@ public class ReadShard extends Shard { int start = Integer.MAX_VALUE; int stop = Integer.MIN_VALUE; String contig = null; + boolean foundMapped = false; for ( final SAMRecord read : reads ) { if ( contig != null && ! read.getReferenceName().equals(contig) ) throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. " + "First contig is " + contig + " next read was " + read.getReferenceName() ); contig = read.getReferenceName(); - if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart(); - if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); + + // Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates + // of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries, + // this shard might consist *only* of unmapped mates! We need to refrain from using the alignment + // starts/stops of these unmapped mates, and detect the case where the shard has been filled *only* + // with unmapped mates. + if ( ! read.getReadUnmappedFlag() ) { + foundMapped = true; + if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart(); + if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); + } } assert contig != null; - if ( contig.equals("*") ) // all reads are unmapped + if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped return GenomeLoc.UNMAPPED; else return parser.createGenomeLoc(contig, start, stop); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 8562ace98..bb788c89f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -1008,6 +1008,12 @@ public class SAMDataSource { } catch ( SAMFormatException e ) { throw new UserException.MalformedBAM(readerID.samFile, e.getMessage()); } + // Picard is throwing a RuntimeException here when BAMs are malformed with bad headers (and so look like SAM files). + // Let's keep this separate from the SAMFormatException (which ultimately derives from RuntimeException) case, + // just in case we want to change this behavior later. + catch ( RuntimeException e ) { + throw new UserException.MalformedBAM(readerID.samFile, e.getMessage()); + } reader.setSAMRecordFactory(factory); reader.enableFileSource(true); reader.setValidationStringency(validationStringency); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java index 5b4be2fc6..664d96321 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java @@ -34,8 +34,10 @@ import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.lang.reflect.Type; import java.util.List; @@ -239,6 +241,8 @@ class ReferenceOrderedQueryDataPool extends ResourcePool codec = fd.getCodec(); final AbstractFeatureReader source = diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index ec3f1e5c7..1cc88fc24 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -49,8 +50,12 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno if ( perReadAlleleLikelihoodMap.size() == 0 ) return null; - for ( Map.Entry sample : perReadAlleleLikelihoodMap.entrySet() ) - depth += sample.getValue().getNumberOfStoredElements(); + for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) { + for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { + final GATKSAMRecord read = el.getKey(); + depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); + } + } } else return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index ee9b51b56..d7790969e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -13,6 +13,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; @@ -72,7 +73,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa ReadBackedPileup pileup = stratifiedContext.getBasePileup(); for ( PileupElement p : pileup ) { if ( alleleCounts.containsKey(p.getBase()) ) - alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1); + alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount()); } // we need to add counts in the correct order @@ -91,12 +92,13 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa alleleCounts.put(allele, 0); } for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + final GATKSAMRecord read = el.getKey(); final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); if (a.isNoCall()) continue; // read is non-informative if (!vc.getAlleles().contains(a)) continue; // sanity check - shouldn't be needed - alleleCounts.put(a,alleleCounts.get(a)+1); + alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1)); } final int[] counts = new int[alleleCounts.size()]; counts[0] = alleleCounts.get(vc.getReference()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index e95af71c2..ec0393cdc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -71,7 +72,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat } else if (stratifiedPerReadAlleleLikelihoodMap != null) { // either SNP with no alignment context, or indels: per-read likelihood map needed - final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); + final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc); return pValueForBestTable(table, null); } else @@ -235,14 +236,13 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat * allele2 # # * @return a 2x2 contingency table */ - private static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, - final Allele ref, final Allele alt) { + private static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { + final Allele ref = vc.getReference(); + final Allele alt = vc.getAltAlleleWithHighestAlleleCount(); int[][] table = new int[2][2]; for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { - if ( el.getKey().isReducedRead() ) // ignore reduced reads - continue; final boolean matchesRef = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(ref,true); final boolean matchesAlt = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(alt,true); @@ -254,7 +254,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat int row = matchesRef ? 0 : 1; int column = isFW ? 0 : 1; - table[row][column]++; + final GATKSAMRecord read = el.getKey(); + table[row][column] += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); } } @@ -275,7 +276,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat for ( Map.Entry sample : stratifiedContexts.entrySet() ) { for (PileupElement p : sample.getValue().getBasePileup()) { - if ( ! RankSumTest.isUsableBase(p, false) || p.getRead().isReducedRead() ) // ignore deletions and reduced reads + if ( ! RankSumTest.isUsableBase(p, false) ) // ignore deletions continue; if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider ) @@ -290,7 +291,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat int row = matchesRef ? 0 : 1; int column = isFW ? 0 : 1; - table[row][column]++; + table[row][column] += p.getRepresentativeCount(); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index ec873c5dd..7c7391812 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -10,6 +11,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsC import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MannWhitneyU; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -19,10 +21,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** @@ -30,6 +29,7 @@ import java.util.Map; */ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { static final boolean DEBUG = false; + private boolean useDithering = true; public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -70,7 +70,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR if (refQuals.isEmpty() && altQuals.isEmpty()) return null; - final MannWhitneyU mannWhitneyU = new MannWhitneyU(); + final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering); for (final Double qual : altQuals) { mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); } @@ -131,4 +131,15 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here } + + /** + * Initialize the rank sum test annotation using walker and engine information. Right now this checks to see if + * engine randomization is turned off, and if so does not dither. + * @param walker + * @param toolkit + * @param headerLines + */ + public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set headerLines ) { + useDithering = ! toolkit.getArguments().disableRandomization; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index e78b9b6fc..2c7a12a36 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -179,7 +179,7 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed int numReadGroups = 0; for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) numReadGroups += header.getReadGroups().size(); - recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups); + recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG); recalibrationEngine = initializeRecalibrationEngine(); recalibrationEngine.initialize(requestedCovariates, recalibrationTables); @@ -200,15 +200,15 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed } } - private boolean readHasBeenSkipped(GATKSAMRecord read) { + private boolean readHasBeenSkipped( final GATKSAMRecord read ) { return read.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE); } - private boolean isLowQualityBase(GATKSAMRecord read, int offset) { - return read.getBaseQualities()[offset] < minimumQToUse; + private boolean isLowQualityBase( final PileupElement p ) { + return p.getQual() < minimumQToUse; } - private boolean readNotSeen(GATKSAMRecord read) { + private boolean readNotSeen( final GATKSAMRecord read ) { return !read.containsTemporaryAttribute(SEEN_ATTRIBUTE); } @@ -230,7 +230,7 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed final int offset = p.getOffset(); // This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases) - if (readHasBeenSkipped(read) || isLowQualityBase(read, offset)) + if (readHasBeenSkipped(read) || p.isInsertionAtBeginningOfRead() || isLowQualityBase(p) ) continue; if (readNotSeen(read)) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index f1f0ce38e..fc7d8a8a4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -182,6 +182,10 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; + @Hidden + @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only") + public PrintStream RECAL_TABLE_UPDATE_LOG = null; + public File existingRecalibrationReport = null; public GATKReportTable generateReportTable(final String covariateNames) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java deleted file mode 100755 index 569cd7072..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - - -/** - * The model representing how we calculate a genotype given the priors and a pile - * of bases and quality scores - */ -public abstract class AlleleFrequencyCalculationModel implements Cloneable { - - public enum Model { - /** The default model with the best performance in all cases */ - EXACT - } - - protected int N; - protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE; - protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS; - - protected Logger logger; - protected PrintStream verboseWriter; - - protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; - - protected AlleleFrequencyCalculationModel(final UnifiedArgumentCollection UAC, final int N, final Logger logger, final PrintStream verboseWriter) { - this.N = N; - this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = UAC.MAX_ALTERNATE_ALLELES; - this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS; - this.logger = logger; - this.verboseWriter = verboseWriter; - } - - /** - * Wrapper class that compares two likelihoods associated with two alleles - */ - protected static final class LikelihoodSum implements Comparable { - public double sum = 0.0; - public Allele allele; - - public LikelihoodSum(Allele allele) { this.allele = allele; } - - public int compareTo(LikelihoodSum other) { - final double diff = sum - other.sum; - return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; - } - } - - /** - * Unpack GenotypesContext into arraylist of doubel values - * @param GLs Input genotype context - * @return ArrayList of doubles corresponding to GL vectors - */ - protected static ArrayList getGLs(GenotypesContext GLs) { - ArrayList genotypeLikelihoods = new ArrayList(GLs.size()); - - genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy - for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { - if ( sample.hasLikelihoods() ) { - double[] gls = sample.getLikelihoods().getAsVector(); - - if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL ) - genotypeLikelihoods.add(gls); - } - } - - return genotypeLikelihoods; - } - - /** - * Must be overridden by concrete subclasses - * @param vc variant context with alleles and genotype likelihoods - * @param log10AlleleFrequencyPriors priors - * @param result (pre-allocated) object to store likelihoods results - * @return the alleles used for genotyping - */ - protected abstract List getLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result); - - /** - * Must be overridden by concrete subclasses - * @param vc variant context with alleles and genotype likelihoods - * @param allelesToUse alleles to subset - * @param assignGenotypes - * @param ploidy - * @return GenotypesContext object - */ - protected abstract GenotypesContext subsetAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes, - final int ploidy); - - - // ------------------------------------------------------------------------------------- - // - // protected classes used to store exact model matrix columns - // - // ------------------------------------------------------------------------------------- - - protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first - - // a wrapper around the int array so that we can make it hashable - protected static final class ExactACcounts { - - protected final int[] counts; - private int hashcode = -1; - - public ExactACcounts(final int[] counts) { - this.counts = counts; - } - - public int[] getCounts() { - return counts; - } - - @Override - public boolean equals(Object obj) { - return (obj instanceof ExactACcounts) && Arrays.equals(counts, ((ExactACcounts) obj).counts); - } - - @Override - public int hashCode() { - if ( hashcode == -1 ) - hashcode = Arrays.hashCode(counts); - return hashcode; - } - - @Override - public String toString() { - StringBuffer sb = new StringBuffer(); - sb.append(counts[0]); - for ( int i = 1; i < counts.length; i++ ) { - sb.append("/"); - sb.append(counts[i]); - } - return sb.toString(); - } - } - - // This class represents a column in the Exact AC calculation matrix - protected static final class ExactACset { - - // the counts of the various alternate alleles which this column represents - final ExactACcounts ACcounts; - - // the column of the matrix - final double[] log10Likelihoods; - - int sum = -1; - - public ExactACset(final int size, final ExactACcounts ACcounts) { - this.ACcounts = ACcounts; - log10Likelihoods = new double[size]; - Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY); - } - - // sum of all the non-reference alleles - public int getACsum() { - if ( sum == -1 ) { - sum = 0; - for ( int count : ACcounts.getCounts() ) - sum += count; - } - return sum; - } - - public boolean equals(Object obj) { - return (obj instanceof ExactACset) && ACcounts.equals(((ExactACset)obj).ACcounts); - } - } - - protected static final class MaxLikelihoodSeen { - double maxLog10L = Double.NEGATIVE_INFINITY; - ExactACcounts ACs = null; - - public MaxLikelihoodSeen() {} - - public void update(final double maxLog10L, final ExactACcounts ACs) { - this.maxLog10L = maxLog10L; - this.ACs = ACs; - } - - // returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set - public boolean isLowerAC(final ExactACcounts otherACs) { - final int[] myACcounts = this.ACs.getCounts(); - final int[] otherACcounts = otherACs.getCounts(); - - for ( int i = 0; i < myACcounts.length; i++ ) { - if ( myACcounts[i] > otherACcounts[i] ) - return false; - } - return true; - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java deleted file mode 100644 index c93e780bf..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.utils.MathUtils; - -import java.util.Arrays; - -/** - * Created by IntelliJ IDEA. - * User: ebanks - * Date: Dec 14, 2011 - * - * Useful helper class to communicate the results of the allele frequency calculation - */ -public class AlleleFrequencyCalculationResult { - - // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles - private double log10MLE; - private double log10MAP; - private final int[] alleleCountsOfMLE; - private final int[] alleleCountsOfMAP; - - // The posteriors seen, not including that of AF=0 - private static final int POSTERIORS_CACHE_SIZE = 5000; - private final double[] log10PosteriorMatrixValues = new double[POSTERIORS_CACHE_SIZE]; - private int currentPosteriorsCacheIndex = 0; - private Double log10PosteriorMatrixSum = null; - - // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) - private double log10LikelihoodOfAFzero; - private double log10PosteriorOfAFzero; - - - public AlleleFrequencyCalculationResult(final int maxAltAlleles) { - alleleCountsOfMLE = new int[maxAltAlleles]; - alleleCountsOfMAP = new int[maxAltAlleles]; - reset(); - } - - public double getLog10MLE() { - return log10MLE; - } - - public double getLog10MAP() { - return log10MAP; - } - - public double getLog10PosteriorsMatrixSumWithoutAFzero() { - if ( log10PosteriorMatrixSum == null ) { - log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex); - } - return log10PosteriorMatrixSum; - } - - public int[] getAlleleCountsOfMLE() { - return alleleCountsOfMLE; - } - - public int[] getAlleleCountsOfMAP() { - return alleleCountsOfMAP; - } - - public double getLog10LikelihoodOfAFzero() { - return log10LikelihoodOfAFzero; - } - - public double getLog10PosteriorOfAFzero() { - return log10PosteriorOfAFzero; - } - - public void reset() { - log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) { - alleleCountsOfMLE[i] = 0; - alleleCountsOfMAP[i] = 0; - } - currentPosteriorsCacheIndex = 0; - log10PosteriorMatrixSum = null; - } - - public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { - if ( log10LofK > log10MLE ) { - log10MLE = log10LofK; - for ( int i = 0; i < alleleCountsForK.length; i++ ) - alleleCountsOfMLE[i] = alleleCountsForK[i]; - } - } - - public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { - addToPosteriorsCache(log10LofK); - - if ( log10LofK > log10MAP ) { - log10MAP = log10LofK; - for ( int i = 0; i < alleleCountsForK.length; i++ ) - alleleCountsOfMAP[i] = alleleCountsForK[i]; - } - } - - private void addToPosteriorsCache(final double log10LofK) { - // add to the cache - log10PosteriorMatrixValues[currentPosteriorsCacheIndex++] = log10LofK; - - // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell - if ( currentPosteriorsCacheIndex == POSTERIORS_CACHE_SIZE ) { - final double temporarySum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex); - log10PosteriorMatrixValues[0] = temporarySum; - currentPosteriorsCacheIndex = 1; - } - } - - public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { - this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; - if ( log10LikelihoodOfAFzero > log10MLE ) { - log10MLE = log10LikelihoodOfAFzero; - Arrays.fill(alleleCountsOfMLE, 0); - } - } - - public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { - this.log10PosteriorOfAFzero = log10PosteriorOfAFzero; - if ( log10PosteriorOfAFzero > log10MAP ) { - log10MAP = log10PosteriorOfAFzero; - Arrays.fill(alleleCountsOfMAP, 0); - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java deleted file mode 100755 index ba7f0f622..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ /dev/null @@ -1,481 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variantcontext.*; - -import java.io.PrintStream; -import java.util.*; - -public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { - - // private final static boolean DEBUG = false; - - private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - - protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { - super(UAC, N, logger, verboseWriter); - } - - public List getLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { - - GenotypesContext GLs = vc.getGenotypes(); - List alleles = vc.getAlleles(); - - final int myMaxAltAllelesToGenotype = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS && vc.getType().equals(VariantContext.Type.INDEL) ? 2 : MAX_ALTERNATE_ALLELES_TO_GENOTYPE; - - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) { - logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - - alleles = new ArrayList(myMaxAltAllelesToGenotype + 1); - alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype)); - GLs = VariantContextUtils.subsetDiploidAlleles(vc, alleles, false); - } - - linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result); - - return alleles; - } - - - private static final int PL_INDEX_OF_HOM_REF = 0; - private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) { - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) - likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); - - // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype - final ArrayList GLs = getGLs(vc.getGenotypes()); - for ( final double[] likelihoods : GLs ) { - final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); - if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) { - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL); - if ( alleles.alleleIndex1 != 0 ) - likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; - // don't double-count it - if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 ) - likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; - } - } - - // sort them by probability mass and choose the best ones - Collections.sort(Arrays.asList(likelihoodSums)); - final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); - for ( int i = 0; i < numAllelesToChoose; i++ ) - bestAlleles.add(likelihoodSums[i].allele); - - final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); - for ( Allele allele : vc.getAlternateAlleles() ) { - if ( bestAlleles.contains(allele) ) - orderedBestAlleles.add(allele); - } - - return orderedBestAlleles; - } - - - // ------------------------------------------------------------------------------------- - // - // Multi-allelic implementation. - // - // ------------------------------------------------------------------------------------- - - public static void linearExactMultiAllelic(final GenotypesContext GLs, - final int numAlternateAlleles, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { - - final ArrayList genotypeLikelihoods = getGLs(GLs); - final int numSamples = genotypeLikelihoods.size()-1; - final int numChr = 2*numSamples; - - // queue of AC conformations to process - final LinkedList ACqueue = new LinkedList(); - - // mapping of ExactACset indexes to the objects - final HashMap indexesToACset = new HashMap(numChr+1); - - // add AC=0 to the queue - int[] zeroCounts = new int[numAlternateAlleles]; - ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts)); - ACqueue.add(zeroSet); - indexesToACset.put(zeroSet.ACcounts, zeroSet); - - // keep processing while we have AC conformations that need to be calculated - MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen(); - while ( !ACqueue.isEmpty() ) { - // compute log10Likelihoods - final ExactACset set = ACqueue.remove(); - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result); - - // adjust max likelihood seen if needed - if ( log10LofKs > maxLikelihoodSeen.maxLog10L ) - maxLikelihoodSeen.update(log10LofKs, set.ACcounts); - - // clean up memory - indexesToACset.remove(set.ACcounts); - //if ( DEBUG ) - // System.out.printf(" *** removing used set=%s%n", set.ACcounts); - } - } - - private static final class DependentSet { - public final int[] ACcounts; - public final int PLindex; - - public DependentSet(final int[] ACcounts, final int PLindex) { - this.ACcounts = ACcounts; - this.PLindex = PLindex; - } - } - - private static double calculateAlleleCountConformation(final ExactACset set, - final ArrayList genotypeLikelihoods, - final MaxLikelihoodSeen maxLikelihoodSeen, - final int numChr, - final LinkedList ACqueue, - final HashMap indexesToACset, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { - - //if ( DEBUG ) - // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); - - // compute the log10Likelihoods - computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result); - - final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; - - // can we abort early because the log10Likelihoods are so small? - if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) { - //if ( DEBUG ) - // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); - return log10LofK; - } - - // iterate over higher frequencies if possible - final int ACwiggle = numChr - set.getACsum(); - if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies - return log10LofK; - - final int numAltAlleles = set.ACcounts.getCounts().length; - - // add conformations for the k+1 case - for ( int allele = 0; allele < numAltAlleles; allele++ ) { - final int[] ACcountsClone = set.ACcounts.getCounts().clone(); - ACcountsClone[allele]++; - // to get to this conformation, a sample would need to be AB (remember that ref=0) - final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); - updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); - } - - // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different - if ( ACwiggle > 1 ) { - final ArrayList differentAlleles = new ArrayList(numAltAlleles * numAltAlleles); - final ArrayList sameAlleles = new ArrayList(numAltAlleles); - - for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { - for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { - final int[] ACcountsClone = set.ACcounts.getCounts().clone(); - ACcountsClone[allele_i]++; - ACcountsClone[allele_j]++; - - // to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index) - final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1); - if ( allele_i == allele_j ) - sameAlleles.add(new DependentSet(ACcountsClone, PLindex)); - else - differentAlleles.add(new DependentSet(ACcountsClone, PLindex)); - } - } - - // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering - for ( DependentSet dependent : differentAlleles ) - updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); - for ( DependentSet dependent : sameAlleles ) - updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); - } - - return log10LofK; - } - - // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and - // also pushes its value to the given callingSetIndex. - private static void updateACset(final int[] newSetCounts, - final int numChr, - final ExactACset dependentSet, - final int PLsetIndex, - final Queue ACqueue, - final HashMap indexesToACset, - final ArrayList genotypeLikelihoods) { - final ExactACcounts index = new ExactACcounts(newSetCounts); - if ( !indexesToACset.containsKey(index) ) { - ExactACset set = new ExactACset(numChr/2 +1, index); - indexesToACset.put(index, set); - ACqueue.add(set); - } - - // push data from the dependency to the new set - //if ( DEBUG ) - // System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts); - pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods); - } - - private static void computeLofK(final ExactACset set, - final ArrayList genotypeLikelihoods, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { - - set.log10Likelihoods[0] = 0.0; // the zero case - final int totalK = set.getACsum(); - - // special case for k = 0 over all k - if ( totalK == 0 ) { - for ( int j = 1; j < set.log10Likelihoods.length; j++ ) - set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; - - final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1]; - result.setLog10LikelihoodOfAFzero(log10Lof0); - result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); - return; - } - - // if we got here, then k > 0 for at least one k. - // the non-AA possible conformations were already dealt with by pushes from dependent sets; - // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value - for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { - - if ( totalK < 2*j-1 ) { - final double[] gl = genotypeLikelihoods.get(j); - final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; - set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue); - } - - final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator; - } - - double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; - - // update the MLE if necessary - result.updateMLEifNeeded(log10LofK, set.ACcounts.counts); - - // apply the priors over each alternate allele - for ( final int ACcount : set.ACcounts.getCounts() ) { - if ( ACcount > 0 ) - log10LofK += log10AlleleFrequencyPriors[ACcount]; - } - result.updateMAPifNeeded(log10LofK, set.ACcounts.counts); - } - - private static void pushData(final ExactACset targetSet, - final ExactACset dependentSet, - final int PLsetIndex, - final ArrayList genotypeLikelihoods) { - final int totalK = targetSet.getACsum(); - - for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) { - - if ( totalK <= 2*j ) { // skip impossible conformations - final double[] gl = genotypeLikelihoods.get(j); - final double conformationValue = - determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex]; - targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue); - } - } - } - - private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { - - // the closed form representation generalized for multiple alleles is as follows: - // AA: (2j - totalK) * (2j - totalK - 1) - // AB: 2k_b * (2j - totalK) - // AC: 2k_c * (2j - totalK) - // BB: k_b * (k_b - 1) - // BC: 2 * k_b * k_c - // CC: k_c * (k_c - 1) - - // find the 2 alleles that are represented by this PL index - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - - // *** note that throughout this method we subtract one from the alleleIndex because ACcounts *** - // *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. *** - - // the AX het case - if ( alleles.alleleIndex1 == 0 ) - return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK]; - - final int k_i = ACcounts[alleles.alleleIndex1-1]; - - // the hom var case (e.g. BB, CC, DD) - final double coeff; - if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) { - coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; - } - // the het non-ref case (e.g. BC, BD, CD) - else { - final int k_j = ACcounts[alleles.alleleIndex2-1]; - coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; - } - - return coeff; - } - - public GenotypesContext subsetAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes, - final int ploidy) { - return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes); - } - - // ------------------------------------------------------------------------------------- - // - // Deprecated bi-allelic ~O(N) implementation. Kept here for posterity. - // - // ------------------------------------------------------------------------------------- - - /** - * A simple data structure that holds the current, prev, and prev->prev likelihoods vectors - * for the exact model calculation - */ -/* - private final static class ExactACCache { - double[] kMinus2, kMinus1, kMinus0; - - private final static double[] create(int n) { - return new double[n]; - } - - public ExactACCache(int n) { - kMinus2 = create(n); - kMinus1 = create(n); - kMinus0 = create(n); - } - - final public void rotate() { - double[] tmp = kMinus2; - kMinus2 = kMinus1; - kMinus1 = kMinus0; - kMinus0 = tmp; - } - - final public double[] getkMinus2() { - return kMinus2; - } - - final public double[] getkMinus1() { - return kMinus1; - } - - final public double[] getkMinus0() { - return kMinus0; - } - } - - public int linearExact(GenotypesContext GLs, - double[] log10AlleleFrequencyPriors, - double[][] log10AlleleFrequencyLikelihoods, - double[][] log10AlleleFrequencyPosteriors) { - final ArrayList genotypeLikelihoods = getGLs(GLs); - final int numSamples = genotypeLikelihoods.size()-1; - final int numChr = 2*numSamples; - - final ExactACCache logY = new ExactACCache(numSamples+1); - logY.getkMinus0()[0] = 0.0; // the zero case - - double maxLog10L = Double.NEGATIVE_INFINITY; - boolean done = false; - int lastK = -1; - - for (int k=0; k <= numChr && ! done; k++ ) { - final double[] kMinus0 = logY.getkMinus0(); - - if ( k == 0 ) { // special case for k = 0 - for ( int j=1; j <= numSamples; j++ ) { - kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0]; - } - } else { // k > 0 - final double[] kMinus1 = logY.getkMinus1(); - final double[] kMinus2 = logY.getkMinus2(); - - for ( int j=1; j <= numSamples; j++ ) { - final double[] gl = genotypeLikelihoods.get(j); - final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - - double aa = Double.NEGATIVE_INFINITY; - double ab = Double.NEGATIVE_INFINITY; - if (k < 2*j-1) - aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0]; - - if (k < 2*j) - ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1]; - - double log10Max; - if (k > 1) { - final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2]; - log10Max = approximateLog10SumLog10(aa, ab, bb); - } else { - // we know we aren't considering the BB case, so we can use an optimized log10 function - log10Max = approximateLog10SumLog10(aa, ab); - } - - // finally, update the L(j,k) value - kMinus0[j] = log10Max - logDenominator; - } - } - - // update the posteriors vector - final double log10LofK = kMinus0[numSamples]; - log10AlleleFrequencyLikelihoods[0][k] = log10LofK; - log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k]; - - // can we abort early? - lastK = k; - maxLog10L = Math.max(maxLog10L, log10LofK); - if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { - //if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); - done = true; - } - - logY.rotate(); - } - - return lastK; - } - - final static double approximateLog10SumLog10(double a, double b, double c) { - return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c); - } -*/ - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 30c0f3e18..885463fcb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -41,7 +42,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection */ @Advanced @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT; + public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT; /** * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily @@ -75,10 +76,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; - @Hidden - @Argument(fullName = "cap_max_alternate_alleles_for_indels", shortName = "capMaxAltAllelesForIndels", doc = "Cap the maximum number of alternate alleles to genotype for indel calls at 2; overrides the --max_alternate_alleles argument; GSA production use only", required = false) - public boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = false; - // indel-related arguments /** * A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site. @@ -160,7 +157,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy */ @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Plody (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false) - int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY; + public int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY; @Hidden @Argument(shortName="minqs", fullName="min_quality_score", doc="Min quality score to consider. Smaller numbers process faster. Default: Q1.", required=false) @@ -186,7 +183,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! public UnifiedArgumentCollection clone() { UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); @@ -212,7 +208,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; uac.alleles = alleles; uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES; - uac.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS; + uac.MAX_ALTERNATE_ALLELES_FOR_INDELS = MAX_ALTERNATE_ALLELES_FOR_INDELS; uac.GLmodel = GLmodel; uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL; uac.referenceSampleRod = referenceSampleRod; @@ -224,6 +220,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection uac.minReferenceDepth = minReferenceDepth; uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES; uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO; + uac.exactCallsLog = exactCallsLog; // todo- arguments to remove uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; @@ -239,8 +236,10 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection this.GenotypingMode = SCAC.GenotypingMode; this.heterozygosity = SCAC.heterozygosity; this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; + this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS; this.OutputMode = SCAC.OutputMode; this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; + this.exactCallsLog = SCAC.exactCallsLog; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 0d1997252..3116d3a7d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -27,10 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; @@ -249,7 +249,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2"); } - if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL) + if (UAC.AFmodel != AFCalc.Model.POOL) throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 469d63b8a..a52b5dfe6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -34,6 +34,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; @@ -78,11 +81,7 @@ public class UnifiedGenotyperEngine { private ThreadLocal> glcm = new ThreadLocal>(); // the model used for calculating p(non-ref) - private ThreadLocal afcm = new ThreadLocal(); - - // the allele frequency likelihoods and posteriors (allocated once as an optimization) - private ThreadLocal alleleFrequencyCalculationResult = new ThreadLocal(); - private ThreadLocal posteriorsArray = new ThreadLocal(); + private ThreadLocal afcm = new ThreadLocal(); // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything private final double[] log10AlleleFrequencyPriorsSNPs; @@ -105,8 +104,6 @@ public class UnifiedGenotyperEngine { private final GenomeLocParser genomeLocParser; private final boolean BAQEnabledOnCMDLine; - protected static final double SUM_GL_THRESH_NOCALL = VariantContextUtils.SUM_GL_THRESH_NOCALL; - // --------------------------------------------------------------------------------------------------------- // // Public interface functions @@ -355,11 +352,8 @@ public class UnifiedGenotyperEngine { // initialize the data for this thread if that hasn't been done yet if ( afcm.get() == null ) { - afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); - alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES)); - posteriorsArray.set(new double[2]); + afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger)); } - AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get(); // estimate our confidence in a reference call and return if ( vc.getNSamples() == 0 ) { @@ -370,8 +364,7 @@ public class UnifiedGenotyperEngine { generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); } - AFresult.reset(); - List allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult); + AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; @@ -380,50 +373,43 @@ public class UnifiedGenotyperEngine { final List myAlleles = new ArrayList(vc.getAlleles().size()); final List alleleCountsofMLE = new ArrayList(vc.getAlleles().size()); myAlleles.add(vc.getReference()); - for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { - final Allele alternateAllele = vc.getAlternateAllele(i); - final int indexOfAllele = allelesUsedInGenotyping.indexOf(alternateAllele); - // the genotyping model may have stripped it out - if ( indexOfAllele == -1 ) + for ( int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++ ) { + final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i); + if ( alternateAllele.isReference() ) continue; - final int indexOfBestAC = AFresult.getAlleleCountsOfMAP()[indexOfAllele-1]; + // we are non-ref if the probability of being non-ref > the emit confidence. + // the emit confidence is phred-scaled, say 30 => 10^-3. + // the posterior AF > 0 is log10: -5 => 10^-5 + // we are non-ref if 10^-5 < 10^-3 => -5 < -3 + final boolean isNonRef = AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0); // if the most likely AC is not 0, then this is a good alternate allele to use - if ( indexOfBestAC != 0 ) { + if ( isNonRef ) { myAlleles.add(alternateAllele); - alleleCountsofMLE.add(AFresult.getAlleleCountsOfMLE()[indexOfAllele-1]); + alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); bestGuessIsRef = false; } // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { myAlleles.add(alternateAllele); - alleleCountsofMLE.add(AFresult.getAlleleCountsOfMLE()[indexOfAllele-1]); + alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); } } - // calculate p(f>0): - final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get()); - final double PofF = 1.0 - normalizedPosteriors[0]; + final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0()); - double phredScaledConfidence; - if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); - if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero(); - } else { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); - if ( Double.isInfinite(phredScaledConfidence) ) { - final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); - phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); - } - } + // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice + final double phredScaledConfidence = + Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES + ? -10 * AFresult.getLog10PosteriorOfAFEq0() + : -10 * AFresult.getLog10PosteriorOfAFGT0()); // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) { // technically, at this point our confidence in a reference call isn't accurately estimated // because it didn't take into account samples with no data, so let's get a better estimate - return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, 1.0 - PofF); + return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0); } // start constructing the resulting VC @@ -439,7 +425,7 @@ public class UnifiedGenotyperEngine { // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) - printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model); + printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model); // *** note that calculating strand bias involves overwriting data structures, so we do that last final HashMap attributes = new HashMap(); @@ -470,27 +456,25 @@ public class UnifiedGenotyperEngine { // the overall lod //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; - double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); + double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); List allAllelesToUse = builder.make().getAlleles(); // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); - AFresult.reset(); - afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); + AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); - double forwardLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); + double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); + double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); - AFresult.reset(); - afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); + AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); - double reverseLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); + double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); + double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; @@ -513,10 +497,10 @@ public class UnifiedGenotyperEngine { // if we are subsetting alleles (either because there were too many or because some were not polymorphic) // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). - if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) + if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); - if ( annotationEngine != null && !limitedContext ) { + if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); @@ -524,13 +508,7 @@ public class UnifiedGenotyperEngine { vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); } - return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); - } - - public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) { - normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero(); - normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); - return MathUtils.normalizeFromLog10(normalizedPosteriors); + return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); } private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { @@ -633,8 +611,6 @@ public class UnifiedGenotyperEngine { AFline.append(i + "/" + N + "\t"); AFline.append(String.format("%.2f\t", ((float)i)/N)); AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); - AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MLE())); - AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MAP())); verboseWriter.println(AFline.toString()); } @@ -700,7 +676,7 @@ public class UnifiedGenotyperEngine { return models; } - protected static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) { + public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) { double sum = 0.0; @@ -754,34 +730,6 @@ public class UnifiedGenotyperEngine { return glcm; } - private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) { - - List> afClasses = new PluginManager(AlleleFrequencyCalculationModel.class).getPlugins(); - - // user-specified name - String afModelName = UAC.AFmodel.name(); - - if (!afModelName.contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) - afModelName = GPSTRING + afModelName; - - for (int i = 0; i < afClasses.size(); i++) { - Class afClass = afClasses.get(i); - String key = afClass.getSimpleName().replace("AFCalculationModel","").toUpperCase(); - if (afModelName.equalsIgnoreCase(key)) { - try { - Object args[] = new Object[]{UAC,N,logger,verboseWriter}; - Constructor c = afClass.getDeclaredConstructor(UnifiedArgumentCollection.class, int.class, Logger.class, PrintStream.class); - - return (AlleleFrequencyCalculationModel)c.newInstance(args); - } - catch (Exception e) { - throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel); - } - } - } - throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel); - } - public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { if ( tracker == null || ref == null || logger == null ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java new file mode 100755 index 000000000..f87084a9c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.List; + + +/** + * Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods + * + */ +public abstract class AFCalc implements Cloneable { + private final static Logger defaultLogger = Logger.getLogger(AFCalc.class); + + protected final int nSamples; + protected final int maxAlternateAllelesToGenotype; + protected final int maxAlternateAllelesForIndels; + + protected Logger logger = defaultLogger; + + private SimpleTimer callTimer = new SimpleTimer(); + private PrintStream callReport = null; + private final AFCalcResultTracker resultTracker; + + protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); + if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels); + if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be > 0 but got " + ploidy); + + this.nSamples = nSamples; + this.maxAlternateAllelesToGenotype = maxAltAlleles; + this.maxAlternateAllelesForIndels = maxAltAllelesForIndels; + this.resultTracker = new AFCalcResultTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); + } + + public void enableProcessLog(final File exactCallsLog) { + initializeOutputFile(exactCallsLog); + } + + public void setLogger(Logger logger) { + this.logger = logger; + } + + /** + * Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc + * + * @param vc the VariantContext holding the alleles and sample information + * @param log10AlleleFrequencyPriors a prior vector nSamples x 2 in length indicating the Pr(AF = i) + * @return result (for programming convenience) + */ + public AFCalcResult getLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { + if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null"); + if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null"); + if ( resultTracker == null ) throw new IllegalArgumentException("Results object cannot be null"); + + // reset the result, so we can store our new result there + resultTracker.reset(); + + final VariantContext vcWorking = reduceScope(vc); + + callTimer.start(); + final AFCalcResult result = computeLog10PNonRef(vcWorking, log10AlleleFrequencyPriors); + final long nanoTime = callTimer.getElapsedTimeNano(); + + if ( callReport != null ) + printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, resultTracker.getLog10PosteriorOfAFzero()); + + return result; + } + + @Deprecated + protected AFCalcResult resultFromTracker(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { + resultTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); + return resultTracker.toAFCalcResult(log10AlleleFrequencyPriors); + } + + // --------------------------------------------------------------------------- + // + // Abstract methods that should be implemented by concrete implementations + // to actually calculate the AF + // + // --------------------------------------------------------------------------- + + /** + * Look at VC and perhaps return a new one of reduced complexity, if that's necessary + * + * Used before the call to computeLog10PNonRef to simply the calculation job at hand, + * if vc exceeds bounds. For example, if VC has 100 alt alleles this function + * may decide to only genotype the best 2 of them. + * + * @param vc the initial VC provided by the caller to this AFcalculation + * @return a potentially simpler VC that's more tractable to genotype + */ + @Requires("vc != null") + @Ensures("result != null") + protected abstract VariantContext reduceScope(final VariantContext vc); + + /** + * Actually carry out the log10PNonRef calculation on vc, storing results in results + * + * @param vc variant context with alleles and genotype likelihoods + * @param log10AlleleFrequencyPriors priors + * @return a AFCalcResult object describing the results of this calculation + */ + // TODO -- add consistent requires among args + protected abstract AFCalcResult computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors); + + /** + * Must be overridden by concrete subclasses + * + * @param vc variant context with alleles and genotype likelihoods + * @param allelesToUse alleles to subset + * @param assignGenotypes + * @param ploidy + * @return GenotypesContext object + */ + public abstract GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy); + + // --------------------------------------------------------------------------- + // + // accessors + // + // --------------------------------------------------------------------------- + + public int getMaxAltAlleles() { + return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels); + } + + + // --------------------------------------------------------------------------- + // + // Print information about the call to the calls log + // + // --------------------------------------------------------------------------- + + private void initializeOutputFile(final File outputFile) { + try { + if (outputFile != null) { + callReport = new PrintStream( new FileOutputStream(outputFile) ); + callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value"))); + } + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotCreateOutputFile(outputFile, e); + } + } + + private void printCallInfo(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final long runtimeNano, + final double log10PosteriorOfAFzero) { + printCallElement(vc, "type", "ignore", vc.getType()); + + int allelei = 0; + for ( final Allele a : vc.getAlleles() ) + printCallElement(vc, "allele", allelei++, a.getDisplayString()); + + for ( final Genotype g : vc.getGenotypes() ) + printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString()); + + for ( int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++ ) + printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]); + + printCallElement(vc, "runtime.nano", "ignore", runtimeNano); + printCallElement(vc, "log10PosteriorOfAFzero", "ignore", log10PosteriorOfAFzero); + + callReport.flush(); + } + + private void printCallElement(final VariantContext vc, + final Object variable, + final Object key, + final Object value) { + final String loc = String.format("%s:%d", vc.getChr(), vc.getStart()); + callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value))); + } + + public AFCalcResultTracker getResultTracker() { + return resultTracker; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java new file mode 100644 index 000000000..981100eaa --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java @@ -0,0 +1,226 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.lang.reflect.Constructor; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +/** + * Factory to make AFCalculations + */ +public class AFCalcFactory { + /** + * Enumeration of usable AF calculation, their constraints (i.e. ploidy). + * + * Note that the order these occur in the enum is the order of preference, so + * the first value is taken over the second when multiple calculations satisfy + * the needs of the request (i.e., considering ploidy). + */ + public enum Calculation { + /** The default implementation */ + EXACT(ReferenceDiploidExactAFCalc.class, 2, -1), + + /** reference implementation of multi-allelic EXACT model */ + EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1), + + /** expt. implementation */ + @Deprecated + EXACT_CONSTRAINED(ConstrainedDiploidExactAFCalc.class, 2, -1), + + /** expt. implementation -- for testing only */ + EXACT_INDEPENDENT(IndependentAllelesDiploidExactAFCalc.class, 2, -1), + + /** original biallelic exact model, for testing only */ + EXACT_ORIGINAL(OriginalDiploidExactAFCalc.class, 2, 2), + + /** implementation that supports any sample ploidy */ + EXACT_GENERAL_PLOIDY("GeneralPloidyExactAFCalc", -1, -1); + + /** + * Must be a name because we look this up dynamically + */ + public final String className; + public final int maxAltAlleles; + public final int requiredPloidy; + + private Calculation(final String className, final int requiredPloidy, final int maxAltAlleles) { + this.className = className; + this.requiredPloidy = requiredPloidy; + this.maxAltAlleles = maxAltAlleles; + } + + private Calculation(final Class clazz, final int requiredPloidy, final int maxAltAlleles) { + this(clazz.getSimpleName(), requiredPloidy, maxAltAlleles); + } + + public boolean usableForParams(final int requestedPloidy, final int requestedMaxAltAlleles) { + return (requiredPloidy == -1 || requiredPloidy == requestedPloidy) + && (maxAltAlleles == -1 || maxAltAlleles >= requestedMaxAltAlleles); + } + } + + private static final Map> afClasses; + static { + afClasses = new PluginManager(AFCalc.class).getPluginsByName(); + } + + private AFCalcFactory() { + + } + + private static Class getClassByName(final String name) { + for ( final Class clazz : afClasses.values() ) { + if ( clazz.getSimpleName().contains(name) ) { + return clazz; + } + } + + return null; + } + + /** + * Create a new AFCalc based on the parameters in the UAC + * + * @param UAC the UnifiedArgumentCollection containing the command-line parameters for the caller + * @param nSamples the number of samples we will be using + * @param logger an optional (can be null) logger to override the default in the model + * @return an initialized AFCalc + */ + public static AFCalc createAFCalc(final UnifiedArgumentCollection UAC, + final int nSamples, + final Logger logger) { + final int maxAltAlleles = Math.max(UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS); + if ( ! UAC.AFmodel.usableForParams(UAC.samplePloidy, maxAltAlleles) ) { + logger.info("Requested ploidy " + UAC.samplePloidy + " maxAltAlleles " + maxAltAlleles + " not supported by requested model " + UAC.AFmodel + " looking for an option"); + final List supportingCalculations = new LinkedList(); + for ( final Calculation calc : Calculation.values() ) { + if ( calc.usableForParams(UAC.samplePloidy, maxAltAlleles) ) + supportingCalculations.add(calc); + } + + if ( supportingCalculations.isEmpty() ) + throw new UserException("no AFCalculation model found that supports ploidy of " + UAC.samplePloidy + " and max alt alleles " + maxAltAlleles); + else if ( supportingCalculations.size() > 1 ) + logger.debug("Warning, multiple supporting AFCalcs found " + Utils.join(",", supportingCalculations) + " choosing first arbitrarily"); + else + UAC.AFmodel = supportingCalculations.get(0); + logger.info("Selecting model " + UAC.AFmodel); + } + + final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.samplePloidy); + + if ( logger != null ) calc.setLogger(logger); + if ( UAC.exactCallsLog != null ) calc.enableProcessLog(UAC.exactCallsLog); + + return calc; + } + + /** + * Create a new AFCalc, choosing the best implementation based on the given parameters, assuming + * that we will only be requesting bi-allelic variants to diploid genotypes + * + * @param nSamples the number of samples we'll be using + * + * @return an initialized AFCalc + */ + public static AFCalc createAFCalc(final int nSamples) { + return createAFCalc(chooseBestCalculation(nSamples, 2, 1), nSamples, 2, 2, 2); + } + + /** + * Create a new AFCalc that supports maxAltAlleles for all variants and diploid genotypes + * + * @param calc the calculation we'd like to use + * @param nSamples the number of samples we'll be using + * @param maxAltAlleles the max. alt alleles for both SNPs and indels + * + * @return an initialized AFCalc + */ + public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles) { + return createAFCalc(calc, nSamples, maxAltAlleles, maxAltAlleles, 2); + } + + /** + * Create a new AFCalc, choosing the best implementation based on the given parameters + * + * @param nSamples the number of samples we'll be using + * @param maxAltAlleles the max. alt alleles to consider for SNPs + * @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs + * @param ploidy the sample ploidy. Must be consistent with the calc + * + * @return an initialized AFCalc + */ + public static AFCalc createAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels); + return createAFCalc(chooseBestCalculation(nSamples, ploidy, maxAlt), nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + } + + /** + * Choose the best calculation for nSamples and ploidy + * + * @param nSamples + * @param ploidy + * @param maxAltAlleles + * @return + */ + private static Calculation chooseBestCalculation(final int nSamples, final int ploidy, final int maxAltAlleles) { + for ( final Calculation calc : Calculation.values() ) { + if ( calc.usableForParams(ploidy, maxAltAlleles) ) { + return calc; + } + } + + throw new IllegalStateException("no calculation found that supports nSamples " + nSamples + " ploidy " + ploidy + " and maxAltAlleles " + maxAltAlleles); + } + + /** + * Create a new AFCalc + * + * @param calc the calculation to use + * @param nSamples the number of samples we'll be using + * @param maxAltAlleles the max. alt alleles to consider for SNPs + * @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs + * @param ploidy the sample ploidy. Must be consistent with the calc + * + * @return an initialized AFCalc + */ + public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + if ( calc == null ) throw new IllegalArgumentException("Calculation cannot be null"); + if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); + if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels); + if ( ploidy < 1 ) throw new IllegalArgumentException("sample ploidy must be greater than zero " + ploidy); + + final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels); + if ( ! calc.usableForParams(ploidy, maxAlt) ) + throw new IllegalArgumentException("AFCalc " + calc + " does not support requested ploidy " + ploidy); + + final Class afClass = getClassByName(calc.className); + if ( afClass == null ) + throw new IllegalArgumentException("Unexpected AFCalc " + calc); + + try { + Object args[] = new Object[]{nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy}; + Constructor c = afClass.getDeclaredConstructor(int.class, int.class, int.class, int.class); + return (AFCalc)c.newInstance(args); + } catch (Exception e) { + throw new ReviewedStingException("Could not instantiate AFCalc " + calc, e); + } + } + + protected static List createAFCalcs(final List calcs, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + final List AFCalcs = new LinkedList(); + + for ( final Calculation calc : calcs ) + AFCalcs.add(createAFCalc(calc, nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy)); + + return AFCalcs; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java new file mode 100644 index 000000000..da7fd08ce --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Describes the results of the AFCalc + * + * Only the bare essentials are represented here, as all AFCalc models must return meaningful results for + * all of these fields. + * + * Note that all of the values -- i.e. priors -- are checked now that they are meaningful, which means + * that users of this code can rely on the values coming out of these functions. + */ +public class AFCalcResult { + private final static int AF0 = 0; + private final static int AF1p = 1; + private final static int LOG_10_ARRAY_SIZES = 2; + + private final double[] log10LikelihoodsOfAC; + private final double[] log10PriorsOfAC; + private final double[] log10PosteriorsOfAC; + + private final Map log10pNonRefByAllele; + + /** + * The AC values for all ALT alleles at the MLE + */ + private final int[] alleleCountsOfMLE; + + int nEvaluations = 0; + + /** + * The list of alleles actually used in computing the AF + */ + private List allelesUsedInGenotyping = null; + + /** + * Create a results object capability of storing results for calls with up to maxAltAlleles + */ + public AFCalcResult(final int[] alleleCountsOfMLE, + final int nEvaluations, + final List allelesUsedInGenotyping, + final double[] log10LikelihoodsOfAC, + final double[] log10PriorsOfAC, + final Map log10pNonRefByAllele) { + if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.size() < 1 ) throw new IllegalArgumentException("allelesUsedInGenotyping must be non-null list of at least 1 value " + allelesUsedInGenotyping); + if ( alleleCountsOfMLE == null ) throw new IllegalArgumentException("alleleCountsOfMLE cannot be null"); + if ( alleleCountsOfMLE.length != allelesUsedInGenotyping.size() - 1) throw new IllegalArgumentException("alleleCountsOfMLE.length " + alleleCountsOfMLE.length + " != allelesUsedInGenotyping.size() " + allelesUsedInGenotyping.size()); + if ( nEvaluations < 0 ) throw new IllegalArgumentException("nEvaluations must be >= 0 but saw " + nEvaluations); + if ( log10LikelihoodsOfAC.length != 2 ) throw new IllegalArgumentException("log10LikelihoodsOfAC must have length equal 2"); + if ( log10PriorsOfAC.length != 2 ) throw new IllegalArgumentException("log10PriorsOfAC must have length equal 2"); + if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null"); + if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); + if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); + if ( ! goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); + if ( ! goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); + + this.alleleCountsOfMLE = alleleCountsOfMLE; + this.nEvaluations = nEvaluations; + this.allelesUsedInGenotyping = allelesUsedInGenotyping; + + this.log10LikelihoodsOfAC = Arrays.copyOf(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES); + this.log10PriorsOfAC = Arrays.copyOf(log10PriorsOfAC, LOG_10_ARRAY_SIZES); + this.log10PosteriorsOfAC = computePosteriors(log10LikelihoodsOfAC, log10PriorsOfAC); + this.log10pNonRefByAllele = new HashMap(log10pNonRefByAllele); + } + + /** + * Return a new AFCalcResult with a new prior probability + * + * @param log10PriorsOfAC + * @return + */ + public AFCalcResult withNewPriors(final double[] log10PriorsOfAC) { + return new AFCalcResult(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele); + } + + /** + * Returns a vector with maxAltAlleles values containing AC values at the MLE + * + * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, + * starting from index 0 (i.e., the first alt allele is at 0). The vector is always + * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values + * are meaningful. + * + * @return a vector with allele counts, not all of which may be meaningful + */ + @Ensures("result != null") + public int[] getAlleleCountsOfMLE() { + return alleleCountsOfMLE; + } + + /** + * Returns the AC of allele a la #getAlleleCountsOfMLE + * + * @param allele the allele whose AC we want to know. Error if its not in allelesUsedInGenotyping + * @throws IllegalStateException if allele isn't in allelesUsedInGenotyping + * @return the AC of allele + */ + public int getAlleleCountAtMLE(final Allele allele) { + return getAlleleCountsOfMLE()[altAlleleIndex(allele)]; + } + + /** + * Returns the number of cycles used to evaluate the pNonRef for this AF calculation + * + * @return the number of evaluations required to produce the answer for this AF calculation + */ + public int getnEvaluations() { + return nEvaluations; + } + + /** + * Get the list of alleles actually used in genotyping. + * + * Due to computational / implementation constraints this may be smaller than + * the actual list of alleles requested + * + * @return a non-empty list of alleles used during genotyping + */ + @Ensures({"result != null", "! result.isEmpty()"}) + public List getAllelesUsedInGenotyping() { + return allelesUsedInGenotyping; + } + + /** + * Get the log10 normalized -- across all ACs -- posterior probability of AC == 0 for all alleles + * + * @return + */ + @Ensures({"goodLog10Probability(result)"}) + public double getLog10PosteriorOfAFEq0() { + return log10PosteriorsOfAC[AF0]; + } + + /** + * Get the log10 normalized -- across all ACs -- posterior probability of AC > 0 for any alleles + * + * @return + */ + @Ensures({"goodLog10Probability(result)"}) + public double getLog10PosteriorOfAFGT0() { + return log10PosteriorsOfAC[AF1p]; + } + + /** + * Get the log10 unnormalized -- across all ACs -- likelihood of AC == 0 for all alleles + * + * @return + */ + @Ensures({"goodLog10Probability(result)"}) + public double getLog10LikelihoodOfAFEq0() { + return log10LikelihoodsOfAC[AF0]; + } + + /** + * Get the log10 unnormalized -- across all ACs -- likelihood of AC > 0 for any alleles + * + * @return + */ + @Ensures({"goodLog10Probability(result)"}) + public double getLog10LikelihoodOfAFGT0() { + return log10LikelihoodsOfAC[AF1p]; + } + + /** + * Get the log10 unnormalized -- across all ACs -- prior probability of AC == 0 for all alleles + * + * @return + */ + @Ensures({"goodLog10Probability(result)"}) + public double getLog10PriorOfAFEq0() { + return log10PriorsOfAC[AF0]; + } + + /** + * Get the log10 unnormalized -- across all ACs -- prior probability of AC > 0 + * + * @return + */ + @Ensures({"goodLog10Probability(result)"}) + public double getLog10PriorOfAFGT0() { + return log10PriorsOfAC[AF1p]; + } + + /** + * Are we sufficiently confidence in being non-ref that the site is considered polymorphic? + * + * We are non-ref if the probability of being non-ref > the emit confidence (often an argument). + * Suppose posterior AF > 0 is log10: -5 => 10^-5 + * And that log10minPNonRef is -3. + * We are considered polymorphic since 10^-5 < 10^-3 => -5 < -3 + * + * @param log10minPNonRef the log10 scaled min pr of being non-ref to be considered polymorphic + * + * @return true if there's enough confidence (relative to log10minPNonRef) to reject AF == 0 + */ + public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) { + return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef; + } + + /** + * Returns the log10 probability that allele is segregating + * + * Unlike the sites-level annotation, this calculation is specific to allele, and can be + * used to separately determine how much evidence there is that allele is independently + * segregating as opposed to the site being polymorphic with any allele. In the bi-allelic + * case these are obviously the same but for multiple alt alleles there can be lots of + * evidence for one allele but not so much for any other allele + * + * @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping + * @return the log10 probability that allele is segregating at this site + */ + @Ensures("goodLog10Probability(result)") + public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) { + final Double log10pNonRef = log10pNonRefByAllele.get(allele); + if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele); + return log10pNonRef; + } + + /** + * Returns the log10 normalized posteriors given the log10 likelihoods and priors + * + * @param log10LikelihoodsOfAC + * @param log10PriorsOfAC + * + * @return freshly allocated log10 normalized posteriors vector + */ + @Requires("log10LikelihoodsOfAC.length == log10PriorsOfAC.length") + @Ensures("goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)") + private static double[] computePosteriors(final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC) { + final double[] log10UnnormalizedPosteriors = new double[log10LikelihoodsOfAC.length]; + for ( int i = 0; i < log10LikelihoodsOfAC.length; i++ ) + log10UnnormalizedPosteriors[i] = log10LikelihoodsOfAC[i] + log10PriorsOfAC[i]; + + // necessary because the posteriors may be so skewed that the log-space normalized value isn't + // good, so we have to try both log-space normalization as well as the real-space normalization if the + // result isn't good + final double[] logNormalized = MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, true); + if ( goodLog10ProbVector(logNormalized, logNormalized.length, true) ) + return logNormalized; + else + return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false); + } + + /** + * Check that the log10 prob vector vector is well formed + * + * @param vector + * @param expectedSize + * @param shouldSumToOne + * + * @return true if vector is well-formed, false otherwise + */ + private static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { + if ( vector.length != expectedSize ) return false; + + for ( final double pr : vector ) { + if ( ! goodLog10Probability(pr) ) + return false; + } + + if ( shouldSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(vector), 1.0, 1e-4) != 0 ) + return false; + + return true; // everything is good + } + + /** + * Computes the offset into linear vectors indexed by alt allele for allele + * + * Things like our MLE allele count vector are indexed by alt allele index, with + * the first alt allele being 0, the second 1, etc. This function computes the index + * associated with allele. + * + * @param allele the allele whose alt index we'd like to know + * @throws IllegalArgumentException if allele isn't in allelesUsedInGenotyping + * @return an index value greater than 0 suitable for indexing into the MLE and other alt allele indexed arrays + */ + @Requires("allele != null") + @Ensures({"result >= 0", "result < allelesUsedInGenotyping.size() - 1"}) + private int altAlleleIndex(final Allele allele) { + if ( allele.isReference() ) throw new IllegalArgumentException("Cannot get the alt allele index for reference allele " + allele); + final int index = allelesUsedInGenotyping.indexOf(allele); + if ( index == -1 ) + throw new IllegalArgumentException("could not find allele " + allele + " in " + allelesUsedInGenotyping); + else + return index - 1; + } + + /** + * Checks that the result is a well-formed log10 probability + * + * @param result a supposedly well-formed log10 probability value + * @return true if result is really well formed + */ + private static boolean goodLog10Probability(final double result) { + return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java new file mode 100644 index 000000000..5c926a4d8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import com.google.java.contract.Ensures; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: Dec 14, 2011 + * + * Useful helper class to communicate the results of the allele frequency calculation + * + * TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF? + */ +class AFCalcResultTracker { + protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; + + // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles + protected double log10MLE; + protected double log10MAP; + private final int[] alleleCountsOfMLE; + private final int[] alleleCountsOfMAP; + + // The posteriors seen, not including that of AF=0 + private static final int LIKELIHOODS_CACHE_SIZE = 5000; + private final double[] log10LikelihoodsMatrixValues = new double[LIKELIHOODS_CACHE_SIZE]; + private int currentLikelihoodsCacheIndex = 0; + protected Double log10LikelihoodsMatrixSum = null; + + // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) + private double log10LikelihoodOfAFzero; + private double log10PosteriorOfAFzero; + private int[] AClimits; + + int nEvaluations = 0; + + /** + * The list of alleles actually used in computing the AF + */ + private List allelesUsedInGenotyping = null; + + /** + * Create a results object capability of storing results for calls with up to maxAltAlleles + * + * @param maxAltAlleles an integer >= 1 + */ + public AFCalcResultTracker(final int maxAltAlleles) { + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles); + + alleleCountsOfMLE = new int[maxAltAlleles]; + alleleCountsOfMAP = new int[maxAltAlleles]; + + reset(); + } + + /** + * Returns a vector with maxAltAlleles values containing AC values at the MLE + * + * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, + * starting from index 0 (i.e., the first alt allele is at 0). The vector is always + * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values + * are meaningful. + * + * @return a vector with allele counts, not all of which may be meaningful + */ + @Ensures("result != null") + public int[] getAlleleCountsOfMLE() { + return alleleCountsOfMLE; + } + + /** + * Returns a vector with maxAltAlleles values containing AC values at the MAP + * + * @see #getAlleleCountsOfMLE() for the encoding of results in this vector + * + * @return a non-null vector of ints + */ + @Ensures("result != null") + public int[] getAlleleCountsOfMAP() { + return alleleCountsOfMAP; + } + + /** + * Returns the likelihoods summed across all AC values for AC > 0 + * + * @return + */ + public double getLog10LikelihoodOfAFNotZero() { + if ( log10LikelihoodsMatrixSum == null ) { + if ( currentLikelihoodsCacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have + log10LikelihoodsMatrixSum = MathUtils.LOG10_P_OF_ZERO; + else + log10LikelihoodsMatrixSum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex); + } + return log10LikelihoodsMatrixSum; + } + + public double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { + return Math.min(getLog10LikelihoodOfAFNotZero(), capAt0 ? 0.0 : Double.POSITIVE_INFINITY); + } + + /** + * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should + * + * @return + */ + public double getLog10LikelihoodOfAFzero() { + return log10LikelihoodOfAFzero; + } + + /** + * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should + * + * @return + */ + public double getLog10PosteriorOfAFzero() { + return log10PosteriorOfAFzero; + } + + protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { + final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); + final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; + final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); + + // TODO -- replace with more meaningful computation + // TODO -- refactor this calculation into the ref calculation + final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); + for ( int i = 0; i < subACOfMLE.length; i++ ) { + final Allele allele = allelesUsedInGenotyping.get(i+1); + final double log10PNonRef = getAlleleCountsOfMAP()[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was + log10pNonRefByAllele.put(allele, log10PNonRef); + } + + return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); + } + + // -------------------------------------------------------------------------------- + // + // Protected mutational methods only for use within the calculation models themselves + // + // -------------------------------------------------------------------------------- + + /** + * Reset the data in this results object, so that it can be used in a subsequent AF calculation + * + * Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer + */ + protected void reset() { + log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = VALUE_NOT_CALCULATED; + for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) { + alleleCountsOfMLE[i] = 0; + alleleCountsOfMAP[i] = 0; + } + currentLikelihoodsCacheIndex = 0; + log10LikelihoodsMatrixSum = null; + allelesUsedInGenotyping = null; + nEvaluations = 0; + Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY); + } + + /** + * Tell this result we used one more evaluation cycle + */ + protected void incNEvaluations() { + nEvaluations++; + } + + protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { + addToLikelihoodsCache(log10LofK); + + if ( log10LofK > log10MLE ) { + log10MLE = log10LofK; + for ( int i = 0; i < alleleCountsForK.length; i++ ) + alleleCountsOfMLE[i] = alleleCountsForK[i]; + } + } + + protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { + if ( log10LofK > log10MAP ) { + log10MAP = log10LofK; + for ( int i = 0; i < alleleCountsForK.length; i++ ) + alleleCountsOfMAP[i] = alleleCountsForK[i]; + } + } + + private void addToLikelihoodsCache(final double log10LofK) { + // add to the cache + log10LikelihoodsMatrixValues[currentLikelihoodsCacheIndex++] = log10LofK; + + // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell + if ( currentLikelihoodsCacheIndex == LIKELIHOODS_CACHE_SIZE) { + final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex); + Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY); + log10LikelihoodsMatrixValues[0] = temporarySum; + currentLikelihoodsCacheIndex = 1; + } + } + + protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { + this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; + if ( log10LikelihoodOfAFzero > log10MLE ) { + log10MLE = log10LikelihoodOfAFzero; + Arrays.fill(alleleCountsOfMLE, 0); + } + } + + protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { + this.log10PosteriorOfAFzero = log10PosteriorOfAFzero; + if ( log10PosteriorOfAFzero > log10MAP ) { + log10MAP = log10PosteriorOfAFzero; + Arrays.fill(alleleCountsOfMAP, 0); + } + } + + protected void setAllelesUsedInGenotyping(List allelesUsedInGenotyping) { + if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() ) + throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty"); + + this.allelesUsedInGenotyping = allelesUsedInGenotyping; + } + + protected void setAClimits(int[] AClimits) { + this.AClimits = AClimits; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedDiploidExactAFCalc.java new file mode 100644 index 000000000..36d53ceaa --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedDiploidExactAFCalc.java @@ -0,0 +1,107 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +@Deprecated +public class ConstrainedDiploidExactAFCalc extends DiploidExactAFCalc { + protected ConstrainedDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { + super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + } + + protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) { + final int[] maxACsToConsider = computeMaxACs(vc); + resultTracker.setAClimits(maxACsToConsider); + return new StateTracker(maxACsToConsider); + } + + /** + * Computes the maximum ACs we need to consider for each alt allele + * + * Walks over the genotypes in VC, and computes for each alt allele the maximum + * AC we need to consider in that alt allele dimension. Does the calculation + * based on the PLs in each genotype g, choosing to update the max AC for the + * alt alleles corresponding to that PL. Only takes the first lowest PL, + * if there are multiple genotype configurations with the same PL value. It + * takes values in the order of the alt alleles. + * + * @param vc the variant context we will compute max alt alleles for + * @return a vector of max alt alleles, indexed by alt allele, so result[0] is the AC of the + * first alt allele. + */ + @Ensures("result != null") + protected final int[] computeMaxACs(final VariantContext vc) { + final int[] maxACs = new int[vc.getNAlleles()-1]; + + for ( final Genotype g : vc.getGenotypes() ) + updateMaxACs(g, maxACs); + + return maxACs; + } + + /** + * Update the maximum achievable allele counts in maxAC according to the PLs in g + * + * Selects the maximum genotype configuration from the PLs in g, and updates + * the maxAC for this configure. For example, if the lowest PL is for 0/1, updates + * the maxAC for the alt allele 1 by 1. If it's 1/1, update is 2. Works for + * many number of alt alleles (determined by length of maxACs). + * + * If the max PL occurs at 0/0, updates nothing + * Note that this function greedily takes the first min PL, so that if 0/1 and 1/1 have + * the same PL value, then updates the first one. + * + * Also, only will update 1 alt allele, so if 0/1 and 0/2 both have the same PL, + * then only first one (1) will be updated + * + * @param g the genotype to update + * @param maxACs the max allele count vector for alt alleles (starting at 0 => first alt allele) + */ + @Requires({ + "g != null", + "maxACs != null", + "goodMaxACs(maxACs)"}) + private void updateMaxACs(final Genotype g, final int[] maxACs) { + final int[] PLs = g.getLikelihoods().getAsPLs(); + + int minPLi = 0; + int minPL = PLs[0]; + + for ( int i = 0; i < PLs.length; i++ ) { + if ( PLs[i] < minPL ) { + minPL = PLs[i]; + minPLi = i; + } + } + + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(minPLi); + updateMaxACs(maxACs, pair.alleleIndex1); + updateMaxACs(maxACs, pair.alleleIndex2); + } + + /** + * Simple helper. Update max alt alleles maxACs according to the allele index (where 0 == ref) + * + * If alleleI == 0 => doesn't update anything + * else maxACs[alleleI - 1]++ + * + * @param maxACs array of max alt allele ACs + * @param alleleI the index (relative to 0) to update a count of 1 in max alt alleles. + */ + @Requires({ + "alleleI >= 0", + "(alleleI - 1) < maxACs.length", + "goodMaxACs(maxACs)"}) + private void updateMaxACs(final int[] maxACs, final int alleleI) { + if ( alleleI > 0 ) + maxACs[alleleI-1]++; + } + + private static boolean goodMaxACs(final int[] maxACs) { + return MathUtils.sum(maxACs) >= 0; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java new file mode 100755 index 000000000..8b12dff61 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +public abstract class DiploidExactAFCalc extends ExactAFCalc { + public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); + } + + protected abstract StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker); + + @Override + protected AFCalcResult computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { + final int numAlternateAlleles = vc.getNAlleles() - 1; + final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), true); + final int numSamples = genotypeLikelihoods.size()-1; + final int numChr = 2*numSamples; + + // queue of AC conformations to process + final LinkedList ACqueue = new LinkedList(); + + // mapping of ExactACset indexes to the objects + final HashMap indexesToACset = new HashMap(numChr+1); + + // add AC=0 to the queue + final int[] zeroCounts = new int[numAlternateAlleles]; + ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts)); + ACqueue.add(zeroSet); + indexesToACset.put(zeroSet.getACcounts(), zeroSet); + + // keep processing while we have AC conformations that need to be calculated + final StateTracker stateTracker = makeMaxLikelihood(vc, getResultTracker()); + + while ( !ACqueue.isEmpty() ) { + getResultTracker().incNEvaluations(); // keep track of the number of evaluations + + // compute log10Likelihoods + final ExactACset set = ACqueue.remove(); + + if ( stateTracker.withinMaxACs(set.getACcounts()) ) { + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, stateTracker, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, getResultTracker()); + + // adjust max likelihood seen if needed + stateTracker.update(log10LofKs, set.getACcounts()); + + // clean up memory + indexesToACset.remove(set.getACcounts()); + //if ( DEBUG ) + // System.out.printf(" *** removing used set=%s%n", set.ACcounts); + } + } + + return resultFromTracker(vc, log10AlleleFrequencyPriors); + } + + @Override + protected VariantContext reduceScope(final VariantContext vc) { + final int myMaxAltAllelesToGenotype = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype; + + // don't try to genotype too many alternate alleles + if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) { + logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + + VariantContextBuilder builder = new VariantContextBuilder(vc); + List alleles = new ArrayList(myMaxAltAllelesToGenotype + 1); + alleles.add(vc.getReference()); + alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype)); + builder.alleles(alleles); + builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); + return builder.make(); + } else { + return vc; + } + } + + private static final int PL_INDEX_OF_HOM_REF = 0; + private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) { + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) + likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); + + // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype + final ArrayList GLs = getGLs(vc.getGenotypes(), true); + for ( final double[] likelihoods : GLs ) { + final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); + if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) { + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL); + if ( alleles.alleleIndex1 != 0 ) + likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + // don't double-count it + if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 ) + likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + } + } + + // sort them by probability mass and choose the best ones + Collections.sort(Arrays.asList(likelihoodSums)); + final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); + for ( int i = 0; i < numAllelesToChoose; i++ ) + bestAlleles.add(likelihoodSums[i].allele); + + final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); + for ( Allele allele : vc.getAlternateAlleles() ) { + if ( bestAlleles.contains(allele) ) + orderedBestAlleles.add(allele); + } + + return orderedBestAlleles; + } + + private static final class DependentSet { + public final int[] ACcounts; + public final int PLindex; + + public DependentSet(final int[] ACcounts, final int PLindex) { + this.ACcounts = ACcounts; + this.PLindex = PLindex; + } + } + + private double calculateAlleleCountConformation(final ExactACset set, + final ArrayList genotypeLikelihoods, + final StateTracker stateTracker, + final int numChr, + final LinkedList ACqueue, + final HashMap indexesToACset, + final double[] log10AlleleFrequencyPriors, + final AFCalcResultTracker resultTracker) { + + //if ( DEBUG ) + // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); + + // compute the log10Likelihoods + computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, resultTracker); + + final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; + + // can we abort early because the log10Likelihoods are so small? + if ( stateTracker.abort(log10LofK, set.getACcounts()) ) { + //if ( DEBUG ) + // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); + return log10LofK; + } + + // iterate over higher frequencies if possible + final int ACwiggle = numChr - set.getACsum(); + if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies + return log10LofK; + + final int numAltAlleles = set.getACcounts().getCounts().length; + + // add conformations for the k+1 case + for ( int allele = 0; allele < numAltAlleles; allele++ ) { + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); + ACcountsClone[allele]++; + // to get to this conformation, a sample would need to be AB (remember that ref=0) + final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); + updateACset(stateTracker, ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + } + + // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different + if ( ACwiggle > 1 ) { + final ArrayList differentAlleles = new ArrayList(numAltAlleles * numAltAlleles); + final ArrayList sameAlleles = new ArrayList(numAltAlleles); + + for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { + for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); + ACcountsClone[allele_i]++; + ACcountsClone[allele_j]++; + + // to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index) + final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1); + if ( allele_i == allele_j ) + sameAlleles.add(new DependentSet(ACcountsClone, PLindex)); + else + differentAlleles.add(new DependentSet(ACcountsClone, PLindex)); + } + } + + // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering + for ( DependentSet dependent : differentAlleles ) + updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + for ( DependentSet dependent : sameAlleles ) + updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + } + + return log10LofK; + } + + // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and + // also pushes its value to the given callingSetIndex. + private void updateACset(final StateTracker stateTracker, + final int[] newSetCounts, + final int numChr, + final ExactACset dependentSet, + final int PLsetIndex, + final Queue ACqueue, + final HashMap indexesToACset, + final ArrayList genotypeLikelihoods) { + final ExactACcounts index = new ExactACcounts(newSetCounts); + if ( !indexesToACset.containsKey(index) ) { + ExactACset set = new ExactACset(numChr/2 +1, index); + indexesToACset.put(index, set); + ACqueue.add(set); + } + + // push data from the dependency to the new set + //if ( DEBUG ) + // System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts); + pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods); + } + + private void computeLofK(final ExactACset set, + final ArrayList genotypeLikelihoods, + final double[] log10AlleleFrequencyPriors, + final AFCalcResultTracker resultTracker) { + + set.getLog10Likelihoods()[0] = 0.0; // the zero case + final int totalK = set.getACsum(); + + // special case for k = 0 over all k + if ( totalK == 0 ) { + for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) + set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; + + final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; + resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); + resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + return; + } + + // if we got here, then k > 0 for at least one k. + // the non-AA possible conformations were already dealt with by pushes from dependent sets; + // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value + for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) { + + if ( totalK < 2*j-1 ) { + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.getLog10Likelihoods()[j-1] + gl[HOM_REF_INDEX]; + set.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[j], conformationValue); + } + + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j] - logDenominator; + } + + double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; + + // update the MLE if necessary + resultTracker.updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); + + // apply the priors over each alternate allele + for ( final int ACcount : set.getACcounts().getCounts() ) { + if ( ACcount > 0 ) + log10LofK += log10AlleleFrequencyPriors[ACcount]; + } + resultTracker.updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); + } + + private void pushData(final ExactACset targetSet, + final ExactACset dependentSet, + final int PLsetIndex, + final ArrayList genotypeLikelihoods) { + final int totalK = targetSet.getACsum(); + + for ( int j = 1; j < targetSet.getLog10Likelihoods().length; j++ ) { + + if ( totalK <= 2*j ) { // skip impossible conformations + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = + determineCoefficient(PLsetIndex, j, targetSet.getACcounts().getCounts(), totalK) + dependentSet.getLog10Likelihoods()[j-1] + gl[PLsetIndex]; + targetSet.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(targetSet.getLog10Likelihoods()[j], conformationValue); + } + } + } + + private double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { + // the closed form representation generalized for multiple alleles is as follows: + // AA: (2j - totalK) * (2j - totalK - 1) + // AB: 2k_b * (2j - totalK) + // AC: 2k_c * (2j - totalK) + // BB: k_b * (k_b - 1) + // BC: 2 * k_b * k_c + // CC: k_c * (k_c - 1) + + // find the 2 alleles that are represented by this PL index + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + + // *** note that throughout this method we subtract one from the alleleIndex because ACcounts *** + // *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. *** + + // the AX het case + if ( alleles.alleleIndex1 == 0 ) + return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK]; + + final int k_i = ACcounts[alleles.alleleIndex1-1]; + + // the hom var case (e.g. BB, CC, DD) + final double coeff; + if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) { + coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; + } + // the het non-ref case (e.g. BC, BD, CD) + else { + final int k_j = ACcounts[alleles.alleleIndex2-1]; + coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; + } + + return coeff; + } + + public GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy) { + return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java new file mode 100644 index 000000000..af6d46eb8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java @@ -0,0 +1,46 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import java.util.Arrays; + +/** +* Created with IntelliJ IDEA. +* User: depristo +* Date: 10/5/12 +* Time: 2:54 PM +* To change this template use File | Settings | File Templates. +*/ // a wrapper around the int array so that we can make it hashable +public final class ExactACcounts { + private final int[] counts; + private int hashcode = -1; + + public ExactACcounts(final int[] counts) { + this.counts = counts; + } + + public int[] getCounts() { + return counts; + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof ExactACcounts) && Arrays.equals(getCounts(), ((ExactACcounts) obj).getCounts()); + } + + @Override + public int hashCode() { + if ( hashcode == -1 ) + hashcode = Arrays.hashCode(getCounts()); + return hashcode; + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append(getCounts()[0]); + for ( int i = 1; i < getCounts().length; i++ ) { + sb.append("/"); + sb.append(getCounts()[i]); + } + return sb.toString(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java new file mode 100644 index 000000000..de5bad57f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java @@ -0,0 +1,49 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.Arrays; + +/** +* Created with IntelliJ IDEA. +* User: depristo +* Date: 10/5/12 +* Time: 2:53 PM +* To change this template use File | Settings | File Templates. +*/ // This class represents a column in the Exact AC calculation matrix +public final class ExactACset { + // the counts of the various alternate alleles which this column represents + private final ExactACcounts ACcounts; + + // the column of the matrix + private final double[] log10Likelihoods; + + int sum = -1; + + public ExactACset(final int size, final ExactACcounts ACcounts) { + this.ACcounts = ACcounts; + log10Likelihoods = new double[size]; + Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY); + } + + /** + * sum of all the non-reference alleles + */ + public int getACsum() { + if ( sum == -1 ) + sum = (int)MathUtils.sum(getACcounts().getCounts()); + return sum; + } + + public boolean equals(Object obj) { + return (obj instanceof ExactACset) && getACcounts().equals(((ExactACset)obj).getACcounts()); + } + + public ExactACcounts getACcounts() { + return ACcounts; + } + + public double[] getLog10Likelihoods() { + return log10Likelihoods; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java new file mode 100755 index 000000000..df0793352 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.ArrayList; + +/** + * Uses the Exact calculation of Heng Li + */ +abstract class ExactAFCalc extends AFCalc { + protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first + + protected ExactAFCalc(final int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { + super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + } + + /** + * Wrapper class that compares two likelihoods associated with two alleles + */ + protected static final class LikelihoodSum implements Comparable { + public double sum = 0.0; + public Allele allele; + + public LikelihoodSum(Allele allele) { this.allele = allele; } + + public int compareTo(LikelihoodSum other) { + final double diff = sum - other.sum; + return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; + } + } + + /** + * Unpack GenotypesContext into arraylist of doubel values + * @param GLs Input genotype context + * @return ArrayList of doubles corresponding to GL vectors + */ + protected static ArrayList getGLs(final GenotypesContext GLs, final boolean includeDummy) { + ArrayList genotypeLikelihoods = new ArrayList(GLs.size() + 1); + + if ( includeDummy ) genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy + for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { + if ( sample.hasLikelihoods() ) { + double[] gls = sample.getLikelihoods().getAsVector(); + + if ( MathUtils.sum(gls) < VariantContextUtils.SUM_GL_THRESH_NOCALL ) + genotypeLikelihoods.add(gls); + } + } + + return genotypeLikelihoods; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java new file mode 100755 index 000000000..0ac964c9c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { + private final static List BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + + private final static class CompareAFCalcResultsByPNonRef implements Comparator { + @Override + public int compare(AFCalcResult o1, AFCalcResult o2) { + return Double.compare(o1.getLog10LikelihoodOfAFGT0(), o2.getLog10LikelihoodOfAFGT0()); + } + } + + private final static CompareAFCalcResultsByPNonRef compareAFCalcResultsByPNonRef = new CompareAFCalcResultsByPNonRef(); + + final ReferenceDiploidExactAFCalc refModel; + + protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { + super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + refModel = new ReferenceDiploidExactAFCalc(nSamples, 1, 1, ploidy); + } + + @Override + protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) { + return refModel.makeMaxLikelihood(vc, resultTracker); + } + + private static class MyAFCalcResult extends AFCalcResult { + final List supporting; + + private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pNonRefByAllele, List supporting) { + super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele); + this.supporting = supporting; + } + } + + @Override + public AFCalcResult computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { + final double log10LikelihoodOfRef = computelog10LikelihoodOfRef(vc); + final List independentResultTrackers = computeAlleleConditionalExact(vc, log10AlleleFrequencyPriors); + final List withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers); + return combineIndependentPNonRefs(vc, log10LikelihoodOfRef, withMultiAllelicPriors); + } + + protected final double computelog10LikelihoodOfRef(final VariantContext vc) { + // this value just the likelihood of AF == 0 in the special constrained multi-allelic calculation + final List allGLs = getGLs(vc.getGenotypes(), false); + double log10LikelihoodOfHomRef = 0.0; + + // TODO -- can be easily optimized (currently looks at all GLs via getGLs) + for ( int i = 0; i < allGLs.size(); i++ ) { + final double[] GLs = allGLs.get(i); + log10LikelihoodOfHomRef += GLs[0]; + //log10LikelihoodOfHomRef += MathUtils.normalizeFromLog10(GLs, true)[0]; + } + + return log10LikelihoodOfHomRef; + } + + /** + * Computes the conditional bi-allelic exact results + * + * Suppose vc contains 2 alt allele: A* with C and T. This function first computes: + * + * (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything] + * + * it then computes the conditional probability on AF_c == 0: + * + * (2) P(D | AF_t > 0 && AF_c == 0) + * + * Thinking about this visually, we have the following likelihood matrix where each cell is + * the P(D | AF_c == i && AF_t == j): + * + * 0 AF_c > 0 + * ----------------- + * 0 | | + * |--|------------- + * a | | + * f | | + * _ | | + * t | | + * > | | + * 0 | | + * + * What we really want to know how + * + * (3) P(D | AF_c == 0 & AF_t == 0) + * + * compares with + * + * (4) P(D | AF_c > 0 || AF_t > 0) + * + * This is effectively asking for the value in the upper left vs. the sum of all cells. + * + * The quantity (1) is the same of all cells except those with AF_c == 0, while (2) is the + * band at the top where AF_t > 0 and AF_c == 0 + * + * So (4) is actually (1) + (2). + * + * (3) is the direct inverse of the (1) and (2), as we are simultaneously calculating + * + * (1*) P(D | AF_c == 0 && AF_t == *) [i.e., T can be anything] + * (2*) P(D | AF_t == 0 && AF_c == 0) [TODO -- note this value looks like the thing we are supposed to use] + * + * This function implements the conditional likelihoods summation for any number of alt + * alleles (not just the tri-allelic case), where each subsequent variant context is + * further constrained such that each already considered allele x has AF_x == 0 in the + * compute. + * + * @param vc + * @param log10AlleleFrequencyPriors + * @return + */ + protected List computeAlleleConditionalExact(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { + final List results = new LinkedList(); + + for ( final VariantContext subvc : makeAlleleConditionalContexts(vc) ) { + final AFCalcResult resultTracker = refModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors); + results.add(resultTracker); + } + + return results; + } + + protected List makeAlleleConditionalContexts(final VariantContext vc) { + final int nAltAlleles = vc.getNAlleles() - 1; + final List vcs = new LinkedList(); + + final List afZeroAlleles = new LinkedList(); + for ( int altI = 0; altI < nAltAlleles; altI++ ) { + final Allele altAllele = vc.getAlternateAllele(altI); + final List biallelic = Arrays.asList(vc.getReference(), altAllele); + vcs.add(biallelicCombinedGLs(vc, biallelic, afZeroAlleles, altI + 1)); + //afZeroAlleles.add(altAllele); + } + + return vcs; + } + + protected VariantContext biallelicCombinedGLs(final VariantContext rootVC, final List biallelic, final List afZeroAlleles, final int allele2) { + if ( rootVC.isBiallelic() ) { + if ( ! afZeroAlleles.isEmpty() ) throw new IllegalArgumentException("Root VariantContext is biallelic but afZeroAlleles wasn't empty: " + afZeroAlleles); + return rootVC; + } else { + final Set allelesToDiscard = new HashSet(rootVC.getAlleleIndices(afZeroAlleles)); + final int nAlts = rootVC.getNAlleles() - 1; + final List biallelicGenotypes = new ArrayList(rootVC.getNSamples()); + for ( final Genotype g : rootVC.getGenotypes() ) + biallelicGenotypes.add(combineGLs(g, allele2, allelesToDiscard, nAlts)); + + final VariantContextBuilder vcb = new VariantContextBuilder(rootVC); + vcb.alleles(biallelic); + vcb.genotypes(biallelicGenotypes); + return vcb.make(); + } + } + + /** + * Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case + * + * This is handled in the following way: + * + * Suppose we have for a A/B/C site the following GLs: + * + * AA AB BB AC BC CC + * + * and we want to get the bi-allelic GLs for X/B, where X is everything not B + * + * XX = AA + AC + CC (since X = A or C) + * XB = AB + BC + * BB = BB + * + * Supports the additional mode of simply dropping GLs whose allele index occurs in allelesToDiscard. This is + * useful in the case where you want to drop alleles (not combine them), such as above: + * + * AA AB BB AC BC CC + * + * and we want to get the bi-allelic GLs for X/B, where X is everything not B, but dropping C (index 2) + * + * XX = AA (since X = A and C is dropped) + * XB = AB + * BB = BB + * + * This allows us to recover partial GLs the correspond to any allele in allelesToDiscard having strictly + * AF == 0. + * + * @param original the original multi-allelic genotype + * @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0 + * @param nAlts the total number of alt alleles + * @return a new biallelic genotype with appropriate PLs + */ + @Requires({"original.hasLikelihoods()", "! allelesToDiscard.contains(altIndex)"}) + @Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"}) + protected Genotype combineGLs(final Genotype original, final int altIndex, final Set allelesToDiscard, final int nAlts ) { + if ( original.isNonInformative() ) + return new GenotypeBuilder(original).PL(new int[]{0,0,0}).alleles(BIALLELIC_NOCALL).make(); + + if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts); + + final double[] normalizedPr = MathUtils.normalizeFromLog10(GenotypeLikelihoods.fromPLs(original.getPL()).getAsVector()); + final double[] biAllelicPr = new double[3]; + + for ( int index = 0; index < normalizedPr.length; index++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index); + + // just continue if we shouldn't include the pair because it's in the discard set + if ( discardAllelePair(pair, allelesToDiscard) ) + continue; + + if ( pair.alleleIndex1 == altIndex ) { + if ( pair.alleleIndex2 == altIndex ) + // hom-alt case + biAllelicPr[2] = normalizedPr[index]; + else + // het-alt case + biAllelicPr[1] += normalizedPr[index]; + } else { + if ( pair.alleleIndex2 == altIndex ) + // het-alt case + biAllelicPr[1] += normalizedPr[index]; + else + // hom-non-alt + biAllelicPr[0] += normalizedPr[index]; + } + } + + final double[] GLs = new double[3]; + for ( int i = 0; i < GLs.length; i++ ) GLs[i] = Math.log10(biAllelicPr[i]); + + return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make(); + } + + protected boolean discardAllelePair(final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair, Set allelesToDiscard) { + return allelesToDiscard.contains(pair.alleleIndex1) || allelesToDiscard.contains(pair.alleleIndex2); + } + + protected List applyMultiAllelicPriors(final List conditionalPNonRefResults) { + final ArrayList sorted = new ArrayList(conditionalPNonRefResults); + + // sort the results, so the most likely allele is first + Collections.sort(sorted, compareAFCalcResultsByPNonRef); + + final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0(); + + for ( int i = 0; i < sorted.size(); i++ ) { + final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; + final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0)); + final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 }; + + // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior + sorted.set(i, sorted.get(i).withNewPriors(MathUtils.normalizeFromLog10(thetaTONPriors, true))); + } + + return sorted; + } + + + /** + * Take the independent estimates of pNonRef for each alt allele and combine them into a single result + * + * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently + */ + protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, + final double log10LikelihoodsOfACEq0, + final List sortedResultsWithThetaNPriors) { + int nEvaluations = 0; + final int nAltAlleles = sortedResultsWithThetaNPriors.size(); + final int[] alleleCountsOfMLE = new int[nAltAlleles]; + final double[] log10PriorsOfAC = new double[2]; + final Map log10pNonRefByAllele = new HashMap(nAltAlleles); + + // this value is a sum in real space so we need to store values to sum up later + final double[] log10LikelihoodsOfACGt0 = new double[nAltAlleles]; + + for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { + final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); + final int altI = vc.getAlleles().indexOf(altAllele) - 1; + + // MLE of altI allele is simply the MLE of this allele in altAlleles + alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele); + + log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0(); + log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); + + // the AF > 0 case requires us to store the normalized likelihood for later summation + log10LikelihoodsOfACGt0[altI] = sortedResultWithThetaNPriors.getLog10LikelihoodOfAFGT0(); + + // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior + log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0()); + + // trivial -- update the number of evaluations + nEvaluations += sortedResultWithThetaNPriors.nEvaluations; + } + + // the log10 likelihoods are the sum of the log10 likelihoods across all alt alleles + final double[] log10LikelihoodsOfAC = new double[]{ + log10LikelihoodsOfACEq0, + MathUtils.log10sumLog10(log10LikelihoodsOfACGt0)}; + + return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), + MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true, true), // necessary to ensure all values < 0 + MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized + log10pNonRefByAllele, sortedResultsWithThetaNPriors); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java new file mode 100644 index 000000000..093bf47d5 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -0,0 +1,145 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Map; + +/** + * Original bi-allelic ~O(N) implementation. Kept here for posterity and reference + */ +public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { + protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { + super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + } + + protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) { + return new StateTracker(); + } + + @Override + protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) { + final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length]; + final double[] log10AlleleFrequencyPosteriors = new double[log10AlleleFrequencyPriors.length]; + final int lastK = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + + final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1)}; + final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)}; + + final double pNonRef = lastK > 0 ? 0.0 : -1000.0; + final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), pNonRef); + + return new AFCalcResult(new int[]{lastK}, 0, vc.getAlleles(), log10Likelihoods, log10Priors, log10pNonRefByAllele); + } + + /** + * A simple data structure that holds the current, prev, and prev->prev likelihoods vectors + * for the exact model calculation + */ + private final static class ExactACCache { + double[] kMinus2, kMinus1, kMinus0; + + private static double[] create(int n) { + return new double[n]; + } + + public ExactACCache(int n) { + kMinus2 = create(n); + kMinus1 = create(n); + kMinus0 = create(n); + } + + final public void rotate() { + double[] tmp = kMinus2; + kMinus2 = kMinus1; + kMinus1 = kMinus0; + kMinus0 = tmp; + } + + final public double[] getkMinus2() { + return kMinus2; + } + + final public double[] getkMinus1() { + return kMinus1; + } + + final public double[] getkMinus0() { + return kMinus0; + } + } + + public int linearExact(final VariantContext vc, + double[] log10AlleleFrequencyPriors, + double[] log10AlleleFrequencyLikelihoods, + double[] log10AlleleFrequencyPosteriors) { + final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), false); + final int numSamples = genotypeLikelihoods.size()-1; + final int numChr = 2*numSamples; + + final ExactACCache logY = new ExactACCache(numSamples+1); + logY.getkMinus0()[0] = 0.0; // the zero case + + double maxLog10L = Double.NEGATIVE_INFINITY; + boolean done = false; + int lastK = -1; + + for (int k=0; k <= numChr && ! done; k++ ) { + final double[] kMinus0 = logY.getkMinus0(); + + if ( k == 0 ) { // special case for k = 0 + for ( int j=1; j <= numSamples; j++ ) { + kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0]; + } + } else { // k > 0 + final double[] kMinus1 = logY.getkMinus1(); + final double[] kMinus2 = logY.getkMinus2(); + + for ( int j=1; j <= numSamples; j++ ) { + final double[] gl = genotypeLikelihoods.get(j); + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + + double aa = Double.NEGATIVE_INFINITY; + double ab = Double.NEGATIVE_INFINITY; + if (k < 2*j-1) + aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0]; + + if (k < 2*j) + ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1]; + + double log10Max; + if (k > 1) { + final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2]; + log10Max = MathUtils.approximateLog10SumLog10(aa, ab, bb); + } else { + // we know we aren't considering the BB case, so we can use an optimized log10 function + log10Max = MathUtils.approximateLog10SumLog10(aa, ab); + } + + // finally, update the L(j,k) value + kMinus0[j] = log10Max - logDenominator; + } + } + + // update the posteriors vector + final double log10LofK = kMinus0[numSamples]; + log10AlleleFrequencyLikelihoods[k] = log10LofK; + log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k]; + + // can we abort early? + lastK = k; + maxLog10L = Math.max(maxLog10L, log10LofK); + if ( log10LofK < maxLog10L - StateTracker.MAX_LOG10_ERROR_TO_STOP_EARLY ) { + //if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); + done = true; + } + + logY.rotate(); + } + + return lastK; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java new file mode 100644 index 000000000..4de983508 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java @@ -0,0 +1,13 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +public class ReferenceDiploidExactAFCalc extends DiploidExactAFCalc { + protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { + super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + } + + protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) { + return new StateTracker(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java new file mode 100644 index 000000000..19e253277 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -0,0 +1,96 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +/** + * Keeps track of the best state seen by the exact model and the max states to visit + * allowing us to abort the search before we visit the entire matrix of AC x samples + */ +final class StateTracker { + public final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 + + final private int[] maxACsToConsider; + + private ExactACcounts ACsAtMax = null; + private double maxLog10L = Double.NEGATIVE_INFINITY; + + public StateTracker() { + this(null); + } + + public StateTracker(final int[] maxACsToConsider) { + this.maxACsToConsider = maxACsToConsider; + } + + /** + * Update the maximum log10L seen, if log10LofKs is higher, and the corresponding ACs of this state + * + * @param log10LofKs the likelihood of our current configuration state + */ + public void update(final double log10LofKs, final ExactACcounts ACs) { + if ( log10LofKs > getMaxLog10L()) { + this.setMaxLog10L(log10LofKs); + this.ACsAtMax = ACs; + } + } + + /** + * Is the likelihood of configuration K too low to consider, related to the + * maximum likelihood seen already? + * + * @param log10LofK the log10 likelihood of the configuration we're considering analyzing + * @return true if the configuration cannot meaningfully contribute to our likelihood sum + */ + public boolean tooLowLikelihood(final double log10LofK) { + return log10LofK < getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY; + } + + /** + * Are all ACs in otherACs less than or equal to their corresponding ACs in the maxACsToConsider? + * + * @param otherACs the set of otherACs that we want to know if we should consider analyzing + * @return true if otherACs is a state worth considering, or false otherwise + */ + public boolean withinMaxACs(final ExactACcounts otherACs) { + if ( maxACsToConsider == null ) + return true; + + final int[] otherACcounts = otherACs.getCounts(); + + for ( int i = 0; i < maxACsToConsider.length; i++ ) { + // consider one more than the max AC to collect a bit more likelihood mass + if ( otherACcounts[i] > maxACsToConsider[i] + 1 ) + return false; + } + + return true; + } + + /** + * returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set + */ + public boolean isLowerAC(final ExactACcounts otherACs) { + if ( ACsAtMax == null ) + return true; + + final int[] myACcounts = this.ACsAtMax.getCounts(); + final int[] otherACcounts = otherACs.getCounts(); + + for ( int i = 0; i < myACcounts.length; i++ ) { + if ( myACcounts[i] > otherACcounts[i] ) + return false; + } + + return true; + } + + public boolean abort( final double log10LofK, final ExactACcounts ACs ) { + return tooLowLikelihood(log10LofK) && isLowerAC(ACs); + } + + public double getMaxLog10L() { + return maxLog10L; + } + + public void setMaxLog10L(double maxLog10L) { + this.maxLog10L = maxLog10L; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index 3e48520a7..f8c871e7d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -23,8 +23,9 @@ */ package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; -import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult; -import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.TreeSet; @@ -32,7 +33,9 @@ import java.util.TreeSet; public class GLBasedSampleSelector extends SampleSelector { double[] flatPriors = null; - double referenceLikelihood; + final double referenceLikelihood; + AFCalc AFCalculator; + public GLBasedSampleSelector(TreeSet sm, double refLik) { super(sm); referenceLikelihood = refLik; @@ -49,11 +52,11 @@ public class GLBasedSampleSelector extends SampleSelector { // do we want to apply a prior? maybe user-spec? if ( flatPriors == null ) { flatPriors = new double[1+2*samples.size()]; + AFCalculator = AFCalcFactory.createAFCalc(samples.size(), 4, 4, 2); } - AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size()); - ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result); + final AFCalcResult result = AFCalculator.getLog10PNonRef(subContext, flatPriors); // do we want to let this qual go up or down? - if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) { + if ( result.getLog10LikelihoodOfAFEq0() < referenceLikelihood ) { return true; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java index f89bcb2a7..92d6e686b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java @@ -75,7 +75,7 @@ public class FilterLiftedVariants extends RodWalker { boolean failed = false; byte[] recordRef = vc.getReference().getBases(); for (int i = 0; i < recordRef.length && i < MAX_VARIANT_SIZE; i++) { - if ( recordRef[i] != ref[i + (vc.isPointEvent() ? 0 : 1)] ) { + if ( recordRef[i] != ref[i] ) { failed = true; break; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 9664a5bde..15c17988c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -50,7 +50,6 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.io.File; import java.io.FileNotFoundException; -import java.io.PrintStream; import java.util.*; /** @@ -278,13 +277,6 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; - /** - * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory - * given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants. - */ - @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false) - protected int numRandom = 0; - /** * This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions. */ @@ -330,20 +322,6 @@ public class SelectVariants extends RodWalker implements TreeR private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false; - /* Private class used to store the intermediate variants in the integer random selection process */ - private static class RandomVariantStructure { - private VariantContext vc; - - RandomVariantStructure(VariantContext vcP) { - vc = vcP; - } - - public void set (VariantContext vcP) { - vc = vcP; - } - - } - public enum NumberAlleleRestriction { ALL, BIALLELIC, @@ -364,12 +342,7 @@ public class SelectVariants extends RodWalker implements TreeR /* variables used by the SELECT RANDOM modules */ - private boolean SELECT_RANDOM_NUMBER = false; private boolean SELECT_RANDOM_FRACTION = false; - private int variantNumber = 0; - private int nVariantsAdded = 0; - private int positionToAdd = 0; - private RandomVariantStructure [] variantArray; //Random number generator for the genotypes to remove private Random randomGenotypes = new Random(); @@ -478,12 +451,6 @@ public class SelectVariants extends RodWalker implements TreeR mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true); } - SELECT_RANDOM_NUMBER = numRandom > 0; - if (SELECT_RANDOM_NUMBER) { - logger.info("Selecting " + numRandom + " variants at random from the variant track"); - variantArray = new RandomVariantStructure[numRandom]; - } - SELECT_RANDOM_FRACTION = fractionRandom > 0; if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track"); @@ -588,14 +555,10 @@ public class SelectVariants extends RodWalker implements TreeR break; } } - if ( !failedJexlMatch ) { - if (SELECT_RANDOM_NUMBER) { - randomlyAddVariant(++variantNumber, sub); - } - else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { - if ( ! justRead ) - vcfWriter.add(sub); - } + if ( !failedJexlMatch && + !justRead && + ( !SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom ) ) { + vcfWriter.add(sub); } } } @@ -718,14 +681,6 @@ public class SelectVariants extends RodWalker implements TreeR public void onTraversalDone(Integer result) { logger.info(result + " records processed."); - - if (SELECT_RANDOM_NUMBER) { - int positionToPrint = positionToAdd; - for (int i=0; i implements TreeR GenotypesContext newGC = sub.getGenotypes(); - // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate) + // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs and AD (because they are no longer accurate) if ( vc.getAlleles().size() != sub.getAlleles().size() ) - newGC = VariantContextUtils.stripPLs(sub.getGenotypes()); + newGC = VariantContextUtils.stripPLsAndAD(sub.getGenotypes()); // if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags if ( vc.getNSamples() != sub.getNSamples() ) { @@ -809,25 +764,4 @@ public class SelectVariants extends RodWalker implements TreeR if ( sawDP ) builder.attribute("DP", depth); } - - private void randomlyAddVariant(int rank, VariantContext vc) { - if (nVariantsAdded < numRandom) - variantArray[nVariantsAdded++] = new RandomVariantStructure(vc); - - else { - double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble(); - double t = (1.0/(rank-numRandom+1)); - if ( v < t) { - variantArray[positionToAdd].set(vc); - nVariantsAdded++; - positionToAdd = nextCircularPosition(positionToAdd); - } - } - } - - private int nextCircularPosition(int cur) { - if ((cur + 1) == variantArray.length) - return 0; - return cur + 1; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 37fc96681..4777b807f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import org.broad.tribble.TribbleException; +import org.broadinstitute.sting.alignment.bwa.java.AlignmentMatchSequence; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; @@ -7,29 +9,28 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.*; import java.util.*; /** - * Yet another VCF to Ped converter. The world actually does need one that will - * work efficiently on large VCFs (or at least give a progress bar). This - * produces a binary ped file in individual major mode. + * Converts a VCF file to a binary plink Ped file (.bed/.bim/.fam) */ @DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=0,stop=100)) public class VariantsToBinaryPed extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @@ -40,29 +41,33 @@ public class VariantsToBinaryPed extends RodWalker { /** * The metaData file can take two formats, the first of which is the first 6 lines of the standard ped file. This * is what Plink describes as a fam file. An example fam file is (note that there is no header): - * - * CEUTrio NA12878 NA12891 NA12892 2 -9 - * CEUTrio NA12891 UNKN1 UNKN2 2 -9 - * CEUTrio NA12892 UNKN3 UNKN4 1 -9 - * + *

+ * CEUTrio NA12878 NA12891 NA12892 2 -9

+ * CEUTrio NA12891 UNKN1 UNKN2 2 -9

+ * CEUTrio NA12892 UNKN3 UNKN4 1 -9

+ *

* where the entries are (FamilyID IndividualID DadID MomID Phenotype Sex) - * + *

* An alternate format is a two-column key-value file - * - * NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9 - * NA12891 fid=CEUTrio;sex=2;phenotype=-9 - * NA12892 fid=CEUTrio;sex=1;phenotype=-9 - * + *

+ * NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9

+ * NA12891 fid=CEUTrio;sex=2;phenotype=-9

+ * NA12892 fid=CEUTrio;sex=1;phenotype=-9

+ *

* wherein unknown parents needn't be specified. The columns are the individual ID, and a list of key-value pairs. - * + *

* Regardless of which file is specified, the walker will output a .fam file alongside the bed file. If the * command line has "-md [name].fam", the fam file will simply be copied. However, if a metadata file of the * alternate format is passed by "-md [name].txt", the walker will construct a formatted .fam file from the data. + *

*/ @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " + "(in which case it will be copied to the file you provide as fam output).") File metaDataFile; + @Input(shortName="mode",fullName="outputMode",required=false,doc="The output file mode (SNP major or individual major)") + OutputMode mode = OutputMode.INDIVIDUAL_MAJOR; + @Output(shortName="bed",fullName = "bed",required=true,doc="output ped file") PrintStream outBed; @@ -78,7 +83,7 @@ public class VariantsToBinaryPed extends RodWalker { @Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele") boolean majorAlleleFirst = false; - private ValidateVariants vv = new ValidateVariants(); + enum OutputMode { INDIVIDUAL_MAJOR,SNP_MAJOR } private static double APPROX_CM_PER_BP = 1000000.0/750000.0; @@ -89,6 +94,8 @@ public class VariantsToBinaryPed extends RodWalker { private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples + private static final String PLINK_DELETION_MARKER = "-"; + // note that HET and NO_CALL are flipped from the documentation: that's because // plink actually reads these in backwards; and we want to use a shift operator // to put these in the appropriate location @@ -99,9 +106,10 @@ public class VariantsToBinaryPed extends RodWalker { private int genotypeCount = 0; private int byteCount = 0; private List famOrder = new ArrayList(); + private long totalByteCount = 0l; + private long totalGenotypeCount = 0l; public void initialize() { - initializeValidator(); writeBedHeader(); Map> sampleMetaValues = parseMetaData(); // create temporary output streams and buffers @@ -136,36 +144,43 @@ public class VariantsToBinaryPed extends RodWalker { throw new UserException("No metadata provided for sample "+sample); } } - try { - File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp"); - printMap.put(sample,new PrintStream(temp)); - tempFiles.put(sample,temp); - } catch (IOException e) { - throw new ReviewedStingException("Error creating temporary file",e); + if ( mode == OutputMode.INDIVIDUAL_MAJOR ) { + // only need to instantiate the files and buffers if in individual major. + // Cut down on memory. + try { + File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp"); + printMap.put(sample,new PrintStream(temp)); + tempFiles.put(sample,temp); + } catch (IOException e) { + throw new ReviewedStingException("Error creating temporary file",e); + } + genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); } - genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); famOrder.add(sample); } } } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null || ! tracker.hasValues(variantCollection.variants) || - tracker.getFirstValue(variantCollection.variants).isFiltered() || - ! tracker.getFirstValue(variantCollection.variants).isSNP() || - ! tracker.getFirstValue(variantCollection.variants).isBiallelic()) { + if ( tracker == null ) { + return 0; + } + + VariantContext vc = tracker.getFirstValue(variantCollection.variants,context.getLocation()); + if ( vc == null || vc.isFiltered() || ! vc.isBiallelic() ) { return 0; } try { - vv.map(tracker,ref,context); - } catch (UserException e) { + validateVariantSite(vc,ref,context); + } catch (TribbleException e) { throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+ - "Please run ValidateVariants for more detailed information."); + "Please run ValidateVariants for more detailed information. This error is: "+e.getMessage()); } - VariantContext vc = tracker.getFirstValue(variantCollection.variants); String refOut; String altOut; + String vcRef = getReferenceAllele(vc); + String vcAlt = getAlternateAllele(vc); boolean altMajor; if ( majorAlleleFirst ) { // want to use the major allele as ref @@ -174,29 +189,42 @@ public class VariantsToBinaryPed extends RodWalker { VariantContextUtils.calculateChromosomeCounts(vc,ats,true); } if ( getAF(ats.get("AF")) > 0.5 ) { - refOut = vc.getAlternateAllele(0).getBaseString(); - altOut = vc.getReference().getBaseString(); + refOut = vcAlt; + altOut = vcRef; altMajor = true; } else { - refOut = vc.getReference().getBaseString(); - altOut = vc.getAlternateAllele(0).getBaseString(); + refOut = vcRef; + altOut = vcAlt; altMajor = false; } } else { - refOut = vc.getReference().getBaseString(); - altOut = vc.getAlternateAllele(0).getBaseString(); + refOut = vcRef; + altOut = vcAlt; altMajor = false; } // write an entry into the map file outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(), refOut,altOut); + if ( mode == OutputMode.INDIVIDUAL_MAJOR ) { + writeIndividualMajor(vc,altMajor); + } else { + writeSNPMajor(vc,altMajor); + } + + + return 1; + } + + public void writeIndividualMajor(VariantContext vc, boolean altMajor) { // store genotypes per sample into the buffer for ( Genotype g : vc.getGenotypes() ) { + ++totalGenotypeCount; String sample = g.getSampleName(); byte[] samBuf = genotypeBuffer.get(sample); byte enc = getEncoding(g,genotypeCount,altMajor); samBuf[byteCount] |= enc; } + genotypeCount++; if ( genotypeCount % 4 == 0 ) { byteCount++; @@ -217,8 +245,30 @@ public class VariantsToBinaryPed extends RodWalker { } genotypeCount = 0; } + } - return 1; + public void writeSNPMajor(VariantContext vc, boolean altMajor) { + // for each sample, write the genotype into the bed file, in the + // order of the fam file + genotypeCount = 0; + byteCount = 0; + byte[] bytes = new byte[(3+famOrder.size())/4]; // this exploits java integer fractions, which round down by default (1-4) -> 1, (5-8) -> 2 + for ( Genotype g : vc.getGenotypesOrderedBy(famOrder) ) { + byte enc = getEncoding(g,genotypeCount,altMajor); + bytes[byteCount] |= enc; + genotypeCount++; + if ( genotypeCount % 4 == 0 ) { + byteCount++; + genotypeCount = 0; + } + } + totalGenotypeCount += famOrder.size(); + totalByteCount += bytes.length; + try { + outBed.write(bytes); + } catch (IOException e) { + throw new ReviewedStingException("Error writing to output bed file",e); + } } public Integer reduce(Integer m, Integer r) { @@ -230,7 +280,15 @@ public class VariantsToBinaryPed extends RodWalker { } public void onTraversalDone(Integer numSites) { - logger.info(String.format("%d sites processed!",numSites)); + logger.info(String.format("%d sites processed for a total of %d genotypes encoded in %d bytes",numSites,totalGenotypeCount,totalByteCount)); + + if ( mode == OutputMode.INDIVIDUAL_MAJOR ) { + mergeGenotypeTempFiles(numSites); + } + + } + + private void mergeGenotypeTempFiles(int numSites) { // push out the remaining genotypes and close stream for ( String sample : printMap.keySet() ) { try { @@ -262,18 +320,19 @@ public class VariantsToBinaryPed extends RodWalker { byte[] readGenotypes = new byte[BUFFER_SIZE]; inStream.read(readGenotypes); outBed.write(readGenotypes); + totalByteCount += BUFFER_SIZE; } if ( ttr > 0 ) { byte[] readGenotypes = new byte[ttr]; inStream.read(readGenotypes); outBed.write(readGenotypes); + totalByteCount += ttr; } inStream.close(); } catch (IOException e) { throw new ReviewedStingException("Error reading form temp file for input.",e); } } - } private byte getEncoding(Genotype g, int offset, boolean altMajor) { @@ -286,8 +345,8 @@ public class VariantsToBinaryPed extends RodWalker { private byte getStandardEncoding(Genotype g, int offset) { byte b; - if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) { - b = NO_CALL; + if ( ! checkGQIsGood(g) ) { + b = NO_CALL; } else if ( g.isHomRef() ) { b = HOM_REF; } else if ( g.isHomVar() ) { @@ -322,10 +381,11 @@ public class VariantsToBinaryPed extends RodWalker { if ( genotype.hasGQ() ) { return genotype.getGQ() >= minGenotypeQuality; } else if ( genotype.hasLikelihoods() ) { - return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality; + double log10gq = GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()); + return MathUtils.log10ProbabilityToPhredScale(log10gq) >= minGenotypeQuality; } - return false; + return minGenotypeQuality <= 0; } private static String getID(VariantContext v) { @@ -346,17 +406,10 @@ public class VariantsToBinaryPed extends RodWalker { } } - private void initializeValidator() { - vv.variantCollection = variantCollection; - vv.dbsnp = dbsnp; - vv.DO_NOT_VALIDATE_FILTERED = true; - vv.type = ValidateVariants.ValidationType.REF; - } - private void writeBedHeader() { // write magic bits into the ped file try { - outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); + outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, (byte) (mode == OutputMode.INDIVIDUAL_MAJOR ? 0x0 : 0x1)}); // ultimately, the bed will be in individual-major mode } catch (IOException e) { throw new ReviewedStingException("error writing to output file."); @@ -410,4 +463,53 @@ public class VariantsToBinaryPed extends RodWalker { return metaValues; } + + private void validateVariantSite(VariantContext vc, ReferenceContext ref, AlignmentContext context) { + final Allele reportedRefAllele = vc.getReference(); + final int refLength = reportedRefAllele.length(); + if ( refLength > 100 ) { + logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", refLength, vc.getChr(), vc.getStart())); + return; + } + + final byte[] observedRefBases = new byte[refLength]; + System.arraycopy(ref.getBases(), 0, observedRefBases, 0, refLength); + final Allele observedRefAllele = Allele.create(observedRefBases); + vc.validateReferenceBases(reportedRefAllele, observedRefAllele); + vc.validateAlternateAlleles(); + } + + private String getReferenceAllele(VariantContext vc) { + if ( vc.isSimpleInsertion() ) { + // bi-allelic, so we just have "-" for ped output + return PLINK_DELETION_MARKER; + } + if ( vc.isSymbolic() ) { + // either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Reference will just be 1. + return "1"; + } + if ( vc.isSimpleDeletion() ) { + // bi-allelic. Want to take the standard representation and strip off the leading base. + return vc.getReference().getBaseString().substring(1); + } + // snp or mnp + return vc.getReference().getBaseString(); + } + + private String getAlternateAllele(VariantContext vc ) { + if ( vc.isSimpleInsertion() ) { + // bi-allelic. Want to take the standard representation and strip off the leading base. + return vc.getAlternateAllele(0).getBaseString().substring(1); + } + if ( vc.isSymbolic() ) { + // either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Alt will just be 2. + return "2"; + } + if ( vc.isSimpleDeletion() ) { + // bi-allelic, so we just have "-" for ped output + return PLINK_DELETION_MARKER; + } + // snp or mnp + return vc.getAlternateAllele(0).getBaseString(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index b9577ca9b..dd5264a1b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -42,6 +42,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.PrintStream; +import java.lang.reflect.Array; import java.util.*; /** @@ -334,12 +335,12 @@ public class VariantsToTable extends RodWalker { return records; } - private static void addFieldValue(Object val, List> result) { + private static void addFieldValue(final Object val, final List> result) { final int numResultRecords = result.size(); // if we're trying to create a single output record, add it if ( numResultRecords == 1 ) { - result.get(0).add(val.toString()); + result.get(0).add(prettyPrintObject(val)); } // if this field is a list of the proper size, add the appropriate entry to each record else if ( (val instanceof List) && ((List)val).size() == numResultRecords ) { @@ -355,6 +356,26 @@ public class VariantsToTable extends RodWalker { } } + private static String prettyPrintObject(final Object val) { + if ( val instanceof List ) + return prettyPrintObject(((List)val).toArray()); + + if ( !val.getClass().isArray() ) + return val.toString(); + + final int length = Array.getLength(val); + if ( length == 0 ) + return ""; + + final StringBuilder sb = new StringBuilder(prettyPrintObject(Array.get(val, 0))); + for ( int i = 1; i < length; i++ ) { + sb.append(","); + sb.append(prettyPrintObject(Array.get(val, i))); + } + return sb.toString(); + } + + public static List> extractFields(VariantContext vc, List fields, boolean allowMissingData) { return extractFields(vc, fields, null, null, allowMissingData, false); } diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 8c95091a6..69920ece4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.util.StringUtil; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.Arrays; @@ -198,7 +199,9 @@ public class BaseUtils { * @param base [AaCcGgTt] * @return 0, 1, 2, 3, or -1 if the base can't be understood */ - static public int simpleBaseToBaseIndex(byte base) { + static public int simpleBaseToBaseIndex(final byte base) { + if ( base < 0 || base >= 256 ) + throw new UserException.BadInput("Non-standard bases were encountered in either the input reference or BAM file(s)"); return baseIndexMap[base]; } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 759ec1cc6..8aa727be8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -58,6 +58,12 @@ public class MathUtils { private static final int MAXN = 50000; private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients + /** + * The smallest log10 value we'll emit from normalizeFromLog10 and other functions + * where the real-space value is 0.0. + */ + public final static double LOG10_P_OF_ZERO = -1000000.0; + static { log10Cache = new double[LOG10_CACHE_SIZE]; log10FactorialCache = new double[LOG10_CACHE_SIZE]; @@ -572,16 +578,26 @@ public class MathUtils { return normalizeFromLog10(array, takeLog10OfOutput, false); } + /** + * See #normalizeFromLog10 but with the additional option to use an approximation that keeps the calculation always in log-space + * + * @param array + * @param takeLog10OfOutput + * @param keepInLogSpace + * + * @return + */ public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput, boolean keepInLogSpace) { - // for precision purposes, we need to add (or really subtract, since they're // all negative) the largest value; also, we need to convert to normal-space. double maxValue = arrayMax(array); // we may decide to just normalize in log space without converting to linear space if (keepInLogSpace) { - for (int i = 0; i < array.length; i++) + for (int i = 0; i < array.length; i++) { array[i] -= maxValue; + array[i] = Math.max(array[i], LOG10_P_OF_ZERO); + } return array; } @@ -598,7 +614,8 @@ public class MathUtils { for (int i = 0; i < array.length; i++) { double x = normalized[i] / sum; if (takeLog10OfOutput) - x = Math.log10(x); + x = Math.max(Math.log10(x), LOG10_P_OF_ZERO); + normalized[i] = x; } @@ -1666,4 +1683,36 @@ public class MathUtils { return result; } + + /** + * Returns a series of integer values between start and stop, inclusive, + * expontentially distributed between the two. That is, if there are + * ten values between 0-10 there will be 10 between 10-100. + * + * WARNING -- BADLY TESTED + * @param start + * @param stop + * @param eps + * @return + */ + public static List log10LinearRange(final int start, final int stop, final double eps) { + final LinkedList values = new LinkedList(); + final double log10range = Math.log10(stop - start); + + if ( start == 0 ) + values.add(0); + + double i = 0.0; + while ( i <= log10range ) { + final int index = (int)Math.round(Math.pow(10, i)) + start; + if ( index < stop && (values.peekLast() == null || values.peekLast() != index ) ) + values.add(index); + i += eps; + } + + if ( values.peekLast() == null || values.peekLast() != stop ) + values.add(stop); + + return values; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 74b038032..f4a200af0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -236,6 +236,33 @@ public class Utils { } } + public static List append(final List left, T ... elts) { + final List l = new LinkedList(left); + l.addAll(Arrays.asList(elts)); + return l; + } + + /** + * Returns a string of the values in joined by separator, such as A,B,C + * + * @param separator + * @param doubles + * @return + */ + public static String join(String separator, double[] doubles) { + if ( doubles == null || doubles.length == 0) + return ""; + else { + StringBuilder ret = new StringBuilder(); + ret.append(doubles[0]); + for (int i = 1; i < doubles.length; ++i) { + ret.append(separator); + ret.append(doubles[i]); + } + return ret.toString(); + } + } + /** * Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of * elti objects (note there's no actual space between sep and the elti elements). Returns diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java new file mode 100644 index 000000000..617391714 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.collections; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.PrintStream; + +/** + * Wrapper around the basic NestedIntegerArray class that logs all updates (ie., all calls to put()) + * to the provided output stream. For testing/debugging purposes. + * + * Log entries are of the following form (fields are tab-separated): + * LABEL VALUE KEY1 KEY2 ... KEY_N + * + * @author David Roazen + */ +public class LoggingNestedIntegerArray extends NestedIntegerArray { + + private PrintStream log; + private String logEntryLabel; + + /** + * + * @param log output stream to which to log update operations + * @param logEntryLabel String that should be prefixed to each log entry + * @param dimensions + */ + public LoggingNestedIntegerArray( PrintStream log, String logEntryLabel, final int... dimensions ) { + super(dimensions); + + if ( log == null ) { + throw new ReviewedStingException("Log output stream must not be null"); + } + this.log = log; + this.logEntryLabel = logEntryLabel != null ? logEntryLabel : ""; + } + + @Override + public void put( final T value, final int... keys ) { + super.put(value, keys); + + StringBuilder logEntry = new StringBuilder(); + + logEntry.append(logEntryLabel); + logEntry.append("\t"); + logEntry.append(value); + for ( int key : keys ) { + logEntry.append("\t"); + logEntry.append(key); + } + + // PrintStream methods all use synchronized blocks internally, so our logging is thread-safe + log.println(logEntry.toString()); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index faafc611a..6b97f8f9f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -129,6 +129,12 @@ public class UserException extends ReviewedStingException { } } + public static class LocalParallelizationProblem extends UserException { + public LocalParallelizationProblem(final File file) { + super(String.format("There was a failure because temporary file %s could not be found while running the GATK with more than one thread. Possible causes for this problem include: your system's open file handle limit is too small, your output or temp directories do not have sufficient space, or just an isolated file system blip", file.getAbsolutePath())); + } + } + public static class NotEnoughMemory extends UserException { public NotEnoughMemory() { super(String.format("There was a failure because you did not provide enough memory to run this program. See the -Xmx JVM argument to adjust the maximum heap size provided to Java")); @@ -160,6 +166,10 @@ public class UserException extends ReviewedStingException { super(String.format("Couldn't read file %s because %s", file.getAbsolutePath(), message)); } + public CouldNotReadInputFile(String file, String message) { + super(String.format("Couldn't read file %s because %s", file, message)); + } + public CouldNotReadInputFile(File file, String message, Exception e) { super(String.format("Couldn't read file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java index afc8f5065..0dd510245 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java @@ -25,9 +25,12 @@ package org.broadinstitute.sting.utils.recalibration; +import org.broadinstitute.sting.utils.collections.LoggingNestedIntegerArray; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; +import java.io.PrintStream; + /** * Utility class to facilitate on-the-fly base quality score recalibration. * @@ -52,19 +55,31 @@ public class RecalibrationTables { private final NestedIntegerArray[] tables; public RecalibrationTables(final Covariate[] covariates) { - this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1); + this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, null); + } + + public RecalibrationTables(final Covariate[] covariates, final PrintStream log) { + this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, log); } public RecalibrationTables(final Covariate[] covariates, final int numReadGroups) { + this(covariates, numReadGroups, null); + } + + public RecalibrationTables(final Covariate[] covariates, final int numReadGroups, final PrintStream log) { tables = new NestedIntegerArray[covariates.length]; final int qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.index].maximumKeyValue() + 1; final int eventDimension = EventType.values().length; - tables[TableType.READ_GROUP_TABLE.index] = new NestedIntegerArray(numReadGroups, eventDimension); - tables[TableType.QUALITY_SCORE_TABLE.index] = new NestedIntegerArray(numReadGroups, qualDimension, eventDimension); + tables[TableType.READ_GROUP_TABLE.index] = log == null ? new NestedIntegerArray(numReadGroups, eventDimension) : + new LoggingNestedIntegerArray(log, "READ_GROUP_TABLE", numReadGroups, eventDimension); + tables[TableType.QUALITY_SCORE_TABLE.index] = log == null ? new NestedIntegerArray(numReadGroups, qualDimension, eventDimension) : + new LoggingNestedIntegerArray(log, "QUALITY_SCORE_TABLE", numReadGroups, qualDimension, eventDimension); for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.index; i < covariates.length; i++) - tables[i] = new NestedIntegerArray(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension); + tables[i] = log == null ? new NestedIntegerArray(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) : + new LoggingNestedIntegerArray(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.index + 1), + numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension); } public NestedIntegerArray getReadGroupTable() { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index fae0a7c4c..aa801c2b9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -288,6 +288,24 @@ public abstract class Genotype implements Comparable { return hasLikelihoods() ? GenotypeLikelihoods.fromPLs(getPL()) : null; } + /** + * Are all likelihoods for this sample non-informative? + * + * Returns true if all PLs are 0 => 0,0,0 => true + * 0,0,0,0,0,0 => true + * 0,10,100 => false + * + * @return true if all samples PLs are equal and == 0 + */ + public boolean isNonInformative() { + for ( final int PL : getPL() ) { + if ( PL != 0 ) + return false; + } + + return true; + } + /** * Unsafe low-level accessor the PL field itself, may be null. * diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index abac84202..e453e2f8a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1517,15 +1517,32 @@ public class VariantContext implements Feature { // to enable tribble integratio return best; } + /** + * Lookup the index of allele in this variant context + * + * @param allele the allele whose index we want to get + * @return the index of the allele into getAlleles(), or -1 if it cannot be found + */ + public int getAlleleIndex(final Allele allele) { + return getAlleles().indexOf(allele); + } + + /** + * Return the allele index #getAlleleIndex for each allele in alleles + * + * @param alleles the alleles we want to look up + * @return a list of indices for each allele, in order + */ + public List getAlleleIndices(final Collection alleles) { + final List indices = new LinkedList(); + for ( final Allele allele : alleles ) + indices.add(getAlleleIndex(allele)); + return indices; + } + public int[] getGLIndecesOfAlternateAllele(Allele targetAllele) { - - int index = 1; - for ( Allele allele : getAlternateAlleles() ) { - if ( allele.equals(targetAllele) ) - break; - index++; - } - + final int index = getAlleleIndex(targetAllele); + if ( index == -1 ) throw new IllegalArgumentException("Allele " + targetAllele + " not in this VariantContex " + this); return GenotypeLikelihoods.getPLIndecesOfAlleles(0, index); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 8abcf115a..81959c998 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -157,11 +157,8 @@ public class VariantContextUtils { builder.attributes(calculateChromosomeCounts(vc, new HashMap(vc.getAttributes()), removeStaleValues, founderIds)); } - public static Genotype removePLs(Genotype g) { - if ( g.hasLikelihoods() ) - return new GenotypeBuilder(g).noPL().make(); - else - return g; + public static Genotype removePLsAndAD(final Genotype g) { + return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; } public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) { @@ -573,7 +570,7 @@ public class VariantContextUtils { } // if we have more alternate alleles in the merged VC than in one or more of the - // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF + // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD for ( final VariantContext vc : VCs ) { if (vc.alleles.size() == 1) continue; @@ -581,7 +578,7 @@ public class VariantContextUtils { if ( ! genotypes.isEmpty() ) logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); - genotypes = stripPLs(genotypes); + genotypes = stripPLsAndAD(genotypes); // this will remove stale AC,AF attributed from vc calculateChromosomeCounts(vc, attributes, true); break; @@ -672,11 +669,11 @@ public class VariantContextUtils { return true; } - public static GenotypesContext stripPLs(GenotypesContext genotypes) { + public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) { GenotypesContext newGs = GenotypesContext.create(genotypes.size()); for ( final Genotype g : genotypes ) { - newGs.add(g.hasLikelihoods() ? removePLs(g) : g); + newGs.add(removePLsAndAD(g)); } return newGs; @@ -1343,10 +1340,7 @@ public class VariantContextUtils { public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { - // TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed - // see whether we need to trim common reference base from all alleles - final int trimExtent = computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, false); if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 ) return inputVC; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index f2d34fe85..9a987f161 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -477,10 +477,10 @@ class VCFWriter extends IndexingVariantContextWriter { else if ( val instanceof List ) { result = formatVCFField(((List)val).toArray()); } else if ( val.getClass().isArray() ) { - int length = Array.getLength(val); + final int length = Array.getLength(val); if ( length == 0 ) return formatVCFField(null); - StringBuffer sb = new StringBuffer(formatVCFField(Array.get(val, 0))); + final StringBuilder sb = new StringBuilder(formatVCFField(Array.get(val, 0))); for ( int i = 1; i < length; i++) { sb.append(","); sb.append(formatVCFField(Array.get(val, i))); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java deleted file mode 100644 index 0731d3fd8..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ /dev/null @@ -1,127 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.Arrays; - - -public class ExactAFCalculationModelUnitTest extends BaseTest { - - static double[] AA1, AB1, BB1; - static double[] AA2, AB2, AC2, BB2, BC2, CC2; - static final int numSamples = 3; - static double[] priors = new double[2*numSamples+1]; // flat priors - - @BeforeSuite - public void before() { - AA1 = new double[]{0.0, -20.0, -20.0}; - AB1 = new double[]{-20.0, 0.0, -20.0}; - BB1 = new double[]{-20.0, -20.0, 0.0}; - AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0}; - AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0}; - AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0}; - BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0}; - BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0}; - CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0}; - } - - private class GetGLsTest extends TestDataProvider { - GenotypesContext GLs; - int numAltAlleles; - String name; - - private GetGLsTest(String name, int numAltAlleles, Genotype... arg) { - super(GetGLsTest.class, name); - GLs = GenotypesContext.create(arg); - this.name = name; - this.numAltAlleles = numAltAlleles; - } - - public String toString() { - return String.format("%s input=%s", super.toString(), GLs); - } - } - - private static Genotype createGenotype(String name, double[] gls) { - return new GenotypeBuilder(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).PL(gls).make(); - } - - @DataProvider(name = "getGLs") - public Object[][] createGLsData() { - - // bi-allelic case - new GetGLsTest("B0", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AA3", AA1)); - new GetGLsTest("B1", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AB", AB1)); - new GetGLsTest("B2", 1, createGenotype("AA1", AA1), createGenotype("BB", BB1), createGenotype("AA2", AA1)); - new GetGLsTest("B3a", 1, createGenotype("AB", AB1), createGenotype("AA", AA1), createGenotype("BB", BB1)); - new GetGLsTest("B3b", 1, createGenotype("AB1", AB1), createGenotype("AB2", AB1), createGenotype("AB3", AB1)); - new GetGLsTest("B4", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("AA", AA1)); - new GetGLsTest("B5", 1, createGenotype("BB1", BB1), createGenotype("AB", AB1), createGenotype("BB2", BB1)); - new GetGLsTest("B6", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("BB3", BB1)); - - // tri-allelic case - new GetGLsTest("B1C0", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AB", AB2)); - new GetGLsTest("B0C1", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AC", AC2)); - new GetGLsTest("B1C1a", 2, createGenotype("AA", AA2), createGenotype("AB", AB2), createGenotype("AC", AC2)); - new GetGLsTest("B1C1b", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("BC", BC2)); - new GetGLsTest("B2C1", 2, createGenotype("AB1", AB2), createGenotype("AB2", AB2), createGenotype("AC", AC2)); - new GetGLsTest("B3C2a", 2, createGenotype("AB", AB2), createGenotype("BC1", BC2), createGenotype("BC2", BC2)); - new GetGLsTest("B3C2b", 2, createGenotype("AB", AB2), createGenotype("BB", BB2), createGenotype("CC", CC2)); - - return GetGLsTest.getTests(GetGLsTest.class); - } - - - @Test(dataProvider = "getGLs") - public void testGLs(GetGLsTest cfg) { - - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); - - int nameIndex = 1; - for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { - int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele]; - - Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); - } - } - - @Test - public void testLargeGLs() { - - final double[] BB = new double[]{-20000000.0, -20000000.0, 0.0}; - GetGLsTest cfg = new GetGLsTest("B6", 1, createGenotype("1", BB), createGenotype("2", BB), createGenotype("3", BB)); - - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); - - int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0]; - Assert.assertEquals(calculatedAlleleCount, 6); - } - - @Test - public void testMismatchedGLs() { - - final double[] AB = new double[]{-2000.0, 0.0, -2000.0, -2000.0, -2000.0, -2000.0}; - final double[] AC = new double[]{-100.0, -100.0, -100.0, 0.0, -100.0, -100.0}; - GetGLsTest cfg = new GetGLsTest("B1C1", 2, createGenotype("1", AC), createGenotype("2", AB)); - - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); - - Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1); - Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 1f418f736..905ceef0f 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("cafd404f1b4f53586f7aa7a7084b91da")); + Arrays.asList("fe9c0e9e4b4ee4677145748cdd2285ff")); executeTest("test MultiSample Pilot1", spec); } @@ -36,7 +36,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("9a760dffbb299bda4934bcb4f7aad42a")); + Arrays.asList("bc15123620e1134f799005d71d1180fe")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); } @@ -44,7 +44,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("8391146877aa7801ffdb3aa954bf2965")); + Arrays.asList("1ba7afccc8552f20d72d0b62237558e3")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -52,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("85b79ff7910f218dd59595d03ffe6ccc")); + Arrays.asList("57e409dbb12e0d85cd8af73db221b1fc")); executeTest("test SingleSample Pilot2", spec); } @@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("cceb34ffbd2dbc45b8821f86ea255284")); + Arrays.asList("772e14d8c908044c04053d204bad69ef")); executeTest("test Multiple SNP alleles", spec); } @@ -76,7 +76,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("00f54a0097e710c0f7b001444c237e32")); + Arrays.asList("1fb69aa3857e921191997daa73f1b687")); executeTest("test reverse trim", spec); } @@ -84,7 +84,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("b3fae6bf4c620458f4259dbc93125e37")); + Arrays.asList("d210ee1baa75dd4a0c63aef6b1fa7a8a")); executeTest("test mismatched PLs", spec); } @@ -94,7 +94,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "712e87db5e278e92bd36e96d377303c6"; + private final static String COMPRESSED_OUTPUT_MD5 = "367c0355b4e7b10c2988e5c41f44b3d2"; @Test public void testCompressedOutput() { @@ -115,7 +115,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "306943dd63111e2e64388cd2e2de6c01"; + String md5 = "360d1274c1072a1ae9868e4e106c2650"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -147,7 +147,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("f73dec2e77f14c170f7b6a8eee5793ff")); + Arrays.asList("6ae4a219c7b9c837fcbf12edeeac3c0c")); executeTest("test min_base_quality_score 26", spec); } @@ -155,7 +155,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("da7a5a3aa1c9f401896c34199c535954")); + Arrays.asList("c7429e670ba477bf9a6bbee2fb41c5a9")); executeTest("test SLOD", spec); } @@ -163,7 +163,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("07f5962f790673a1299f3a0f56579b65")); + Arrays.asList("abd8e33e649cc11b55e200d3940cc7e2")); executeTest("test NDA", spec); } @@ -171,23 +171,23 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("22037eac40a3b1df3086c2d7b27f0d5f")); + Arrays.asList("8a9b424e00cdbe6b5e73d517335b2186")); executeTest("test using comp track", spec); } @Test public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "92db524b334f1416e595c711abc2d798"); + testOutputParameters("-sites_only", "97ba874eafc9884a4de027a84c036311"); } @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "da318257d25a02abd26a3348421c3c69"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "f9ea04d96eeef29e71d37e60518c2579"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "13c4f01cffbbfac600318be95b3ca02f"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "67739a3ccf30975bcaef8a563e4b80cf"); } private void testOutputParameters(final String args, final String md5) { @@ -201,7 +201,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("7326eb84d8418546a408b68839a0a47e")); + Arrays.asList("9addd225a985178339a0c49dc5fdc220")); executeTest("test confidence 1", spec1); } @@ -209,7 +209,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("7326eb84d8418546a408b68839a0a47e")); + Arrays.asList("9addd225a985178339a0c49dc5fdc220")); executeTest("test confidence 2", spec2); } @@ -220,12 +220,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "7aed8361e692eff559e6bca88752db0d" ); + testHeterozosity( 0.01, "f1c4c8e701b2334bf3c4f12fc395fec8" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "989e65bb7337117d31cd615163a8ac84" ); + testHeterozosity( 1.0 / 1850, "7fbbf4a21d6bf0026bfdadbb3c086fbe" ); } private void testHeterozosity(final double arg, final String md5) { @@ -249,7 +249,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("c155587aa0410f43d7ccc57e1ae09a68")); + Arrays.asList("5d19e3077e0cabbb364f68676a09ebe0")); executeTest(String.format("test multiple technologies"), spec); } @@ -268,7 +268,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("0748a711c6154f8d85847afb79aead94")); + Arrays.asList("8a1931095f70523ad11cb99b30df7b84")); executeTest(String.format("test calling with BAQ"), spec); } @@ -287,7 +287,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("6aa034f669ec09ac4f5a28624cbe1830")); + Arrays.asList("64a491b5276fd5d1cd04260ea3e63cf7")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -302,7 +302,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("ba7a011d0c665acc4455d58a6ab28716")); + Arrays.asList("f63a8b8061e6c5999408d34798061895")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -315,7 +315,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("4f7d80f4f53ef0f0959414cb30097482")); + Arrays.asList("c9d684ff2f2a9083480db6e962d612a9")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -325,7 +325,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("95986d0c92436d3b9c1f1be9c768a368")); + Arrays.asList("833fd97c6f32d7af6c9c088a78e51f68")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -335,7 +335,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("cecd3e35a817e299e97e8f7bbf083d2c")); + Arrays.asList("95b73c24c68dc475516571d9f49dfb1e")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -343,13 +343,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("af04b81f0548ca22b8d1f6bf223b336e")); + Arrays.asList("3bdbf48de30bac58f3bcbc5bf3aa63aa")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("c7792e27477ecf99893a76ecbac5c2f9")); + Arrays.asList("beee9457d7cea42006ac45400db5e873")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -371,7 +371,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 20:10,000,000-10,100,000", 1, - Arrays.asList("59ff26d7e5ca2503ebe9f74902251551")); + Arrays.asList("945a2f994eaced8efdf8de24b58f2680")); executeTest(String.format("test UG with base indel quality scores"), spec); } @@ -405,7 +405,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("f99f9a917529bfef717fad97f725d5df")); + Arrays.asList("ba4fafec383fb988f20c8cf53dd3e9a0")); executeTest("test minIndelFraction 0.0", spec); } @@ -413,7 +413,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("eac2cd649bd5836068350eb4260aaea7")); + Arrays.asList("4c57a88de275105156aaafc6f9041365")); executeTest("test minIndelFraction 0.25", spec); } @@ -435,7 +435,22 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNsInCigar() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + validationDataLocation + "testWithNs.bam -o %s -L 8:141799600-141814700", 1, - Arrays.asList("22c9fd65ce3298bd7fbf400c9c209f29")); + Arrays.asList("e8ebfaac0804b782f22ab8ea35152735")); executeTest("test calling on reads with Ns in CIGAR", spec); } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing reduced reads + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testReducedBam() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, + Arrays.asList("bbf16e1873e525ee5975021cfb8988cf")); + executeTest("test calling on a ReducedRead BAM", spec); + } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java index e14580ead..a8309c14e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java @@ -61,4 +61,13 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { Arrays.asList("7e7bad0e1890753a01303c09a38ceb8d")); executeTest("test hg18 to hg19, unsorted", spec); } + + @Test + public void testLiftoverFilteringOfIndels() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T FilterLiftedVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "liftover_indel_test.vcf --no_cmdline_in_header", + 1, + Arrays.asList("0909a953291a5e701194668c9b8833ab")); + executeTest("test liftover filtering of indels", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index ffd9c9b4a..58d3677c7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -190,7 +190,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("52cb2f150559ca1457e9df7ec153dbb4") + Arrays.asList("549321a2543608f214ab4893ab478be6") ); executeTest("testRegenotype--" + testFile, spec); @@ -216,7 +216,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("52cb2f150559ca1457e9df7ec153dbb4") + Arrays.asList("549321a2543608f214ab4893ab478be6") ); executeTest("testRemoveMLEAndRegenotype--" + testFile, spec); @@ -255,7 +255,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, 1, - Arrays.asList("3ab35d5e81a29fb5db3e2add11c7e823") + Arrays.asList("f14d75892b99547d8e9ba3a03bfb04ea") ); executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java index a75da6cf9..8f11c09f6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java @@ -28,6 +28,13 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { } + public static String baseTestString(String inputVCF, String inputMetaData, int gq, String mode) { + return "-T VariantsToBinaryPed -R " + b37KGReference + " -mode "+mode + + " -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) + + " -bim %s -fam %s -bed %s"; + + } + @Test public void testNA12878Alone() { String testName = "testNA12878Alone"; @@ -52,6 +59,72 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { executeTest(testName, spec); } + @Test + public void testNA12878AloneSNPMajor() { + String testName = "testNA12878AloneSNPMajor"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10,"SNP_MAJOR"), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","ada1acc475d096012b921b3219c3a446") + ); + + executeTest(testName, spec); + } + + @Test + public void testNA12878HighGQ() { + String testName = "testNA12878HighGQ"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",80), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","0822adea688e99bb336afe5172d4c959") + ); + + executeTest(testName, spec); + } + + @Test + public void testVCFMismatchReference() { + String testName = "testVCFMismatchReference"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.badReference.vcf", "CEUTrio.NA12878.metadata.txt",80), + 3, + UserException.class + ); + + executeTest(testName, spec); + } + + @Test + public void test1000GWithIndels() { + String testName = "test1000GWithIndels"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0), + 3, + Arrays.asList("3c98112434d9948dc47da72ad14e8d84","3aceda4f9bb5b5457797c1fe5a85b03d","451498ceff06c1649890900fa994f1af") + ); + } + + @Test + public void test1000GWithIndelsSNPMajor() { + String testName = "test1000GWithIndelsSNPMajor"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0,"SNP_MAJOR"), + 3, + Arrays.asList("3c98112434d9948dc47da72ad14e8d84","4a0ba3d0594b06306aa6459e4e28ec9a","451498ceff06c1649890900fa994f1af") + ); + } + + @Test + public void test1000G_Symbolic() { + String testName = "test1000G_Symbolic"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("1000G_selected_SVs.vcf", "1000G_selected_allVariants.md.txt",0), + 3, + Arrays.asList("5e7ede48e7c5d5972c59dc5558a06e40","451498ceff06c1649890900fa994f1af","4b53a82a0b2d1a22a6eebca50a4f83a8") + ); + } + @Test public void testCEUTrio() { String testName = "testCEUTrio"; @@ -112,6 +185,7 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { executeTest(testName, spec); } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java index 2ffcd02e2..8186ffc7d 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java @@ -63,7 +63,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest { @Test(enabled = true) public void testMultiAllelicOneRecord() { WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(""), - Arrays.asList("13dd36c08be6c800f23988e6000d963e")); + Arrays.asList("0ff49c08690f61a38614606a090f23ea")); executeTest("testMultiAllelicOneRecord", spec); } @@ -100,6 +100,19 @@ public class VariantsToTableIntegrationTest extends WalkerTest { executeTest("testGenotypeFieldsWithInline", spec); } + @Test(enabled = true) + public void testListFields() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b36KGReference + + " --variant " + privateTestDir + "vcfexample.withMLE.vcf" + + " -T VariantsToTable" + + " -GF PL" + + " -o %s", + 1, + Arrays.asList("1cb2737ab0eaee0a9ae25ab2e7ac3e7e")); + executeTest("testGenotypeFields", spec); + } + @Test(enabled = true) public void testMoltenOutput() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java index d19a58b3a..24ffde9c3 100755 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -21,7 +21,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest { for ( final int nct : Arrays.asList(1, 2) ) { // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); - tests.add(new Object[]{ "BOTH", "081d077786ac0af24e9f97259a55209c", nt, nct }); + tests.add(new Object[]{ "BOTH", "78ce72d8f9d029313f5f2ceb02bb9822", nt, nct }); } return tests.toArray(new Object[][]{}); diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala index 09a24e782..1cd5a7512 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala @@ -10,13 +10,17 @@ class ExampleRetryMemoryLimit extends QScript { var bamFile: File = _ def script() { - val ug = new UnifiedGenotyper with RetryMemoryLimit - // First run with 1m - ug.memoryLimit = .001 - // On retry run with 1g - ug.retryMemoryFunction = (d => d * 1000) - ug.reference_sequence = referenceFile - ug.input_file = Seq(bamFile) - add(ug) + for (scatterCount <- 1 to 2) { + val ug = new UnifiedGenotyper with RetryMemoryLimit + // First run with 1m + ug.memoryLimit = .001 + // On retry run with 1g + ug.retryMemoryFunction = (d => d * 1000) + ug.reference_sequence = referenceFile + ug.input_file = Seq(bamFile) + ug.out = swapExt(bamFile, ".bam", ".scattered_%d.vcf".format(scatterCount)) + ug.scatterCount = scatterCount + add(ug) + } } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 0d0fab9d1..d0379d022 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -189,7 +189,7 @@ class QCommandLine extends CommandLineProgram with Logging { private def createQueueHeader() : Seq[String] = { Seq(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp), "Copyright (c) 2012 The Broad Institute", - "Fro support and documentation go to http://www.broadinstitute.org/gatk") + "For support and documentation go to http://www.broadinstitute.org/gatk") } private def getQueueVersion : String = { diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala index 6cd4b06bc..9522ec86c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala @@ -26,19 +26,19 @@ package org.broadinstitute.sting.queue.extensions.gatk import org.broadinstitute.sting.queue.function.scattergather.GatherFunction import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction -import org.broadinstitute.sting.queue.function.QFunction +import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction} import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor /** * Merges BAM files using net.sf.picard.sam.MergeSamFiles. */ -class BamGatherFunction extends GatherFunction with PicardBamFunction { +class BamGatherFunction extends GatherFunction with PicardBamFunction with RetryMemoryLimit { this.javaMainClass = "net.sf.picard.sam.MergeSamFiles" this.assumeSorted = Some(true) protected def inputBams = gatherParts protected def outputBam = originalOutput - override def freezeFieldValues { + override def freezeFieldValues() { val originalGATK = originalFunction.asInstanceOf[CommandLineGATK] // Whatever the original function can handle, merging *should* do less. diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala index 739e6cc91..75be4d773 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala @@ -25,13 +25,13 @@ package org.broadinstitute.sting.queue.extensions.gatk import org.broadinstitute.sting.queue.function.scattergather.GatherFunction -import org.broadinstitute.sting.queue.function.QFunction +import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction} import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor /** * Merges a vcf text file. */ -class VcfGatherFunction extends CombineVariants with GatherFunction { +class VcfGatherFunction extends CombineVariants with GatherFunction with RetryMemoryLimit { this.assumeIdenticalSamples = true this.suppressCommandLineHeader = true diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala index 9257cc7c2..b22bb2b59 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala @@ -50,7 +50,7 @@ class SortSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFun override def freezeFieldValues() { super.freezeFieldValues() if (outputIndex == null && output != null) - outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + outputIndex = new File(output.getAbsolutePath.stripSuffix(".bam") + ".bai") } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala index 84b625760..eb426d301 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala @@ -25,6 +25,7 @@ package org.broadinstitute.sting.queue.function import org.broadinstitute.sting.queue.util._ +import org.broadinstitute.sting.commandline.Argument /** * A command line that will be run in a pipeline. @@ -33,12 +34,15 @@ trait CommandLineFunction extends QFunction with Logging { def commandLine: String /** Upper memory limit */ + @Argument(doc="Memory limit", required=false) var memoryLimit: Option[Double] = None /** Resident memory limit */ + @Argument(doc="Resident memory limit", required=false) var residentLimit: Option[Double] = None /** Resident memory request */ + @Argument(doc="Resident memory request", required=false) var residentRequest: Option[Double] = None /** the number of SMP cores this job wants */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala index b9cb8540f..6500360c0 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala @@ -47,6 +47,7 @@ trait JavaCommandLineFunction extends CommandLineFunction { /** * Memory limit for the java executable, or if None will use the default memoryLimit. */ + @Argument(doc="Java memory limit", required=false) var javaMemoryLimit: Option[Double] = None /** diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala index 9f7932d39..aae846534 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala @@ -113,11 +113,13 @@ trait QFunction extends Logging with QJobReport { var jobErrorFile: File = _ /** Errors (if any) from the last failed run of jobErrorFiles. */ + @Argument(doc="Job error lines", required=false) var jobErrorLines: Seq[String] = Nil /** * The number of times this function has previously been run. */ + @Argument(doc="Job retries", required=false) var retries = 0 /** Change settings for the next run. Retries will be set to the number of times the function was run and jobErrorLines may contain the error text. */ @@ -541,4 +543,11 @@ object QFunction { classFields } } + + /** + * Returns the Seq of fields for a QFunction class. + * @param clazz Class to retrieve fields for. + * @return the fields of the class. + */ + def classFunctionFields(clazz: Class[_]) = classFields(clazz).functionFields } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala b/public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala index 8bba5551f..acc9a7203 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala @@ -24,17 +24,26 @@ package org.broadinstitute.sting.queue.function +import org.broadinstitute.sting.commandline.Argument + +object RetryMemoryLimit { + private val defaultRetryMemoryFunction: (Double => Double) = ( 2 * _ ) + private val defaultMemoryLimitErrorText = Seq("OutOfMemory", "you did not provide enough memory", "TERM_MEMLIMIT") +} + /** A mixin that on retry increases the memory limit when certain text is found. */ trait RetryMemoryLimit extends CommandLineFunction { /** How to increase the memory. By default doubles the memory. */ - var retryMemoryFunction: (Double => Double) = (2 * _) + var retryMemoryFunction: (Double => Double) = RetryMemoryLimit.defaultRetryMemoryFunction /** Once the threshold is passed, no more memory will be added to memory limit. */ + @Argument(doc="threshold to stop doubling the memory", required=false) var memoryLimitThreshold: Option[Double] = None /** Various strings to look for to determine we ran out of memory. */ - var memoryLimitErrorText = Seq("OutOfMemory", "you did not provide enough memory", "TERM_MEMLIMIT") + @Argument(doc="text to look for in the errors", required = false) + var memoryLimitErrorText = RetryMemoryLimit.defaultMemoryLimitErrorText override def freezeFieldValues() { super.freezeFieldValues() @@ -42,6 +51,21 @@ trait RetryMemoryLimit extends CommandLineFunction { this.memoryLimitThreshold = this.qSettings.memoryLimitThreshold } + + override def copySettingsTo(function: QFunction) { + super.copySettingsTo(function) + function match { + case retryMemoryLimit: RetryMemoryLimit => + if (retryMemoryLimit.memoryLimitThreshold.isEmpty) + retryMemoryLimit.memoryLimitThreshold = this.memoryLimitThreshold + if (retryMemoryLimit.retryMemoryFunction == RetryMemoryLimit.defaultRetryMemoryFunction) + retryMemoryLimit.retryMemoryFunction = this.retryMemoryFunction + if (retryMemoryLimit.memoryLimitErrorText == RetryMemoryLimit.defaultMemoryLimitErrorText) + retryMemoryLimit.memoryLimitErrorText = this.memoryLimitErrorText + case _ => /* ignore */ + } + } + override def setupRetry() { super.setupRetry() if (this.memoryLimitThreshold.isDefined && this.memoryLimit.isDefined) { diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala index 5b4f2b7e6..686188e72 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala @@ -30,6 +30,10 @@ import org.broadinstitute.sting.queue.function.{QFunction, CommandLineFunction} /** * Shadow clones another command line function. */ +object CloneFunction { + private lazy val cloneFunctionFields = QFunction.classFunctionFields(classOf[CloneFunction]) +} + class CloneFunction extends CommandLineFunction { var originalFunction: ScatterGatherableFunction = _ var cloneIndex: Int = _ @@ -41,10 +45,10 @@ class CloneFunction extends CommandLineFunction { var originalValues = Map.empty[ArgumentSource, Any] withScatterPartCount += 1 if (withScatterPartCount == 1) { - overriddenFields.foreach{ - case (field, overrideValue) => { + originalFunction.functionFields.foreach { + case (field) => { originalValues += field -> originalFunction.getFieldValue(field) - originalFunction.setFieldValue(field, overrideValue) + originalFunction.setFieldValue(field, getFieldValue(field)) } } } @@ -52,9 +56,11 @@ class CloneFunction extends CommandLineFunction { f() } finally { if (withScatterPartCount == 1) { - originalValues.foreach{ - case (name, value) => - originalFunction.setFieldValue(name, value) + originalFunction.functionFields.foreach { + case (field) => { + setFieldValue(field, originalFunction.getFieldValue(field)) + originalFunction.setFieldValue(field, originalValues(field)) + } } } withScatterPartCount -= 1 @@ -63,6 +69,8 @@ class CloneFunction extends CommandLineFunction { override def description = withScatterPart(() => originalFunction.description) override def shortDescription = withScatterPart(() => originalFunction.shortDescription) + override def setupRetry() { withScatterPart(() => originalFunction.setupRetry()) } + override protected def functionFieldClass = originalFunction.getClass def commandLine = withScatterPart(() => originalFunction.commandLine) @@ -73,13 +81,19 @@ class CloneFunction extends CommandLineFunction { } override def getFieldValue(source: ArgumentSource): AnyRef = { - overriddenFields.get(source) match { - case Some(value) => value.asInstanceOf[AnyRef] - case None => { - val value = originalFunction.getFieldValue(source) - overriddenFields += source -> value - value - } + CloneFunction.cloneFunctionFields.find(_.field.getName == source.field.getName) match { + case Some(cloneSource) => + super.getFieldValue(cloneSource) + case None => + overriddenFields.get(source) match { + case Some(value) => + value.asInstanceOf[AnyRef] + case None => { + val value = originalFunction.getFieldValue(source) + overriddenFields += source -> value + value + } + } } }