From ef3882f43995afb9c2ac63a04e38b99f1fde462f Mon Sep 17 00:00:00 2001 From: Ami Levy Moonshine Date: Thu, 11 Oct 2012 14:51:41 -0400 Subject: [PATCH 02/54] PhaseByTransmission: small typo /n. variantCallQC_summaryTablesOnly.R: small changes (more to come) /n GeneralCallingPipeline.scala: the new pipeline script. It is not as clean as I want it to be, but it works. I still going to work on it a little bit more. Also, it does not include yet: (1) the RR step (2) need better eval step (3) need to include other targets (currently it eork on the CEU Trio) --- .../sting/gatk/walkers/phasing/PhaseByTransmission.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 00acf854a..7ebfec49e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -36,7 +36,7 @@ import java.util.*; * * From 19e2b5f0d57a4f081664905733a5969aed18c3ed Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 17 Oct 2012 00:44:23 -0400 Subject: [PATCH 05/54] RR optimization: since total count in BaseCounts is requested so often, don't keep computing it from scratch each time. --- .../compression/reducereads/BaseCounts.java | 72 +++++++++++-------- .../reducereads/HeaderElement.java | 8 +-- 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index 94f3c2b6b..96e75adb9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -20,6 +20,7 @@ import java.util.Map; private final Map counts; // keeps track of the base counts private final Map sumQuals; // keeps track of the quals of each base + private int totalCount = 0; // keeps track of total count since this is requested so often public BaseCounts() { counts = new EnumMap(BaseIndex.class); @@ -36,49 +37,62 @@ import java.util.Map; baseCounts.counts.put(BaseIndex.C, countsACGT[1]); baseCounts.counts.put(BaseIndex.G, countsACGT[2]); baseCounts.counts.put(BaseIndex.T, countsACGT[3]); + baseCounts.totalCount = countsACGT[0] + countsACGT[1] + countsACGT[2] + countsACGT[3]; return baseCounts; } @Requires("other != null") - public void add(BaseCounts other) { - for (final BaseIndex i : BaseIndex.values()) - counts.put(i, counts.get(i) + other.counts.get(i)); + public void add(final BaseCounts other) { + for (final BaseIndex i : BaseIndex.values()) { + final int otherCount = other.counts.get(i); + counts.put(i, counts.get(i) + otherCount); + totalCount += otherCount; + } } @Requires("other != null") - public void sub(BaseCounts other) { - for (final BaseIndex i : BaseIndex.values()) - counts.put(i, counts.get(i) - other.counts.get(i)); + public void sub(final BaseCounts other) { + for (final BaseIndex i : BaseIndex.values()) { + final int otherCount = other.counts.get(i); + counts.put(i, counts.get(i) - otherCount); + totalCount -= otherCount; + } } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") - public void incr(byte base) { - final BaseIndex i = BaseIndex.byteToBase(base); - if (i != null) // no Ns - counts.put(i, counts.get(i) + 1); - } - - @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") - public void incr(byte base, byte qual) { + public void incr(final byte base) { final BaseIndex i = BaseIndex.byteToBase(base); if (i != null) { // no Ns counts.put(i, counts.get(i) + 1); + totalCount++; + } + } + + @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") + public void incr(final byte base, final byte qual) { + final BaseIndex i = BaseIndex.byteToBase(base); + if (i != null) { // no Ns + counts.put(i, counts.get(i) + 1); + totalCount++; sumQuals.put(i, sumQuals.get(i) + qual); } } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") - public void decr(byte base) { - final BaseIndex i = BaseIndex.byteToBase(base); - if (i != null) // no Ns - counts.put(i, counts.get(i) - 1); - } - - @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") - public void decr(byte base, byte qual) { + public void decr(final byte base) { final BaseIndex i = BaseIndex.byteToBase(base); if (i != null) { // no Ns counts.put(i, counts.get(i) - 1); + totalCount--; + } + } + + @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") + public void decr(final byte base, final byte qual) { + final BaseIndex i = BaseIndex.byteToBase(base); + if (i != null) { // no Ns + counts.put(i, counts.get(i) - 1); + totalCount--; sumQuals.put(i, sumQuals.get(i) - qual); } } @@ -131,11 +145,7 @@ import java.util.Map; @Ensures("result >= 0") public int totalCount() { - int sum = 0; - for (int c : counts.values()) - sum += c; - - return sum; + return totalCount; } /** @@ -146,7 +156,7 @@ import java.util.Map; */ @Ensures({"result >=0.0", "result<= 1.0"}) public double baseCountProportion(final byte base) { - return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount(); + return baseCountProportion(BaseIndex.byteToBase(base)); } /** @@ -157,10 +167,10 @@ import java.util.Map; */ @Ensures({"result >=0.0", "result<= 1.0"}) public double baseCountProportion(final BaseIndex baseIndex) { - int total = totalCount(); + final int total = totalCount(); if (total == 0) return 0.0; - return (double) counts.get(baseIndex) / totalCount(); + return (double) counts.get(baseIndex) / total; } @@ -248,7 +258,7 @@ import java.util.Map; final int total = totalCountWithoutIndels(); if (total == 0) return 0.0; - return (double) counts.get(index) / totalCountWithoutIndels(); + return (double) counts.get(index) / total; } public Object[] countsArray() { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index 0c1854ad1..272512bdb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -157,11 +157,9 @@ public class HeaderElement { * @return whether or not the HeaderElement is variant due to excess insertions */ private boolean isVariantFromInsertions(double minIndelProportion) { - int numberOfBases = consensusBaseCounts.totalCount(); - if (numberOfBases == 0 && insertionsToTheRight > 0) - return true; // we only have insertions - else if (numberOfBases == 0) - return false; // we don't have anything + final int numberOfBases = consensusBaseCounts.totalCount(); + if (numberOfBases == 0) + return (insertionsToTheRight > 0); // do we only have insertions? // if we have bases and insertions, check the ratio return ((double) insertionsToTheRight / numberOfBases) > minIndelProportion; From 33df1afe0e0ed5e0c5d18c0d82a27797fddf0c68 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 17 Oct 2012 00:55:44 -0400 Subject: [PATCH 06/54] More BaseCounts optimizations for RR. --- .../compression/reducereads/BaseCounts.java | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index 96e75adb9..fb76ef291 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -167,13 +167,9 @@ import java.util.Map; */ @Ensures({"result >=0.0", "result<= 1.0"}) public double baseCountProportion(final BaseIndex baseIndex) { - final int total = totalCount(); - if (total == 0) - return 0.0; - return (double) counts.get(baseIndex) / total; + return (totalCount == 0) ? 0.0 : (double)counts.get(baseIndex) / (double)totalCount; } - @Ensures("result != null") public String toString() { StringBuilder b = new StringBuilder(); @@ -239,11 +235,7 @@ import java.util.Map; @Ensures("result >=0") public int totalCountWithoutIndels() { - int sum = 0; - for (Map.Entry entry : counts.entrySet()) - if (entry.getKey().isNucleotide()) - sum += entry.getValue(); - return sum; + return totalCount - counts.get(BaseIndex.D) - counts.get(BaseIndex.I); } /** @@ -256,9 +248,7 @@ import java.util.Map; @Ensures({"result >=0.0", "result<= 1.0"}) public double baseCountProportionWithoutIndels(final BaseIndex index) { final int total = totalCountWithoutIndels(); - if (total == 0) - return 0.0; - return (double) counts.get(index) / total; + return (total == 0) ? 0.0 : (double)counts.get(index) / (double)total; } public Object[] countsArray() { From 20ffbcc86e98794ff40d97dff6a3d9b9859bbc15 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 17 Oct 2012 21:44:53 -0400 Subject: [PATCH 07/54] RR optimization: profiling was showing that the BaseCounts class was a major bottleneck because the underlying implementation was a HashMap. Given that the map index was an indexable Enum anyways, it makes a lot more sense to implement as a native array. Knocks 30% off the runtime in bad regions. --- .../reducereads/BaseAndQualsCounts.java | 41 ++--- .../compression/reducereads/BaseCounts.java | 149 ++++++++---------- .../compression/reducereads/BaseIndex.java | 6 +- .../reducereads/HeaderElement.java | 4 +- 4 files changed, 89 insertions(+), 111 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java index d5afc5722..654e0af09 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java @@ -1,8 +1,5 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; -import java.util.HashMap; -import java.util.Map; - /** * An object that keeps track of the base counts as well as the sum of the base, insertion and deletion qualities of each base. * @@ -10,35 +7,31 @@ import java.util.Map; * @since 6/15/12 */ public class BaseAndQualsCounts extends BaseCounts { - private final Map sumInsertionQuals; - private final Map sumDeletionQuals; + private final long[] sumInsertionQuals; + private final long[] sumDeletionQuals; public BaseAndQualsCounts() { super(); - this.sumInsertionQuals = new HashMap(); - this.sumDeletionQuals = new HashMap(); - for (BaseIndex i : BaseIndex.values()) { - sumInsertionQuals.put(i, 0L); - sumDeletionQuals.put(i, 0L); + this.sumInsertionQuals = new long[BaseIndex.values().length]; + this.sumDeletionQuals = new long[BaseIndex.values().length]; + for (final BaseIndex i : BaseIndex.values()) { + sumInsertionQuals[i.index] = 0L; + sumDeletionQuals[i.index] = 0L; } } public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { - super.incr(base, baseQual); - BaseIndex i = BaseIndex.byteToBase(base); - if (i != null) { // do not allow Ns - sumInsertionQuals.put(i, sumInsertionQuals.get(i) + insQual); - sumDeletionQuals.put(i, sumDeletionQuals.get(i) + delQual); - } + final BaseIndex i = BaseIndex.byteToBase(base); + super.incr(i, baseQual); + sumInsertionQuals[i.index] += insQual; + sumDeletionQuals[i.index] += delQual; } public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { - super.decr(base, baseQual); - BaseIndex i = BaseIndex.byteToBase(base); - if (i != null) { // do not allow Ns - sumInsertionQuals.put(i, sumInsertionQuals.get(i) - insQual); - sumDeletionQuals.put(i, sumDeletionQuals.get(i) - delQual); - } + final BaseIndex i = BaseIndex.byteToBase(base); + super.decr(i, baseQual); + sumInsertionQuals[i.index] -= insQual; + sumDeletionQuals[i.index] -= delQual; } public byte averageInsertionQualsOfBase(final BaseIndex base) { @@ -49,7 +42,7 @@ public class BaseAndQualsCounts extends BaseCounts { return getGenericAverageQualOfBase(base, sumDeletionQuals); } - private byte getGenericAverageQualOfBase(final BaseIndex base, final Map sumQuals) { - return (byte) (sumQuals.get(base) / getCount(base)); + private byte getGenericAverageQualOfBase(final BaseIndex base, final long[] sumQuals) { + return (byte) (sumQuals[base.index] / countOfBase(base)); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index fb76ef291..3a3905710 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -3,8 +3,6 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import java.util.EnumMap; -import java.util.Map; /** * An object to keep track of the number of occurrences of each base and it's quality. @@ -18,25 +16,25 @@ import java.util.Map; public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N; public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte(); - private final Map counts; // keeps track of the base counts - private final Map sumQuals; // keeps track of the quals of each base - private int totalCount = 0; // keeps track of total count since this is requested so often + private final int[] counts; // keeps track of the base counts + private final long[] sumQuals; // keeps track of the quals of each base + private int totalCount = 0; // keeps track of total count since this is requested so often public BaseCounts() { - counts = new EnumMap(BaseIndex.class); - sumQuals = new EnumMap(BaseIndex.class); - for (BaseIndex i : BaseIndex.values()) { - counts.put(i, 0); - sumQuals.put(i, 0L); + counts = new int[BaseIndex.values().length]; + sumQuals = new long[BaseIndex.values().length]; + for (final BaseIndex i : BaseIndex.values()) { + counts[i.index] = 0; + sumQuals[i.index] = 0L; } } public static BaseCounts createWithCounts(int[] countsACGT) { BaseCounts baseCounts = new BaseCounts(); - baseCounts.counts.put(BaseIndex.A, countsACGT[0]); - baseCounts.counts.put(BaseIndex.C, countsACGT[1]); - baseCounts.counts.put(BaseIndex.G, countsACGT[2]); - baseCounts.counts.put(BaseIndex.T, countsACGT[3]); + baseCounts.counts[BaseIndex.A.index] = countsACGT[0]; + baseCounts.counts[BaseIndex.C.index] = countsACGT[1]; + baseCounts.counts[BaseIndex.G.index] = countsACGT[2]; + baseCounts.counts[BaseIndex.T.index] = countsACGT[3]; baseCounts.totalCount = countsACGT[0] + countsACGT[1] + countsACGT[2] + countsACGT[3]; return baseCounts; } @@ -44,8 +42,8 @@ import java.util.Map; @Requires("other != null") public void add(final BaseCounts other) { for (final BaseIndex i : BaseIndex.values()) { - final int otherCount = other.counts.get(i); - counts.put(i, counts.get(i) + otherCount); + final int otherCount = other.counts[i.index]; + counts[i.index] += otherCount; totalCount += otherCount; } } @@ -53,8 +51,8 @@ import java.util.Map; @Requires("other != null") public void sub(final BaseCounts other) { for (final BaseIndex i : BaseIndex.values()) { - final int otherCount = other.counts.get(i); - counts.put(i, counts.get(i) - otherCount); + final int otherCount = other.counts[i.index]; + counts[i.index] -= otherCount; totalCount -= otherCount; } } @@ -62,49 +60,29 @@ import java.util.Map; @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") public void incr(final byte base) { final BaseIndex i = BaseIndex.byteToBase(base); - if (i != null) { // no Ns - counts.put(i, counts.get(i) + 1); - totalCount++; - } + counts[i.index]++; + totalCount++; } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") - public void incr(final byte base, final byte qual) { - final BaseIndex i = BaseIndex.byteToBase(base); - if (i != null) { // no Ns - counts.put(i, counts.get(i) + 1); - totalCount++; - sumQuals.put(i, sumQuals.get(i) + qual); - } + public void incr(final BaseIndex base, final byte qual) { + counts[base.index]++; + totalCount++; + sumQuals[base.index] += qual; } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") public void decr(final byte base) { final BaseIndex i = BaseIndex.byteToBase(base); - if (i != null) { // no Ns - counts.put(i, counts.get(i) - 1); - totalCount--; - } + counts[i.index]--; + totalCount--; } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") - public void decr(final byte base, final byte qual) { - final BaseIndex i = BaseIndex.byteToBase(base); - if (i != null) { // no Ns - counts.put(i, counts.get(i) - 1); - totalCount--; - sumQuals.put(i, sumQuals.get(i) - qual); - } - } - - @Ensures("result >= 0") - public int getCount(final byte base) { - return getCount(BaseIndex.byteToBase(base)); - } - - @Ensures("result >= 0") - public int getCount(final BaseIndex base) { - return counts.get(base); + public void decr(final BaseIndex base, final byte qual) { + counts[base.index]--; + totalCount--; + sumQuals[base.index] -= qual; } @Ensures("result >= 0") @@ -114,27 +92,32 @@ import java.util.Map; @Ensures("result >= 0") public long getSumQuals(final BaseIndex base) { - return sumQuals.get(base); + return sumQuals[base.index]; } @Ensures("result >= 0") public byte averageQuals(final byte base) { - return (byte) (getSumQuals(base) / getCount(base)); + return (byte) (getSumQuals(base) / countOfBase(base)); } @Ensures("result >= 0") public byte averageQuals(final BaseIndex base) { - return (byte) (getSumQuals(base) / getCount(base)); + return (byte) (getSumQuals(base) / countOfBase(base)); + } + + @Ensures("result >= 0") + public int countOfBase(final byte base) { + return countOfBase(BaseIndex.byteToBase(base)); } @Ensures("result >= 0") public int countOfBase(final BaseIndex base) { - return counts.get(base); + return counts[base.index]; } @Ensures("result >= 0") public long sumQualsOfBase(final BaseIndex base) { - return sumQuals.get(base); + return sumQuals[base.index]; } @Ensures("result >= 0") @@ -151,7 +134,7 @@ import java.util.Map; /** * Given a base , it returns the proportional count of this base compared to all other bases * - * @param base + * @param base base * @return the proportion of this base over all other bases */ @Ensures({"result >=0.0", "result<= 1.0"}) @@ -162,19 +145,19 @@ import java.util.Map; /** * Given a base , it returns the proportional count of this base compared to all other bases * - * @param baseIndex + * @param baseIndex base * @return the proportion of this base over all other bases */ @Ensures({"result >=0.0", "result<= 1.0"}) public double baseCountProportion(final BaseIndex baseIndex) { - return (totalCount == 0) ? 0.0 : (double)counts.get(baseIndex) / (double)totalCount; + return (totalCount == 0) ? 0.0 : (double)counts[baseIndex.index] / (double)totalCount; } @Ensures("result != null") public String toString() { StringBuilder b = new StringBuilder(); - for (Map.Entry elt : counts.entrySet()) { - b.append(elt.toString()).append("=").append(elt.getValue()).append(","); + for (final BaseIndex i : BaseIndex.values()) { + b.append(i.toString()).append("=").append(counts[i.index]).append(","); } return b.toString(); } @@ -186,9 +169,9 @@ import java.util.Map; @Ensures("result != null") public BaseIndex baseIndexWithMostCounts() { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - for (Map.Entry entry : counts.entrySet()) { - if (entry.getValue() > counts.get(maxI)) - maxI = entry.getKey(); + for (final BaseIndex i : BaseIndex.values()) { + if (counts[i.index] > counts[maxI.index]) + maxI = i; } return maxI; } @@ -196,17 +179,17 @@ import java.util.Map; @Ensures("result != null") public BaseIndex baseIndexWithMostCountsWithoutIndels() { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - for (Map.Entry entry : counts.entrySet()) { - if (entry.getKey().isNucleotide() && entry.getValue() > counts.get(maxI)) - maxI = entry.getKey(); + for (final BaseIndex i : BaseIndex.values()) { + if (i.isNucleotide() && counts[i.index] > counts[maxI.index]) + maxI = i; } return maxI; } private boolean hasHigherCount(final BaseIndex targetIndex, final BaseIndex testIndex) { - final int targetCount = counts.get(targetIndex); - final int testCount = counts.get(testIndex); - return ( targetCount > testCount || (targetCount == testCount && sumQuals.get(targetIndex) > sumQuals.get(testIndex)) ); + final int targetCount = counts[targetIndex.index]; + final int testCount = counts[testIndex.index]; + return ( targetCount > testCount || (targetCount == testCount && sumQuals[targetIndex.index] > sumQuals[testIndex.index]) ); } public byte baseWithMostProbability() { @@ -216,42 +199,42 @@ import java.util.Map; @Ensures("result != null") public BaseIndex baseIndexWithMostProbability() { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - for (Map.Entry entry : sumQuals.entrySet()) { - if (entry.getValue() > sumQuals.get(maxI)) - maxI = entry.getKey(); + for (final BaseIndex i : BaseIndex.values()) { + if (sumQuals[i.index] > sumQuals[maxI.index]) + maxI = i; } - return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCounts()); + return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCounts()); } @Ensures("result != null") public BaseIndex baseIndexWithMostProbabilityWithoutIndels() { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - for (Map.Entry entry : sumQuals.entrySet()) { - if (entry.getKey().isNucleotide() && entry.getValue() > sumQuals.get(maxI)) - maxI = entry.getKey(); + for (final BaseIndex i : BaseIndex.values()) { + if (i.isNucleotide() && sumQuals[i.index] > sumQuals[maxI.index]) + maxI = i; } - return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCountsWithoutIndels()); + return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCountsWithoutIndels()); } @Ensures("result >=0") public int totalCountWithoutIndels() { - return totalCount - counts.get(BaseIndex.D) - counts.get(BaseIndex.I); + return totalCount - counts[BaseIndex.D.index] - counts[BaseIndex.I.index]; } /** * Calculates the proportional count of a base compared to all other bases except indels (I and D) * - * @param index + * @param base base * @return the proportion of this base over all other bases except indels */ @Requires("index.isNucleotide()") @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportionWithoutIndels(final BaseIndex index) { + public double baseCountProportionWithoutIndels(final BaseIndex base) { final int total = totalCountWithoutIndels(); - return (total == 0) ? 0.0 : (double)counts.get(index) / (double)total; + return (total == 0) ? 0.0 : (double)counts[base.index] / (double)total; } - public Object[] countsArray() { - return counts.values().toArray(); + public int[] countsArray() { + return counts.clone(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java index a64db5874..02f867bcb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + /** * Simple byte / base index conversions * @@ -56,7 +58,7 @@ public enum BaseIndex { case 'N': case 'n': return N; - default: return null; + default: throw new ReviewedStingException("Tried to create a byte index for an impossible base " + base); } } @@ -68,7 +70,7 @@ public enum BaseIndex { * @return whether or not it is a nucleotide, given the definition above */ public boolean isNucleotide() { - return this == A || this == C || this == G || this == T || this == N; + return !isIndel(); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index 272512bdb..3097c2ee9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -213,11 +213,11 @@ public class HeaderElement { if (totalCount == 0) return 0; - Object[] countsArray = consensusBaseCounts.countsArray(); + int[] countsArray = consensusBaseCounts.countsArray(); Arrays.sort(countsArray); for (int i = countsArray.length-1; i>=0; i--) { nHaplotypes++; - runningCount += (Integer) countsArray[i]; + runningCount += countsArray[i]; if (runningCount/totalCount > minVariantProportion) break; } From 54f698422cd978e2c11960de497e4938c9136f8f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 18 Oct 2012 09:01:51 -0400 Subject: [PATCH 08/54] Better implementation for getSoftEnd() in GATKSAMRecord --- .../sting/utils/sam/GATKSAMRecord.java | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 53e6dc0dc..c6df449a3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Arrays; import java.util.HashMap; +import java.util.List; import java.util.Map; /** @@ -406,24 +407,20 @@ public class GATKSAMRecord extends BAMRecord { * @return the unclipped end of the read taking soft clips (but not hard clips) into account */ public int getSoftEnd() { - if (softEnd < 0) { - int stop = this.getUnclippedStart(); + if ( softEnd < 0 ) { + softEnd = getAlignmentEnd(); + final List cigs = getCigar().getCigarElements(); + for (int i=cigs.size() - 1; i>=0; --i) { + final CigarElement cig = cigs.get(i); + final CigarOperator op = cig.getOperator(); - if (ReadUtils.readIsEntirelyInsertion(this)) - return stop; - - int shift = 0; - CigarOperator lastOperator = null; - for (CigarElement cigarElement : this.getCigar().getCigarElements()) { - stop += shift; - lastOperator = cigarElement.getOperator(); - if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP) - shift = cigarElement.getLength(); - else - shift = 0; + if (op == CigarOperator.SOFT_CLIP) + softEnd += cig.getLength(); + else if (op != CigarOperator.HARD_CLIP) + break; } - softEnd = (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } + return softEnd; } From 97abb98c0bb98e27337449f1d3125c97554e56ae Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 18 Oct 2012 10:19:21 -0400 Subject: [PATCH 09/54] Bugfix for bad nt / nct argument detection in MicroScheduler --- .../sting/gatk/executive/MicroScheduler.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 07d9df79a..223e11680 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -157,18 +157,22 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { if ( ! (walker instanceof TreeReducible) ) { throw badNT("nt", engine, walker); - } else { - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } + } + + if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) { + throw badNT("nct", engine, walker); + } + + if ( threadAllocation.getNumDataThreads() > 1 ) { + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } else { - if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) - throw badNT("nct", engine, walker); return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } } private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) { - throw new UserException.BadArgumentValue("nt", + throw new UserException.BadArgumentValue(parallelArg, String.format("The analysis %s currently does not support parallel execution with %s. " + "Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg)); } From f20fa9d0823b031f7c7a6d952d54da3804954598 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 18 Oct 2012 10:19:34 -0400 Subject: [PATCH 10/54] SelectVariants is actually NanoSchedulable --- .../sting/gatk/walkers/variantutils/SelectVariants.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 15c17988c..98f56be1f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCounts; @@ -42,11 +43,11 @@ import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.*; +import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import java.io.File; import java.io.FileNotFoundException; @@ -188,7 +189,7 @@ import java.util.*; * */ @DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} ) -public class SelectVariants extends RodWalker implements TreeReducible { +public class SelectVariants extends RodWalker implements TreeReducible, NanoSchedulable { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** From d3fc797cfe2af278d88f06f3580f3247818a8918 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 18 Oct 2012 10:42:20 -0400 Subject: [PATCH 11/54] SelectVariants is actually *NOT* NanoSchedulable --- .../gatk/walkers/variantutils/SelectVariants.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 98f56be1f..c7b1d0fc7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -31,7 +31,6 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCounts; @@ -189,7 +188,7 @@ import java.util.*; * */ @DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} ) -public class SelectVariants extends RodWalker implements TreeReducible, NanoSchedulable { +public class SelectVariants extends RodWalker implements TreeReducible { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** @@ -543,9 +542,11 @@ public class SelectVariants extends RodWalker implements TreeR VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS); if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) { - final VariantContextBuilder builder = new VariantContextBuilder(UG_engine.calculateGenotypes(sub)).filters(sub.getFiltersMaybeNull()); - addAnnotations(builder, sub); - sub = builder.make(); + synchronized (UG_engine) { + final VariantContextBuilder builder = new VariantContextBuilder(UG_engine.calculateGenotypes(sub)).filters(sub.getFiltersMaybeNull()); + addAnnotations(builder, sub); + sub = builder.make(); + } } if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) { From 3504f71b6b6be2ac7b97a5886c3609ca4c2caee6 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 18 Oct 2012 13:58:38 -0400 Subject: [PATCH 13/54] Fixing a null pointer exception bug for DEV-10 --- .../gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java index 2ea4bdfb0..00c6ddae8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java @@ -123,13 +123,13 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor // This parser has been passed a null filename and the GATK is not responsible for creating a type default for the object; // therefore, the user must have failed to specify a type default - if(writerFileName.asFile() == null && generateMD5) + if(writerFileName != null && writerFileName.asFile() == null && generateMD5) throw new ArgumentException("MD5 generation specified, but no output file specified. If md5 generation is desired, please specify a BAM output file and an md5 file will be written alongside."); // Create the stub and set parameters. SAMFileWriterStub stub = null; // stub = new SAMFileWriterStub(engine, defaultOutputStream); - if ( writerFileName.asFile() != null ) { + if (writerFileName != null && writerFileName.asFile() != null ) { stub = new SAMFileWriterStub(engine, writerFileName.asFile()); if ( compressionLevel != null ) From b4e69239dd0504ac7dc088c5a7726bb6b66e63a6 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 18 Oct 2012 14:31:15 -0400 Subject: [PATCH 14/54] In order to be considered an informative read in the PerReadAlleleLikelihoodMap it has to be informative compared to all other alleles not just the worst allele. Also, fixing a bug when there is only one allele in the map. --- ...GenotyperGeneralPloidyIntegrationTest.java | 4 ++-- .../HaplotypeCallerIntegrationTest.java | 4 ++-- .../genotyper/PerReadAlleleLikelihoodMap.java | 19 ++++++++----------- .../UnifiedGenotyperIntegrationTest.java | 11 +++++------ 4 files changed, 17 insertions(+), 21 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 219c36a05..989f06ec5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -70,12 +70,12 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","7d6f319b9edcb1ff8c290fef150a2df8"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","9acfe0019efdc91217ee070acb071228"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","dd02890123e07e7412a49475cb6280f1"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","c1d4dd793f61710a1b1fc5d82803210f"); } @Test(enabled = true) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index be8fd2fb2..a8ea4b7da 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,7 +21,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "8c52c0955099cca3215a0d78fd455894"); + HCTest(CEUTRIO_BAM, "", "75013fa6a884104f0b1797502b636698"); } @Test @@ -31,7 +31,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "61c1a0fb62d909229af6b5a91dad8b35"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "3cd3363976b1937d801f9f82996f4abe"); } private void HCTestComplexVariants(String bam, String args, String md5) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java index 9c0062876..a83adc275 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java @@ -113,23 +113,20 @@ public class PerReadAlleleLikelihoodMap { return likelihoodReadMap.get(p.getRead()); } - public static Allele getMostLikelyAllele(Map alleleMap) { - double minLike = Double.POSITIVE_INFINITY, maxLike = Double.NEGATIVE_INFINITY; + public static Allele getMostLikelyAllele( final Map alleleMap ) { + double maxLike = Double.NEGATIVE_INFINITY; + double prevMaxLike = Double.NEGATIVE_INFINITY; Allele mostLikelyAllele = Allele.NO_CALL; - for (Map.Entry el : alleleMap.entrySet()) { + for (final Map.Entry el : alleleMap.entrySet()) { if (el.getValue() > maxLike) { + prevMaxLike = maxLike; maxLike = el.getValue(); mostLikelyAllele = el.getKey(); + } else if( el.getValue() > prevMaxLike ) { + prevMaxLike = el.getValue(); } - - if (el.getValue() < minLike) - minLike = el.getValue(); - } - if (maxLike-minLike > INDEL_LIKELIHOOD_THRESH) - return mostLikelyAllele; - else - return Allele.NO_CALL; + return (maxLike - prevMaxLike > INDEL_LIKELIHOOD_THRESH ? mostLikelyAllele : Allele.NO_CALL ); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index df088a4ad..72724e46a 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("26af30187316f742878c85f0ed091837")); + Arrays.asList("48b4f4b05461be276bffc91350f08cbc")); executeTest("test Multiple SNP alleles", spec); } @@ -76,7 +76,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("aa9cf96ab8f5aa844387e3aef1f27249")); + Arrays.asList("04affcc9d720ee17bc221759707e0cd2")); executeTest("test reverse trim", spec); } @@ -84,7 +84,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("d210ee1baa75dd4a0c63aef6b1fa7a8a")); + Arrays.asList("112e7bedfd284d4d9390aa006118c733")); executeTest("test mismatched PLs", spec); } @@ -343,13 +343,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("7fc488fe16dea9f023bfcfdaa908a548")); + Arrays.asList("863ee56b3594f09795644127f2f9539f")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("f3ff7fe0f15f31eadd726c711d6bf3de")); + Arrays.asList("503ca1b75cc7b2679eaa80f7b5e7ef1c")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -452,5 +452,4 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { Arrays.asList("bbf16e1873e525ee5975021cfb8988cf")); executeTest("test calling on a ReducedRead BAM", spec); } - } From 3db38c5a93d80ede80cb8eb53bb630a1a6ee9490 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Thu, 18 Oct 2012 15:42:14 -0400 Subject: [PATCH 16/54] Bug fix: inbreeding coeff shouldn't be computed in ref-only sites --- .../sting/gatk/walkers/annotator/InbreedingCoeff.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index 64be64afa..9a4de3c36 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -48,7 +48,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno private Map calculateIC(final VariantContext vc) { final GenotypesContext genotypes = (founderIds == null || founderIds.isEmpty()) ? vc.getGenotypes() : vc.getGenotypes(founderIds); - if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) + if ( genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant()) return null; int idxAA = 0, idxAB = 1, idxBB = 2; From 403654d40aa8cafc05061ce09f0678e3ce55e768 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Thu, 18 Oct 2012 16:57:15 -0400 Subject: [PATCH 17/54] Fixed null checkes in ArgumentTypeDescriptor due to ArgumentMatchValue updates. Fixed @Arguments such as scatter count that were labeled as java.io.File via incorrect @Input annotation. --- .../commandline/ArgumentTypeDescriptor.java | 6 ++--- .../qscripts/DataProcessingPipeline.scala | 25 ++++++++++--------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java index 4b9774806..54ade61f6 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -532,7 +532,7 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor { Object[] vals = type.getEnumConstants(); Object defaultEnumeration = null; // as we look at options, record the default option if it exists for (Object val : vals) { - if (String.valueOf(val).equalsIgnoreCase(value.asString())) return val; + if (String.valueOf(val).equalsIgnoreCase(value == null ? null : value.asString())) return val; try { if (type.getField(val.toString()).isAnnotationPresent(EnumerationArgumentDefault.class)) defaultEnumeration = val; } catch (NoSuchFieldException e) { throw new ReviewedStingException("parsing " + type.toString() + "doesn't contain the field " + val.toString()); } } @@ -546,10 +546,10 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor { else throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value.asString()); } else if (type.equals(File.class)) { - result = value.asFile(); + result = value == null ? null : value.asFile(); } else { Constructor ctor = type.getConstructor(String.class); - result = ctor.newInstance(value.asString()); + result = ctor.newInstance(value == null ? null : value.asString()); } } catch (UserException e) { throw e; diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 56f6460fb..165e6a4e9 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -13,6 +13,7 @@ import net.sf.samtools.SAMFileHeader.SortOrder import org.broadinstitute.sting.queue.util.QScriptUtils import org.broadinstitute.sting.queue.function.ListWriterFunction import org.broadinstitute.sting.commandline.Hidden +import org.broadinstitute.sting.commandline class DataProcessingPipeline extends QScript { qscript => @@ -41,34 +42,34 @@ class DataProcessingPipeline extends QScript { @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false) var bwaPath: File = _ - @Input(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", fullName="project", shortName="p", required=false) + @Argument(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", fullName="project", shortName="p", required=false) var projectName: String = "project" - @Input(doc="Output path for the processed BAM files.", fullName="output_directory", shortName="outputDir", required=false) + @Argument(doc="Output path for the processed BAM files.", fullName="output_directory", shortName="outputDir", required=false) var outputDir: String = "" - @Input(doc="the -L interval string to be used by GATK - output bams at interval only", fullName="gatk_interval_string", shortName="L", required=false) + @Argument(doc="the -L interval string to be used by GATK - output bams at interval only", fullName="gatk_interval_string", shortName="L", required=false) var intervalString: String = "" @Input(doc="an intervals file to be used by GATK - output bams at intervals only", fullName="gatk_interval_file", shortName="intervals", required=false) var intervals: File = _ - @Input(doc="Cleaning model: KNOWNS_ONLY, USE_READS or USE_SW", fullName="clean_model", shortName="cm", required=false) + @Argument(doc="Cleaning model: KNOWNS_ONLY, USE_READS or USE_SW", fullName="clean_model", shortName="cm", required=false) var cleaningModel: String = "USE_READS" - @Input(doc="Decompose input BAM file and fully realign it using BWA and assume Single Ended reads", fullName="use_bwa_single_ended", shortName="bwase", required=false) + @Argument(doc="Decompose input BAM file and fully realign it using BWA and assume Single Ended reads", fullName="use_bwa_single_ended", shortName="bwase", required=false) var useBWAse: Boolean = false - @Input(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false) + @Argument(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false) var useBWApe: Boolean = false - @Input(doc="Decompose input BAM file and fully realign it using BWA SW", fullName="use_bwa_sw", shortName="bwasw", required=false) + @Argument(doc="Decompose input BAM file and fully realign it using BWA SW", fullName="use_bwa_sw", shortName="bwasw", required=false) var useBWAsw: Boolean = false - @Input(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false) + @Argument(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false) var bwaThreads: Int = 1 - @Input(doc="Perform validation on the BAM files", fullName="validation", shortName="vs", required=false) + @Argument(doc="Perform validation on the BAM files", fullName="validation", shortName="vs", required=false) var validation: Boolean = false @@ -76,15 +77,15 @@ class DataProcessingPipeline extends QScript { * Hidden Parameters ****************************************************************************/ @Hidden - @Input(doc="How many ways to scatter/gather", fullName="scatter_gather", shortName="sg", required=false) + @Argument(doc="How many ways to scatter/gather", fullName="scatter_gather", shortName="sg", required=false) var nContigs: Int = -1 @Hidden - @Input(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false) + @Argument(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false) var defaultPlatform: String = "" @Hidden - @Input(doc="Run the pipeline in test mode only", fullName = "test_mode", shortName = "test", required=false) + @Argument(doc="Run the pipeline in test mode only", fullName = "test_mode", shortName = "test", required=false) var testMode: Boolean = false From 27d8d3f51e67699feaafc7aab40a8ed40bc11d4c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 19 Oct 2012 11:59:34 -0400 Subject: [PATCH 18/54] RR optimization: don't recalculate the entire bitset of variant sites for every read added to the sliding window. Instead, reuse as much of the previously calculated bitset as you can (basically from the window start until the start of the new read minus the context size). In some awfully performing regions this cuts down the runtime in half, although in others this doesn't seem to help much (so clearly something else is going on). Note that I still need to fix one last bug here, but it's almost done. --- .../reducereads/SlidingWindow.java | 108 +++++++++++++----- 1 file changed, 81 insertions(+), 27 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 6fdf85317..63524ae82 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -24,7 +24,7 @@ import java.util.*; public class SlidingWindow { // Sliding Window data - final private LinkedList readsInWindow; + final private TreeSet readsInWindow; final private LinkedList windowHeader; protected int contextSize; // the largest context size (between mismatches and indels) protected String contig; @@ -97,7 +97,13 @@ public class SlidingWindow { this.MIN_MAPPING_QUALITY = minMappingQuality; this.windowHeader = new LinkedList(); - this.readsInWindow = new LinkedList(); + this.readsInWindow = new TreeSet(new Comparator() { + @Override + public int compare(GATKSAMRecord read1, GATKSAMRecord read2) { + final int difference = read1.getSoftEnd() - read2.getSoftEnd(); + return difference != 0 ? difference : read1.getReadName().compareTo(read2.getReadName()); + } + }); this.contig = contig; this.contigIndex = contigIndex; @@ -195,55 +201,102 @@ public class SlidingWindow { * @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start! * @return all reads that have fallen to the left of the sliding window after the slide */ - protected List slideWindow(int incomingReadUnclippedStart) { + protected List slideWindow(final int incomingReadUnclippedStart) { List finalizedReads = new LinkedList(); - if (incomingReadUnclippedStart - contextSize > getStartLocation(windowHeader)) { - int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation(windowHeader); - boolean[] variantSite = markSites(getStartLocation(windowHeader) + readStartHeaderIndex); + final int windowHeaderStartLocation = getStartLocation(windowHeader); + + if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) { + markSites(incomingReadUnclippedStart); + int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation; int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive) - List> regions = getAllVariantRegions(0, breakpoint, variantSite); + List> regions = getAllVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet()); finalizedReads = closeVariantRegions(regions, false); - List readsToRemove = new LinkedList(); - final int windowHeaderStartLoc = getStartLocation(windowHeader); - for (final GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!) - if (read.getSoftEnd() < windowHeaderStartLoc) { - readsToRemove.add(read); - } - } - for (GATKSAMRecord read : readsToRemove) { - readsInWindow.remove(read); + while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) { + readsInWindow.pollFirst(); } } return finalizedReads; } + + private final class MarkedSites { + + private boolean[] siteIsVariant = new boolean[0]; + private int startLocation = 0; + + public MarkedSites() {} + + public boolean[] getVariantSiteBitSet() { return siteIsVariant; } + + /** + * Updates the variant site bitset given the new startlocation and size of the region to mark. + * + * @param newStartLocation the new start location of the bitset + * @param sizeOfRegion the new size of the region to be represented + * + * @return the end position (newStartLocation + index) of the region marked by this method; the calling method is responsible for the remainder. + */ + public int updateRegion(final int newStartLocation, final int sizeOfRegion) { + int lastPositionMarked = sizeOfRegion; + + // if this is the first time we set the array and we can't reuse anything, just create a new array from scratch + if ( newStartLocation >= this.startLocation + siteIsVariant.length || newStartLocation < this.startLocation ) { + siteIsVariant = new boolean[sizeOfRegion]; + lastPositionMarked = 0; + } + // if the dimensions change, copy what we can and continue + else if ( newStartLocation != this.startLocation || sizeOfRegion != siteIsVariant.length ) { + final boolean[] tempArray = new boolean[sizeOfRegion]; + final int differenceInStartPositions = newStartLocation - this.startLocation; + lastPositionMarked = Math.min(siteIsVariant.length - differenceInStartPositions, sizeOfRegion); + System.arraycopy(siteIsVariant, differenceInStartPositions, tempArray, 0, lastPositionMarked); + siteIsVariant = null; // explicitly allow garbage collection + siteIsVariant = tempArray; + } + + this.startLocation = newStartLocation; + + return lastPositionMarked + newStartLocation; + } + } + + private final MarkedSites markedSites = new MarkedSites(); + /** * returns an array marked with variant and non-variant regions (it uses * markVariantRegions to make the marks) * * @param stop check the window from start to stop (not-inclusive) - * @return a boolean array with 'true' marking variant regions and false marking consensus sites */ - protected boolean[] markSites(int stop) { + protected void markSites(final int stop) { - boolean[] markedSites = new boolean[stop - getStartLocation(windowHeader) + contextSize + 1]; + final int windowHeaderStartLocation = getStartLocation(windowHeader); + final int sizeOfMarkedRegion = stop - windowHeaderStartLocation + contextSize + 1; + final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion); + final int locationToProcess = Math.min(lastPositionMarked, stop - contextSize); + // update the iterator to the correct position Iterator headerElementIterator = windowHeader.iterator(); - for (int i = getStartLocation(windowHeader); i < stop; i++) { + for (int i = windowHeaderStartLocation; i < locationToProcess; i++) { + if (headerElementIterator.hasNext()) + headerElementIterator.next(); + } + + // process a contextSize worth of region from scratch in case there's a variant there + for (int i = locationToProcess; i < stop; i++) { if (headerElementIterator.hasNext()) { HeaderElement headerElement = headerElementIterator.next(); if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT)) - markVariantRegion(markedSites, i - getStartLocation(windowHeader)); + markVariantRegion(markedSites, i - windowHeaderStartLocation); } else break; } - return markedSites; } /** @@ -252,11 +305,11 @@ public class SlidingWindow { * @param markedSites the boolean array to bear the marks * @param variantSiteLocation the location where a variant site was found */ - protected void markVariantRegion(boolean[] markedSites, int variantSiteLocation) { + protected void markVariantRegion(final MarkedSites markedSites, final int variantSiteLocation) { int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize; - int to = (variantSiteLocation + contextSize + 1 > markedSites.length) ? markedSites.length : variantSiteLocation + contextSize + 1; + int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length : variantSiteLocation + contextSize + 1; for (int i = from; i < to; i++) - markedSites[i] = true; + markedSites.getVariantSiteBitSet()[i] = true; } /** @@ -625,8 +678,8 @@ public class SlidingWindow { List finalizedReads = new LinkedList(); if (!windowHeader.isEmpty()) { - boolean[] variantSite = markSites(getStopLocation(windowHeader) + 1); - List> regions = getAllVariantRegions(0, windowHeader.size(), variantSite); + markSites(getStopLocation(windowHeader) + 1); + List> regions = getAllVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet()); finalizedReads = closeVariantRegions(regions, true); if (!windowHeader.isEmpty()) { @@ -635,6 +688,7 @@ public class SlidingWindow { } } + return finalizedReads; } From d3cf37dfaf47200bbddb8aaaa9e099e8658ce72c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 19 Oct 2012 12:01:45 -0400 Subject: [PATCH 19/54] Bug fix for general ploidy model: when choosing the most likely alternate allele(s), you need to weight the likelihood mass by the ploidy of the specific alleles (otherwise all alt alleles will have the same probability). This fixes Yossi's issue with pooled validation calling. This may brek integration tests, but I will leave that to GdA to handle. --- .../walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 1a864d3d8..903e733ac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -128,6 +128,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @return list of numAllelesToChoose most likely alleles */ + private static final int PL_INDEX_OF_HOM_REF = 0; private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) { final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; @@ -143,7 +144,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele for (int k=1; k < acCount.length;k++) { if (acCount[k] > 0) - likelihoodSums[k-1].sum += likelihoods[PLindexOfBestGL]; + likelihoodSums[k-1].sum += acCount[k] * (likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]); } } From f08e5a44daa2584538acb39f5bdba445b941e6f5 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 19 Oct 2012 12:11:18 -0400 Subject: [PATCH 20/54] Better implementation of GATKSAMRecord.getSoftStart() --- .../sting/utils/sam/GATKSAMRecord.java | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index c6df449a3..8c3d83874 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -387,14 +387,10 @@ public class GATKSAMRecord extends BAMRecord { */ public int getSoftStart() { if (softStart < 0) { - int start = this.getUnclippedStart(); - for (CigarElement cigarElement : this.getCigar().getCigarElements()) { - if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) - start += cigarElement.getLength(); - else - break; - } - softStart = start; + softStart = getAlignmentStart(); + final CigarElement firstCig = getCigar().getCigarElement(0); + if (firstCig.getOperator() == CigarOperator.HARD_CLIP) + softStart -= firstCig.getLength(); } return softStart; } From f7bd4998fc42560d3c1561772c2e5d48b34dbe7f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 19 Oct 2012 12:13:59 -0400 Subject: [PATCH 21/54] No need for dummy GLs --- .../gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 903e733ac..0e97c090c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -136,7 +136,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype - final ArrayList GLs = getGLs(vc.getGenotypes(), true); + final ArrayList GLs = getGLs(vc.getGenotypes(), false); for ( final double[] likelihoods : GLs ) { final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); From 9c088fe3fee42a9a4b81e4b449bd4105cce53b9d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 19 Oct 2012 12:41:24 -0400 Subject: [PATCH 22/54] Actually a better implementation of GATKSAMRecord.getSoftStart(). Last commit was all wrong. Oops. --- .../broadinstitute/sting/utils/sam/GATKSAMRecord.java | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 8c3d83874..1feb76517 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -388,9 +388,14 @@ public class GATKSAMRecord extends BAMRecord { public int getSoftStart() { if (softStart < 0) { softStart = getAlignmentStart(); - final CigarElement firstCig = getCigar().getCigarElement(0); - if (firstCig.getOperator() == CigarOperator.HARD_CLIP) - softStart -= firstCig.getLength(); + for (final CigarElement cig : getCigar().getCigarElements()) { + final CigarOperator op = cig.getOperator(); + + if (op == CigarOperator.SOFT_CLIP) + softStart -= cig.getLength(); + else if (op != CigarOperator.HARD_CLIP) + break; + } } return softStart; } From 4622896312de494ed876a8c29dd00b8d5f22351b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 19 Oct 2012 13:04:05 -0400 Subject: [PATCH 23/54] Oops, killed contracts --- .../sting/gatk/walkers/compression/reducereads/BaseCounts.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index 3a3905710..778b8300a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -227,7 +227,7 @@ import com.google.java.contract.Requires; * @param base base * @return the proportion of this base over all other bases except indels */ - @Requires("index.isNucleotide()") + @Requires("base.isNucleotide()") @Ensures({"result >=0.0", "result<= 1.0"}) public double baseCountProportionWithoutIndels(final BaseIndex base) { final int total = totalCountWithoutIndels(); From 2ef456d51a024b551e8a6ba761b3e36191b6f20f Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Fri, 19 Oct 2012 13:19:56 -0400 Subject: [PATCH 24/54] Added explicit @ClassType annotations to @Argument for Option[Int] or Option[Double] since scala seems to change the reflected type to Option[Object] on some systems. Changed ReflectionUtils.getGenericTypes' order of looking for @ClassType since the primitive generic wasn't completely erased, only changed to Object which is incorrect. More fixes to @Arguments labeled as java.io.File via incorrect @Input annotation. Put in a default undocumented implementation of @Argument doc() to match the one added to @Input. --- .../broadinstitute/sting/commandline/Argument.java | 2 +- .../queue/qscripts/PacbioProcessingPipeline.scala | 12 ++++++------ .../org/broadinstitute/sting/queue/QSettings.scala | 7 ++++++- .../sting/queue/util/ReflectionUtils.scala | 9 ++++----- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/commandline/Argument.java b/public/java/src/org/broadinstitute/sting/commandline/Argument.java index 33592287d..67ce8a863 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/Argument.java +++ b/public/java/src/org/broadinstitute/sting/commandline/Argument.java @@ -62,7 +62,7 @@ public @interface Argument { * --help argument is specified. * @return Doc string associated with this command-line argument. */ - String doc(); + String doc() default "Undocumented option"; /** * Is this argument required. If true, the command-line argument system will diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala index a4a6636fe..ef73840b3 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala @@ -27,28 +27,28 @@ class PacbioProcessingPipeline extends QScript { @Input(doc="dbsnp VCF file to use ", shortName="D", required=true) var dbSNP: File = _ - @Input(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false) + @Argument(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false) var threads: Int = 0 - @Input(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false) + @Argument(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false) var sample: String = "NA" @Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false) var bwaPath: File = _ - @Input(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false) + @Argument(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false) var BLASR_BAM: Boolean = false @Hidden - @Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false) + @Argument(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false) var dbq: Int = 20 @Hidden - @Input(shortName="bwastring", required=false) + @Argument(shortName="bwastring", required=false) var bwastring: String = "" @Hidden - @Input(shortName = "test", fullName = "test_mode", required = false) + @Argument(shortName = "test", fullName = "test_mode", required = false) var testMode: Boolean = false val queueLogDir: String = ".qlog/" diff --git a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala index 429428c4c..2c0f43bac 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala @@ -25,7 +25,7 @@ package org.broadinstitute.sting.queue import java.io.File -import org.broadinstitute.sting.commandline.Argument +import org.broadinstitute.sting.commandline.{ClassType, Argument} /** * Default settings settable on the command line and passed to CommandLineFunctions. @@ -41,6 +41,7 @@ class QSettings { var jobQueue: String = _ @Argument(fullName="job_priority", shortName="jobPriority", doc="Default priority for jobs. Min = 0, Max = 100", required=false) + @ClassType(classOf[Int]) var jobPriority: Option[Int] = None @Argument(fullName="job_native_arg", shortName="jobNative", doc="Native arguments to pass to the job runner.", required=false) @@ -53,15 +54,19 @@ class QSettings { var jobEnvironmentNames: Seq[String] = Nil @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes. If not set defaults to 2GB.", required=false) + @ClassType(classOf[Double]) var memoryLimit: Option[Double] = Some(2) @Argument(fullName="memory_limit_threshold", shortName="memLimitThresh", doc="After passing this threshold stop increasing memory limit for jobs, in gigabytes.", required=false) + @ClassType(classOf[Double]) var memoryLimitThreshold: Option[Double] = None @Argument(fullName="resident_memory_limit", shortName="resMemLimit", doc="Default resident memory limit for jobs, in gigabytes.", required=false) + @ClassType(classOf[Double]) var residentLimit: Option[Double] = None @Argument(fullName="resident_memory_request", shortName="resMemReq", doc="Default resident memory request for jobs, in gigabytes.", required=false) + @ClassType(classOf[Double]) var residentRequest: Option[Double] = None @Argument(fullName="resident_memory_request_parameter", shortName="resMemReqParam", doc="Parameter for resident memory requests. By default not requested.", required=false) diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala index 980a22e8e..15101fd75 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala @@ -159,12 +159,11 @@ object ReflectionUtils { private def getGenericTypes(field: Field): Option[Array[Class[_]]] = { // TODO: Refactor: based on java code in org.broadinstitute.sting.commandline.ArgumentTypeDescriptor // If this is a parameterized collection, find the contained type. If blow up if only one type exists. - if (field.getGenericType.isInstanceOf[ParameterizedType]) { + if (hasAnnotation(field, classOf[ClassType])) { + Some(Array(getAnnotation(field, classOf[ClassType]).value)) + } else if (field.getGenericType.isInstanceOf[ParameterizedType]) { val parameterizedType = field.getGenericType.asInstanceOf[ParameterizedType] Some(parameterizedType.getActualTypeArguments.map(_.asInstanceOf[Class[_]])) - } else if (hasAnnotation(field, classOf[ClassType])) { - Some(Array(getAnnotation(field, classOf[ClassType]).value)) - } - else None + } else None } } From 637e0cf1512c68e8964b3e1d5bce6d0755cad61b Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Thu, 11 Oct 2012 13:59:21 -0400 Subject: [PATCH 27/54] CountReads does not permit the use of output files --- .../src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index 301fa5b9b..1d2c6c9cc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -33,7 +33,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ * -T CountReads \ - * -o output.txt \ * -I input.bam \ * [-L input.intervals] * From 45f64425a3f4401761845791e25693476c065afe Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Fri, 19 Oct 2012 14:29:02 -0400 Subject: [PATCH 28/54] Update read metrics per shard rather than locus --- .../sting/gatk/traversals/TraverseActiveRegions.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 2b7b2f9f5..5d38df0f5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -104,10 +104,11 @@ public class TraverseActiveRegions extends TraversalEngine Date: Sat, 20 Oct 2012 16:38:18 -0400 Subject: [PATCH 30/54] Refactoring the PairHMM util class to allow for multiple implementations which can be specified by the callers via an enum argument. Adding an optimized PairHMM implementation which caches per-read calculations as well as a logless implementation which drastically reduces the runtime of the HMM while also increasing the precision of the result. In the HaplotypeCaller we now lexicographically sort the haplotypes to take maximal benefit of the haplotype offset optimization which only recalculates the HMM matrices after the first differing base in the haplotype. Many thanks to Mauricio for all the initial groundwork for these optimizations. The change to the one HC integration test is in the fourth decimal of HaplotypeScore. --- .../gatk/walkers/genotyper/ErrorModel.java | 2 +- ...elGenotypeLikelihoodsCalculationModel.java | 2 +- .../haplotypecaller/HaplotypeCaller.java | 12 +- .../LikelihoodCalculationEngine.java | 53 ++-- .../sting/utils/pairhmm/CachingPairHMM.java | 181 ++++++++++++ .../utils/pairhmm/LoglessCachingPairHMM.java | 187 +++++++++++++ .../HaplotypeCallerIntegrationTest.java | 2 +- .../sting/utils/pairhmm}/PairHMMUnitTest.java | 242 ++++++---------- ...elGenotypeLikelihoodsCalculationModel.java | 2 +- .../genotyper/UnifiedArgumentCollection.java | 13 +- .../indels/PairHMMIndelErrorModel.java | 42 +-- .../broadinstitute/sting/utils/Haplotype.java | 16 ++ .../broadinstitute/sting/utils/PairHMM.java | 259 ------------------ .../sting/utils/pairhmm/ExactPairHMM.java | 107 ++++++++ .../sting/utils/pairhmm/OriginalPairHMM.java | 105 +++++++ .../sting/utils/pairhmm/PairHMM.java | 45 +++ 16 files changed, 813 insertions(+), 457 deletions(-) create mode 100644 protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java create mode 100644 protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java rename {public/java/test/org/broadinstitute/sting/utils => protected/java/test/org/broadinstitute/sting/utils/pairhmm}/PairHMMUnitTest.java (56%) delete mode 100644 public/java/src/org/broadinstitute/sting/utils/PairHMM.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java index f76225134..8042c15d8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -72,7 +72,7 @@ public class ErrorModel { haplotypeMap = new LinkedHashMap(); if (refSampleVC.isIndel()) { pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java index fc0c526bc..f09a1ea3e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java @@ -62,7 +62,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); haplotypeMap = new LinkedHashMap(); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 71e4f5f8a..5f2b5775c 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -52,6 +52,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -114,6 +115,12 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false) protected PrintStream graphWriter = null; + /** + * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. + */ + @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; + @Hidden @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) protected String keepRG = null; @@ -287,7 +294,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); - likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false ); + likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); } @@ -400,6 +407,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final List filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! + // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM + Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() ); + // evaluate each sample's reads against all haplotypes final HashMap> perSampleReadList = splitReadsBySample( activeRegion.getReads() ); final HashMap> perSampleFilteredReadList = splitReadsBySample( filteredReads ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 14c1cd59d..62554c4ab 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -30,6 +30,9 @@ import com.google.java.contract.Requires; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pairhmm.*; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -44,8 +47,25 @@ public class LikelihoodCalculationEngine { private final boolean DEBUG; private final PairHMM pairHMM; - public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) { - pairHMM = new PairHMM( noBanded ); + public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) { + + switch (hmmType) { + case EXACT: + pairHMM = new ExactPairHMM(); + break; + case ORIGINAL: + pairHMM = new OriginalPairHMM(); + break; + case CACHING: + pairHMM = new CachingPairHMM(); + break; + case LOGLESS_CACHING: + pairHMM = new LoglessCachingPairHMM(); + break; + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING."); + } + this.constantGCP = constantGCP; DEBUG = debug; } @@ -69,23 +89,18 @@ public class LikelihoodCalculationEngine { X_METRIC_LENGTH += 2; Y_METRIC_LENGTH += 2; - // initial arrays to hold the probabilities of being in the match, insertion and deletion cases - final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases + pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); // for each sample's reads for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } // evaluate the likelihood of the reads given those haplotypes - computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey(), matchMetricArray, XMetricArray, YMetricArray ); + computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() ); } } - private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample ) { final int numHaplotypes = haplotypes.size(); final int numReads = reads.size(); @@ -113,9 +128,8 @@ public class LikelihoodCalculationEngine { final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype; - readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(), - readQuals, readInsQuals, readDelQuals, overallGCP, - haplotypeStart, matchMetricArray, XMetricArray, YMetricArray); + readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(), + readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0); readCounts[jjj][iii] = readCount; } } @@ -130,7 +144,7 @@ public class LikelihoodCalculationEngine { return iii; } } - return b1.length; + return Math.min(b1.length, b2.length); } @Requires({"haplotypes.size() > 0"}) @@ -280,7 +294,7 @@ public class LikelihoodCalculationEngine { final int numHaplotypes = haplotypes.size(); final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples final ArrayList bestHaplotypesIndexList = new ArrayList(); - bestHaplotypesIndexList.add(0); // always start with the reference haplotype + bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype // set up the default 1-to-1 haplotype mapping object final ArrayList> haplotypeMapping = new ArrayList>(); for( final Haplotype h : haplotypes ) { @@ -322,6 +336,13 @@ public class LikelihoodCalculationEngine { return bestHaplotypes; } + public static int findReferenceIndex( final List haplotypes ) { + for( final Haplotype h : haplotypes ) { + if( h.isReference() ) { return haplotypes.indexOf(h); } + } + throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); + } + public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java new file mode 100644 index 000000000..282db45d5 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ + +public class CachingPairHMM extends OriginalPairHMM { + + double[][] constantMatrix = null; // The cache in the CachingPairHMM + double[][] distanceMatrix = null; // The cache in the CachingPairHMM + + protected static final double [] firstRowConstantMatrix = { + QualityUtils.qualToProbLog10((byte) (DEFAULT_GOP + DEFAULT_GOP)), + QualityUtils.qualToProbLog10(DEFAULT_GCP), + QualityUtils.qualToErrorProbLog10(DEFAULT_GOP), + QualityUtils.qualToErrorProbLog10(DEFAULT_GCP), + 0.0, + 0.0 + }; + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + super.initialize(READ_MAX_LENGTH, HAPLOTYPE_MAX_LENGTH); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + constantMatrix = new double[X_METRIC_LENGTH][6]; + distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + // fill in the first row + for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) { + updateCell(1, jjj, 0.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray); + } + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + if( recacheReadValues ) { + initializeConstants( insertionGOP, deletionGOP, overallGCP ); + } + initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex ); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + for (int i = 2; i < X_METRIC_LENGTH; i++) { + for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) { + updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializeDistanceMatrix( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final int startIndex ) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + public void initializeConstants( final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP ) { + + final int l = insertionGOP.length; + constantMatrix[1] = firstRowConstantMatrix; + for (int i = 0; i < l; i++) { + final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); + constantMatrix[i+2][0] = QualityUtils.qualToProbLog10((byte) qualIndexGOP); + constantMatrix[i+2][1] = QualityUtils.qualToProbLog10(overallGCP[i]); + constantMatrix[i+2][2] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]); + constantMatrix[i+2][3] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); + constantMatrix[i+2][4] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]); + constantMatrix[i+2][5] = QualityUtils.qualToErrorProbLog10(overallGCP[i]); + } + constantMatrix[l+1][4] = 0.0; + constantMatrix[l+1][5] = 0.0; + } + + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions + + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param constants an array with the six constants relevant to this location + * @param matchMetricArray the matches likelihood matrix + * @param XMetricArray the insertions likelihood matrix + * @param YMetricArray the deletions likelihood matrix + */ + private void updateCell( final int indI, final int indJ, final double prior, final double[] constants, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + matchMetricArray[indI][indJ] = prior + + MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ - 1] + constants[0], + XMetricArray[indI - 1][indJ - 1] + constants[1], + YMetricArray[indI - 1][indJ - 1] + constants[1] ); + XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ] + constants[2], + XMetricArray[indI - 1][indJ] + constants[3]); + YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI][indJ - 1] + constants[4], + YMetricArray[indI][indJ - 1] + constants[5]); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java new file mode 100644 index 000000000..d2aef5bb5 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ + +public class LoglessCachingPairHMM extends CachingPairHMM { + + protected static final double SCALE_FACTOR_LOG10 = 300.0; + + protected static final double [] firstRowConstantMatrix = { + QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)), + QualityUtils.qualToProb(DEFAULT_GCP), + QualityUtils.qualToErrorProb(DEFAULT_GOP), + QualityUtils.qualToErrorProb(DEFAULT_GCP), + 1.0, + 1.0 + }; + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], 0.0); + Arrays.fill(XMetricArray[iii], 0.0); + Arrays.fill(YMetricArray[iii], 0.0); + } + + // the initial condition + matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10); // Math.log10(1.0); + + constantMatrix = new double[X_METRIC_LENGTH][6]; + distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + // fill in the first row + for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) { + updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray); + } + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + if( recacheReadValues ) { + initializeConstants( insertionGOP, deletionGOP, overallGCP ); + } + initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex ); + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + for (int i = 2; i < X_METRIC_LENGTH; i++) { + for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) { + updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10; + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializeDistanceMatrix( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final int startIndex ) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + public void initializeConstants( final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP ) { + + final int l = insertionGOP.length; + constantMatrix[1] = firstRowConstantMatrix; + for (int i = 0; i < l; i++) { + final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); + constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP); + constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]); + constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]); + constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]); + constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]); + constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]); + } + constantMatrix[l+1][4] = 1.0; + constantMatrix[l+1][5] = 1.0; + } + + /** + * Updates a cell in the HMM matrix + * + * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the + * initial conditions + + * @param indI row index in the matrices to update + * @param indJ column index in the matrices to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param constants an array with the six constants relevant to this location + * @param matchMetricArray the matches likelihood matrix + * @param XMetricArray the insertions likelihood matrix + * @param YMetricArray the deletions likelihood matrix + */ + private void updateCell( final int indI, final int indJ, final double prior, final double[] constants, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] + + XMetricArray[indI - 1][indJ - 1] * constants[1] + + YMetricArray[indI - 1][indJ - 1] * constants[1] ); + XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3]; + YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5]; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a8ea4b7da..a441e6c77 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -70,7 +70,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("fa5c5eb996e95aed12c50d70e6dd74d7")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c54c0c9411054bf629bfd98b616e53fc")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java similarity index 56% rename from public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java rename to protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java index 22bcb1bbf..6281054b1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java @@ -23,24 +23,26 @@ */ // our package -package org.broadinstitute.sting.utils; +package org.broadinstitute.sting.utils.pairhmm; // the imports for unit testing. - import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; - public class PairHMMUnitTest extends BaseTest { final static boolean EXTENSIVE_TESTING = true; - PairHMM hmm = new PairHMM( false ); // reference implementation - PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding + PairHMM exactHMM = new ExactPairHMM(); // the log truth implementation + PairHMM originalHMM = new OriginalPairHMM(); // the reference implementation + PairHMM cachingHMM = new CachingPairHMM(); + PairHMM loglessHMM = new LoglessCachingPairHMM(); // -------------------------------------------------------------------------------- // @@ -57,7 +59,7 @@ public class PairHMMUnitTest extends BaseTest { final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC"; final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA"; - public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { + public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) { this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); } @@ -76,115 +78,51 @@ public class PairHMMUnitTest extends BaseTest { } public double expectedLogL() { - return expectedQual / -10.0; + return (expectedQual / -10.0) + 0.03 ; } - public double tolerance() { - return 0.1; // TODO FIXME arbitrary + public double toleranceFromTheoretical() { + return 0.2; } - public double calcLogL() { + public double toleranceFromReference() { + return 1E-4; + } - double logL = hmm.computeReadLikelihoodGivenHaplotype( + public double toleranceFromExact() { + return 1E-9; + } + + public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) { + pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length); + return pairHMM.computeReadLikelihoodGivenHaplotypeLog10( refBasesWithContext, readBasesWithContext, - qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true), - qualAsBytes(gcp, false)); - - return logL; + qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel), + qualAsBytes(gcp, false, anchorIndel), 0, true); } private final byte[] asBytes(final String bases, final boolean left, final boolean right) { return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); } - private byte[] qualAsBytes(final int phredQual, final boolean doGOP) { + private byte[] qualAsBytes(final int phredQual, final boolean doGOP, final boolean anchorIndel) { final byte phredQuals[] = new byte[readBasesWithContext.length]; - // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM - Arrays.fill(phredQuals, (byte)100); - // update just the bases corresponding to the provided micro read with the quality scores - if( doGOP ) { - phredQuals[0 + CONTEXT.length()] = (byte)phredQual; - } else { - for ( int i = 0; i < read.length(); i++) - phredQuals[i + CONTEXT.length()] = (byte)phredQual; - } + if( anchorIndel ) { + // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM + Arrays.fill(phredQuals, (byte)100); - return phredQuals; - } - } - - final Random random = new Random(87865573); - private class BandedLikelihoodTestProvider extends TestDataProvider { - final String ref, read; - final byte[] refBasesWithContext, readBasesWithContext; - final int baseQual, insQual, delQual, gcp; - final int expectedQual; - final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC"; - final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA"; - final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT"; - final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA"; - final byte[] baseQuals, insQuals, delQuals, gcps; - - public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { - this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); - } - - public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { - super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual)); - this.baseQual = baseQual; - this.delQual = delQual; - this.insQual = insQual; - this.gcp = gcp; - this.read = read; - this.ref = ref; - this.expectedQual = expectedQual; - - refBasesWithContext = asBytes(ref, left, right); - readBasesWithContext = asBytes(read, false, false); - baseQuals = qualAsBytes(baseQual); - insQuals = qualAsBytes(insQual); - delQuals = qualAsBytes(delQual); - gcps = qualAsBytes(gcp, false); - } - - public double expectedLogL() { - double logL = hmm.computeReadLikelihoodGivenHaplotype( - refBasesWithContext, readBasesWithContext, - baseQuals, insQuals, delQuals, gcps); - - return logL; - } - - public double tolerance() { - return 0.2; // TODO FIXME arbitrary - } - - public double calcLogL() { - - double logL = bandedHMM.computeReadLikelihoodGivenHaplotype( - refBasesWithContext, readBasesWithContext, - baseQuals, insQuals, delQuals, gcps); - - return logL; - } - - private final byte[] asBytes(final String bases, final boolean left, final boolean right) { - return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); - } - - private byte[] qualAsBytes(final int phredQual) { - return qualAsBytes(phredQual, true); - } - - private byte[] qualAsBytes(final int phredQual, final boolean addRandom) { - final byte phredQuals[] = new byte[readBasesWithContext.length]; - Arrays.fill(phredQuals, (byte)phredQual); - if(addRandom) { - for( int iii = 0; iii < phredQuals.length; iii++) { - phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3)); + // update just the bases corresponding to the provided micro read with the quality scores + if( doGOP ) { + phredQuals[0 + CONTEXT.length()] = (byte)phredQual; + } else { + for ( int i = 0; i < read.length(); i++) + phredQuals[i + CONTEXT.length()] = (byte)phredQual; } + } else { + Arrays.fill(phredQuals, (byte)phredQual); } + return phredQuals; } } @@ -195,8 +133,8 @@ public class PairHMMUnitTest extends BaseTest { // test all combinations final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30); final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40); - final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); - final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(8, 10, 20) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20,30,35) : Arrays.asList(2); for ( final int baseQual : baseQuals ) { for ( final int indelQual : indelQuals ) { @@ -219,7 +157,7 @@ public class PairHMMUnitTest extends BaseTest { for ( boolean insertionP : Arrays.asList(true, false)) { final String small = Utils.dupString((char)base, 1); - final String big = Utils.dupString((char)base, size); + final String big = Utils.dupString((char) base, size); final String ref = insertionP ? small : big; final String read = insertionP ? big : small; @@ -238,69 +176,65 @@ public class PairHMMUnitTest extends BaseTest { return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); } - @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) - public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { - double calculatedLogL = cfg.calcLogL(); - double expectedLogL = cfg.expectedLogL(); - logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); - Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); - } - - @DataProvider(name = "BandedLikelihoodTestProvider") - public Object[][] makeBandedLikelihoodTests() { + final Random random = new Random(87860573); + @DataProvider(name = "OptimizedLikelihoodTestProvider") + public Object[][] makeOptimizedLikelihoodTests() { // context on either side is ACGTTGCA REF ACGTTGCA // test all combinations - final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30); - final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40); - final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10); - final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); + final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 30, 40, 60) : Arrays.asList(30); + final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 40, 60) : Arrays.asList(40); + final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); + final List sizes = EXTENSIVE_TESTING ? Arrays.asList(3, 20, 50, 90, 160) : Arrays.asList(2); for ( final int baseQual : baseQuals ) { for ( final int indelQual : indelQuals ) { for ( final int gcp : gcps ) { - - // test substitutions - for ( final byte refBase : BaseUtils.BASES ) { - for ( final byte readBase : BaseUtils.BASES ) { - final String ref = new String(new byte[]{refBase}); - final String read = new String(new byte[]{readBase}); - final int expected = refBase == readBase ? 0 : baseQual; - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); - } - } - - // test insertions and deletions - for ( final int size : sizes ) { - for ( final byte base : BaseUtils.BASES ) { - final int expected = indelQual + (size - 2) * gcp; - - for ( boolean insertionP : Arrays.asList(true, false)) { - final String small = Utils.dupString((char)base, 1); - final String big = Utils.dupString((char)base, size); - - final String ref = insertionP ? small : big; - final String read = insertionP ? big : small; - - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true); - new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true); + for ( final int refSize : sizes ) { + for ( final int readSize : sizes ) { + String ref = ""; + String read = ""; + for( int iii = 0; iii < refSize; iii++) { + ref += (char) BaseUtils.BASES[random.nextInt(4)]; } + for( int iii = 0; iii < readSize; iii++) { + read += (char) BaseUtils.BASES[random.nextInt(4)]; + } + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, false); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, false, true); + new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, true); } } } } } - return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class); + return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); } - @Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true) - public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) { - double calculatedLogL = cfg.calcLogL(); + @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) + public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { + double exactLogL = cfg.calcLogL( exactHMM, true ); + double calculatedLogL = cfg.calcLogL( originalHMM, true ); + double optimizedLogL = cfg.calcLogL( cachingHMM, true ); + double loglessLogL = cfg.calcLogL( loglessHMM, true ); double expectedLogL = cfg.expectedLogL(); - logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); - Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); + //logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(exactLogL, expectedLogL, cfg.toleranceFromTheoretical()); + Assert.assertEquals(calculatedLogL, expectedLogL, cfg.toleranceFromTheoretical()); + Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference()); + Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact()); + } + + @Test(dataProvider = "OptimizedLikelihoodTestProvider", enabled = true) + public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) { + double exactLogL = cfg.calcLogL( exactHMM, false ); + double calculatedLogL = cfg.calcLogL( originalHMM, false ); + double optimizedLogL = cfg.calcLogL( cachingHMM, false ); + double loglessLogL = cfg.calcLogL( loglessHMM, false ); + //logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString())); + Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference()); + Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact()); } @Test @@ -322,11 +256,11 @@ public class PairHMMUnitTest extends BaseTest { byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - double res1 = hmm.computeReadLikelihoodGivenHaplotype( + originalHMM.initialize(mread.length, haplotype1.length); + double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( haplotype1, mread, quals, gop, gop, - gcp); - + gcp, 0, false); System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); @@ -353,11 +287,11 @@ public class PairHMMUnitTest extends BaseTest { byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - double res1 = hmm.computeReadLikelihoodGivenHaplotype( + originalHMM.initialize(mread.length, haplotype1.length); + double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10( haplotype1, mread, quals, gop, gop, - gcp); - + gcp, 0, false); System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index ebfbc49fe..e0ffb2ba6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -57,7 +57,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 885463fcb..3eda2017c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -65,6 +66,12 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; + /** + * The PairHMM implementation to use for -glm INDEL genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. + */ + @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for -glm INDEL genotype likelihood calculations", required = false) + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ORIGINAL; + /** * The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base * is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value. @@ -112,10 +119,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) public int INDEL_HAPLOTYPE_SIZE = 80; - @Hidden - @Argument(fullName = "noBandedIndel", shortName = "noBandedIndel", doc = "Don't do Banded Indel likelihood computation", required = false) - public boolean DONT_DO_BANDED_INDEL_COMPUTATION = false; - @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) public boolean OUTPUT_DEBUG_INDEL_INFO = false; @@ -221,10 +224,10 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES; uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO; uac.exactCallsLog = exactCallsLog; + uac.pairHMM = pairHMM; // todo- arguments to remove uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; - uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION; return uac; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 9234a9fe8..3d287057c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -30,8 +30,11 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.PairHMM; import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pairhmm.ExactPairHMM; +import org.broadinstitute.sting.utils.pairhmm.OriginalPairHMM; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -48,7 +51,6 @@ public class PairHMMIndelErrorModel { public static final int BASE_QUAL_THRESHOLD = 20; private boolean DEBUG = false; - private boolean bandedLikelihoods = false; private static final int MAX_CACHED_QUAL = 127; @@ -67,6 +69,8 @@ public class PairHMMIndelErrorModel { private final byte[] GAP_OPEN_PROB_TABLE; private final byte[] GAP_CONT_PROB_TABLE; + private final PairHMM pairHMM; + ///////////////////////////// // Private Member Variables ///////////////////////////// @@ -85,15 +89,26 @@ public class PairHMMIndelErrorModel { } } - public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, boolean bandedLikelihoods) { + public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, final PairHMM.HMM_IMPLEMENTATION hmmType ) { this.DEBUG = deb; - this.bandedLikelihoods = bandedLikelihoods; + + switch (hmmType) { + case EXACT: + pairHMM = new ExactPairHMM(); + break; + case ORIGINAL: + pairHMM = new OriginalPairHMM(); + break; + case CACHING: + case LOGLESS_CACHING: + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL and EXACT."); + } // fill gap penalty table, affine naive model: this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; - for (int i = 0; i < START_HRUN_GAP_IDX; i++) { GAP_OPEN_PROB_TABLE[i] = indelGOP; GAP_CONT_PROB_TABLE[i] = indelGCP; @@ -190,7 +205,6 @@ public class PairHMMIndelErrorModel { final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final int[] readCounts) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; - final PairHMM pairHMM = new PairHMM(bandedLikelihoods); int readIdx=0; for (PileupElement p: pileup) { @@ -303,8 +317,6 @@ public class PairHMMIndelErrorModel { final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases); int j=0; - // initialize path metric and traceback memories for likelihood computation - double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null; byte[] previousHaplotypeSeen = null; final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; @@ -341,14 +353,9 @@ public class PairHMMIndelErrorModel { final int X_METRIC_LENGTH = readBases.length+2; final int Y_METRIC_LENGTH = haplotypeBases.length+2; - if (matchMetricArray == null) { + if (previousHaplotypeSeen == null) { //no need to reallocate arrays for each new haplotype, as length won't change - matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - - PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); } int startIndexInHaplotype = 0; @@ -356,11 +363,10 @@ public class PairHMMIndelErrorModel { startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); previousHaplotypeSeen = haplotypeBases.clone(); - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, (read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities), (read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities), - contextLogGapContinuationProbabilities, - startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); + contextLogGapContinuationProbabilities, startIndexInHaplotype, false); if (DEBUG) { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index befd24307..b30d47074 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.Serializable; import java.util.*; public class Haplotype { @@ -184,6 +185,21 @@ public class Haplotype { return new Haplotype(newHaplotypeBases); } + public static class HaplotypeBaseComparator implements Comparator, Serializable { + @Override + public int compare( final Haplotype hap1, final Haplotype hap2 ) { + final byte[] arr1 = hap1.getBases(); + final byte[] arr2 = hap2.getBases(); + // compares byte arrays using lexical ordering + final int len = Math.min(arr1.length, arr2.length); + for( int iii = 0; iii < len; iii++ ) { + final int cmp = arr1[iii] - arr2[iii]; + if (cmp != 0) { return cmp; } + } + return arr2.length - arr1.length; + } + } + public static LinkedHashMap makeHaplotypeListFromAlleles(final List alleleList, final int startPos, final ReferenceContext ref, diff --git a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java deleted file mode 100644 index 15f7a7869..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -import java.util.*; - -/** - * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. - * User: rpoplin - * Date: 3/1/12 - */ - -public class PairHMM { - private static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; - private static final byte DEFAULT_GOP = (byte) 45; - private static final byte DEFAULT_GCP = (byte) 10; - private static final double BANDING_TOLERANCE = 22.0; - private static final int BANDING_CLUSTER_WINDOW = 12; - private final boolean noBanded; - - public PairHMM() { - noBanded = false; - } - - public PairHMM( final boolean noBanded ) { - this.noBanded = noBanded; - } - - - public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, - final int X_METRIC_LENGTH) { - - for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { - Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); - } - - // the initial condition - matchMetricArray[1][1] = 0.0; // Math.log10(1.0); - - } - - @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) - @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability - public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, - final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) { - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - final int X_METRIC_LENGTH = readBases.length + 2; - final int Y_METRIC_LENGTH = haplotypeBases.length + 2; - - // initial arrays to hold the probabilities of being in the match, insertion and deletion cases - final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); - - return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray); - } - - @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) - @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability - public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, - final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - final int X_METRIC_LENGTH = readBases.length + 2; - final int Y_METRIC_LENGTH = haplotypeBases.length + 2; - - // ensure that all the qual scores have valid values - for( int iii = 0; iii < readQuals.length; iii++ ) { - readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); - } - - if( false ) { - final ArrayList workQueue = new ArrayList(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step - final ArrayList workToBeAdded = new ArrayList(); - final ArrayList calculatedValues = new ArrayList(); - final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1; - workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype - - for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over - //Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order - int el = 1; - for( int work : workQueue ) { - // choose the appropriate diagonal baseline location - int iii = 0; - int jjj = diag; - if( diag > Y_METRIC_LENGTH ) { - iii = diag - Y_METRIC_LENGTH; - jjj = Y_METRIC_LENGTH; - } - // move to the starting work location along the diagonal - iii += work; - jjj -= work; - while( iii >= X_METRIC_LENGTH || jjj <= 0 ) { - iii--; - jjj++; - work--; - } - if( !detectClusteredStartLocations(workToBeAdded, work ) ) { - workToBeAdded.add(work); // keep this thread going once it has started - } - - if( work >= el - 3 ) { - // step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value - double maxElement = Double.NEGATIVE_INFINITY; - for( el = work; el < numDiags + 1; el++ ) { - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, - insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); - final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); - calculatedValues.add(bestMetric); - if( bestMetric > maxElement ) { - maxElement = bestMetric; - } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { - break; - } - if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix - break; - } - if( --jjj <= 0 ) { // don't walk off the edge of the matrix - break; - } - } - - // find a local maximum to start a new band in the work queue - double localMaxElement = Double.NEGATIVE_INFINITY; - int localMaxElementIndex = 0; - for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) { - final double bestMetric = calculatedValues.get(kkk); - if( bestMetric > localMaxElement ) { - localMaxElement = bestMetric; - localMaxElementIndex = kkk; - } else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum - if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) { - workToBeAdded.add( work + localMaxElementIndex ); - } - break; - } - } - calculatedValues.clear(); - - // reset iii and jjj to the appropriate diagonal baseline location - iii = 0; - jjj = diag; - if( diag > Y_METRIC_LENGTH ) { - iii = diag - Y_METRIC_LENGTH; - jjj = Y_METRIC_LENGTH; - } - // move to the starting work location along the diagonal - iii += work-1; - jjj -= work-1; - - // step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value - for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) { - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, - insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); - final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); - if( bestMetric > maxElement ) { - maxElement = bestMetric; - } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { - break; - } - } - } - } - workQueue.clear(); - workQueue.addAll(workToBeAdded); - workToBeAdded.clear(); - } - } else { - // simple rectangular version of update loop, slow - for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { - for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { - if( (iii == 1 && jjj == 1) ) { continue; } - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, - matchMetricArray, XMetricArray, YMetricArray); - } - } - } - - // final probability is the log10 sum of the last element in all three state arrays - final int endI = X_METRIC_LENGTH - 1; - final int endJ = Y_METRIC_LENGTH - 1; - return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); - } - - private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, - final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { - - // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions - final int im1 = indI - 1; - final int jm1 = indJ - 1; - - // update the match array - double pBaseReadLog10 = 0.0; // Math.log10(1.0); - if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state - final byte x = readBases[im1-1]; - final byte y = haplotypeBases[jm1-1]; - final byte qual = readQuals[im1-1]; - pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); - } - final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); - final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); - final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); - matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); - - // update the X (insertion) array - final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); - final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); - - // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype - final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); - final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); - } - - // private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other - private boolean detectClusteredStartLocations( final ArrayList list, int loc ) { - for(int x : list) { - if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) { - return true; - } - } - return false; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java new file mode 100644 index 000000000..17089ee81 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java @@ -0,0 +1,107 @@ +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 10/16/12 + */ + +public class ExactPairHMM extends PairHMM { + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); + } + + // the initial condition + matchMetricArray[1][1] = 0.0; // Math.log10(1.0); + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.log10sumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]}); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0}); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1}); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2}); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java new file mode 100644 index 000000000..cd946cdf1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * User: rpoplin + * Date: 3/1/12 + */ + +public class OriginalPairHMM extends ExactPairHMM { + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java new file mode 100644 index 000000000..7a1399c32 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 10/16/12 + */ + +public abstract class PairHMM { + protected static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; + protected static final byte DEFAULT_GOP = (byte) 45; + protected static final byte DEFAULT_GCP = (byte) 10; + + public enum HMM_IMPLEMENTATION { + /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ + EXACT, + /* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */ + ORIGINAL, + /* Optimized version of the PairHMM which caches per-read computations */ + CACHING, + /* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */ + LOGLESS_CACHING + } + + protected double[][] matchMetricArray = null; + protected double[][] XMetricArray = null; + protected double[][] YMetricArray = null; + + public abstract void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ); + + @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", + "readBases.length == overallGCP.length", "matchMetricArray!=null", "XMetricArray!=null", "YMetricArray!=null"}) + @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 likelihood + public abstract double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ); +} From 2c624f76c83f8cfec61001f89928ebe39ab8d713 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 20 Oct 2012 20:35:54 -0400 Subject: [PATCH 31/54] Refactoring the Unified (and Standard) Argument Collections because it was really ugly that the subclass had to do all the cloning for the super class. The clone() method is really not recommended best practice in Java anyways, so I changed it so that we use standard overloaded constructors. Confirmed that the Haplotype Caller --help docs do not include UG-specific arguments. --- .../haplotypecaller/HaplotypeCaller.java | 4 +- .../StandardCallerArgumentCollection.java | 26 ++++++ .../genotyper/UnifiedArgumentCollection.java | 90 +++++++------------ 3 files changed, 62 insertions(+), 58 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 5f2b5775c..6d6351fc5 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -241,14 +241,14 @@ public class HaplotypeCaller extends ActiveRegionWalker implem samplesList.addAll( samples ); // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested - UnifiedArgumentCollection simpleUAC = UAC.clone(); + UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 085a60191..9b9f04228 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -69,7 +69,33 @@ public class StandardCallerArgumentCollection { @Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false) public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2; + /** + * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. + * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we + * will try to remove (N * contamination fraction) bases for each alternate allele. + */ + @Hidden + @Argument(fullName = "contamination_percentage_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false) + public double CONTAMINATION_PERCENTAGE = 0.0; + @Hidden @Argument(shortName = "logExactCalls", doc="x", required=false) public File exactCallsLog = null; + + + public StandardCallerArgumentCollection() { } + + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + public StandardCallerArgumentCollection(final StandardCallerArgumentCollection SCAC) { + this.alleles = SCAC.alleles; + this.GenotypingMode = SCAC.GenotypingMode; + this.heterozygosity = SCAC.heterozygosity; + this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; + this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS; + this.OutputMode = SCAC.OutputMode; + this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; + this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; + this.CONTAMINATION_PERCENTAGE = SCAC.CONTAMINATION_PERCENTAGE; + this.exactCallsLog = SCAC.exactCallsLog; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 3eda2017c..17137c5e9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -186,63 +186,41 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! - public UnifiedArgumentCollection clone() { - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - - uac.GLmodel = GLmodel; - uac.AFmodel = AFmodel; - uac.heterozygosity = heterozygosity; - uac.PCR_error = PCR_error; - uac.GenotypingMode = GenotypingMode; - uac.OutputMode = OutputMode; - uac.NO_SLOD = NO_SLOD; - uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; - uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; - uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; - uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; - uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; - uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; - uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE; - uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; - uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY; - uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; - uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO; - uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; - uac.alleles = alleles; - uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES; - uac.MAX_ALTERNATE_ALLELES_FOR_INDELS = MAX_ALTERNATE_ALLELES_FOR_INDELS; - uac.GLmodel = GLmodel; - uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL; - uac.referenceSampleRod = referenceSampleRod; - uac.referenceSampleName = referenceSampleName; - uac.samplePloidy = samplePloidy; - uac.maxQualityScore = minQualityScore; - uac.phredScaledPrior = phredScaledPrior; - uac.minPower = minPower; - uac.minReferenceDepth = minReferenceDepth; - uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES; - uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO; - uac.exactCallsLog = exactCallsLog; - uac.pairHMM = pairHMM; - - // todo- arguments to remove - uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; - return uac; - } - public UnifiedArgumentCollection() { } - public UnifiedArgumentCollection( final StandardCallerArgumentCollection SCAC ) { - super(); - this.alleles = SCAC.alleles; - this.GenotypingMode = SCAC.GenotypingMode; - this.heterozygosity = SCAC.heterozygosity; - this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; - this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS; - this.OutputMode = SCAC.OutputMode; - this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; - this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; - this.exactCallsLog = SCAC.exactCallsLog; + public UnifiedArgumentCollection(final StandardCallerArgumentCollection SCAC) { + super(SCAC); + } + + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + public UnifiedArgumentCollection(final UnifiedArgumentCollection uac) { + this.GLmodel = uac.GLmodel; + this.AFmodel = uac.AFmodel; + this.PCR_error = uac.PCR_error; + this.NO_SLOD = uac.NO_SLOD; + this.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; + this.MIN_BASE_QUALTY_SCORE = uac.MIN_BASE_QUALTY_SCORE; + this.MAX_DELETION_FRACTION = uac.MAX_DELETION_FRACTION; + this.MIN_INDEL_COUNT_FOR_GENOTYPING = uac.MIN_INDEL_COUNT_FOR_GENOTYPING; + this.MIN_INDEL_FRACTION_PER_SAMPLE = uac.MIN_INDEL_FRACTION_PER_SAMPLE; + this.INDEL_HETEROZYGOSITY = uac.INDEL_HETEROZYGOSITY; + this.INDEL_GAP_OPEN_PENALTY = uac.INDEL_GAP_OPEN_PENALTY; + this.INDEL_GAP_CONTINUATION_PENALTY = uac.INDEL_GAP_CONTINUATION_PENALTY; + this.OUTPUT_DEBUG_INDEL_INFO = uac.OUTPUT_DEBUG_INDEL_INFO; + this.INDEL_HAPLOTYPE_SIZE = uac.INDEL_HAPLOTYPE_SIZE; + this.TREAT_ALL_READS_AS_SINGLE_POOL = uac.TREAT_ALL_READS_AS_SINGLE_POOL; + this.referenceSampleRod = uac.referenceSampleRod; + this.referenceSampleName = uac.referenceSampleName; + this.samplePloidy = uac.samplePloidy; + this.maxQualityScore = uac.minQualityScore; + this.phredScaledPrior = uac.phredScaledPrior; + this.minPower = uac.minPower; + this.minReferenceDepth = uac.minReferenceDepth; + this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; + this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; + this.pairHMM = uac.pairHMM; + + // todo- arguments to remove + this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; } } From 841a906f210ac363817fe3e84794281501e4b30f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sat, 20 Oct 2012 23:31:56 -0400 Subject: [PATCH 32/54] Adding a hidden (for now) argument to UG (and HC) that tells the caller that the incoming samples are contaminated by N% and to fix it by aggressively down-sampling all alleles. This actually works. Yes, you read that right: given that we know what N is, we can make good calls on bams that have N% contamination. Only hooked up for SNPS right now. No tests added yet. --- ...NPGenotypeLikelihoodsCalculationModel.java | 67 +++++++++++++++++-- 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 76ba72017..d053e13a8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -41,19 +41,20 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.variantcontext.*; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; +import java.util.*; public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { private final boolean useAlleleFromVCF; private final double[] likelihoodSums = new double[4]; - + private final ArrayList[] alleleStratifiedElements = new ArrayList[4]; + protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i] = new ArrayList(); } public VariantContext getLikelihoods(final RefMetaDataTracker tracker, @@ -78,8 +79,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC ArrayList GLs = new ArrayList(contexts.size()); for ( Map.Entry sample : contexts.entrySet() ) { ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); + if ( UAC.CONTAMINATION_PERCENTAGE > 0.0 ) + pileup = createDecontaminatedPileup(pileup, UAC.CONTAMINATION_PERCENTAGE); if ( useBAQedPileup ) - pileup = createBAQedPileup( pileup ); + pileup = createBAQedPileup(pileup); // create the GenotypeLikelihoods object final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.PCR_error); @@ -150,8 +153,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC // create the genotypes; no-call everyone for now final GenotypesContext genotypes = GenotypesContext.create(); - final List noCall = new ArrayList(); - noCall.add(Allele.NO_CALL); for ( SampleGenotypeData sampleData : GLs ) { final double[] allLikelihoods = sampleData.GL.getLikelihoods(); @@ -202,6 +203,42 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC return allelesToUse; } + public ReadBackedPileup createDecontaminatedPileup(final ReadBackedPileup pileup, final double contaminationPercentage) { + // special case removal of all reads + if ( contaminationPercentage >= 1.0 ) + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); + + // start by stratifying the reads by the alleles they represent at this position + for( final PileupElement pe : pileup ) { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); + if ( baseIndex != -1 ) + alleleStratifiedElements[baseIndex].add(pe); + } + + // Down-sample *each* allele by the contamination fraction applied to the entire pileup. + // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later. + int numReadsToRemove = (int)Math.ceil((double)pileup.getNumberOfElements() * contaminationPercentage); + final TreeSet elementsToKeep = new TreeSet(new Comparator() { + @Override + public int compare(PileupElement element1, PileupElement element2) { + final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); + return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); + } + }); + + for ( int i = 0; i < 4; i++ ) { + final ArrayList alleleList = alleleStratifiedElements[i]; + if ( alleleList.size() > numReadsToRemove ) + elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove)); + } + + // clean up pointers so memory can be garbage collected if needed + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i].clear(); + + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(elementsToKeep)); + } + public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) { final List BAQedElements = new ArrayList(); for( final PileupElement PE : pileup ) { @@ -220,6 +257,22 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); } } + private List downsampleElements(final ArrayList elements, final int numElementsToRemove) { + final int pileupSize = elements.size(); + final BitSet itemsToRemove = new BitSet(pileupSize); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); + } + + ArrayList elementsToKeep = new ArrayList(pileupSize - numElementsToRemove); + for ( int i = 0; i < pileupSize; i++ ) { + if ( !itemsToRemove.get(i) ) + elementsToKeep.add(elements.get(i)); + } + + return elementsToKeep; + } + private static class SampleGenotypeData { public final String name; From d44d5b827538789bae68c47c1634080e6c7d04c9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 21 Oct 2012 01:29:59 -0400 Subject: [PATCH 33/54] Fix RawHapMapCodec so that it can build indexes. Minor fixes to VCF codec. --- .../sting/utils/codecs/hapmap/RawHapMapCodec.java | 9 +++++++++ .../broadinstitute/sting/utils/codecs/vcf/VCFCodec.java | 4 +--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java index 916fb43ea..6b3fce966 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java @@ -25,8 +25,11 @@ package org.broadinstitute.sting.utils.codecs.hapmap; import org.broad.tribble.AsciiFeatureCodec; +import org.broad.tribble.FeatureCodecHeader; import org.broad.tribble.annotation.Strand; +import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.LineReader; +import org.broad.tribble.readers.PositionalBufferedStream; import java.io.IOException; import java.util.Arrays; @@ -116,4 +119,10 @@ public class RawHapMapCodec extends AsciiFeatureCodec { } return headerLine; } + + @Override + public FeatureCodecHeader readHeader(final PositionalBufferedStream stream) throws IOException { + final AsciiLineReader br = new AsciiLineReader(stream); + return new FeatureCodecHeader(readHeader(br), br.getPosition()); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 4df1efee7..f12f13dc7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -2,8 +2,6 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.variantcontext.*; import java.io.IOException; import java.util.*; @@ -119,7 +117,7 @@ public class VCFCodec extends AbstractVCFCodec { // empty set for passes filters List fFields = new LinkedList(); // otherwise we have to parse and cache the value - if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) + if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) fFields.add(filterString); else fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); From 0616b98551e7b3dbfa571ff93c65c9a1ef645029 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 21 Oct 2012 08:26:26 -0400 Subject: [PATCH 34/54] Not sure why we were setting the UAC variables instead of the simpleUAC ones when that's what we wanted. --- .../gatk/walkers/haplotypecaller/HaplotypeCaller.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 6d6351fc5..a08614deb 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -242,13 +242,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); - UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling - UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling - UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); - UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); + simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); + simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); From 67b9e7319e9be26d49ae1d75ea68aedbb0a0cfd0 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Sun, 21 Oct 2012 12:38:33 -0400 Subject: [PATCH 35/54] Fix for integration tests: new criterion in AF exact calculation model to trim alleles based on likelihoods does produce better results and resulting alleles changed in 2 sites at integration tests (and all subsequent sites after this had minor annotation differences due to RankSum dithering) --- .../UnifiedGenotyperGeneralPloidyIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 989f06ec5..652489a71 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -70,12 +70,12 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","9acfe0019efdc91217ee070acb071228"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","06a512271631c5b511314a2618de82d7"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","c1d4dd793f61710a1b1fc5d82803210f"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","36a383adfdbf1f59656138b538a9920d"); } @Test(enabled = true) From 9c63cee9fcdb69a7a8e8d77a771ddb2afa18f7cd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Oct 2012 18:36:14 -0400 Subject: [PATCH 36/54] Moving pnrm to UnifiedArgumentCollection so it's available with the HaplotypeCaller --- .../arguments/StandardCallerArgumentCollection.java | 10 +++++++++- .../walkers/genotyper/UnifiedArgumentCollection.java | 11 +++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 9b9f04228..a511364f9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.arguments; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; @@ -82,7 +83,6 @@ public class StandardCallerArgumentCollection { @Argument(shortName = "logExactCalls", doc="x", required=false) public File exactCallsLog = null; - public StandardCallerArgumentCollection() { } // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! @@ -97,5 +97,13 @@ public class StandardCallerArgumentCollection { this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; this.CONTAMINATION_PERCENTAGE = SCAC.CONTAMINATION_PERCENTAGE; this.exactCallsLog = SCAC.exactCallsLog; + this.AFmodel = SCAC.AFmodel; } + + /** + * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. + */ + @Advanced + @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) + public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 17137c5e9..abf0b4420 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -27,8 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; +<<<<<<< HEAD import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.pairhmm.PairHMM; +======= +>>>>>>> 19181ee... Moving pnrm to UnifiedArgumentCollection so it's available with the HaplotypeCaller import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -38,13 +41,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; - /** - * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. - */ - @Advanced - @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT; - /** * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily * distinguish between PCR errors vs. sequencing errors. The practical implication for this value is that it @@ -219,7 +215,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; this.pairHMM = uac.pairHMM; - // todo- arguments to remove this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; } From 99c9031cb4c20cf996202329ca17978ea1fde59e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 17 Oct 2012 20:41:33 -0400 Subject: [PATCH 37/54] Merge AFCalcResultTracker into StateTracker, cleanup -- These two classes were really the same, and now they are actually the same! -- Cleanuped the interfaces, removed duplicate data -- Added lots of contracts, some of which found numerical issues with GeneralPloidyExactAFCalc (which have been patched over but not fixed) -- Moved goodProbability and goodProbabilityVector utilities to MathUtils. Very useful for contracts! --- .../afcalc/GeneralPloidyExactAFCalc.java | 320 ++++++------------ ...neralPloidyAFCalculationModelUnitTest.java | 9 +- .../gatk/walkers/genotyper/afcalc/AFCalc.java | 30 +- .../genotyper/afcalc/AFCalcResult.java | 55 +-- .../genotyper/afcalc/AFCalcResultTracker.java | 256 -------------- .../genotyper/afcalc/DiploidExactAFCalc.java | 55 ++- .../afcalc/OriginalDiploidExactAFCalc.java | 4 - .../genotyper/afcalc/StateTracker.java | 304 ++++++++++++++--- .../broadinstitute/sting/utils/MathUtils.java | 33 ++ 9 files changed, 440 insertions(+), 626 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 0e97c090c..3916c2549 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; -import org.broadinstitute.sting.gatk.walkers.genotyper.ProbabilityVector; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -69,8 +68,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { @Override public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { - combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, getResultTracker()); - return resultFromTracker(vc, log10AlleleFrequencyPriors); + combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors); + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); } /** @@ -171,13 +170,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param numAlleles Number of alternate alleles * @param ploidyPerPool Number of samples per pool * @param log10AlleleFrequencyPriors Frequency priors - * @param resultTracker object to fill with output values */ - protected static void combineSinglePools(final GenotypesContext GLs, - final int numAlleles, - final int ploidyPerPool, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + protected void combineSinglePools(final GenotypesContext GLs, + final int numAlleles, + final int ploidyPerPool, + final double[] log10AlleleFrequencyPriors) { final ArrayList genotypeLikelihoods = getGLs(GLs, true); @@ -196,24 +193,24 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { if ( genotypeLikelihoods.size() <= 1 ) { // no meaningful GLs at all, just set the tracker to non poly values - resultTracker.reset(); // just mimic-ing call below - resultTracker.setLog10LikelihoodOfAFzero(0.0); + getStateTracker().reset(); // just mimic-ing call below + getStateTracker().setLog10LikelihoodOfAFzero(0.0); } else { for (int p=1; p ACqueue = new LinkedList(); // mapping of ExactACset indexes to the objects final HashMap indexesToACset = new HashMap(); @@ -230,16 +227,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { indexesToACset.put(zeroSet.getACcounts(), zeroSet); // keep processing while we have AC conformations that need to be calculated - StateTracker stateTracker = new StateTracker(); while ( !ACqueue.isEmpty() ) { - resultTracker.incNEvaluations(); + getStateTracker().incNEvaluations(); // compute log10Likelihoods final ExactACset ACset = ACqueue.remove(); - final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, resultTracker, stateTracker, ACqueue, indexesToACset); - - // adjust max likelihood seen if needed - if ( log10LofKs > stateTracker.getMaxLog10L()) - stateTracker.update(log10LofKs, ACset.getACcounts()); + final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset); // clean up memory indexesToACset.remove(ACset.getACcounts()); @@ -260,39 +252,32 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param log10AlleleFrequencyPriors Prior object * @param originalPloidy Total ploidy of original combined pool * @param newGLPloidy Ploidy of GL vector - * @param resultTracker AFResult object - * @param stateTracker max likelihood observed so far * @param ACqueue Queue of conformations to compute * @param indexesToACset AC indices of objects in queue * @return max log likelihood */ - private static double calculateACConformationAndUpdateQueue(final ExactACset set, - final CombinedPoolLikelihoods newPool, - final CombinedPoolLikelihoods originalPool, - final double[] newGL, - final double[] log10AlleleFrequencyPriors, - final int originalPloidy, - final int newGLPloidy, - final AFCalcResultTracker resultTracker, - final StateTracker stateTracker, - final LinkedList ACqueue, - final HashMap indexesToACset) { + private double calculateACConformationAndUpdateQueue(final ExactACset set, + final CombinedPoolLikelihoods newPool, + final CombinedPoolLikelihoods originalPool, + final double[] newGL, + final double[] log10AlleleFrequencyPriors, + final int originalPloidy, + final int newGLPloidy, + final LinkedList ACqueue, + final HashMap indexesToACset) { // compute likeihood in "set" of new set based on original likelihoods final int numAlleles = set.getACcounts().getCounts().length; final int newPloidy = set.getACsum(); - final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, resultTracker); + final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy); // add to new pool if (!Double.isInfinite(log10LofK)) newPool.add(set); - // TODO -- uncomment this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) - //if ( log10LofK < stateTracker.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && stateTracker.isLowerAC(set.ACcounts) ) { - if ( log10LofK < stateTracker.getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY ) { - if ( VERBOSE ) - System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, stateTracker.getMaxLog10L()); + // TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) + if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) { return log10LofK; } @@ -323,67 +308,67 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { } - /** - * Naive combiner of two multiallelic pools - number of alt alleles must be the same. - * Math is generalization of biallelic combiner. - * - * For vector K representing an allele count conformation, - * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) - * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) - * @param originalPool First log-likelihood pool GL vector - * @param yy Second pool GL vector - * @param ploidy1 Ploidy of first pool (# of chromosomes in it) - * @param ploidy2 Ploidy of second pool - * @param numAlleles Number of alleles - * @param log10AlleleFrequencyPriors Array of biallelic priors - * @param resultTracker Af calculation result object - */ - public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { -/* - final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); - final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); - - if (dim1 != originalPool.getLength() || dim2 != yy.length) - throw new ReviewedStingException("BUG: Inconsistent vector length"); - - if (ploidy2 == 0) - return; - - final int newPloidy = ploidy1 + ploidy2; - - // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) - // and L2(K) = Pr(D|AC2=K) * choose(m2,K) - GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); - final double[] x = originalPool.getLikelihoodsAsVector(true); - while(firstIterator.hasNext()) { - x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); - firstIterator.next(); - } - - GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); - final double[] y = yy.clone(); - while(secondIterator.hasNext()) { - y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); - secondIterator.next(); - } - - // initialize output to -log10(choose(m1+m2,[k1 k2...]) - final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); - final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); - - - // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K - while(outputIterator.hasNext()) { - final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); - double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); - - originalPool.add(likelihood, set, outputIterator.getLinearIndex()); - outputIterator.next(); - } -*/ - } +// /** +// * Naive combiner of two multiallelic pools - number of alt alleles must be the same. +// * Math is generalization of biallelic combiner. +// * +// * For vector K representing an allele count conformation, +// * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) +// * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) +// * @param originalPool First log-likelihood pool GL vector +// * @param yy Second pool GL vector +// * @param ploidy1 Ploidy of first pool (# of chromosomes in it) +// * @param ploidy2 Ploidy of second pool +// * @param numAlleles Number of alleles +// * @param log10AlleleFrequencyPriors Array of biallelic priors +// * @param resultTracker Af calculation result object +// */ +// public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, +// final double[] log10AlleleFrequencyPriors, +// final AFCalcResultTracker resultTracker) { +///* +// final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); +// final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); +// +// if (dim1 != originalPool.getLength() || dim2 != yy.length) +// throw new ReviewedStingException("BUG: Inconsistent vector length"); +// +// if (ploidy2 == 0) +// return; +// +// final int newPloidy = ploidy1 + ploidy2; +// +// // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) +// // and L2(K) = Pr(D|AC2=K) * choose(m2,K) +// GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); +// final double[] x = originalPool.getLikelihoodsAsVector(true); +// while(firstIterator.hasNext()) { +// x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); +// firstIterator.next(); +// } +// +// GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); +// final double[] y = yy.clone(); +// while(secondIterator.hasNext()) { +// y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); +// secondIterator.next(); +// } +// +// // initialize output to -log10(choose(m1+m2,[k1 k2...]) +// final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); +// final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); +// +// +// // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K +// while(outputIterator.hasNext()) { +// final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); +// double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); +// +// originalPool.add(likelihood, set, outputIterator.getLinearIndex()); +// outputIterator.next(); +// } +//*/ +// } /** * Compute likelihood of a particular AC conformation and update AFresult object @@ -394,15 +379,13 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param numAlleles Number of alleles (including ref) * @param ploidy1 Ploidy of original pool (combined) * @param ploidy2 Ploidy of new pool - * @param resultTracker AFResult object * @return log-likehood of requested conformation */ - private static double computeLofK(final ExactACset set, - final CombinedPoolLikelihoods firstGLs, - final double[] secondGL, - final double[] log10AlleleFrequencyPriors, - final int numAlleles, final int ploidy1, final int ploidy2, - final AFCalcResultTracker resultTracker) { + private double computeLofK(final ExactACset set, + final CombinedPoolLikelihoods firstGLs, + final double[] secondGL, + final double[] log10AlleleFrequencyPriors, + final int numAlleles, final int ploidy1, final int ploidy2) { final int newPloidy = ploidy1 + ploidy2; @@ -420,8 +403,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX]; set.getLog10Likelihoods()[0] = log10Lof0; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); return log10Lof0; } else { @@ -464,14 +447,14 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); - resultTracker.updateMLEifNeeded(log10LofK, altCounts); + getStateTracker().updateMLEifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - resultTracker.updateMAPifNeeded(log10LofK, altCounts); + getStateTracker().updateMAPifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts); return log10LofK; } @@ -494,99 +477,6 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { return (sum == ploidy); } - /** - * Combines naively two biallelic pools (of arbitrary size). - * For two pools of size m1 and m2, we can compute the combined likelihood as: - * Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k) - * @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i - * @param newPLVector Second GL vector - * @param ploidy1 Ploidy of first pool (# of chromosomes in it) - * @param ploidy2 Ploidy of second pool - * @param log10AlleleFrequencyPriors Array of biallelic priors - * @param resultTracker Af calculation result object - * @return Combined likelihood vector - */ - public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector, - final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { - - final int newPloidy = ploidy1 + ploidy2; - - final double[] combinedLikelihoods = new double[1+newPloidy]; - - /** Pre-fill result array and incorporate weights into input vectors - * Say L1(k) = Pr(D|AC1=k) * choose(m1,k) - * and L2(k) = Pr(D|AC2=k) * choose(m2,k) - * equation reduces to - * Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j) - * which is just plain convolution of L1 and L2 (with pre-existing vector) - */ - - // intialize result vector to -infinity - Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY); - - final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1); - for (int k=originalPool.getProbabilityVector().length; k< x.length; k++) - x[k] = Double.NEGATIVE_INFINITY; - - final double[] y = newPLVector.clone(); - - - final double log10Lof0 = x[0]+y[0]; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); - - double maxElement = log10Lof0; - int maxElementIdx = 0; - int[] alleleCounts = new int[1]; - for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) { - double[] acc = new double[k+1]; - Arrays.fill(acc,Double.NEGATIVE_INFINITY); - double innerMax = Double.NEGATIVE_INFINITY; - - for (int j=0; j <=k; j++) { - double x1,y1; - - - if (k-j>=0 && k-j < y.length) - y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j); - else - continue; - - if (j < x.length) - x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j); - else - continue; - - if (Double.isInfinite(x1) || Double.isInfinite(y1)) - continue; - acc[j] = x1 + y1; - if (acc[j] > innerMax) - innerMax = acc[j]; - else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY) - break; - } - combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k); - maxElementIdx = k; - double maxDiff = combinedLikelihoods[k] - maxElement; - if (maxDiff > 0) - maxElement = combinedLikelihoods[k]; - else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) { - break; - } - - alleleCounts[0] = k; - resultTracker.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts); - resultTracker.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts); - - - } - - - return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true)); - } - - /** * From a given variant context, extract a given subset of alleles, and update genotype context accordingly, * including updating the PL's, and assign genotypes accordingly @@ -675,10 +565,10 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * * @return genotype */ - private static void assignGenotype(final GenotypeBuilder gb, - final double[] newLikelihoods, - final List allelesToUse, - final int numChromosomes) { + private void assignGenotype(final GenotypeBuilder gb, + final double[] newLikelihoods, + final List allelesToUse, + final int numChromosomes) { final int numNewAltAlleles = allelesToUse.size() - 1; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java index 48f282901..1b3a4c0c0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java @@ -137,18 +137,15 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest { @Test(dataProvider = "getGLs") public void testGLs(GetGLsTest cfg) { - - final AFCalcResultTracker resultTracker = new AFCalcResultTracker(cfg.numAltAlleles); final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size()); double[] priors = new double[len]; // flat priors - GeneralPloidyExactAFCalc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, resultTracker); + final GeneralPloidyExactAFCalc calc = new GeneralPloidyExactAFCalc(cfg.GLs.size(), 1 + cfg.numAltAlleles, 1 + cfg.numAltAlleles, cfg.ploidy); + calc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = resultTracker.getAlleleCountsOfMAP()[allele]; - -// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount); + int calculatedAlleleCount = calc.getStateTracker().getAlleleCountsOfMAP()[allele]; Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java index 07f88c9e3..927fadd94 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -50,7 +50,7 @@ public abstract class AFCalc implements Cloneable { protected Logger logger = defaultLogger; private SimpleTimer callTimer = new SimpleTimer(); - private final AFCalcResultTracker resultTracker; + private final StateTracker stateTracker; private ExactCallLogger exactCallLogger = null; protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { @@ -62,7 +62,7 @@ public abstract class AFCalc implements Cloneable { this.nSamples = nSamples; this.maxAlternateAllelesToGenotype = maxAltAlleles; this.maxAlternateAllelesForIndels = maxAltAllelesForIndels; - this.resultTracker = new AFCalcResultTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); + this.stateTracker = new StateTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); } public void enableProcessLog(final File exactCallsLog) { @@ -83,10 +83,10 @@ public abstract class AFCalc implements Cloneable { public AFCalcResult getLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null"); if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null"); - if ( resultTracker == null ) throw new IllegalArgumentException("Results object cannot be null"); + if ( stateTracker == null ) throw new IllegalArgumentException("Results object cannot be null"); // reset the result, so we can store our new result there - resultTracker.reset(); + stateTracker.reset(); final VariantContext vcWorking = reduceScope(vc); @@ -100,10 +100,20 @@ public abstract class AFCalc implements Cloneable { return result; } - @Deprecated - protected AFCalcResult resultFromTracker(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { - resultTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); - return resultTracker.toAFCalcResult(log10AlleleFrequencyPriors); + /** + * Convert the final state of the state tracker into our result as an AFCalcResult + * + * Assumes that stateTracker has been updated accordingly + * + * @param vcWorking the VariantContext we actually used as input to the calc model (after reduction) + * @param log10AlleleFrequencyPriors the priors by AC vector + * @return a AFCalcResult describing the result of this calculation + */ + @Requires("stateTracker.getnEvaluations() > 0") + @Ensures("result != null") + protected AFCalcResult getResultFromFinalState(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { + stateTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); + return stateTracker.toAFCalcResult(log10AlleleFrequencyPriors); } // --------------------------------------------------------------------------- @@ -162,8 +172,8 @@ public abstract class AFCalc implements Cloneable { return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels); } - public AFCalcResultTracker getResultTracker() { - return resultTracker; + public StateTracker getStateTracker() { + return stateTracker; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index 7cacb2060..a65772444 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -83,8 +83,8 @@ public class AFCalcResult { if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null"); if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); - if ( ! goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); - if ( ! goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); + if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); + if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); this.alleleCountsOfMLE = alleleCountsOfMLE; this.nEvaluations = nEvaluations; @@ -147,7 +147,7 @@ public class AFCalcResult { * Due to computational / implementation constraints this may be smaller than * the actual list of alleles requested * - * @return a non-empty list of alleles used during genotyping + * @return a non-empty list of alleles used during genotyping, the first of which is the reference allele */ @Ensures({"result != null", "! result.isEmpty()"}) public List getAllelesUsedInGenotyping() { @@ -159,7 +159,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PosteriorOfAFEq0() { return log10PosteriorsOfAC[AF0]; } @@ -169,7 +169,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PosteriorOfAFGT0() { return log10PosteriorsOfAC[AF1p]; } @@ -179,7 +179,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10LikelihoodOfAFEq0() { return log10LikelihoodsOfAC[AF0]; } @@ -189,7 +189,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10LikelihoodOfAFGT0() { return log10LikelihoodsOfAC[AF1p]; } @@ -199,7 +199,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PriorOfAFEq0() { return log10PriorsOfAC[AF0]; } @@ -209,7 +209,7 @@ public class AFCalcResult { * * @return */ - @Ensures({"goodLog10Probability(result)"}) + @Ensures({"MathUtils.goodLog10Probability(result)"}) public double getLog10PriorOfAFGT0() { return log10PriorsOfAC[AF1p]; } @@ -263,7 +263,7 @@ public class AFCalcResult { * @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping * @return the log10 probability that allele is segregating at this site */ - @Ensures("goodLog10Probability(result)") + @Ensures("MathUtils.goodLog10Probability(result)") public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) { final Double log10pNonRef = log10pNonRefByAllele.get(allele); if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele); @@ -279,7 +279,7 @@ public class AFCalcResult { * @return freshly allocated log10 normalized posteriors vector */ @Requires("log10LikelihoodsOfAC.length == log10PriorsOfAC.length") - @Ensures("goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)") + @Ensures("MathUtils.goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)") private static double[] computePosteriors(final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC) { final double[] log10UnnormalizedPosteriors = new double[log10LikelihoodsOfAC.length]; for ( int i = 0; i < log10LikelihoodsOfAC.length; i++ ) @@ -287,29 +287,6 @@ public class AFCalcResult { return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false); } - /** - * Check that the log10 prob vector vector is well formed - * - * @param vector - * @param expectedSize - * @param shouldSumToOne - * - * @return true if vector is well-formed, false otherwise - */ - private static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { - if ( vector.length != expectedSize ) return false; - - for ( final double pr : vector ) { - if ( ! goodLog10Probability(pr) ) - return false; - } - - if ( shouldSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(vector), 1.0, 1e-4) != 0 ) - return false; - - return true; // everything is good - } - /** * Computes the offset into linear vectors indexed by alt allele for allele * @@ -331,14 +308,4 @@ public class AFCalcResult { else return index - 1; } - - /** - * Checks that the result is a well-formed log10 probability - * - * @param result a supposedly well-formed log10 probability value - * @return true if result is really well formed - */ - private static boolean goodLog10Probability(final double result) { - return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); - } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java deleted file mode 100644 index 5c926a4d8..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultTracker.java +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; - -import com.google.java.contract.Ensures; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * Created by IntelliJ IDEA. - * User: ebanks - * Date: Dec 14, 2011 - * - * Useful helper class to communicate the results of the allele frequency calculation - * - * TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF? - */ -class AFCalcResultTracker { - protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; - - // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles - protected double log10MLE; - protected double log10MAP; - private final int[] alleleCountsOfMLE; - private final int[] alleleCountsOfMAP; - - // The posteriors seen, not including that of AF=0 - private static final int LIKELIHOODS_CACHE_SIZE = 5000; - private final double[] log10LikelihoodsMatrixValues = new double[LIKELIHOODS_CACHE_SIZE]; - private int currentLikelihoodsCacheIndex = 0; - protected Double log10LikelihoodsMatrixSum = null; - - // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) - private double log10LikelihoodOfAFzero; - private double log10PosteriorOfAFzero; - private int[] AClimits; - - int nEvaluations = 0; - - /** - * The list of alleles actually used in computing the AF - */ - private List allelesUsedInGenotyping = null; - - /** - * Create a results object capability of storing results for calls with up to maxAltAlleles - * - * @param maxAltAlleles an integer >= 1 - */ - public AFCalcResultTracker(final int maxAltAlleles) { - if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles); - - alleleCountsOfMLE = new int[maxAltAlleles]; - alleleCountsOfMAP = new int[maxAltAlleles]; - - reset(); - } - - /** - * Returns a vector with maxAltAlleles values containing AC values at the MLE - * - * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, - * starting from index 0 (i.e., the first alt allele is at 0). The vector is always - * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values - * are meaningful. - * - * @return a vector with allele counts, not all of which may be meaningful - */ - @Ensures("result != null") - public int[] getAlleleCountsOfMLE() { - return alleleCountsOfMLE; - } - - /** - * Returns a vector with maxAltAlleles values containing AC values at the MAP - * - * @see #getAlleleCountsOfMLE() for the encoding of results in this vector - * - * @return a non-null vector of ints - */ - @Ensures("result != null") - public int[] getAlleleCountsOfMAP() { - return alleleCountsOfMAP; - } - - /** - * Returns the likelihoods summed across all AC values for AC > 0 - * - * @return - */ - public double getLog10LikelihoodOfAFNotZero() { - if ( log10LikelihoodsMatrixSum == null ) { - if ( currentLikelihoodsCacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have - log10LikelihoodsMatrixSum = MathUtils.LOG10_P_OF_ZERO; - else - log10LikelihoodsMatrixSum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex); - } - return log10LikelihoodsMatrixSum; - } - - public double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { - return Math.min(getLog10LikelihoodOfAFNotZero(), capAt0 ? 0.0 : Double.POSITIVE_INFINITY); - } - - /** - * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should - * - * @return - */ - public double getLog10LikelihoodOfAFzero() { - return log10LikelihoodOfAFzero; - } - - /** - * TODO -- eric what is this supposed to return? my unit tests don't do what I think they should - * - * @return - */ - public double getLog10PosteriorOfAFzero() { - return log10PosteriorOfAFzero; - } - - protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { - final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); - final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; - final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); - - // TODO -- replace with more meaningful computation - // TODO -- refactor this calculation into the ref calculation - final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); - for ( int i = 0; i < subACOfMLE.length; i++ ) { - final Allele allele = allelesUsedInGenotyping.get(i+1); - final double log10PNonRef = getAlleleCountsOfMAP()[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was - log10pNonRefByAllele.put(allele, log10PNonRef); - } - - return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); - } - - // -------------------------------------------------------------------------------- - // - // Protected mutational methods only for use within the calculation models themselves - // - // -------------------------------------------------------------------------------- - - /** - * Reset the data in this results object, so that it can be used in a subsequent AF calculation - * - * Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer - */ - protected void reset() { - log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = VALUE_NOT_CALCULATED; - for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) { - alleleCountsOfMLE[i] = 0; - alleleCountsOfMAP[i] = 0; - } - currentLikelihoodsCacheIndex = 0; - log10LikelihoodsMatrixSum = null; - allelesUsedInGenotyping = null; - nEvaluations = 0; - Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY); - } - - /** - * Tell this result we used one more evaluation cycle - */ - protected void incNEvaluations() { - nEvaluations++; - } - - protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { - addToLikelihoodsCache(log10LofK); - - if ( log10LofK > log10MLE ) { - log10MLE = log10LofK; - for ( int i = 0; i < alleleCountsForK.length; i++ ) - alleleCountsOfMLE[i] = alleleCountsForK[i]; - } - } - - protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { - if ( log10LofK > log10MAP ) { - log10MAP = log10LofK; - for ( int i = 0; i < alleleCountsForK.length; i++ ) - alleleCountsOfMAP[i] = alleleCountsForK[i]; - } - } - - private void addToLikelihoodsCache(final double log10LofK) { - // add to the cache - log10LikelihoodsMatrixValues[currentLikelihoodsCacheIndex++] = log10LofK; - - // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell - if ( currentLikelihoodsCacheIndex == LIKELIHOODS_CACHE_SIZE) { - final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex); - Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY); - log10LikelihoodsMatrixValues[0] = temporarySum; - currentLikelihoodsCacheIndex = 1; - } - } - - protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { - this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; - if ( log10LikelihoodOfAFzero > log10MLE ) { - log10MLE = log10LikelihoodOfAFzero; - Arrays.fill(alleleCountsOfMLE, 0); - } - } - - protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { - this.log10PosteriorOfAFzero = log10PosteriorOfAFzero; - if ( log10PosteriorOfAFzero > log10MAP ) { - log10MAP = log10PosteriorOfAFzero; - Arrays.fill(alleleCountsOfMAP, 0); - } - } - - protected void setAllelesUsedInGenotyping(List allelesUsedInGenotyping) { - if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() ) - throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty"); - - this.allelesUsedInGenotyping = allelesUsedInGenotyping; - } - - protected void setAClimits(int[] AClimits) { - this.AClimits = AClimits; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java index 49915c515..6b345dcf5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -36,10 +36,6 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); } - protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) { - return new StateTracker(); - } - @Override protected AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { @@ -60,29 +56,21 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { ACqueue.add(zeroSet); indexesToACset.put(zeroSet.getACcounts(), zeroSet); - // keep processing while we have AC conformations that need to be calculated - final StateTracker stateTracker = makeMaxLikelihood(vc, getResultTracker()); - while ( !ACqueue.isEmpty() ) { - getResultTracker().incNEvaluations(); // keep track of the number of evaluations + getStateTracker().incNEvaluations(); // keep track of the number of evaluations // compute log10Likelihoods final ExactACset set = ACqueue.remove(); - if ( stateTracker.withinMaxACs(set.getACcounts()) ) { - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, stateTracker, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, getResultTracker()); + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors); - // adjust max likelihood seen if needed - stateTracker.update(log10LofKs, set.getACcounts()); - - // clean up memory - indexesToACset.remove(set.getACcounts()); - //if ( DEBUG ) - // System.out.printf(" *** removing used set=%s%n", set.ACcounts); - } + // clean up memory + indexesToACset.remove(set.getACcounts()); + //if ( DEBUG ) + // System.out.printf(" *** removing used set=%s%n", set.ACcounts); } - return resultFromTracker(vc, log10AlleleFrequencyPriors); + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); } @Override @@ -153,23 +141,21 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { private double calculateAlleleCountConformation(final ExactACset set, final ArrayList genotypeLikelihoods, - final StateTracker stateTracker, final int numChr, final LinkedList ACqueue, final HashMap indexesToACset, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + final double[] log10AlleleFrequencyPriors) { //if ( DEBUG ) // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); // compute the log10Likelihoods - computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, resultTracker); + computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors); final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; // can we abort early because the log10Likelihoods are so small? - if ( stateTracker.abort(log10LofK, set.getACcounts()) ) { + if ( getStateTracker().abort(log10LofK, set.getACcounts(), true) ) { //if ( DEBUG ) // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); return log10LofK; @@ -188,7 +174,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { ACcountsClone[allele]++; // to get to this conformation, a sample would need to be AB (remember that ref=0) final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); - updateACset(stateTracker, ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different @@ -213,9 +199,9 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering for ( DependentSet dependent : differentAlleles ) - updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); for ( DependentSet dependent : sameAlleles ) - updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } return log10LofK; @@ -223,8 +209,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and // also pushes its value to the given callingSetIndex. - private void updateACset(final StateTracker stateTracker, - final int[] newSetCounts, + private void updateACset(final int[] newSetCounts, final int numChr, final ExactACset dependentSet, final int PLsetIndex, @@ -246,8 +231,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { private void computeLofK(final ExactACset set, final ArrayList genotypeLikelihoods, - final double[] log10AlleleFrequencyPriors, - final AFCalcResultTracker resultTracker) { + final double[] log10AlleleFrequencyPriors) { set.getLog10Likelihoods()[0] = 0.0; // the zero case final int totalK = set.getACsum(); @@ -258,8 +242,8 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; - resultTracker.setLog10LikelihoodOfAFzero(log10Lof0); - resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); return; } @@ -281,14 +265,15 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; // update the MLE if necessary - resultTracker.updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); + getStateTracker().updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); // apply the priors over each alternate allele for ( final int ACcount : set.getACcounts().getCounts() ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - resultTracker.updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); + + getStateTracker().updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); } private void pushData(final ExactACset targetSet, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index 093bf47d5..88f5e06e6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -16,10 +16,6 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); } - protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) { - return new StateTracker(); - } - @Override protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) { final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length]; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index 19e253277..3eb32d35e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -1,35 +1,85 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + /** - * Keeps track of the best state seen by the exact model and the max states to visit - * allowing us to abort the search before we visit the entire matrix of AC x samples + * Keeps track of the state information during the exact model AF calculation. + * + * Tracks things like the MLE and MAP AC values, their corresponding likelhood and posterior + * values, the likelihood of the AF == 0 state, and the number of evaluations needed + * by the calculation to compute the P(AF == 0) */ final class StateTracker { - public final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - - final private int[] maxACsToConsider; - - private ExactACcounts ACsAtMax = null; - private double maxLog10L = Double.NEGATIVE_INFINITY; - - public StateTracker() { - this(null); - } - - public StateTracker(final int[] maxACsToConsider) { - this.maxACsToConsider = maxACsToConsider; - } + protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; + protected final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 /** - * Update the maximum log10L seen, if log10LofKs is higher, and the corresponding ACs of this state - * - * @param log10LofKs the likelihood of our current configuration state + * These variables are intended to contain the MLE and MAP (and their corresponding allele counts) + * of the site over all alternate alleles */ - public void update(final double log10LofKs, final ExactACcounts ACs) { - if ( log10LofKs > getMaxLog10L()) { - this.setMaxLog10L(log10LofKs); - this.ACsAtMax = ACs; - } + protected double log10MLE; + protected double log10MAP; + + /** + * Returns a vector with maxAltAlleles values containing AC values at the MLE + * + * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, + * starting from index 0 (i.e., the first alt allele is at 0). The vector is always + * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values + * are meaningful. + */ + private final int[] alleleCountsOfMLE; + private final int[] alleleCountsOfMAP; + + /** + * A vector of log10 likelihood values seen, for future summation. When the size of the + * vector is exceeed -- because we've pushed more posteriors than there's space to hold + * -- we simply sum up the existing values, make that the first value, and continue. + */ + private final double[] log10LikelihoodsForAFGt0 = new double[LIKELIHOODS_CACHE_SIZE]; + private static final int LIKELIHOODS_CACHE_SIZE = 5000; + private int log10LikelihoodsForAFGt0CacheIndex = 0; + + /** + * The actual sum of the likelihoods. Null if the sum hasn't been computed yet + */ + protected Double log10LikelihoodsForAFGt0Sum = null; + + /** + * Contains the likelihood for the site's being monomorphic (i.e. AF=0 for all alternate alleles) + */ + private double log10LikelihoodOfAFzero = 0.0; + + /** + * The number of evaluates we've gone through in the AFCalc + */ + private int nEvaluations = 0; + + /** + * The list of alleles actually used in computing the AF + */ + private List allelesUsedInGenotyping = null; + + /** + * Create a results object capability of storing results for calls with up to maxAltAlleles + * + * @param maxAltAlleles an integer >= 1 + */ + public StateTracker(final int maxAltAlleles) { + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles); + + alleleCountsOfMLE = new int[maxAltAlleles]; + alleleCountsOfMAP = new int[maxAltAlleles]; + + reset(); } /** @@ -39,58 +89,200 @@ final class StateTracker { * @param log10LofK the log10 likelihood of the configuration we're considering analyzing * @return true if the configuration cannot meaningfully contribute to our likelihood sum */ - public boolean tooLowLikelihood(final double log10LofK) { - return log10LofK < getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY; + private boolean tooLowLikelihood(final double log10LofK) { + return log10LofK < log10MLE - MAX_LOG10_ERROR_TO_STOP_EARLY; } /** - * Are all ACs in otherACs less than or equal to their corresponding ACs in the maxACsToConsider? + * @return true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set + */ + private boolean isLowerAC(final ExactACcounts otherACs) { + final int[] otherACcounts = otherACs.getCounts(); + + for ( int i = 0; i < otherACcounts.length; i++ ) { + if ( alleleCountsOfMLE[i] > otherACcounts[i] ) + return false; + } + + return true; + } + + /** + * Should we stop exploring paths from ACs, given it's log10LofK * - * @param otherACs the set of otherACs that we want to know if we should consider analyzing - * @return true if otherACs is a state worth considering, or false otherwise + * @param log10LofK the log10LofK of these ACs + * @param ACs the ACs of this state + * @return return true if there's no reason to continue with subpaths of AC, or false otherwise */ - public boolean withinMaxACs(final ExactACcounts otherACs) { - if ( maxACsToConsider == null ) - return true; + protected boolean abort( final double log10LofK, final ExactACcounts ACs, final boolean enforceLowerACs ) { + return tooLowLikelihood(log10LofK) && (!enforceLowerACs || isLowerAC(ACs)); + } - final int[] otherACcounts = otherACs.getCounts(); + @Ensures("result != null") + protected int[] getAlleleCountsOfMAP() { + return alleleCountsOfMAP; + } - for ( int i = 0; i < maxACsToConsider.length; i++ ) { - // consider one more than the max AC to collect a bit more likelihood mass - if ( otherACcounts[i] > maxACsToConsider[i] + 1 ) - return false; - } - - return true; + @Ensures("result >= 0") + protected int getnEvaluations() { + return nEvaluations; } /** - * returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set + * Returns the likelihoods summed across all AC values for AC > 0 + * + * @return */ - public boolean isLowerAC(final ExactACcounts otherACs) { - if ( ACsAtMax == null ) - return true; + private double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { + if ( log10LikelihoodsForAFGt0Sum == null ) { + if ( log10LikelihoodsForAFGt0CacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have + log10LikelihoodsForAFGt0Sum = MathUtils.LOG10_P_OF_ZERO; + else + log10LikelihoodsForAFGt0Sum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); + } + return Math.min(log10LikelihoodsForAFGt0Sum, capAt0 ? 0.0 : Double.POSITIVE_INFINITY); + } - final int[] myACcounts = this.ACsAtMax.getCounts(); - final int[] otherACcounts = otherACs.getCounts(); + /** + * @return + */ + private double getLog10LikelihoodOfAFzero() { + return log10LikelihoodOfAFzero; + } - for ( int i = 0; i < myACcounts.length; i++ ) { - if ( myACcounts[i] > otherACcounts[i] ) - return false; + /** + * Convert this state to an corresponding AFCalcResult. + * + * Assumes that the values in this state have been filled in with meaningful values during the calculation. + * For example, that the allelesUsedInGenotyping has been set, that the alleleCountsOfMLE contains meaningful + * values, etc. + * + * @param log10PriorsByAC + * + * @return + */ + @Requires("allelesUsedInGenotyping != null") + protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { + final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); + final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; + final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); + + // TODO -- replace with more meaningful computation + // TODO -- refactor this calculation into the ref calculation + final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); + for ( int i = 0; i < subACOfMLE.length; i++ ) { + final Allele allele = allelesUsedInGenotyping.get(i+1); + final double log10PNonRef = alleleCountsOfMAP[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was + log10pNonRefByAllele.put(allele, log10PNonRef); } - return true; + return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); } - public boolean abort( final double log10LofK, final ExactACcounts ACs ) { - return tooLowLikelihood(log10LofK) && isLowerAC(ACs); + // -------------------------------------------------------------------------------- + // + // Protected mutational methods only for use within the calculation models themselves + // + // -------------------------------------------------------------------------------- + + /** + * Reset the data in this results object, so that it can be used in a subsequent AF calculation + * + * Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer + */ + protected void reset() { + log10MLE = log10MAP = log10LikelihoodOfAFzero = VALUE_NOT_CALCULATED; + log10LikelihoodsForAFGt0CacheIndex = 0; + log10LikelihoodsForAFGt0Sum = null; + allelesUsedInGenotyping = null; + nEvaluations = 0; + Arrays.fill(alleleCountsOfMLE, 0); + Arrays.fill(alleleCountsOfMAP, 0); + Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY); } - public double getMaxLog10L() { - return maxLog10L; + /** + * Tell this result we used one more evaluation cycle + */ + protected void incNEvaluations() { + nEvaluations++; } - public void setMaxLog10L(double maxLog10L) { - this.maxLog10L = maxLog10L; + /** + * Update the maximum log10 likelihoods seen, if log10LofKs is higher, and the corresponding ACs of this state + * + * @param log10LofK the likelihood of our current configuration state, cannot be the 0 state + * @param alleleCountsForK the allele counts for this state + */ + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10LofK)"}) + @Ensures("log10MLE == Math.max(log10LofK, log10MLE)") + protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { + addToLikelihoodsCache(log10LofK); + + if ( log10LofK > log10MLE ) { + log10MLE = log10LofK; + System.arraycopy(alleleCountsForK, 0, alleleCountsOfMLE, 0, alleleCountsForK.length); + } + } + + /** + * Update the maximum log10 posterior seen, if log10PofKs is higher, and the corresponding ACs of this state + * + * @param log10PofK the posterior of our current configuration state + * @param alleleCountsForK the allele counts for this state + */ + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10PofK)"}) + @Ensures("log10MAP == Math.max(log10PofK, log10MAP)") + protected void updateMAPifNeeded(final double log10PofK, final int[] alleleCountsForK) { + if ( log10PofK > log10MAP ) { + log10MAP = log10PofK; + System.arraycopy(alleleCountsForK, 0, alleleCountsOfMAP, 0, alleleCountsForK.length); + } + } + + @Requires({"MathUtils.goodLog10Probability(log10LofK)"}) + private void addToLikelihoodsCache(final double log10LofK) { + // add to the cache + log10LikelihoodsForAFGt0[log10LikelihoodsForAFGt0CacheIndex++] = log10LofK; + + // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell + if ( log10LikelihoodsForAFGt0CacheIndex == LIKELIHOODS_CACHE_SIZE) { + final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); + Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY); + log10LikelihoodsForAFGt0[0] = temporarySum; + log10LikelihoodsForAFGt0CacheIndex = 1; + } + } + + @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) + protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { + this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; + if ( log10LikelihoodOfAFzero > log10MLE ) { + log10MLE = log10LikelihoodOfAFzero; + Arrays.fill(alleleCountsOfMLE, 0); + } + } + + @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) + protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { + if ( log10PosteriorOfAFzero > log10MAP ) { + log10MAP = log10PosteriorOfAFzero; + Arrays.fill(alleleCountsOfMAP, 0); + } + } + + /** + * Set the list of alleles used in genotyping + * + * @param allelesUsedInGenotyping the list of alleles, where the first allele is reference + */ + @Requires({"allelesUsedInGenotyping != null", "allelesUsedInGenotyping.size() > 1"}) + protected void setAllelesUsedInGenotyping(List allelesUsedInGenotyping) { + if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() ) + throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty"); + if ( allelesUsedInGenotyping.get(0).isNonReference() ) + throw new IllegalArgumentException("The first element of allelesUsedInGenotyping must be the reference allele"); + + this.allelesUsedInGenotyping = allelesUsedInGenotyping; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 3740d5d7c..ff153a85c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -1194,6 +1194,39 @@ public class MathUtils { return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.)); } + /** + * Check that the log10 prob vector vector is well formed + * + * @param vector + * @param expectedSize + * @param shouldSumToOne + * + * @return true if vector is well-formed, false otherwise + */ + public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { + if ( vector.length != expectedSize ) return false; + + for ( final double pr : vector ) { + if ( ! goodLog10Probability(pr) ) + return false; + } + + if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 ) + return false; + + return true; // everything is good + } + + /** + * Checks that the result is a well-formed log10 probability + * + * @param result a supposedly well-formed log10 probability value + * @return true if result is really well formed + */ + public static boolean goodLog10Probability(final double result) { + return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); + } + /** * A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that From 695cf8367598d3cb312e590b2cb3fb6a0cd65faf Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 18 Oct 2012 07:38:32 -0400 Subject: [PATCH 38/54] More docs and contracts for classes in genotyper.afcalc -- Future protection of the output of GeneralPloidyExactAFCalc, which produces in some cases bad likelihoods (positive values) --- .../afcalc/GeneralPloidyExactAFCalc.java | 6 ++-- .../gatk/walkers/genotyper/afcalc/AFCalc.java | 28 +++++++++++++++++-- .../IndependentAllelesDiploidExactAFCalc.java | 14 ++++++++-- .../afcalc/OriginalDiploidExactAFCalc.java | 2 +- .../genotyper/afcalc/StateTracker.java | 12 +++----- 5 files changed, 45 insertions(+), 17 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 3916c2549..9c7883ab8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -447,14 +447,16 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); - getStateTracker().updateMLEifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts); + // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY + getStateTracker().updateMLEifNeeded(MathUtils.goodLog10Probability(log10LofK) ? log10LofK : MathUtils.LOG10_P_OF_ZERO, altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - getStateTracker().updateMAPifNeeded(Double.isInfinite(log10LofK) ? MathUtils.LOG10_P_OF_ZERO : log10LofK, altCounts); + // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY + getStateTracker().updateMAPifNeeded(MathUtils.goodLog10Probability(log10LofK) ? log10LofK : MathUtils.LOG10_P_OF_ZERO, altCounts); return log10LofK; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java index 927fadd94..e3abdeb24 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -53,6 +53,16 @@ public abstract class AFCalc implements Cloneable { private final StateTracker stateTracker; private ExactCallLogger exactCallLogger = null; + /** + * Create a new AFCalc object capable of calculating the prob. that alleles are + * segregating among nSamples with up to maxAltAlleles for SNPs and maxAltAllelesForIndels + * for indels for samples with ploidy + * + * @param nSamples number of samples, must be > 0 + * @param maxAltAlleles maxAltAlleles for SNPs + * @param maxAltAllelesForIndels for indels + * @param ploidy the ploidy, must be > 0 + */ protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); @@ -65,10 +75,20 @@ public abstract class AFCalc implements Cloneable { this.stateTracker = new StateTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); } + /** + * Enable exact call logging to file + * + * @param exactCallsLog the destination file + */ public void enableProcessLog(final File exactCallsLog) { exactCallLogger = new ExactCallLogger(exactCallsLog); } + /** + * Use this logger instead of the default logger + * + * @param logger + */ public void setLogger(Logger logger) { this.logger = logger; } @@ -109,7 +129,7 @@ public abstract class AFCalc implements Cloneable { * @param log10AlleleFrequencyPriors the priors by AC vector * @return a AFCalcResult describing the result of this calculation */ - @Requires("stateTracker.getnEvaluations() > 0") + @Requires("stateTracker.getnEvaluations() >= 0") @Ensures("result != null") protected AFCalcResult getResultFromFinalState(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { stateTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); @@ -144,11 +164,13 @@ public abstract class AFCalc implements Cloneable { * @param log10AlleleFrequencyPriors priors * @return a AFCalcResult object describing the results of this calculation */ - // TODO -- add consistent requires among args + @Requires({"vc != null", "log10AlleleFrequencyPriors != null"}) protected abstract AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors); /** + * Subset VC to the just allelesToUse, updating genotype likelihoods + * * Must be overridden by concrete subclasses * * @param vc variant context with alleles and genotype likelihoods @@ -172,7 +194,7 @@ public abstract class AFCalc implements Cloneable { return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels); } - public StateTracker getStateTracker() { + protected StateTracker getStateTracker() { return stateTracker; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index c0edee291..2f85a5246 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -303,7 +303,13 @@ import java.util.*; /** * Take the independent estimates of pNonRef for each alt allele and combine them into a single result * - * TODO -- add more docs + * Given n independent calculations for each of n alternate alleles create a single + * combined AFCalcResult with: + * + * priors for AF == 0 equal to theta^N for the nth least likely allele + * posteriors that reflect the combined chance that any alleles are segregating and corresponding + * likelihoods + * combined MLEs in the order of the alt alleles in vc * * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently */ @@ -350,8 +356,10 @@ import java.util.*; }; return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), - MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), // necessary to ensure all values < 0 - MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized + // necessary to ensure all values < 0 + MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), + // priors incorporate multiple alt alleles, must be normalized + MathUtils.normalizeFromLog10(log10PriorsOfAC, true), log10pNonRefByAllele, sortedResultsWithThetaNPriors); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index 88f5e06e6..dea38e46c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -11,7 +11,7 @@ import java.util.Map; /** * Original bi-allelic ~O(N) implementation. Kept here for posterity and reference */ -public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { +class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index 3eb32d35e..301891a99 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -129,9 +129,7 @@ final class StateTracker { } /** - * Returns the likelihoods summed across all AC values for AC > 0 - * - * @return + * @return the likelihoods summed across all AC values for AC > 0 */ private double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { if ( log10LikelihoodsForAFGt0Sum == null ) { @@ -144,7 +142,7 @@ final class StateTracker { } /** - * @return + * @return the log10 likelihood of AF == 0 */ private double getLog10LikelihoodOfAFzero() { return log10LikelihoodOfAFzero; @@ -157,9 +155,9 @@ final class StateTracker { * For example, that the allelesUsedInGenotyping has been set, that the alleleCountsOfMLE contains meaningful * values, etc. * - * @param log10PriorsByAC + * @param log10PriorsByAC the priors by AC * - * @return + * @return an AFCalcResult summarizing the final results of this calculation */ @Requires("allelesUsedInGenotyping != null") protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { @@ -167,8 +165,6 @@ final class StateTracker { final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); - // TODO -- replace with more meaningful computation - // TODO -- refactor this calculation into the ref calculation final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); for ( int i = 0; i < subACOfMLE.length; i++ ) { final Allele allele = allelesUsedInGenotyping.get(i+1); From 326f42927050ad326ec41a64f6d8c72c17ecc22f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 19 Oct 2012 14:06:41 -0400 Subject: [PATCH 39/54] Bugfixes to make new AFCalc system pass integrationtests -- GeneralPloidyExactAFCalc turns -Infinity values into -Double.MAX_VALUE, so our calculations pass unit tests -- Bugfix for GeneralPloidyGenotypeLikelihoodsCalculationModel, return a null VC when the only allele we get from our final alleles to use method is the reference base -- Fix calculation of reference posteriors when P(AF == 0) = 0.0 and P(AF == 0) = X for some meaningful value of X. Added unit test to ensure this behavior is correct -- Fix horrible sorting bug in IndependentAllelesDiploidExactAFCalc that applied the theta^N priors in the wrong order. Add contract to ensure this doesn't ever happen again -- Bugfix in GLBasedSampleSelector, where VCs without any polymorphic alleles were being sent to the exact model -- --- ...dyGenotypeLikelihoodsCalculationModel.java | 2 +- .../afcalc/GeneralPloidyExactAFCalc.java | 4 +-- .../afcalc/AFCalcResultUnitTest.java | 5 ++++ .../genotyper/afcalc/AFCalcUnitTest.java | 2 +- .../IndependentAllelesDiploidExactAFCalc.java | 28 +++++++++++++++++-- .../GLBasedSampleSelector.java | 3 ++ 6 files changed, 37 insertions(+), 7 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java index 4c20700ac..2522fc16e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java @@ -245,7 +245,7 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G // find the alternate allele(s) that we should be using final List alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs); - if (alleles == null || alleles.isEmpty()) + if (alleles == null || alleles.isEmpty() || (alleles.size() == 1 && alleles.get(0).isReference())) return null; // start making the VariantContext final GenomeLoc loc = ref.getLocus(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 9c7883ab8..51b7fb633 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -448,7 +448,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMLEifNeeded(MathUtils.goodLog10Probability(log10LofK) ? log10LofK : MathUtils.LOG10_P_OF_ZERO, altCounts); + getStateTracker().updateMLEifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { @@ -456,7 +456,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { log10LofK += log10AlleleFrequencyPriors[ACcount]; } // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMAPifNeeded(MathUtils.goodLog10Probability(log10LofK) ? log10LofK : MathUtils.LOG10_P_OF_ZERO, altCounts); + getStateTracker().updateMAPifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts); return log10LofK; } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java index 1070642e9..cbe2eb268 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java @@ -56,6 +56,11 @@ public class AFCalcResultUnitTest extends BaseTest { tests.add(new Object[]{new MyTest(new double[]{-1e-9, badL}, new double[]{0.0, badL})}); } + // test that a non-ref site gets reasonable posteriors with an ~0.0 value doesn't get lost + for ( final double nonRefL : Arrays.asList(-100.0, -50.0, -10.0, -9.0, -8.0, -7.0, -6.0, -5.0)) { + tests.add(new Object[]{new MyTest(new double[]{0.0, nonRefL}, new double[]{0.0, nonRefL})}); + } + return tests.toArray(new Object[][]{}); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 25df0f6d2..9c6c8e8ab 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -185,7 +185,7 @@ public class AFCalcUnitTest extends BaseTest { testResultSimple(cfg); } - @Test(enabled = true, dataProvider = "badGLs") + @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs") public void testBadGLs(GetGLsTest cfg) { testResultSimple(cfg); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index 2f85a5246..ea89d3802 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -100,7 +100,7 @@ import java.util.*; private final static class CompareAFCalcResultsByPNonRef implements Comparator { @Override public int compare(AFCalcResult o1, AFCalcResult o2) { - return Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); + return -1 * Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); } } @@ -313,6 +313,7 @@ import java.util.*; * * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently */ + @Requires("sortedByPosteriorGT(sortedResultsWithThetaNPriors)") protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, final List sortedResultsWithThetaNPriors) { int nEvaluations = 0; @@ -321,8 +322,9 @@ import java.util.*; final double[] log10PriorsOfAC = new double[2]; final Map log10pNonRefByAllele = new HashMap(nAltAlleles); - // this value is a sum in log space + // the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs double log10PosteriorOfACEq0Sum = 0.0; + double log10PosteriorOfACGt0Sum = 0.0; for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); @@ -337,6 +339,7 @@ import java.util.*; // the AF > 0 case requires us to store the normalized likelihood for later summation if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); + log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0()); @@ -348,7 +351,16 @@ import java.util.*; // In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C, // the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently // log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0 - final double log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); + // + // note we need to handle the case where the posterior of AF == 0 is 0.0, in which case we + // use the summed log10PosteriorOfACGt0Sum directly. This happens in cases where + // AF > 0 : 0.0 and AF == 0 : -16, and if you use the inverse calculation you get 0.0 and MathUtils.LOG10_P_OF_ZERO + final double log10PosteriorOfACGt0; + if ( log10PosteriorOfACEq0Sum == 0.0 ) + log10PosteriorOfACGt0 = log10PosteriorOfACGt0Sum; + else + log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); + final double[] log10LikelihoodsOfAC = new double[] { // L + prior = posterior => L = poster - prior log10PosteriorOfACEq0Sum - log10PriorsOfAC[0], @@ -362,4 +374,14 @@ import java.util.*; MathUtils.normalizeFromLog10(log10PriorsOfAC, true), log10pNonRefByAllele, sortedResultsWithThetaNPriors); } + + private static boolean sortedByPosteriorGT(final List sortedVCs) { + double lastPosteriorGt0 = sortedVCs.get(0).getLog10PosteriorOfAFGT0(); + for ( final AFCalcResult vc : sortedVCs ) { + if ( vc.getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) + return false; + lastPosteriorGt0 = vc.getLog10PosteriorOfAFGT0(); + } + return true; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index f8c871e7d..48a2d2700 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -48,6 +48,9 @@ public class GLBasedSampleSelector extends SampleSelector { // first subset to the samples VariantContext subContext = vc.subContextFromSamples(samples); + if ( ! subContext.isPolymorphicInSamples() ) + return false; + // now check to see (using EXACT model) whether this should be variant // do we want to apply a prior? maybe user-spec? if ( flatPriors == null ) { From eaffb814d381e19aa8d522338f726c481634ab98 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 18 Oct 2012 13:34:51 -0400 Subject: [PATCH 40/54] IndependentExactAFCalc is now the default EXACT model implementation -- Changed UG / HC to use this one via the StandardCallerArgumentCollection -- Update the AFCalcFactory.Calculation to have a getDefault() value instead of having a duplicate entry in the enums --- .../gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java | 2 +- .../arguments/StandardCallerArgumentCollection.java | 2 +- .../gatk/walkers/genotyper/afcalc/AFCalcFactory.java | 11 +++++------ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index 9c6c8e8ab..ab967fbe1 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -495,7 +495,7 @@ public class AFCalcUnitTest extends BaseTest { // list of all high-quality models in the system final List models = Arrays.asList( - AFCalcFactory.Calculation.EXACT, + AFCalcFactory.Calculation.getDefaultModel(), AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcFactory.Calculation.EXACT_INDEPENDENT); diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index a511364f9..84dfa694b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -105,5 +105,5 @@ public class StandardCallerArgumentCollection { */ @Advanced @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT; + public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java index 7d67815cf..80de555ca 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java @@ -24,15 +24,12 @@ public class AFCalcFactory { * the needs of the request (i.e., considering ploidy). */ public enum Calculation { - /** The default implementation */ - EXACT(ReferenceDiploidExactAFCalc.class, 2, -1), - - /** reference implementation of multi-allelic EXACT model */ - EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1), - /** expt. implementation -- for testing only */ EXACT_INDEPENDENT(IndependentAllelesDiploidExactAFCalc.class, 2, -1), + /** reference implementation of multi-allelic EXACT model. Extremely slow for many alternate alleles */ + EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1), + /** original biallelic exact model, for testing only */ EXACT_ORIGINAL(OriginalDiploidExactAFCalc.class, 2, 2), @@ -60,6 +57,8 @@ public class AFCalcFactory { return (requiredPloidy == -1 || requiredPloidy == requestedPloidy) && (maxAltAlleles == -1 || maxAltAlleles >= requestedMaxAltAlleles); } + + public static Calculation getDefaultModel() { return EXACT_INDEPENDENT; } } private static final Map> afClasses; From 0fb82745077af0d1795253a1b3c2e5c03551645d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 19 Oct 2012 17:11:31 -0400 Subject: [PATCH 41/54] Fix contact on sorting of AFCalcResults -- The thing that must be sorted is the pre-theta^N list, which is not checked in the routine that applies the theta^N prior. --- .../IndependentAllelesDiploidExactAFCalc.java | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index ea89d3802..804b560b4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -285,10 +285,14 @@ import java.util.*; // sort the results, so the most likely allele is first Collections.sort(sorted, compareAFCalcResultsByPNonRef); + double lastPosteriorGt0 = sorted.get(0).getLog10PosteriorOfAFGT0(); final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0(); for ( int i = 0; i < sorted.size(); i++ ) { - final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; + if ( sorted.get(i).getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) + throw new IllegalStateException("pNonRefResults not sorted: lastPosteriorGt0 " + lastPosteriorGt0 + " but current is " + sorted.get(i).getLog10PosteriorOfAFGT0()); + + final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0)); final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 }; @@ -313,7 +317,6 @@ import java.util.*; * * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently */ - @Requires("sortedByPosteriorGT(sortedResultsWithThetaNPriors)") protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, final List sortedResultsWithThetaNPriors) { int nEvaluations = 0; @@ -374,14 +377,4 @@ import java.util.*; MathUtils.normalizeFromLog10(log10PriorsOfAC, true), log10pNonRefByAllele, sortedResultsWithThetaNPriors); } - - private static boolean sortedByPosteriorGT(final List sortedVCs) { - double lastPosteriorGt0 = sortedVCs.get(0).getLog10PosteriorOfAFGT0(); - for ( final AFCalcResult vc : sortedVCs ) { - if ( vc.getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) - return false; - lastPosteriorGt0 = vc.getLog10PosteriorOfAFGT0(); - } - return true; - } } From 0fcd358ace4c232236e577d53627ea06959b4766 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 19 Oct 2012 19:33:47 -0400 Subject: [PATCH 42/54] Original EXACT model implementation lives, providing another reference (bi-allelic only) EXACT model -- Potentially a very fast implementation (it's very clean) but restricted to the biallelic case -- A starting point for future bi-allelic only optimized (logless) or generalized (bi-allelic general ploidy) implementations -- Added systematic unit tests covering this implementation, and comparing it to others -- Uncovered a nasty normalization bug in StateTracker that was capping our likelihoods at 0, even after summing up multiple likelihoods, which is just not safe to do and was causing us to lose likelihood in some cases -- Removed the restriction that a likelihood be <= 0 in StateTracker, and the protection for these cases in GeneralPloidyExactAFCalc which just wasn't right --- .../afcalc/GeneralPloidyExactAFCalc.java | 4 +- .../genotyper/afcalc/AFCalcUnitTest.java | 149 ++++++++++++++---- .../afcalc/OriginalDiploidExactAFCalc.java | 38 +++-- .../genotyper/afcalc/StateTracker.java | 14 +- 4 files changed, 149 insertions(+), 56 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 51b7fb633..2b247430c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -448,7 +448,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { // update the MLE if necessary final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMLEifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts); + getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); // apply the priors over each alternate allele for (final int ACcount : altCounts ) { @@ -456,7 +456,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { log10LofK += log10AlleleFrequencyPriors[ACcount]; } // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY - getStateTracker().updateMAPifNeeded(Math.max(Math.min(log10LofK, 0.0), -Double.MAX_VALUE), altCounts); + getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts); return log10LofK; } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index ab967fbe1..ef4318a40 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; +import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MathUtils; @@ -124,12 +125,7 @@ public class AFCalcUnitTest extends BaseTest { final List triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2); for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) { - List calcs = AFCalcFactory.createAFCalcs( - Arrays.asList( - AFCalcFactory.Calculation.EXACT_REFERENCE, - AFCalcFactory.Calculation.EXACT_INDEPENDENT, - AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY - ), 4, 2, 2, 2); + List calcs = AFCalcFactory.createAFCalcs( Arrays.asList( AFCalcFactory.Calculation.values() ), 4, 2, 2, 2); final int nPriorValues = 2*nSamples+1; final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors @@ -146,7 +142,7 @@ public class AFCalcUnitTest extends BaseTest { new GetGLsTest(model, 1, genotypes, priors, priorName); // tri-allelic - if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) ) // || model != generalCalc ) ) + if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) && ! ( model instanceof OriginalDiploidExactAFCalc) ) // || model != generalCalc ) ) for ( List genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) ) new GetGLsTest(model, 2, genotypes, priors, priorName); } @@ -156,22 +152,28 @@ public class AFCalcUnitTest extends BaseTest { return GetGLsTest.getTests(GetGLsTest.class); } - @DataProvider(name = "badGLs") - public Object[][] createBadGLs() { - final List genotypes = Arrays.asList(AB2, BB2, CC2, CC2); - final int nSamples = genotypes.size(); +// @DataProvider(name = "badGLs") +// public Object[][] createBadGLs() { +// final List genotypes = Arrays.asList(AB2, BB2, CC2, CC2); +// final int nSamples = genotypes.size(); +// +// final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4); +// +// final int nPriorValues = 2*nSamples+1; +// final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors +// for ( AFCalc model : Arrays.asList(indCalc) ) { +// final String priorName = "flat"; +// new GetGLsTest(model, 2, genotypes, priors, priorName); +// } +// +// return GetGLsTest.getTests(GetGLsTest.class); +// } - final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4); - - final int nPriorValues = 2*nSamples+1; - final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors - for ( AFCalc model : Arrays.asList(indCalc) ) { - final String priorName = "flat"; - new GetGLsTest(model, 2, genotypes, priors, priorName); - } - - return GetGLsTest.getTests(GetGLsTest.class); - } +// +// @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs") +// public void testBadGLs(GetGLsTest cfg) { +// testResultSimple(cfg); +// } @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs") public void testBiallelicGLs(GetGLsTest cfg) { @@ -185,11 +187,6 @@ public class AFCalcUnitTest extends BaseTest { testResultSimple(cfg); } - @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs") - public void testBadGLs(GetGLsTest cfg) { - testResultSimple(cfg); - } - private static class NonInformativeData { final Genotype nonInformative; final List called; @@ -218,16 +215,14 @@ public class AFCalcUnitTest extends BaseTest { samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative)); final int nSamples = samples.size(); - List calcs = AFCalcFactory.createAFCalcs( - Arrays.asList( - AFCalcFactory.Calculation.EXACT_REFERENCE, - AFCalcFactory.Calculation.EXACT_INDEPENDENT, - AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY - ), 4, 2, 2, 2); + List calcs = AFCalcFactory.createAFCalcs(Arrays.asList(AFCalcFactory.Calculation.values()), 4, 2, 2, 2); final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors for ( AFCalc model : calcs ) { + if ( testData.nAltAlleles > 1 && model instanceof OriginalDiploidExactAFCalc ) + continue; + final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat"); for ( int rotation = 0; rotation < nSamples; rotation++ ) { @@ -428,6 +423,94 @@ public class AFCalcUnitTest extends BaseTest { "Actual pNonRef not within tolerance " + tolerance + " of expected"); } + @DataProvider(name = "PNonRefBiallelicSystematic") + public Object[][] makePNonRefBiallelicSystematic() { + List tests = new ArrayList(); + + final List bigNonRefPLs = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 50, 100, 1000); + final List> bigDiploidPLs = removeBadPLs(Utils.makePermutations(bigNonRefPLs, 3, true)); + + for ( AFCalcFactory.Calculation modelType : AFCalcFactory.Calculation.values() ) { + + if ( false ) { // for testing only + tests.add(new Object[]{modelType, toGenotypes(Arrays.asList(Arrays.asList(0,100,0)))}); + } else { + if ( modelType == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) continue; // TODO -- GENERAL_PLOIDY DOESN'T WORK + + // test all combinations of PLs for 1 sample + for ( final List> PLsPerSample : Utils.makePermutations(bigDiploidPLs, 1, true) ) { + tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)}); + } + + + final List> smallDiploidPLs = new LinkedList>(); + for ( final int nonRefPL : Arrays.asList(5, 10, 20, 30) ) { + for ( int i = 0; i < 2; i++ ) { + List pls = new ArrayList(Collections.nCopies(3, nonRefPL)); + pls.set(i, 0); + smallDiploidPLs.add(pls); + } + } + + for ( final List> PLsPerSample : Utils.makePermutations(smallDiploidPLs, 5, false) ) { + tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + final List> removeBadPLs(List> listOfPLs) { + List> clean = new LinkedList>(); + + for ( final List PLs : listOfPLs ) { + int x = PLs.get(0); + boolean bad = false; + for ( int pl1 : PLs ) + if ( pl1 > x ) + bad = true; + else + x = pl1; + if ( ! bad ) clean.add(PLs); + } + + return clean; + } + + private List toGenotypes(final List> PLsPerSample) { + final List nocall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + final List genotypes = new ArrayList(PLsPerSample.size()); + + for ( final List PLs : PLsPerSample ) { + final int[] pls = ArrayUtils.toPrimitive(PLs.toArray(new Integer[3])); + final int min = MathUtils.arrayMin(pls); + for ( int i = 0; i < pls.length; i++ ) pls[i] -= min; + genotypes.add(makePL(nocall, pls)); + } + + return genotypes; + } + + @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRefBiallelicSystematic") + private void PNonRefBiallelicSystematic(AFCalcFactory.Calculation modelType, final List genotypes) { + //logger.warn("Running " + modelType + " with " + genotypes); + final AFCalcTestBuilder refBuilder = new AFCalcTestBuilder(genotypes.size(), 1, AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcTestBuilder.PriorType.human); + final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(genotypes.size(), 1, modelType, AFCalcTestBuilder.PriorType.human); + + final VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A, C)); + vcb.genotypes(genotypes); + + final AFCalcResult refResult = refBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors()); + final AFCalcResult testResult = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors()); + + final double tolerance = 1e-3; + Assert.assertEquals(testResult.getLog10PosteriorOfAFGT0(), refResult.getLog10PosteriorOfAFGT0(), tolerance, + "Actual pNonRef not within tolerance " + tolerance + " of expected"); + Assert.assertEquals(testResult.getAlleleCountsOfMLE(), refResult.getAlleleCountsOfMLE(), + "Actual MLE " + Utils.join(",", testResult.getAlleleCountsOfMLE()) + " not equal to expected " + Utils.join(",", refResult.getAlleleCountsOfMLE())); + } + // -------------------------------------------------------------------------------- // // Test priors diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index dea38e46c..ac4634666 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -20,15 +21,22 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) { final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length]; final double[] log10AlleleFrequencyPosteriors = new double[log10AlleleFrequencyPriors.length]; - final int lastK = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + final Pair result = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + final int lastK = result.getFirst(); + final int mleK = result.getSecond(); - final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1)}; + final double log10LikelihoodAFGt0 = lastK == 0 ? MathUtils.LOG10_P_OF_ZERO : MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1, lastK+1); + final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], log10LikelihoodAFGt0}; final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)}; + final double[] log10Posteriors = MathUtils.vectorSum(log10Likelihoods, log10Priors); - final double pNonRef = lastK > 0 ? 0.0 : -1000.0; - final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), pNonRef); + final double log10PNonRef = log10Posteriors[1] > log10Posteriors[0] ? 0.0 : MathUtils.LOG10_P_OF_ZERO; + final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PNonRef); - return new AFCalcResult(new int[]{lastK}, 0, vc.getAlleles(), log10Likelihoods, log10Priors, log10pNonRefByAllele); + return new AFCalcResult(new int[]{mleK}, 0, vc.getAlleles(), + MathUtils.normalizeFromLog10(log10Likelihoods, true), + MathUtils.normalizeFromLog10(log10Priors, true), + log10pNonRefByAllele); } /** @@ -68,11 +76,11 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { } } - public int linearExact(final VariantContext vc, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyLikelihoods, - double[] log10AlleleFrequencyPosteriors) { - final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), false); + public Pair linearExact(final VariantContext vc, + double[] log10AlleleFrequencyPriors, + double[] log10AlleleFrequencyLikelihoods, + double[] log10AlleleFrequencyPosteriors) { + final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), true); final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; @@ -81,7 +89,7 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { double maxLog10L = Double.NEGATIVE_INFINITY; boolean done = false; - int lastK = -1; + int lastK = -1, mleK = -1; for (int k=0; k <= numChr && ! done; k++ ) { final double[] kMinus0 = logY.getkMinus0(); @@ -127,7 +135,11 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { // can we abort early? lastK = k; - maxLog10L = Math.max(maxLog10L, log10LofK); + if ( log10LofK > maxLog10L ) { + maxLog10L = log10LofK; + mleK = k; + } + if ( log10LofK < maxLog10L - StateTracker.MAX_LOG10_ERROR_TO_STOP_EARLY ) { //if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); done = true; @@ -136,6 +148,6 @@ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { logY.rotate(); } - return lastK; + return new Pair(lastK, mleK); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java index 301891a99..b82ec1d29 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -131,14 +131,14 @@ final class StateTracker { /** * @return the likelihoods summed across all AC values for AC > 0 */ - private double getLog10LikelihoodOfAFNotZero(final boolean capAt0) { + private double getLog10LikelihoodOfAFNotZero() { if ( log10LikelihoodsForAFGt0Sum == null ) { if ( log10LikelihoodsForAFGt0CacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have log10LikelihoodsForAFGt0Sum = MathUtils.LOG10_P_OF_ZERO; else log10LikelihoodsForAFGt0Sum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); } - return Math.min(log10LikelihoodsForAFGt0Sum, capAt0 ? 0.0 : Double.POSITIVE_INFINITY); + return log10LikelihoodsForAFGt0Sum; } /** @@ -162,7 +162,7 @@ final class StateTracker { @Requires("allelesUsedInGenotyping != null") protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); - final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)}; + final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true); final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); @@ -210,7 +210,7 @@ final class StateTracker { * @param log10LofK the likelihood of our current configuration state, cannot be the 0 state * @param alleleCountsForK the allele counts for this state */ - @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10LofK)"}) + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"}) @Ensures("log10MLE == Math.max(log10LofK, log10MLE)") protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { addToLikelihoodsCache(log10LofK); @@ -227,7 +227,7 @@ final class StateTracker { * @param log10PofK the posterior of our current configuration state * @param alleleCountsForK the allele counts for this state */ - @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0", "MathUtils.goodLog10Probability(log10PofK)"}) + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"}) @Ensures("log10MAP == Math.max(log10PofK, log10MAP)") protected void updateMAPifNeeded(final double log10PofK, final int[] alleleCountsForK) { if ( log10PofK > log10MAP ) { @@ -236,7 +236,6 @@ final class StateTracker { } } - @Requires({"MathUtils.goodLog10Probability(log10LofK)"}) private void addToLikelihoodsCache(final double log10LofK) { // add to the cache log10LikelihoodsForAFGt0[log10LikelihoodsForAFGt0CacheIndex++] = log10LofK; @@ -250,7 +249,6 @@ final class StateTracker { } } - @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; if ( log10LikelihoodOfAFzero > log10MLE ) { @@ -259,7 +257,7 @@ final class StateTracker { } } - @Requires({"MathUtils.goodLog10Probability(log10LikelihoodOfAFzero)"}) + @Requires({"MathUtils.goodLog10Probability(log10PosteriorOfAFzero)"}) protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { if ( log10PosteriorOfAFzero > log10MAP ) { log10MAP = log10PosteriorOfAFzero; From 6b6caf8e3a299553daa1dc5b64b33e0de14144a3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 19 Oct 2012 14:42:00 -0400 Subject: [PATCH 43/54] Bugfix for indel DP calculations using reduced reads -- Adding tests for SNP and indel calling on reduced BAM --- ...delGenotypeLikelihoodsCalculationModel.java | 2 +- .../UnifiedGenotyperIntegrationTest.java | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index e0ffb2ba6..1761f9cec 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -231,7 +231,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood int count = 0; for (PileupElement p : pileup) { if (p.isDeletion() || p.isInsertionAtBeginningOfRead() || BaseUtils.isRegularBase(p.getBase())) - count++; + count += p.getRepresentativeCount(); } return count; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 72724e46a..3bb630d18 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -452,4 +452,22 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { Arrays.asList("bbf16e1873e525ee5975021cfb8988cf")); executeTest("test calling on a ReducedRead BAM", spec); } + + @Test + public void testReducedBamSNPs() { + testReducedCalling("SNP", ""); + } + + @Test + public void testReducedBamINDELs() { + testReducedCalling("INDEL", ""); + } + + + private void testReducedCalling(final String model, final String md5) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-11,000,000 -glm " + model, 1, + Arrays.asList(md5)); + executeTest("test calling on a ReducedRead BAM with " + model, spec); + } } From d21e42608ac43b58ade9984436d8c2b428eb6037 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 21 Oct 2012 08:10:43 -0400 Subject: [PATCH 44/54] Updating integration tests for minor changes due to switching to EXACT_INDEPENDENT model by default --- ...GenotyperGeneralPloidyIntegrationTest.java | 6 +-- .../HaplotypeCallerIntegrationTest.java | 8 ++-- .../UnifiedGenotyperIntegrationTest.java | 44 +++++++++---------- .../SelectVariantsIntegrationTest.java | 4 +- .../NanoSchedulerIntegrationTest.java | 2 +- 5 files changed, 32 insertions(+), 32 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 652489a71..b447a1113 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -60,12 +60,12 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","67dabdbf1e6ed8a83d2e85766558a20a"); + PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","9ce24f4ff787aed9d3754519a60ef49f"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","d4bfae27f1b07923f381d708d8a34cf4"); + PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","492c8ba9a80a902097ff15bbeb031592"); } @Test(enabled = true) @@ -80,7 +80,7 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da84bf45f7080a46a7a78542b3a0629d"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","0a8c3b06243040b743dd90d497bb3f83"); } @Test(enabled = true) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index a441e6c77..870967f09 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,7 +21,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "75013fa6a884104f0b1797502b636698"); + HCTest(CEUTRIO_BAM, "", "ee866a8694a6f6c77242041275350ab9"); } @Test @@ -31,7 +31,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "3cd3363976b1937d801f9f82996f4abe"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "53caa950535749f99d3c5b9bb61c7b60"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "6eb9c1026225b38ba7bd3c4c218f8269"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "b4ea70a446e4782bd3700ca14dd726ff"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -64,7 +64,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "98d82d74e8d6a778290bee6c0df6d092"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8"); } @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 3bb630d18..044d70fc2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("b3abf320f7d02d0e3b2883833419130e")); + Arrays.asList("847605f4efafef89529fe0e496315edd")); executeTest("test MultiSample Pilot1", spec); } @@ -52,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("57e409dbb12e0d85cd8af73db221b1fc")); + Arrays.asList("afb8768f31ab57eb43f75c1115eadc99")); executeTest("test SingleSample Pilot2", spec); } @@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("48b4f4b05461be276bffc91350f08cbc")); + Arrays.asList("73c9b926c5e971a113de347a64fdcf20")); executeTest("test Multiple SNP alleles", spec); } @@ -76,7 +76,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("04affcc9d720ee17bc221759707e0cd2")); + Arrays.asList("7f8d13690cb7d4173787afa00c496f12")); executeTest("test reverse trim", spec); } @@ -84,7 +84,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("112e7bedfd284d4d9390aa006118c733")); + Arrays.asList("3c006b06b17bbe8e787d64eff6a63a19")); executeTest("test mismatched PLs", spec); } @@ -94,7 +94,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "367c0355b4e7b10c2988e5c41f44b3d2"; + private final static String COMPRESSED_OUTPUT_MD5 = "fd236bd635d514e4214d364f45ec4d10"; @Test public void testCompressedOutput() { @@ -115,7 +115,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "360d1274c1072a1ae9868e4e106c2650"; + String md5 = "d408b4661b820ed86272415b8ea08780"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -147,7 +147,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("6ae4a219c7b9c837fcbf12edeeac3c0c")); + Arrays.asList("839ecd30d354a36b5dfa2b5e99859765")); executeTest("test min_base_quality_score 26", spec); } @@ -182,12 +182,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "f9ea04d96eeef29e71d37e60518c2579"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "8800c58715c2bb434b69e1873cb77de6"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "67739a3ccf30975bcaef8a563e4b80cf"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "639df9f4c029792ac2e46069efc82b20"); } private void testOutputParameters(final String args, final String md5) { @@ -220,12 +220,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "f1c4c8e701b2334bf3c4f12fc395fec8" ); + testHeterozosity( 0.01, "986923de51c71635d47e3d06fe3794a1" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "7fbbf4a21d6bf0026bfdadbb3c086fbe" ); + testHeterozosity( 1.0 / 1850, "fb12b1553f813004a394a391a8540873" ); } private void testHeterozosity(final double arg, final String md5) { @@ -268,7 +268,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("950fb032cc9902ae48bd21f272d2fd52")); + Arrays.asList("98058fc913b61c22d44875da1f5ea89c")); executeTest(String.format("test calling with BAQ"), spec); } @@ -287,7 +287,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("b3df138254ed141b61a758df87757e0d")); + Arrays.asList("650c53774afacfc07a595675e8cdde17")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -302,7 +302,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("63fd9488daadd4baaef0a98f02916996")); + Arrays.asList("6a0c2a3a7bcc56ad01428c71408055aa")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -315,7 +315,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("52b5a432092995c92fe71e1942689ba8")); + Arrays.asList("5f2721c3323de5390d2d47446139f32b")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -343,13 +343,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("863ee56b3594f09795644127f2f9539f")); + Arrays.asList("a4761d7f25e7a62f34494801c98a0da7")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("503ca1b75cc7b2679eaa80f7b5e7ef1c")); + Arrays.asList("c526c234947482d1cd2ffc5102083a08")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -371,7 +371,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 20:10,000,000-10,100,000", 1, - Arrays.asList("945a2f994eaced8efdf8de24b58f2680")); + Arrays.asList("1e0d2c15546c3b0959b00ffb75488b56")); executeTest(String.format("test UG with base indel quality scores"), spec); } @@ -449,18 +449,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("bbf16e1873e525ee5975021cfb8988cf")); + Arrays.asList("da9c05f87bd6415e97f90c49cf68ed19")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", ""); + testReducedCalling("SNP", "1d4a826b144723ff0766c36aa0239287"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", ""); + testReducedCalling("INDEL", "68ef51d5c98480e0c0192e0eecb95bca"); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 58d3677c7..f29ac8cad 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -190,7 +190,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("549321a2543608f214ab4893ab478be6") + Arrays.asList("46ff472fc7ef6734ad01170028d5924a") ); executeTest("testRegenotype--" + testFile, spec); @@ -216,7 +216,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("549321a2543608f214ab4893ab478be6") + Arrays.asList("46ff472fc7ef6734ad01170028d5924a") ); executeTest("testRemoveMLEAndRegenotype--" + testFile, spec); diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java index c1b28314c..37614f15f 100755 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -21,7 +21,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest { for ( final int nct : Arrays.asList(1, 2) ) { // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); - tests.add(new Object[]{ "BOTH", "8cad82c3a5f5b932042933f136663c8a", nt, nct }); + tests.add(new Object[]{ "BOTH", "85fc5d6dfeb60ed89763470f4b4c981e", nt, nct }); } return tests.toArray(new Object[][]{}); From 5296de8251ddb1cd9036bf9b7e57fc362a78f92b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 21 Oct 2012 08:20:39 -0400 Subject: [PATCH 45/54] Fix UnifiedArgumentCollection constructor logic error -- The old way of overloading constructors and calling super didn't work (might have been a consequence of merge). This is the right way to do the copy constructor with the call to super() --- .../genotyper/UnifiedArgumentCollection.java | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index abf0b4420..e50f25fe8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -27,15 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; -<<<<<<< HEAD -import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.utils.pairhmm.PairHMM; -======= ->>>>>>> 19181ee... Moving pnrm to UnifiedArgumentCollection so it's available with the HaplotypeCaller import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - public class UnifiedArgumentCollection extends StandardCallerArgumentCollection { @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) @@ -182,14 +177,30 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; - public UnifiedArgumentCollection() { } - - public UnifiedArgumentCollection(final StandardCallerArgumentCollection SCAC) { - super(SCAC); + /** + * Create a new UAC with defaults for all UAC arguments + */ + public UnifiedArgumentCollection() { + super(); } - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + /** + * Create a new UAC based on the information only our in super-class scac and defaults for all UAC arguments + * @param scac + */ + public UnifiedArgumentCollection(final StandardCallerArgumentCollection scac) { + super(scac); + } + + /** + * Create a new UAC with all parameters having the values in uac + * + * @param uac + */ public UnifiedArgumentCollection(final UnifiedArgumentCollection uac) { + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + super(uac); + this.GLmodel = uac.GLmodel; this.AFmodel = uac.AFmodel; this.PCR_error = uac.PCR_error; @@ -215,6 +226,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; this.pairHMM = uac.pairHMM; + // todo- arguments to remove this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; } From eb6c9a1a79b8979353d85723d26138be2d52ff6a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 21 Oct 2012 09:57:31 -0400 Subject: [PATCH 46/54] Disable EfficiencyMonitoringThreadFactoryUnitTest -- This is no longer a core GATK activity, and the tests need to run for so long (2 min each) that it's just too painful to run them. Should be re-eabled if we come to care about this capability again, or if we can run these tests all in parallel in the future. --- .../threading/EfficiencyMonitoringThreadFactoryUnitTest.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java index 7381bebc4..c072c808d 100755 --- a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java @@ -130,7 +130,10 @@ public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { return StateTest.getTests(StateTest.class); } - @Test(enabled = true, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND) + // NOTE this test takes an unreasonably long time to run, and so it's been disabled as these monitoring threads + // aren't a core GATK feature any longer. Should be reabled if we come to care about this capability again + // in the future, or we can run these in parallel + @Test(enabled = false, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND) public void testStateTest(final StateTest test) throws InterruptedException { // allows us to test blocking final EfficiencyMonitoringThreadFactory factory = new EfficiencyMonitoringThreadFactory(test.getNStates()); From 9f2851d769e3f14b19f4e92c07c25d4b4689e250 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 21 Oct 2012 20:23:11 -0400 Subject: [PATCH 47/54] Updating UnifiedGenotyperGeneralPloidyIntegrationTest following rebasing -- Created a JIRA ticket https://jira.broadinstitute.org/browse/GSA-623 for Guillermo to look at the differences as the multi-allelic nature of many sites seems to change with the new more protected infrastructure. This may be due to implementation issues in the pooled caller, problems with my interface, or could be a genuine improvement. --- .../UnifiedGenotyperGeneralPloidyIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index b447a1113..ae3befe66 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -70,12 +70,12 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","06a512271631c5b511314a2618de82d7"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","848e1092b5cd57b0da5f1187e67134e7"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","36a383adfdbf1f59656138b538a9920d"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","51a7b51d82a341adec0e6510f5dfadd8"); } @Test(enabled = true) From ccae6a5b9264350050695de05dad098884ea99f9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 22 Oct 2012 11:48:34 -0400 Subject: [PATCH 48/54] Fixed the RR bug I (knowingly) introduced last week: turns out we can't trust a context size's worth of data from the previous marking. I think Mauricio warned me about this but I forgot. --- .../gatk/walkers/compression/reducereads/SlidingWindow.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 63524ae82..32abe8ef6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -276,7 +276,10 @@ public class SlidingWindow { final int windowHeaderStartLocation = getStartLocation(windowHeader); final int sizeOfMarkedRegion = stop - windowHeaderStartLocation + contextSize + 1; - final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion); + + // copy over as many bits as we can from the previous calculation. Note that we can't trust the + // last (contextSize - 1) worth of bits because we may not have actually looked at variant regions there. + final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion) - contextSize - 1; final int locationToProcess = Math.min(lastPositionMarked, stop - contextSize); // update the iterator to the correct position From 97dc3664c9ca9bd42d98e1a236219bd4e849fa26 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Mon, 22 Oct 2012 12:04:23 -0400 Subject: [PATCH 49/54] Fixed yet another NPE related to the ArgumentTypeDescriptor vs. ArgumentMatchValue. Added integration test based on GSA-621. --- .../gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java | 2 +- .../genotyper/UnifiedGenotyperIntegrationTest.java | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java index 43350ccc1..f521c959d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java @@ -151,7 +151,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { ? new VariantContextWriterStub(engine, writerFile, argumentSources) : new VariantContextWriterStub(engine, defaultOutputStream, argumentSources); - stub.setCompressed(isCompressed(writerFileName.asString())); + stub.setCompressed(isCompressed(writerFileName == null ? null: writerFileName.asString())); stub.setDoNotWriteGenotypes(argumentIsPresent(createSitesOnlyArgumentDefinition(),matches)); stub.setSkipWritingCommandLineHeader(argumentIsPresent(createNoCommandLineHeaderArgumentDefinition(),matches)); stub.setForceBCF(argumentIsPresent(createBCFArgumentDefinition(),matches)); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 044d70fc2..3a2d6f697 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -7,6 +7,7 @@ import org.testng.annotations.Test; import java.io.File; import java.util.Arrays; +import java.util.Collections; import java.util.List; // ********************************************************************************** // @@ -18,6 +19,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; + private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam"; // -------------------------------------------------------------------------------------------------------------- // @@ -175,6 +177,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test using comp track", spec); } + @Test + public void testNoCmdLineHeaderStdout() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandNoCmdLineHeaderStdout + " -glm INDEL -L 1:67,225,396-67,288,518", 0, + Collections.emptyList()); + executeTest("testNoCmdLineHeaderStdout", spec); + } + @Test public void testOutputParameterSitesOnly() { testOutputParameters("-sites_only", "97ba874eafc9884a4de027a84c036311"); From 90f59803fd8c02bf7c2c38f3e909b72ab2c35cf8 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 22 Oct 2012 09:58:43 -0400 Subject: [PATCH 50/54] MaxAltAlleles now defaults to 6, no more MaxAltAllelesForIndels -- Updated StandardCallerArgumentCollection to remove MaxAltAllelesForIndels. Previous argument is deprecated with meaningful doc message for people to use maxAltAlleles -- All constructores, factory methods, and test builders and their users updated to provide just a single argument -- Updating MD5s for integration tests that change due to genotyping more alleles -- Adding more alleles to genotyping results in slight changes in the QUAL value for multi-allelic loci where one or more alleles aren't polymorphic. That's simply due to the way that alternative hypotheses contribute as reference evidence against each true allele. The effect can be large (new qual = old qual / 2 in one case here). -- If we want more precision in our estimates we could decide (Eric, should we discuss?) to actually separately do a discovery phase in the genotyping, eliminate all variants not considered polymorphic, and then do a final round of calling to get the exact QUAL value for only those that are segregating. This would have the value of having the QUAL stay constant as more alleles are genotyped, at the cost of some code complexity increase and runtime. Might be worth it through --- .../genotyper/afcalc/AFCalcTestBuilder.java | 2 +- .../afcalc/GeneralPloidyExactAFCalc.java | 14 ++++----- .../genotyper/afcalc/AFCalcUnitTest.java | 4 +-- ...neralPloidyAFCalculationModelUnitTest.java | 2 +- .../HaplotypeCallerIntegrationTest.java | 2 +- .../StandardCallerArgumentCollection.java | 13 ++++++--- .../gatk/walkers/genotyper/afcalc/AFCalc.java | 10 ++----- .../genotyper/afcalc/AFCalcFactory.java | 29 ++++++++----------- .../genotyper/afcalc/DiploidExactAFCalc.java | 14 ++++----- .../walkers/genotyper/afcalc/ExactAFCalc.java | 4 +-- .../IndependentAllelesDiploidExactAFCalc.java | 17 ++++++----- .../afcalc/OriginalDiploidExactAFCalc.java | 4 +-- .../afcalc/ReferenceDiploidExactAFCalc.java | 4 +-- .../GLBasedSampleSelector.java | 2 +- .../UnifiedGenotyperIntegrationTest.java | 4 +-- 15 files changed, 59 insertions(+), 66 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java index cfb67164d..6f3740ab3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java @@ -54,7 +54,7 @@ public class AFCalcTestBuilder { } public AFCalc makeModel() { - return AFCalcFactory.createAFCalc(modelType, nSamples, getNumAltAlleles(), getNumAltAlleles(), 2); + return AFCalcFactory.createAFCalc(modelType, nSamples, getNumAltAlleles(), 2); } public double[] makePriors() { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index 2b247430c..b248c8759 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -40,22 +40,20 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 private final static boolean VERBOSE = false; - protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); this.ploidy = ploidy; } @Override protected VariantContext reduceScope(VariantContext vc) { - final int maxAltAlleles = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype; - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > maxAltAlleles) { - logger.warn("this tool is currently set to genotype at most " + maxAltAlleles + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + if ( vc.getAlternateAlleles().size() > getMaxAltAlleles()) { + logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - final List alleles = new ArrayList(maxAltAlleles + 1); + final List alleles = new ArrayList(getMaxAltAlleles() + 1); alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, maxAltAlleles, ploidy)); + alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles(), ploidy)); VariantContextBuilder builder = new VariantContextBuilder(vc); builder.alleles(alleles); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java index ef4318a40..2d346e548 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java @@ -125,7 +125,7 @@ public class AFCalcUnitTest extends BaseTest { final List triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2); for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) { - List calcs = AFCalcFactory.createAFCalcs( Arrays.asList( AFCalcFactory.Calculation.values() ), 4, 2, 2, 2); + List calcs = AFCalcFactory.createAFCalcs( Arrays.asList( AFCalcFactory.Calculation.values() ), 4, 2, 2); final int nPriorValues = 2*nSamples+1; final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors @@ -215,7 +215,7 @@ public class AFCalcUnitTest extends BaseTest { samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative)); final int nSamples = samples.size(); - List calcs = AFCalcFactory.createAFCalcs(Arrays.asList(AFCalcFactory.Calculation.values()), 4, 2, 2, 2); + List calcs = AFCalcFactory.createAFCalcs(Arrays.asList(AFCalcFactory.Calculation.values()), 4, 2, 2); final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java index 1b3a4c0c0..3df2f7883 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java @@ -140,7 +140,7 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest { final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size()); double[] priors = new double[len]; // flat priors - final GeneralPloidyExactAFCalc calc = new GeneralPloidyExactAFCalc(cfg.GLs.size(), 1 + cfg.numAltAlleles, 1 + cfg.numAltAlleles, cfg.ploidy); + final GeneralPloidyExactAFCalc calc = new GeneralPloidyExactAFCalc(cfg.GLs.size(), 1 + cfg.numAltAlleles, cfg.ploidy); calc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 870967f09..1d4fad19e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -31,7 +31,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "53caa950535749f99d3c5b9bb61c7b60"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "53caa950535749f99d3c5b9bb61c7b60"); } private void HCTestComplexVariants(String bam, String args, String md5) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 84dfa694b..aa3a1e6ac 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -55,20 +55,25 @@ public class StandardCallerArgumentCollection { * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend * that you not play around with this parameter. + * + * As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6. */ @Advanced @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) - public int MAX_ALTERNATE_ALLELES = 3; + public int MAX_ALTERNATE_ALLELES = 6; /** * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend * that you not play around with this parameter. + * + * This argument has been retired in GATK 2.2. Please specify just maxAltAlleles from now on */ - @Advanced - @Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false) - public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2; + @Deprecated + @Hidden + @Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "This argument has been retired in GATK 2.2. Please specify just maxAltAlleles from now on, which will apply to any variant, regardless of type", required = false) + public int MAX_ALTERNATE_ALLELES_FOR_INDELS = -1; /** * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java index e3abdeb24..f783267bc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -45,7 +45,6 @@ public abstract class AFCalc implements Cloneable { protected final int nSamples; protected final int maxAlternateAllelesToGenotype; - protected final int maxAlternateAllelesForIndels; protected Logger logger = defaultLogger; @@ -60,19 +59,16 @@ public abstract class AFCalc implements Cloneable { * * @param nSamples number of samples, must be > 0 * @param maxAltAlleles maxAltAlleles for SNPs - * @param maxAltAllelesForIndels for indels * @param ploidy the ploidy, must be > 0 */ - protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + protected AFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); - if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels); if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be > 0 but got " + ploidy); this.nSamples = nSamples; this.maxAlternateAllelesToGenotype = maxAltAlleles; - this.maxAlternateAllelesForIndels = maxAltAllelesForIndels; - this.stateTracker = new StateTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels)); + this.stateTracker = new StateTracker(maxAltAlleles); } /** @@ -191,7 +187,7 @@ public abstract class AFCalc implements Cloneable { // --------------------------------------------------------------------------- public int getMaxAltAlleles() { - return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels); + return maxAlternateAllelesToGenotype; } protected StateTracker getStateTracker() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java index 80de555ca..efb16101e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java @@ -91,7 +91,7 @@ public class AFCalcFactory { public static AFCalc createAFCalc(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger) { - final int maxAltAlleles = Math.max(UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS); + final int maxAltAlleles = UAC.MAX_ALTERNATE_ALLELES; if ( ! UAC.AFmodel.usableForParams(UAC.samplePloidy, maxAltAlleles) ) { logger.info("Requested ploidy " + UAC.samplePloidy + " maxAltAlleles " + maxAltAlleles + " not supported by requested model " + UAC.AFmodel + " looking for an option"); final List supportingCalculations = new LinkedList(); @@ -109,7 +109,7 @@ public class AFCalcFactory { logger.info("Selecting model " + UAC.AFmodel); } - final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.samplePloidy); + final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.samplePloidy); if ( logger != null ) calc.setLogger(logger); if ( UAC.exactCallsLog != null ) calc.enableProcessLog(UAC.exactCallsLog); @@ -126,7 +126,7 @@ public class AFCalcFactory { * @return an initialized AFCalc */ public static AFCalc createAFCalc(final int nSamples) { - return createAFCalc(chooseBestCalculation(nSamples, 2, 1), nSamples, 2, 2, 2); + return createAFCalc(chooseBestCalculation(nSamples, 2, 1), nSamples, 2, 2); } /** @@ -139,7 +139,7 @@ public class AFCalcFactory { * @return an initialized AFCalc */ public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles) { - return createAFCalc(calc, nSamples, maxAltAlleles, maxAltAlleles, 2); + return createAFCalc(calc, nSamples, maxAltAlleles, 2); } /** @@ -147,14 +147,12 @@ public class AFCalcFactory { * * @param nSamples the number of samples we'll be using * @param maxAltAlleles the max. alt alleles to consider for SNPs - * @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs * @param ploidy the sample ploidy. Must be consistent with the calc * * @return an initialized AFCalc */ - public static AFCalc createAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { - final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels); - return createAFCalc(chooseBestCalculation(nSamples, ploidy, maxAlt), nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + public static AFCalc createAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + return createAFCalc(chooseBestCalculation(nSamples, ploidy, maxAltAlleles), nSamples, maxAltAlleles, ploidy); } /** @@ -181,20 +179,17 @@ public class AFCalcFactory { * @param calc the calculation to use * @param nSamples the number of samples we'll be using * @param maxAltAlleles the max. alt alleles to consider for SNPs - * @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs * @param ploidy the sample ploidy. Must be consistent with the calc * * @return an initialized AFCalc */ - public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles, final int ploidy) { if ( calc == null ) throw new IllegalArgumentException("Calculation cannot be null"); if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); - if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels); if ( ploidy < 1 ) throw new IllegalArgumentException("sample ploidy must be greater than zero " + ploidy); - final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels); - if ( ! calc.usableForParams(ploidy, maxAlt) ) + if ( ! calc.usableForParams(ploidy, maxAltAlleles) ) throw new IllegalArgumentException("AFCalc " + calc + " does not support requested ploidy " + ploidy); final Class afClass = getClassByName(calc.className); @@ -202,19 +197,19 @@ public class AFCalcFactory { throw new IllegalArgumentException("Unexpected AFCalc " + calc); try { - Object args[] = new Object[]{nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy}; - Constructor c = afClass.getDeclaredConstructor(int.class, int.class, int.class, int.class); + Object args[] = new Object[]{nSamples, maxAltAlleles, ploidy}; + Constructor c = afClass.getDeclaredConstructor(int.class, int.class, int.class); return (AFCalc)c.newInstance(args); } catch (Exception e) { throw new ReviewedStingException("Could not instantiate AFCalc " + calc, e); } } - protected static List createAFCalcs(final List calcs, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { + protected static List createAFCalcs(final List calcs, final int nSamples, final int maxAltAlleles, final int ploidy) { final List AFCalcs = new LinkedList(); for ( final Calculation calc : calcs ) - AFCalcs.add(createAFCalc(calc, nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy)); + AFCalcs.add(createAFCalc(calc, nSamples, maxAltAlleles, ploidy)); return AFCalcs; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java index 6b345dcf5..4895c84d9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -31,8 +31,8 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; public abstract class DiploidExactAFCalc extends ExactAFCalc { - public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); } @@ -75,16 +75,14 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { @Override protected VariantContext reduceScope(final VariantContext vc) { - final int myMaxAltAllelesToGenotype = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype; - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) { - logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + if ( vc.getAlternateAlleles().size() > getMaxAltAlleles() ) { + logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); VariantContextBuilder builder = new VariantContextBuilder(vc); - List alleles = new ArrayList(myMaxAltAllelesToGenotype + 1); + List alleles = new ArrayList(getMaxAltAlleles() + 1); alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype)); + alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles())); builder.alleles(alleles); builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); return builder.make(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java index df0793352..ab230d398 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java @@ -39,8 +39,8 @@ import java.util.ArrayList; abstract class ExactAFCalc extends AFCalc { protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first - protected ExactAFCalc(final int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + protected ExactAFCalc(final int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index 804b560b4..857b7e59e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -89,7 +89,7 @@ import java.util.*; /** * The min. confidence of an allele to be included in the joint posterior. */ - private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-20); + private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-10); private final static int[] BIALLELIC_NON_INFORMATIVE_PLS = new int[]{0,0,0}; private final static List BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); @@ -111,9 +111,9 @@ import java.util.*; */ final AFCalc biAlleleExactModel; - protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); - biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, 1, ploidy); + protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, ploidy); } /** @@ -336,12 +336,13 @@ import java.util.*; // MLE of altI allele is simply the MLE of this allele in altAlleles alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele); - log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0(); - log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); - // the AF > 0 case requires us to store the normalized likelihood for later summation - if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) + if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) { log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); + log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0(); + log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); + } + log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java index ac4634666..fc26111e0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -13,8 +13,8 @@ import java.util.Map; * Original bi-allelic ~O(N) implementation. Kept here for posterity and reference */ class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { - protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); } @Override diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java index b4e7b2ab1..97e5fed3b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java @@ -1,7 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; public class ReferenceDiploidExactAFCalc extends DiploidExactAFCalc { - protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy); + protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index 48a2d2700..d71d0c9c8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -55,7 +55,7 @@ public class GLBasedSampleSelector extends SampleSelector { // do we want to apply a prior? maybe user-spec? if ( flatPriors == null ) { flatPriors = new double[1+2*samples.size()]; - AFCalculator = AFCalcFactory.createAFCalc(samples.size(), 4, 4, 2); + AFCalculator = AFCalcFactory.createAFCalc(samples.size(), 4, 2); } final AFCalcResult result = AFCalculator.getLog10PNonRef(subContext, flatPriors); // do we want to let this qual go up or down? diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 3a2d6f697..a1701e3e5 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("73c9b926c5e971a113de347a64fdcf20")); + Arrays.asList("543f68e42034bf44cfb24da8c9204320")); executeTest("test Multiple SNP alleles", spec); } @@ -78,7 +78,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("7f8d13690cb7d4173787afa00c496f12")); + Arrays.asList("5ce03dd9ca2d9324c1d4a9d64389beb5")); executeTest("test reverse trim", spec); } From 008df545754ce9dee93a0f0de29056385bea73b2 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 22 Oct 2012 14:21:52 -0400 Subject: [PATCH 51/54] Bug fix in GATKSAMRecord.getSoftEnd() for reads that are entirely clipped. --- .../sting/utils/sam/GATKSAMRecord.java | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 1feb76517..9fdb48b34 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -60,8 +60,9 @@ public class GATKSAMRecord extends BAMRecord { private String mReadString = null; private GATKSAMReadGroupRecord mReadGroup = null; private byte[] reducedReadCounts = null; - private int softStart = -1; - private int softEnd = -1; + private final static int UNINITIALIZED = -1; + private int softStart = UNINITIALIZED; + private int softEnd = UNINITIALIZED; // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; @@ -386,7 +387,7 @@ public class GATKSAMRecord extends BAMRecord { * @return the unclipped start of the read taking soft clips (but not hard clips) into account */ public int getSoftStart() { - if (softStart < 0) { + if ( softStart == UNINITIALIZED ) { softStart = getAlignmentStart(); for (final CigarElement cig : getCigar().getCigarElements()) { final CigarOperator op = cig.getOperator(); @@ -408,17 +409,23 @@ public class GATKSAMRecord extends BAMRecord { * @return the unclipped end of the read taking soft clips (but not hard clips) into account */ public int getSoftEnd() { - if ( softEnd < 0 ) { + if ( softEnd == UNINITIALIZED ) { + boolean foundAlignedBase = false; softEnd = getAlignmentEnd(); final List cigs = getCigar().getCigarElements(); - for (int i=cigs.size() - 1; i>=0; --i) { + for (int i = cigs.size() - 1; i >= 0; --i) { final CigarElement cig = cigs.get(i); final CigarOperator op = cig.getOperator(); - if (op == CigarOperator.SOFT_CLIP) + if (op == CigarOperator.SOFT_CLIP) // assumes the soft clip that we found is at the end of the aligned read softEnd += cig.getLength(); - else if (op != CigarOperator.HARD_CLIP) + else if (op != CigarOperator.HARD_CLIP) { + foundAlignedBase = true; break; + } + } + if( !foundAlignedBase ) { // for example 64H14S, the soft end is actually the same as the alignment end + softEnd = getAlignmentEnd(); } } From fd59e7d5f65784182918a9f86f888cb3515bf1af Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Mon, 22 Oct 2012 16:27:31 -0400 Subject: [PATCH 52/54] Better error message when generic types are erased from scala collections. --- .../broadinstitute/sting/utils/exceptions/UserException.java | 3 +++ .../sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala | 3 +++ 2 files changed, 6 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 6b97f8f9f..c1f408bc7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -352,6 +352,9 @@ public class UserException extends ReviewedStingException { } public static class CannotExecuteQScript extends UserException { + public CannotExecuteQScript(String message) { + super(String.format("Unable to execute QScript: " + message)); + } public CannotExecuteQScript(String message, Exception e) { super(String.format("Unable to execute QScript: " + message), e); } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala index 0d8edc25d..54e89ec58 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala @@ -28,6 +28,7 @@ import collection.JavaConversions._ import org.broadinstitute.sting.queue.QException import java.lang.Class import org.broadinstitute.sting.commandline.{ArgumentMatches, ArgumentSource, ArgumentTypeDescriptor, ParsingEngine} +import org.broadinstitute.sting.utils.exceptions.UserException import java.lang.reflect.Type /** @@ -75,6 +76,8 @@ class ScalaCompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { def parse(parsingEngine: ParsingEngine, source: ArgumentSource, classType: Class[_], argumentMatches: ArgumentMatches) = { val componentType = ReflectionUtils.getCollectionType(source.field) + if (componentType == classOf[java.lang.Object]) + throw new UserException.CannotExecuteQScript("Please also include a @ClassType(classOf[]) annotation on field: " + source.field + ". Example: @ClassType(classOf[Double]). The scala generic type for the field was subjected to java/scala type erasure and is not available via reflection.") val componentArgumentParser = parsingEngine.selectBestTypeDescriptor(componentType) if (classOf[Seq[_]].isAssignableFrom(classType)) { From 15b28e61cd6f73c9ddbab1e06e8b0cb08e62572e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 22 Oct 2012 16:54:38 -0400 Subject: [PATCH 53/54] Retiring TraverseReads and TraverseLoci after testing confirms nano scheduler version in single threaded version is fine -- There's been no report of problems with the nano scheduled version of TraverseLoci and TraverseReads, so I'm removing the old versions since they are no longer needed -- Removing unnecessary intermediate base classes -- GSA-515 / Nanoscheduler GSA-549 / https://jira.broadinstitute.org/browse/GSA-549 --- .../sting/gatk/executive/MicroScheduler.java | 12 +- .../gatk/traversals/TraverseLociBase.java | 103 ---------------- .../gatk/traversals/TraverseLociLinear.java | 47 -------- .../gatk/traversals/TraverseLociNano.java | 85 +++++++++++++- .../gatk/traversals/TraverseReadPairs.java | 2 +- .../sting/gatk/traversals/TraverseReads.java | 111 ------------------ .../traversals/TraverseReadsUnitTest.java | 11 +- 7 files changed, 89 insertions(+), 282 deletions(-) delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 223e11680..df4ed9ef8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -74,8 +74,6 @@ import java.util.*; * */ public abstract class MicroScheduler implements MicroSchedulerMBean { - // TODO -- remove me and retire non nano scheduled versions of traversals - private final static boolean USE_NANOSCHEDULER_FOR_EVERYTHING = true; protected static final Logger logger = Logger.getLogger(MicroScheduler.class); /** @@ -238,15 +236,9 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { @Ensures("result != null") private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) { if (walker instanceof ReadWalker) { - if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ) - return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); - else - return new TraverseReads(); + return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); } else if (walker instanceof LocusWalker) { - if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ) - return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); - else - return new TraverseLociLinear(); + return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); } else if (walker instanceof DuplicateWalker) { return new TraverseDuplicates(); } else if (walker instanceof ReadPairWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java deleted file mode 100755 index 30e78ef5c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java +++ /dev/null @@ -1,103 +0,0 @@ -package org.broadinstitute.sting.gatk.traversals; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; - -/** - * A simple solution to iterating over all reference positions over a series of genomic locations. - */ -public abstract class TraverseLociBase extends TraversalEngine,LocusShardDataProvider> { - /** - * our log, which we want to capture anything from this class - */ - protected static final Logger logger = Logger.getLogger(TraversalEngine.class); - - @Override - public final String getTraversalUnits() { - return "sites"; - } - - protected static class TraverseResults { - final int numIterations; - final T reduceResult; - - public TraverseResults(int numIterations, T reduceResult) { - this.numIterations = numIterations; - this.reduceResult = reduceResult; - } - } - - protected abstract TraverseResults traverse( final LocusWalker walker, - final LocusView locusView, - final LocusReferenceView referenceView, - final ReferenceOrderedView referenceOrderedDataView, - final T sum); - - @Override - public T traverse( LocusWalker walker, - LocusShardDataProvider dataProvider, - T sum) { - logger.debug(String.format("TraverseLociBase.traverse: Shard is %s", dataProvider)); - - final LocusView locusView = getLocusView( walker, dataProvider ); - - if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all - //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); - ReferenceOrderedView referenceOrderedDataView = null; - if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) - referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); - else - referenceOrderedDataView = (RodLocusView)locusView; - - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - - final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); - sum = result.reduceResult; - dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); - updateCumulativeMetrics(dataProvider.getShard()); - } - - // We have a final map call to execute here to clean up the skipped based from the - // last position in the ROD to that in the interval - if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { - // only do this if the walker isn't done! - final RodLocusView rodLocusView = (RodLocusView)locusView; - final long nSkipped = rodLocusView.getLastSkippedBases(); - if ( nSkipped > 0 ) { - final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); - final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); - final M x = walker.map(null, null, ac); - sum = walker.reduce(x, sum); - } - } - - return sum; - } - - /** - * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' - * of sorts, providing a consistent interface so that TraverseLociBase doesn't need to be reimplemented for any new datatype - * that comes along. - * @param walker walker to interrogate. - * @param dataProvider Data which which to drive the locus view. - * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. - */ - private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { - final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); - if( dataSource == DataSource.READS ) - return new CoveredLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) - return new AllLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) - return new RodLocusView(dataProvider); - else - throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java deleted file mode 100755 index 22381092f..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java +++ /dev/null @@ -1,47 +0,0 @@ -package org.broadinstitute.sting.gatk.traversals; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; -import org.broadinstitute.sting.gatk.datasources.providers.LocusView; -import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.utils.GenomeLoc; - -/** - * A simple solution to iterating over all reference positions over a series of genomic locations. - */ -public class TraverseLociLinear extends TraverseLociBase { - - @Override - protected TraverseResults traverse(LocusWalker walker, LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView, T sum) { - // We keep processing while the next reference location is within the interval - boolean done = false; - int numIterations = 0; - - while( locusView.hasNext() && ! done ) { - numIterations++; - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - final boolean keepMeP = walker.filter(tracker, refContext, locus); - if (keepMeP) { - final M x = walker.map(tracker, refContext, locus); - sum = walker.reduce(x, sum); - done = walker.isDone(); - } - - printProgress(locus.getLocation()); - } - - return new TraverseResults(numIterations, sum); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java index 469625c30..84715e5fd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java @@ -1,24 +1,26 @@ package org.broadinstitute.sting.gatk.traversals; +import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; -import org.broadinstitute.sting.gatk.datasources.providers.LocusView; -import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; +import org.broadinstitute.sting.gatk.datasources.providers.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction; import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import java.util.Iterator; /** * A simple solution to iterating over all reference positions over a series of genomic locations. */ -public class TraverseLociNano extends TraverseLociBase { +public class TraverseLociNano extends TraversalEngine,LocusShardDataProvider> { /** our log, which we want to capture anything from this class */ private static final boolean DEBUG = false; @@ -30,6 +32,81 @@ public class TraverseLociNano extends TraverseLociBase { } @Override + public final String getTraversalUnits() { + return "sites"; + } + + protected static class TraverseResults { + final int numIterations; + final T reduceResult; + + public TraverseResults(int numIterations, T reduceResult) { + this.numIterations = numIterations; + this.reduceResult = reduceResult; + } + } + + @Override + public T traverse( LocusWalker walker, + LocusShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider)); + + final LocusView locusView = getLocusView( walker, dataProvider ); + + if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all + //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); + ReferenceOrderedView referenceOrderedDataView = null; + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); + else + referenceOrderedDataView = (RodLocusView)locusView; + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + + final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); + sum = result.reduceResult; + dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); + updateCumulativeMetrics(dataProvider.getShard()); + } + + // We have a final map call to execute here to clean up the skipped based from the + // last position in the ROD to that in the interval + if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { + // only do this if the walker isn't done! + final RodLocusView rodLocusView = (RodLocusView)locusView; + final long nSkipped = rodLocusView.getLastSkippedBases(); + if ( nSkipped > 0 ) { + final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); + final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); + final M x = walker.map(null, null, ac); + sum = walker.reduce(x, sum); + } + } + + return sum; + } + + /** + * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' + * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype + * that comes along. + * @param walker walker to interrogate. + * @param dataProvider Data which which to drive the locus view. + * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. + */ + private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { + final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + if( dataSource == DataSource.READS ) + return new CoveredLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) + return new AllLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) + return new RodLocusView(dataProvider); + else + throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); + } + protected TraverseResults traverse(final LocusWalker walker, final LocusView locusView, final LocusReferenceView referenceView, diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java index aef3cf7d0..8273e1328 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java @@ -42,7 +42,7 @@ public class TraverseReadPairs extends TraversalEngine walker, ReadShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseReads.traverse Covered dataset is %s", dataProvider)); + logger.debug(String.format("TraverseReadsPairs.traverse Covered dataset is %s", dataProvider)); if( !dataProvider.hasReads() ) throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java deleted file mode 100755 index d41d17bde..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ -package org.broadinstitute.sting.gatk.traversals; - -import net.sf.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView; -import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView; -import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.providers.ReadView; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -/** - * @author aaron - * @version 1.0 - * @date Apr 24, 2009 - *

- * Class TraverseReads - *

- * This class handles traversing by reads in the new shardable style - */ -public class TraverseReads extends TraversalEngine,ReadShardDataProvider> { - /** our log, which we want to capture anything from this class */ - protected static final Logger logger = Logger.getLogger(TraverseReads.class); - - @Override - public String getTraversalUnits() { - return "reads"; - } - - /** - * Traverse by reads, given the data and the walker - * - * @param walker the walker to traverse with - * @param dataProvider the provider of the reads data - * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function - * @return the reduce variable of the read walker - */ - public T traverse(ReadWalker walker, - ReadShardDataProvider dataProvider, - T sum) { - - logger.debug(String.format("TraverseReads.traverse Covered dataset is %s", dataProvider)); - - if( !dataProvider.hasReads() ) - throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); - - final ReadView reads = new ReadView(dataProvider); - final ReadReferenceView reference = new ReadReferenceView(dataProvider); - - // get the reference ordered data - final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - - boolean done = walker.isDone(); - // while we still have more reads - for (final SAMRecord read : reads) { - if ( done ) break; - - // ReferenceContext -- the reference bases covered by the read - final ReferenceContext refContext = ! read.getReadUnmappedFlag() && dataProvider.hasReference() - ? reference.getReferenceContext(read) - : null; - - // update the number of reads we've seen - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // if the read is mapped, create a metadata tracker - final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null; - - final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); - if (keepMeP) { - M x = walker.map(refContext, (GATKSAMRecord) read, tracker); // the tracker can be null - sum = walker.reduce(x, sum); - } - - final GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart()); - - updateCumulativeMetrics(dataProvider.getShard()); - printProgress(locus); - - done = walker.isDone(); - } - return sum; - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index 46f77c283..bf1fc9e65 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -6,13 +6,12 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.qc.CountReads; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -62,9 +61,9 @@ public class TraverseReadsUnitTest extends BaseTest { private SAMReaderID bam = new SAMReaderID(new File(validationDataLocation + "index_test.bam"),new Tags()); // TCGA-06-0188.aligned.duplicates_marked.bam"); private File refFile = new File(validationDataLocation + "Homo_sapiens_assembly17.fasta"); private List bamList; - private Walker countReadWalker; + private ReadWalker countReadWalker; private File output; - private TraverseReads traversalEngine = null; + private TraverseReadsNano traversalEngine = null; private IndexedFastaSequenceFile ref = null; private GenomeLocParser genomeLocParser = null; @@ -107,7 +106,7 @@ public class TraverseReadsUnitTest extends BaseTest { bamList.add(bam); countReadWalker = new CountReads(); - traversalEngine = new TraverseReads(); + traversalEngine = new TraverseReadsNano(1); traversalEngine.initialize(engine); } @@ -125,7 +124,7 @@ public class TraverseReadsUnitTest extends BaseTest { fail("Shard == null"); } - ShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null,null); + ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null,null); accumulator = traversalEngine.traverse(countReadWalker, dataProvider, accumulator); dataProvider.close(); } From f838815343d971d3a9ff2eda0df931aa69654c52 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 23 Oct 2012 06:47:53 -0400 Subject: [PATCH 54/54] Updating MD5s for confidence ref site estimation in IndependentAllelesDiploidExactAFCalc -- Included logic to only add priors for alleles with sufficient evidence to be called polymorphic. If no alleles are poly make sure to add priors of first allele --- .../afcalc/IndependentAllelesDiploidExactAFCalc.java | 8 ++++++++ .../genotyper/UnifiedGenotyperIntegrationTest.java | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index 857b7e59e..d0b801a20 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -329,6 +329,7 @@ import java.util.*; double log10PosteriorOfACEq0Sum = 0.0; double log10PosteriorOfACGt0Sum = 0.0; + boolean anyPoly = false; for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); final int altI = vc.getAlleles().indexOf(altAllele) - 1; @@ -338,6 +339,7 @@ import java.util.*; // the AF > 0 case requires us to store the normalized likelihood for later summation if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) { + anyPoly = true; log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0(); log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); @@ -352,6 +354,12 @@ import java.util.*; nEvaluations += sortedResultWithThetaNPriors.nEvaluations; } + // If no alleles were polymorphic, make sure we have the proper priors (the defaults) for likelihood calculation + if ( ! anyPoly ) { + log10PriorsOfAC[0] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFEq0(); + log10PriorsOfAC[1] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFGT0(); + } + // In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C, // the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently // log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0 diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index a1701e3e5..dee54b9f8 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -192,12 +192,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "8800c58715c2bb434b69e1873cb77de6"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "f9ea04d96eeef29e71d37e60518c2579"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "639df9f4c029792ac2e46069efc82b20"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "41c046d38ea328421df924e37e017645"); } private void testOutputParameters(final String args, final String md5) {