Merge GATK repo into CMI-GATK
Bringing in the following relevant changes: * Fixes the indel realigner N-Way out null pointer exception DEV-10 * Optimizations to ReduceReads that bring the run time to 1/3rd. Conflicts: protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java DEV-10 #resolve #time 2m
This commit is contained in:
commit
c210b7cde4
|
|
@ -1,8 +1,5 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* An object that keeps track of the base counts as well as the sum of the base, insertion and deletion qualities of each base.
|
||||
*
|
||||
|
|
@ -10,35 +7,31 @@ import java.util.Map;
|
|||
* @since 6/15/12
|
||||
*/
|
||||
public class BaseAndQualsCounts extends BaseCounts {
|
||||
private final Map<BaseIndex, Long> sumInsertionQuals;
|
||||
private final Map<BaseIndex, Long> sumDeletionQuals;
|
||||
private final long[] sumInsertionQuals;
|
||||
private final long[] sumDeletionQuals;
|
||||
|
||||
public BaseAndQualsCounts() {
|
||||
super();
|
||||
this.sumInsertionQuals = new HashMap<BaseIndex, Long>();
|
||||
this.sumDeletionQuals = new HashMap<BaseIndex, Long>();
|
||||
for (BaseIndex i : BaseIndex.values()) {
|
||||
sumInsertionQuals.put(i, 0L);
|
||||
sumDeletionQuals.put(i, 0L);
|
||||
this.sumInsertionQuals = new long[BaseIndex.values().length];
|
||||
this.sumDeletionQuals = new long[BaseIndex.values().length];
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
sumInsertionQuals[i.index] = 0L;
|
||||
sumDeletionQuals[i.index] = 0L;
|
||||
}
|
||||
}
|
||||
|
||||
public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
|
||||
super.incr(base, baseQual);
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // do not allow Ns
|
||||
sumInsertionQuals.put(i, sumInsertionQuals.get(i) + insQual);
|
||||
sumDeletionQuals.put(i, sumDeletionQuals.get(i) + delQual);
|
||||
}
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
super.incr(i, baseQual);
|
||||
sumInsertionQuals[i.index] += insQual;
|
||||
sumDeletionQuals[i.index] += delQual;
|
||||
}
|
||||
|
||||
public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
|
||||
super.decr(base, baseQual);
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // do not allow Ns
|
||||
sumInsertionQuals.put(i, sumInsertionQuals.get(i) - insQual);
|
||||
sumDeletionQuals.put(i, sumDeletionQuals.get(i) - delQual);
|
||||
}
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
super.decr(i, baseQual);
|
||||
sumInsertionQuals[i.index] -= insQual;
|
||||
sumDeletionQuals[i.index] -= delQual;
|
||||
}
|
||||
|
||||
public byte averageInsertionQualsOfBase(final BaseIndex base) {
|
||||
|
|
@ -49,7 +42,7 @@ public class BaseAndQualsCounts extends BaseCounts {
|
|||
return getGenericAverageQualOfBase(base, sumDeletionQuals);
|
||||
}
|
||||
|
||||
private byte getGenericAverageQualOfBase(final BaseIndex base, final Map<BaseIndex, Long> sumQuals) {
|
||||
return (byte) (sumQuals.get(base) / getCount(base));
|
||||
private byte getGenericAverageQualOfBase(final BaseIndex base, final long[] sumQuals) {
|
||||
return (byte) (sumQuals[base.index] / countOfBase(base));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,8 +3,6 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
|||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.EnumMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* An object to keep track of the number of occurrences of each base and it's quality.
|
||||
|
|
@ -18,79 +16,73 @@ import java.util.Map;
|
|||
public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N;
|
||||
public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte();
|
||||
|
||||
private final Map<BaseIndex, Integer> counts; // keeps track of the base counts
|
||||
private final Map<BaseIndex, Long> sumQuals; // keeps track of the quals of each base
|
||||
private final int[] counts; // keeps track of the base counts
|
||||
private final long[] sumQuals; // keeps track of the quals of each base
|
||||
private int totalCount = 0; // keeps track of total count since this is requested so often
|
||||
|
||||
public BaseCounts() {
|
||||
counts = new EnumMap<BaseIndex, Integer>(BaseIndex.class);
|
||||
sumQuals = new EnumMap<BaseIndex, Long>(BaseIndex.class);
|
||||
for (BaseIndex i : BaseIndex.values()) {
|
||||
counts.put(i, 0);
|
||||
sumQuals.put(i, 0L);
|
||||
counts = new int[BaseIndex.values().length];
|
||||
sumQuals = new long[BaseIndex.values().length];
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
counts[i.index] = 0;
|
||||
sumQuals[i.index] = 0L;
|
||||
}
|
||||
}
|
||||
|
||||
public static BaseCounts createWithCounts(int[] countsACGT) {
|
||||
BaseCounts baseCounts = new BaseCounts();
|
||||
baseCounts.counts.put(BaseIndex.A, countsACGT[0]);
|
||||
baseCounts.counts.put(BaseIndex.C, countsACGT[1]);
|
||||
baseCounts.counts.put(BaseIndex.G, countsACGT[2]);
|
||||
baseCounts.counts.put(BaseIndex.T, countsACGT[3]);
|
||||
baseCounts.counts[BaseIndex.A.index] = countsACGT[0];
|
||||
baseCounts.counts[BaseIndex.C.index] = countsACGT[1];
|
||||
baseCounts.counts[BaseIndex.G.index] = countsACGT[2];
|
||||
baseCounts.counts[BaseIndex.T.index] = countsACGT[3];
|
||||
baseCounts.totalCount = countsACGT[0] + countsACGT[1] + countsACGT[2] + countsACGT[3];
|
||||
return baseCounts;
|
||||
}
|
||||
|
||||
@Requires("other != null")
|
||||
public void add(BaseCounts other) {
|
||||
for (final BaseIndex i : BaseIndex.values())
|
||||
counts.put(i, counts.get(i) + other.counts.get(i));
|
||||
public void add(final BaseCounts other) {
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
final int otherCount = other.counts[i.index];
|
||||
counts[i.index] += otherCount;
|
||||
totalCount += otherCount;
|
||||
}
|
||||
}
|
||||
|
||||
@Requires("other != null")
|
||||
public void sub(BaseCounts other) {
|
||||
for (final BaseIndex i : BaseIndex.values())
|
||||
counts.put(i, counts.get(i) - other.counts.get(i));
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(byte base) {
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) // no Ns
|
||||
counts.put(i, counts.get(i) + 1);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(byte base, byte qual) {
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // no Ns
|
||||
counts.put(i, counts.get(i) + 1);
|
||||
sumQuals.put(i, sumQuals.get(i) + qual);
|
||||
public void sub(final BaseCounts other) {
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
final int otherCount = other.counts[i.index];
|
||||
counts[i.index] -= otherCount;
|
||||
totalCount -= otherCount;
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(byte base) {
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(final byte base) {
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) // no Ns
|
||||
counts.put(i, counts.get(i) - 1);
|
||||
counts[i.index]++;
|
||||
totalCount++;
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(final BaseIndex base, final byte qual) {
|
||||
counts[base.index]++;
|
||||
totalCount++;
|
||||
sumQuals[base.index] += qual;
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(byte base, byte qual) {
|
||||
public void decr(final byte base) {
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // no Ns
|
||||
counts.put(i, counts.get(i) - 1);
|
||||
sumQuals.put(i, sumQuals.get(i) - qual);
|
||||
}
|
||||
counts[i.index]--;
|
||||
totalCount--;
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int getCount(final byte base) {
|
||||
return getCount(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int getCount(final BaseIndex base) {
|
||||
return counts.get(base);
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(final BaseIndex base, final byte qual) {
|
||||
counts[base.index]--;
|
||||
totalCount--;
|
||||
sumQuals[base.index] -= qual;
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
|
|
@ -100,27 +92,32 @@ import java.util.Map;
|
|||
|
||||
@Ensures("result >= 0")
|
||||
public long getSumQuals(final BaseIndex base) {
|
||||
return sumQuals.get(base);
|
||||
return sumQuals[base.index];
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQuals(final byte base) {
|
||||
return (byte) (getSumQuals(base) / getCount(base));
|
||||
return (byte) (getSumQuals(base) / countOfBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQuals(final BaseIndex base) {
|
||||
return (byte) (getSumQuals(base) / getCount(base));
|
||||
return (byte) (getSumQuals(base) / countOfBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int countOfBase(final byte base) {
|
||||
return countOfBase(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int countOfBase(final BaseIndex base) {
|
||||
return counts.get(base);
|
||||
return counts[base.index];
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long sumQualsOfBase(final BaseIndex base) {
|
||||
return sumQuals.get(base);
|
||||
return sumQuals[base.index];
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
|
|
@ -131,44 +128,36 @@ import java.util.Map;
|
|||
|
||||
@Ensures("result >= 0")
|
||||
public int totalCount() {
|
||||
int sum = 0;
|
||||
for (int c : counts.values())
|
||||
sum += c;
|
||||
|
||||
return sum;
|
||||
return totalCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a base , it returns the proportional count of this base compared to all other bases
|
||||
*
|
||||
* @param base
|
||||
* @param base base
|
||||
* @return the proportion of this base over all other bases
|
||||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(final byte base) {
|
||||
return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount();
|
||||
return baseCountProportion(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a base , it returns the proportional count of this base compared to all other bases
|
||||
*
|
||||
* @param baseIndex
|
||||
* @param baseIndex base
|
||||
* @return the proportion of this base over all other bases
|
||||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(final BaseIndex baseIndex) {
|
||||
int total = totalCount();
|
||||
if (total == 0)
|
||||
return 0.0;
|
||||
return (double) counts.get(baseIndex) / totalCount();
|
||||
return (totalCount == 0) ? 0.0 : (double)counts[baseIndex.index] / (double)totalCount;
|
||||
}
|
||||
|
||||
|
||||
@Ensures("result != null")
|
||||
public String toString() {
|
||||
StringBuilder b = new StringBuilder();
|
||||
for (Map.Entry<BaseIndex, Integer> elt : counts.entrySet()) {
|
||||
b.append(elt.toString()).append("=").append(elt.getValue()).append(",");
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
b.append(i.toString()).append("=").append(counts[i.index]).append(",");
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
|
|
@ -180,9 +169,9 @@ import java.util.Map;
|
|||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCounts() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet()) {
|
||||
if (entry.getValue() > counts.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
if (counts[i.index] > counts[maxI.index])
|
||||
maxI = i;
|
||||
}
|
||||
return maxI;
|
||||
}
|
||||
|
|
@ -190,17 +179,17 @@ import java.util.Map;
|
|||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCountsWithoutIndels() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet()) {
|
||||
if (entry.getKey().isNucleotide() && entry.getValue() > counts.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
if (i.isNucleotide() && counts[i.index] > counts[maxI.index])
|
||||
maxI = i;
|
||||
}
|
||||
return maxI;
|
||||
}
|
||||
|
||||
private boolean hasHigherCount(final BaseIndex targetIndex, final BaseIndex testIndex) {
|
||||
final int targetCount = counts.get(targetIndex);
|
||||
final int testCount = counts.get(testIndex);
|
||||
return ( targetCount > testCount || (targetCount == testCount && sumQuals.get(targetIndex) > sumQuals.get(testIndex)) );
|
||||
final int targetCount = counts[targetIndex.index];
|
||||
final int testCount = counts[testIndex.index];
|
||||
return ( targetCount > testCount || (targetCount == testCount && sumQuals[targetIndex.index] > sumQuals[testIndex.index]) );
|
||||
}
|
||||
|
||||
public byte baseWithMostProbability() {
|
||||
|
|
@ -210,48 +199,42 @@ import java.util.Map;
|
|||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostProbability() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (Map.Entry<BaseIndex, Long> entry : sumQuals.entrySet()) {
|
||||
if (entry.getValue() > sumQuals.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
if (sumQuals[i.index] > sumQuals[maxI.index])
|
||||
maxI = i;
|
||||
}
|
||||
return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCounts());
|
||||
return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCounts());
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostProbabilityWithoutIndels() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (Map.Entry<BaseIndex, Long> entry : sumQuals.entrySet()) {
|
||||
if (entry.getKey().isNucleotide() && entry.getValue() > sumQuals.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
if (i.isNucleotide() && sumQuals[i.index] > sumQuals[maxI.index])
|
||||
maxI = i;
|
||||
}
|
||||
return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCountsWithoutIndels());
|
||||
return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCountsWithoutIndels());
|
||||
}
|
||||
|
||||
@Ensures("result >=0")
|
||||
public int totalCountWithoutIndels() {
|
||||
int sum = 0;
|
||||
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet())
|
||||
if (entry.getKey().isNucleotide())
|
||||
sum += entry.getValue();
|
||||
return sum;
|
||||
return totalCount - counts[BaseIndex.D.index] - counts[BaseIndex.I.index];
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the proportional count of a base compared to all other bases except indels (I and D)
|
||||
*
|
||||
* @param index
|
||||
* @param base base
|
||||
* @return the proportion of this base over all other bases except indels
|
||||
*/
|
||||
@Requires("index.isNucleotide()")
|
||||
@Requires("base.isNucleotide()")
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportionWithoutIndels(final BaseIndex index) {
|
||||
public double baseCountProportionWithoutIndels(final BaseIndex base) {
|
||||
final int total = totalCountWithoutIndels();
|
||||
if (total == 0)
|
||||
return 0.0;
|
||||
return (double) counts.get(index) / totalCountWithoutIndels();
|
||||
return (total == 0) ? 0.0 : (double)counts[base.index] / (double)total;
|
||||
}
|
||||
|
||||
public Object[] countsArray() {
|
||||
return counts.values().toArray();
|
||||
public int[] countsArray() {
|
||||
return counts.clone();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
/**
|
||||
* Simple byte / base index conversions
|
||||
*
|
||||
|
|
@ -56,7 +58,7 @@ public enum BaseIndex {
|
|||
case 'N':
|
||||
case 'n':
|
||||
return N;
|
||||
default: return null;
|
||||
default: throw new ReviewedStingException("Tried to create a byte index for an impossible base " + base);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -68,7 +70,7 @@ public enum BaseIndex {
|
|||
* @return whether or not it is a nucleotide, given the definition above
|
||||
*/
|
||||
public boolean isNucleotide() {
|
||||
return this == A || this == C || this == G || this == T || this == N;
|
||||
return !isIndel();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -157,11 +157,9 @@ public class HeaderElement {
|
|||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
*/
|
||||
private boolean isVariantFromInsertions(double minIndelProportion) {
|
||||
int numberOfBases = consensusBaseCounts.totalCount();
|
||||
if (numberOfBases == 0 && insertionsToTheRight > 0)
|
||||
return true; // we only have insertions
|
||||
else if (numberOfBases == 0)
|
||||
return false; // we don't have anything
|
||||
final int numberOfBases = consensusBaseCounts.totalCount();
|
||||
if (numberOfBases == 0)
|
||||
return (insertionsToTheRight > 0); // do we only have insertions?
|
||||
|
||||
// if we have bases and insertions, check the ratio
|
||||
return ((double) insertionsToTheRight / numberOfBases) > minIndelProportion;
|
||||
|
|
@ -215,11 +213,11 @@ public class HeaderElement {
|
|||
if (totalCount == 0)
|
||||
return 0;
|
||||
|
||||
Object[] countsArray = consensusBaseCounts.countsArray();
|
||||
int[] countsArray = consensusBaseCounts.countsArray();
|
||||
Arrays.sort(countsArray);
|
||||
for (int i = countsArray.length-1; i>=0; i--) {
|
||||
nHaplotypes++;
|
||||
runningCount += (Integer) countsArray[i];
|
||||
runningCount += countsArray[i];
|
||||
if (runningCount/totalCount > minVariantProportion)
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ import java.util.*;
|
|||
public class SlidingWindow {
|
||||
|
||||
// Sliding Window data
|
||||
final private LinkedList<GATKSAMRecord> readsInWindow;
|
||||
final private TreeSet<GATKSAMRecord> readsInWindow;
|
||||
final private LinkedList<HeaderElement> windowHeader;
|
||||
protected int contextSize; // the largest context size (between mismatches and indels)
|
||||
protected String contig;
|
||||
|
|
@ -97,7 +97,13 @@ public class SlidingWindow {
|
|||
this.MIN_MAPPING_QUALITY = minMappingQuality;
|
||||
|
||||
this.windowHeader = new LinkedList<HeaderElement>();
|
||||
this.readsInWindow = new LinkedList<GATKSAMRecord>();
|
||||
this.readsInWindow = new TreeSet<GATKSAMRecord>(new Comparator<GATKSAMRecord>() {
|
||||
@Override
|
||||
public int compare(GATKSAMRecord read1, GATKSAMRecord read2) {
|
||||
final int difference = read1.getSoftEnd() - read2.getSoftEnd();
|
||||
return difference != 0 ? difference : read1.getReadName().compareTo(read2.getReadName());
|
||||
}
|
||||
});
|
||||
|
||||
this.contig = contig;
|
||||
this.contigIndex = contigIndex;
|
||||
|
|
@ -195,55 +201,105 @@ public class SlidingWindow {
|
|||
* @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
|
||||
* @return all reads that have fallen to the left of the sliding window after the slide
|
||||
*/
|
||||
protected List<GATKSAMRecord> slideWindow(int incomingReadUnclippedStart) {
|
||||
protected List<GATKSAMRecord> slideWindow(final int incomingReadUnclippedStart) {
|
||||
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
if (incomingReadUnclippedStart - contextSize > getStartLocation(windowHeader)) {
|
||||
int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation(windowHeader);
|
||||
boolean[] variantSite = markSites(getStartLocation(windowHeader) + readStartHeaderIndex);
|
||||
final int windowHeaderStartLocation = getStartLocation(windowHeader);
|
||||
|
||||
if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) {
|
||||
markSites(incomingReadUnclippedStart);
|
||||
int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation;
|
||||
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
|
||||
|
||||
CompressionStash regions = getVariantRegionsFromThisSample(0, breakpoint, variantSite);
|
||||
CompressionStash regions = getVariantRegionsFromThisSample(0, breakpoint, markedSites.getVariantSiteBitSet());
|
||||
finalizedReads = closeVariantRegions(regions, false);
|
||||
|
||||
List<GATKSAMRecord> readsToRemove = new LinkedList<GATKSAMRecord>();
|
||||
final int windowHeaderStartLoc = getStartLocation(windowHeader);
|
||||
for (final GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
|
||||
if (read.getSoftEnd() < windowHeaderStartLoc) {
|
||||
readsToRemove.add(read);
|
||||
}
|
||||
}
|
||||
for (GATKSAMRecord read : readsToRemove) {
|
||||
readsInWindow.remove(read);
|
||||
while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
|
||||
readsInWindow.pollFirst();
|
||||
}
|
||||
}
|
||||
|
||||
return finalizedReads;
|
||||
}
|
||||
|
||||
|
||||
private final class MarkedSites {
|
||||
|
||||
private boolean[] siteIsVariant = new boolean[0];
|
||||
private int startLocation = 0;
|
||||
|
||||
public MarkedSites() {}
|
||||
|
||||
public boolean[] getVariantSiteBitSet() { return siteIsVariant; }
|
||||
|
||||
/**
|
||||
* Updates the variant site bitset given the new startlocation and size of the region to mark.
|
||||
*
|
||||
* @param newStartLocation the new start location of the bitset
|
||||
* @param sizeOfRegion the new size of the region to be represented
|
||||
*
|
||||
* @return the end position (newStartLocation + index) of the region marked by this method; the calling method is responsible for the remainder.
|
||||
*/
|
||||
public int updateRegion(final int newStartLocation, final int sizeOfRegion) {
|
||||
int lastPositionMarked = sizeOfRegion;
|
||||
|
||||
// if this is the first time we set the array and we can't reuse anything, just create a new array from scratch
|
||||
if ( newStartLocation >= this.startLocation + siteIsVariant.length || newStartLocation < this.startLocation ) {
|
||||
siteIsVariant = new boolean[sizeOfRegion];
|
||||
lastPositionMarked = 0;
|
||||
}
|
||||
// if the dimensions change, copy what we can and continue
|
||||
else if ( newStartLocation != this.startLocation || sizeOfRegion != siteIsVariant.length ) {
|
||||
final boolean[] tempArray = new boolean[sizeOfRegion];
|
||||
final int differenceInStartPositions = newStartLocation - this.startLocation;
|
||||
lastPositionMarked = Math.min(siteIsVariant.length - differenceInStartPositions, sizeOfRegion);
|
||||
System.arraycopy(siteIsVariant, differenceInStartPositions, tempArray, 0, lastPositionMarked);
|
||||
siteIsVariant = null; // explicitly allow garbage collection
|
||||
siteIsVariant = tempArray;
|
||||
}
|
||||
|
||||
this.startLocation = newStartLocation;
|
||||
|
||||
return lastPositionMarked + newStartLocation;
|
||||
}
|
||||
}
|
||||
|
||||
private final MarkedSites markedSites = new MarkedSites();
|
||||
|
||||
/**
|
||||
* returns an array marked with variant and non-variant regions (it uses
|
||||
* markVariantRegions to make the marks)
|
||||
*
|
||||
* @param stop check the window from start to stop (not-inclusive)
|
||||
* @return a boolean array with 'true' marking variant regions and false marking consensus sites
|
||||
*/
|
||||
protected boolean[] markSites(int stop) {
|
||||
protected void markSites(final int stop) {
|
||||
|
||||
boolean[] markedSites = new boolean[stop - getStartLocation(windowHeader) + contextSize + 1];
|
||||
final int windowHeaderStartLocation = getStartLocation(windowHeader);
|
||||
final int sizeOfMarkedRegion = stop - windowHeaderStartLocation + contextSize + 1;
|
||||
|
||||
// copy over as many bits as we can from the previous calculation. Note that we can't trust the
|
||||
// last (contextSize - 1) worth of bits because we may not have actually looked at variant regions there.
|
||||
final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion) - contextSize - 1;
|
||||
final int locationToProcess = Math.min(lastPositionMarked, stop - contextSize);
|
||||
|
||||
// update the iterator to the correct position
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.iterator();
|
||||
for (int i = getStartLocation(windowHeader); i < stop; i++) {
|
||||
for (int i = windowHeaderStartLocation; i < locationToProcess; i++) {
|
||||
if (headerElementIterator.hasNext())
|
||||
headerElementIterator.next();
|
||||
}
|
||||
|
||||
// process a contextSize worth of region from scratch in case there's a variant there
|
||||
for (int i = locationToProcess; i < stop; i++) {
|
||||
if (headerElementIterator.hasNext()) {
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
|
||||
if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
|
||||
markVariantRegion(markedSites, i - getStartLocation(windowHeader));
|
||||
markVariantRegion(markedSites, i - windowHeaderStartLocation);
|
||||
|
||||
} else
|
||||
break;
|
||||
}
|
||||
return markedSites;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -252,11 +308,11 @@ public class SlidingWindow {
|
|||
* @param markedSites the boolean array to bear the marks
|
||||
* @param variantSiteLocation the location where a variant site was found
|
||||
*/
|
||||
protected void markVariantRegion(boolean[] markedSites, int variantSiteLocation) {
|
||||
protected void markVariantRegion(final MarkedSites markedSites, final int variantSiteLocation) {
|
||||
int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize;
|
||||
int to = (variantSiteLocation + contextSize + 1 > markedSites.length) ? markedSites.length : variantSiteLocation + contextSize + 1;
|
||||
int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length : variantSiteLocation + contextSize + 1;
|
||||
for (int i = from; i < to; i++)
|
||||
markedSites[i] = true;
|
||||
markedSites.getVariantSiteBitSet()[i] = true;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -630,8 +686,8 @@ public class SlidingWindow {
|
|||
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
boolean[] variantSite = markSites(getStopLocation(windowHeader) + 1);
|
||||
CompressionStash regions = getVariantRegionsFromThisSample(0, windowHeader.size(), variantSite);
|
||||
markSites(getStopLocation(windowHeader) + 1);
|
||||
CompressionStash regions = getVariantRegionsFromThisSample(0, windowHeader.size(), markedSites.getVariantSiteBitSet());
|
||||
finalizedReads = closeVariantRegions(regions, true);
|
||||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
|
|
@ -640,6 +696,7 @@ public class SlidingWindow {
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
return finalizedReads;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ public class ErrorModel {
|
|||
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
|
||||
if (refSampleVC.isIndel()) {
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
|
||||
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -245,7 +245,7 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
|
|||
|
||||
// find the alternate allele(s) that we should be using
|
||||
final List<Allele> alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs);
|
||||
if (alleles == null || alleles.isEmpty())
|
||||
if (alleles == null || alleles.isEmpty() || (alleles.size() == 1 && alleles.get(0).isReference()))
|
||||
return null;
|
||||
// start making the VariantContext
|
||||
final GenomeLoc loc = ref.getLocus();
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener
|
|||
|
||||
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
|
||||
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ public class AFCalcTestBuilder {
|
|||
}
|
||||
|
||||
public AFCalc makeModel() {
|
||||
return AFCalcFactory.createAFCalc(modelType, nSamples, getNumAltAlleles(), getNumAltAlleles(), 2);
|
||||
return AFCalcFactory.createAFCalc(modelType, nSamples, getNumAltAlleles(), 2);
|
||||
}
|
||||
|
||||
public double[] makePriors() {
|
||||
|
|
|
|||
|
|
@ -26,7 +26,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.ProbabilityVector;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -41,22 +40,20 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
private final static boolean VERBOSE = false;
|
||||
|
||||
protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, ploidy);
|
||||
this.ploidy = ploidy;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected VariantContext reduceScope(VariantContext vc) {
|
||||
final int maxAltAlleles = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype;
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > maxAltAlleles) {
|
||||
logger.warn("this tool is currently set to genotype at most " + maxAltAlleles + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
if ( vc.getAlternateAlleles().size() > getMaxAltAlleles()) {
|
||||
logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
final List<Allele> alleles = new ArrayList<Allele>(maxAltAlleles + 1);
|
||||
final List<Allele> alleles = new ArrayList<Allele>(getMaxAltAlleles() + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, maxAltAlleles, ploidy));
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles(), ploidy));
|
||||
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
builder.alleles(alleles);
|
||||
|
|
@ -69,8 +66,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
|
||||
@Override
|
||||
public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) {
|
||||
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, getResultTracker());
|
||||
return resultFromTracker(vc, log10AlleleFrequencyPriors);
|
||||
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors);
|
||||
return getResultFromFinalState(vc, log10AlleleFrequencyPriors);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -128,6 +125,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
* @return list of numAllelesToChoose most likely alleles
|
||||
*/
|
||||
|
||||
private static final int PL_INDEX_OF_HOM_REF = 0;
|
||||
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) {
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
|
||||
|
|
@ -135,7 +133,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
||||
|
||||
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes(), true);
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes(), false);
|
||||
for ( final double[] likelihoods : GLs ) {
|
||||
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
|
|
@ -143,7 +141,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
// by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele
|
||||
for (int k=1; k < acCount.length;k++) {
|
||||
if (acCount[k] > 0)
|
||||
likelihoodSums[k-1].sum += likelihoods[PLindexOfBestGL];
|
||||
likelihoodSums[k-1].sum += acCount[k] * (likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -170,13 +168,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
* @param numAlleles Number of alternate alleles
|
||||
* @param ploidyPerPool Number of samples per pool
|
||||
* @param log10AlleleFrequencyPriors Frequency priors
|
||||
* @param resultTracker object to fill with output values
|
||||
*/
|
||||
protected static void combineSinglePools(final GenotypesContext GLs,
|
||||
final int numAlleles,
|
||||
final int ploidyPerPool,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
protected void combineSinglePools(final GenotypesContext GLs,
|
||||
final int numAlleles,
|
||||
final int ploidyPerPool,
|
||||
final double[] log10AlleleFrequencyPriors) {
|
||||
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs, true);
|
||||
|
||||
|
|
@ -195,24 +191,24 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
|
||||
if ( genotypeLikelihoods.size() <= 1 ) {
|
||||
// no meaningful GLs at all, just set the tracker to non poly values
|
||||
resultTracker.reset(); // just mimic-ing call below
|
||||
resultTracker.setLog10LikelihoodOfAFzero(0.0);
|
||||
getStateTracker().reset(); // just mimic-ing call below
|
||||
getStateTracker().setLog10LikelihoodOfAFzero(0.0);
|
||||
} else {
|
||||
for (int p=1; p<genotypeLikelihoods.size(); p++) {
|
||||
resultTracker.reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
|
||||
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
|
||||
numAlleles, log10AlleleFrequencyPriors, resultTracker);
|
||||
getStateTracker().reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
|
||||
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p),
|
||||
combinedPloidy, ploidyPerPool, numAlleles, log10AlleleFrequencyPriors);
|
||||
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool, double[] newGL, int originalPloidy, int newGLPloidy, int numAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
|
||||
|
||||
|
||||
public CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool,
|
||||
double[] newGL,
|
||||
int originalPloidy,
|
||||
int newGLPloidy,
|
||||
int numAlleles,
|
||||
final double[] log10AlleleFrequencyPriors) {
|
||||
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>();
|
||||
|
|
@ -229,16 +225,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
StateTracker stateTracker = new StateTracker();
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
resultTracker.incNEvaluations();
|
||||
getStateTracker().incNEvaluations();
|
||||
// compute log10Likelihoods
|
||||
final ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, resultTracker, stateTracker, ACqueue, indexesToACset);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
if ( log10LofKs > stateTracker.getMaxLog10L())
|
||||
stateTracker.update(log10LofKs, ACset.getACcounts());
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset);
|
||||
|
||||
// clean up memory
|
||||
indexesToACset.remove(ACset.getACcounts());
|
||||
|
|
@ -259,39 +250,32 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
* @param log10AlleleFrequencyPriors Prior object
|
||||
* @param originalPloidy Total ploidy of original combined pool
|
||||
* @param newGLPloidy Ploidy of GL vector
|
||||
* @param resultTracker AFResult object
|
||||
* @param stateTracker max likelihood observed so far
|
||||
* @param ACqueue Queue of conformations to compute
|
||||
* @param indexesToACset AC indices of objects in queue
|
||||
* @return max log likelihood
|
||||
*/
|
||||
private static double calculateACConformationAndUpdateQueue(final ExactACset set,
|
||||
final CombinedPoolLikelihoods newPool,
|
||||
final CombinedPoolLikelihoods originalPool,
|
||||
final double[] newGL,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final int originalPloidy,
|
||||
final int newGLPloidy,
|
||||
final AFCalcResultTracker resultTracker,
|
||||
final StateTracker stateTracker,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
private double calculateACConformationAndUpdateQueue(final ExactACset set,
|
||||
final CombinedPoolLikelihoods newPool,
|
||||
final CombinedPoolLikelihoods originalPool,
|
||||
final double[] newGL,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final int originalPloidy,
|
||||
final int newGLPloidy,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
|
||||
// compute likeihood in "set" of new set based on original likelihoods
|
||||
final int numAlleles = set.getACcounts().getCounts().length;
|
||||
final int newPloidy = set.getACsum();
|
||||
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, resultTracker);
|
||||
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy);
|
||||
|
||||
|
||||
// add to new pool
|
||||
if (!Double.isInfinite(log10LofK))
|
||||
newPool.add(set);
|
||||
|
||||
// TODO -- uncomment this correct line when the implementation of this model is optimized (it's too slow now to handle this fix)
|
||||
//if ( log10LofK < stateTracker.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && stateTracker.isLowerAC(set.ACcounts) ) {
|
||||
if ( log10LofK < stateTracker.getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, stateTracker.getMaxLog10L());
|
||||
// TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix)
|
||||
if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) {
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
|
|
@ -322,67 +306,67 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* Naive combiner of two multiallelic pools - number of alt alleles must be the same.
|
||||
* Math is generalization of biallelic combiner.
|
||||
*
|
||||
* For vector K representing an allele count conformation,
|
||||
* Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
|
||||
* where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
|
||||
* @param originalPool First log-likelihood pool GL vector
|
||||
* @param yy Second pool GL vector
|
||||
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
|
||||
* @param ploidy2 Ploidy of second pool
|
||||
* @param numAlleles Number of alleles
|
||||
* @param log10AlleleFrequencyPriors Array of biallelic priors
|
||||
* @param resultTracker Af calculation result object
|
||||
*/
|
||||
public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
/*
|
||||
final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
|
||||
final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
|
||||
|
||||
if (dim1 != originalPool.getLength() || dim2 != yy.length)
|
||||
throw new ReviewedStingException("BUG: Inconsistent vector length");
|
||||
|
||||
if (ploidy2 == 0)
|
||||
return;
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
// Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
|
||||
// and L2(K) = Pr(D|AC2=K) * choose(m2,K)
|
||||
GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
|
||||
final double[] x = originalPool.getLikelihoodsAsVector(true);
|
||||
while(firstIterator.hasNext()) {
|
||||
x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
|
||||
firstIterator.next();
|
||||
}
|
||||
|
||||
GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
|
||||
final double[] y = yy.clone();
|
||||
while(secondIterator.hasNext()) {
|
||||
y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
|
||||
secondIterator.next();
|
||||
}
|
||||
|
||||
// initialize output to -log10(choose(m1+m2,[k1 k2...])
|
||||
final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
|
||||
final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
|
||||
|
||||
|
||||
// Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
|
||||
while(outputIterator.hasNext()) {
|
||||
final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
|
||||
double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
|
||||
|
||||
originalPool.add(likelihood, set, outputIterator.getLinearIndex());
|
||||
outputIterator.next();
|
||||
}
|
||||
*/
|
||||
}
|
||||
// /**
|
||||
// * Naive combiner of two multiallelic pools - number of alt alleles must be the same.
|
||||
// * Math is generalization of biallelic combiner.
|
||||
// *
|
||||
// * For vector K representing an allele count conformation,
|
||||
// * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
|
||||
// * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
|
||||
// * @param originalPool First log-likelihood pool GL vector
|
||||
// * @param yy Second pool GL vector
|
||||
// * @param ploidy1 Ploidy of first pool (# of chromosomes in it)
|
||||
// * @param ploidy2 Ploidy of second pool
|
||||
// * @param numAlleles Number of alleles
|
||||
// * @param log10AlleleFrequencyPriors Array of biallelic priors
|
||||
// * @param resultTracker Af calculation result object
|
||||
// */
|
||||
// public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
|
||||
// final double[] log10AlleleFrequencyPriors,
|
||||
// final AFCalcResultTracker resultTracker) {
|
||||
///*
|
||||
// final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
|
||||
// final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
|
||||
//
|
||||
// if (dim1 != originalPool.getLength() || dim2 != yy.length)
|
||||
// throw new ReviewedStingException("BUG: Inconsistent vector length");
|
||||
//
|
||||
// if (ploidy2 == 0)
|
||||
// return;
|
||||
//
|
||||
// final int newPloidy = ploidy1 + ploidy2;
|
||||
//
|
||||
// // Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
|
||||
// // and L2(K) = Pr(D|AC2=K) * choose(m2,K)
|
||||
// GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
|
||||
// final double[] x = originalPool.getLikelihoodsAsVector(true);
|
||||
// while(firstIterator.hasNext()) {
|
||||
// x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
|
||||
// firstIterator.next();
|
||||
// }
|
||||
//
|
||||
// GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
|
||||
// final double[] y = yy.clone();
|
||||
// while(secondIterator.hasNext()) {
|
||||
// y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
|
||||
// secondIterator.next();
|
||||
// }
|
||||
//
|
||||
// // initialize output to -log10(choose(m1+m2,[k1 k2...])
|
||||
// final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
|
||||
// final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
|
||||
//
|
||||
//
|
||||
// // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
|
||||
// while(outputIterator.hasNext()) {
|
||||
// final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
|
||||
// double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
|
||||
//
|
||||
// originalPool.add(likelihood, set, outputIterator.getLinearIndex());
|
||||
// outputIterator.next();
|
||||
// }
|
||||
//*/
|
||||
// }
|
||||
|
||||
/**
|
||||
* Compute likelihood of a particular AC conformation and update AFresult object
|
||||
|
|
@ -393,15 +377,13 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
* @param numAlleles Number of alleles (including ref)
|
||||
* @param ploidy1 Ploidy of original pool (combined)
|
||||
* @param ploidy2 Ploidy of new pool
|
||||
* @param resultTracker AFResult object
|
||||
* @return log-likehood of requested conformation
|
||||
*/
|
||||
private static double computeLofK(final ExactACset set,
|
||||
final CombinedPoolLikelihoods firstGLs,
|
||||
final double[] secondGL,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final int numAlleles, final int ploidy1, final int ploidy2,
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
private double computeLofK(final ExactACset set,
|
||||
final CombinedPoolLikelihoods firstGLs,
|
||||
final double[] secondGL,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final int numAlleles, final int ploidy1, final int ploidy2) {
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
|
|
@ -419,8 +401,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX];
|
||||
set.getLog10Likelihoods()[0] = log10Lof0;
|
||||
|
||||
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
return log10Lof0;
|
||||
|
||||
} else {
|
||||
|
|
@ -463,14 +445,16 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
|
||||
// update the MLE if necessary
|
||||
final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length);
|
||||
resultTracker.updateMLEifNeeded(log10LofK, altCounts);
|
||||
// TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY
|
||||
getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts);
|
||||
|
||||
// apply the priors over each alternate allele
|
||||
for (final int ACcount : altCounts ) {
|
||||
if ( ACcount > 0 )
|
||||
log10LofK += log10AlleleFrequencyPriors[ACcount];
|
||||
}
|
||||
resultTracker.updateMAPifNeeded(log10LofK, altCounts);
|
||||
// TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY
|
||||
getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts);
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
|
@ -493,99 +477,6 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
return (sum == ploidy);
|
||||
}
|
||||
|
||||
/**
|
||||
* Combines naively two biallelic pools (of arbitrary size).
|
||||
* For two pools of size m1 and m2, we can compute the combined likelihood as:
|
||||
* Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k)
|
||||
* @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i
|
||||
* @param newPLVector Second GL vector
|
||||
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
|
||||
* @param ploidy2 Ploidy of second pool
|
||||
* @param log10AlleleFrequencyPriors Array of biallelic priors
|
||||
* @param resultTracker Af calculation result object
|
||||
* @return Combined likelihood vector
|
||||
*/
|
||||
public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector,
|
||||
final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors,
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
final double[] combinedLikelihoods = new double[1+newPloidy];
|
||||
|
||||
/** Pre-fill result array and incorporate weights into input vectors
|
||||
* Say L1(k) = Pr(D|AC1=k) * choose(m1,k)
|
||||
* and L2(k) = Pr(D|AC2=k) * choose(m2,k)
|
||||
* equation reduces to
|
||||
* Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j)
|
||||
* which is just plain convolution of L1 and L2 (with pre-existing vector)
|
||||
*/
|
||||
|
||||
// intialize result vector to -infinity
|
||||
Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY);
|
||||
|
||||
final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1);
|
||||
for (int k=originalPool.getProbabilityVector().length; k< x.length; k++)
|
||||
x[k] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
final double[] y = newPLVector.clone();
|
||||
|
||||
|
||||
final double log10Lof0 = x[0]+y[0];
|
||||
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
|
||||
double maxElement = log10Lof0;
|
||||
int maxElementIdx = 0;
|
||||
int[] alleleCounts = new int[1];
|
||||
for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) {
|
||||
double[] acc = new double[k+1];
|
||||
Arrays.fill(acc,Double.NEGATIVE_INFINITY);
|
||||
double innerMax = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (int j=0; j <=k; j++) {
|
||||
double x1,y1;
|
||||
|
||||
|
||||
if (k-j>=0 && k-j < y.length)
|
||||
y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j);
|
||||
else
|
||||
continue;
|
||||
|
||||
if (j < x.length)
|
||||
x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j);
|
||||
else
|
||||
continue;
|
||||
|
||||
if (Double.isInfinite(x1) || Double.isInfinite(y1))
|
||||
continue;
|
||||
acc[j] = x1 + y1;
|
||||
if (acc[j] > innerMax)
|
||||
innerMax = acc[j];
|
||||
else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY)
|
||||
break;
|
||||
}
|
||||
combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k);
|
||||
maxElementIdx = k;
|
||||
double maxDiff = combinedLikelihoods[k] - maxElement;
|
||||
if (maxDiff > 0)
|
||||
maxElement = combinedLikelihoods[k];
|
||||
else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) {
|
||||
break;
|
||||
}
|
||||
|
||||
alleleCounts[0] = k;
|
||||
resultTracker.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
|
||||
resultTracker.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* From a given variant context, extract a given subset of alleles, and update genotype context accordingly,
|
||||
* including updating the PL's, and assign genotypes accordingly
|
||||
|
|
@ -674,10 +565,10 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
|||
*
|
||||
* @return genotype
|
||||
*/
|
||||
private static void assignGenotype(final GenotypeBuilder gb,
|
||||
final double[] newLikelihoods,
|
||||
final List<Allele> allelesToUse,
|
||||
final int numChromosomes) {
|
||||
private void assignGenotype(final GenotypeBuilder gb,
|
||||
final double[] newLikelihoods,
|
||||
final List<Allele> allelesToUse,
|
||||
final int numChromosomes) {
|
||||
final int numNewAltAlleles = allelesToUse.size() - 1;
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
|||
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -114,6 +115,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false)
|
||||
protected PrintStream graphWriter = null;
|
||||
|
||||
/**
|
||||
* The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
|
||||
*/
|
||||
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
|
||||
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
|
||||
protected String keepRG = null;
|
||||
|
|
@ -234,14 +241,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
samplesList.addAll( samples );
|
||||
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
|
||||
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
|
||||
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
|
||||
UnifiedArgumentCollection simpleUAC = UAC.clone();
|
||||
UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC);
|
||||
simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
|
||||
simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
|
||||
simpleUAC.exactCallsLog = null;
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
|
|
@ -287,7 +294,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
}
|
||||
|
||||
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false );
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
|
||||
}
|
||||
|
||||
|
|
@ -400,6 +407,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria
|
||||
if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do!
|
||||
|
||||
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
|
||||
Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() );
|
||||
|
||||
// evaluate each sample's reads against all haplotypes
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList = splitReadsBySample( activeRegion.getReads() );
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
|
||||
|
|
|
|||
|
|
@ -30,6 +30,9 @@ import com.google.java.contract.Requires;
|
|||
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pairhmm.*;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
|
@ -44,8 +47,25 @@ public class LikelihoodCalculationEngine {
|
|||
private final boolean DEBUG;
|
||||
private final PairHMM pairHMM;
|
||||
|
||||
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) {
|
||||
pairHMM = new PairHMM( noBanded );
|
||||
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
|
||||
|
||||
switch (hmmType) {
|
||||
case EXACT:
|
||||
pairHMM = new ExactPairHMM();
|
||||
break;
|
||||
case ORIGINAL:
|
||||
pairHMM = new OriginalPairHMM();
|
||||
break;
|
||||
case CACHING:
|
||||
pairHMM = new CachingPairHMM();
|
||||
break;
|
||||
case LOGLESS_CACHING:
|
||||
pairHMM = new LoglessCachingPairHMM();
|
||||
break;
|
||||
default:
|
||||
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");
|
||||
}
|
||||
|
||||
this.constantGCP = constantGCP;
|
||||
DEBUG = debug;
|
||||
}
|
||||
|
|
@ -69,23 +89,18 @@ public class LikelihoodCalculationEngine {
|
|||
X_METRIC_LENGTH += 2;
|
||||
Y_METRIC_LENGTH += 2;
|
||||
|
||||
// initial arrays to hold the probabilities of being in the match, insertion and deletion cases
|
||||
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
|
||||
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
|
||||
pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
|
||||
|
||||
// for each sample's reads
|
||||
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
|
||||
//if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
|
||||
// evaluate the likelihood of the reads given those haplotypes
|
||||
computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey(), matchMetricArray, XMetricArray, YMetricArray );
|
||||
computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() );
|
||||
}
|
||||
}
|
||||
|
||||
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample ) {
|
||||
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final int numReads = reads.size();
|
||||
|
|
@ -113,9 +128,8 @@ public class LikelihoodCalculationEngine {
|
|||
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
|
||||
previousHaplotypeSeen = haplotype;
|
||||
|
||||
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(),
|
||||
readQuals, readInsQuals, readDelQuals, overallGCP,
|
||||
haplotypeStart, matchMetricArray, XMetricArray, YMetricArray);
|
||||
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
|
||||
readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0);
|
||||
readCounts[jjj][iii] = readCount;
|
||||
}
|
||||
}
|
||||
|
|
@ -130,7 +144,7 @@ public class LikelihoodCalculationEngine {
|
|||
return iii;
|
||||
}
|
||||
}
|
||||
return b1.length;
|
||||
return Math.min(b1.length, b2.length);
|
||||
}
|
||||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
|
|
@ -280,7 +294,7 @@ public class LikelihoodCalculationEngine {
|
|||
final int numHaplotypes = haplotypes.size();
|
||||
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
|
||||
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
|
||||
bestHaplotypesIndexList.add(0); // always start with the reference haplotype
|
||||
bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
|
||||
// set up the default 1-to-1 haplotype mapping object
|
||||
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
|
|
@ -322,6 +336,13 @@ public class LikelihoodCalculationEngine {
|
|||
return bestHaplotypes;
|
||||
}
|
||||
|
||||
public static int findReferenceIndex( final List<Haplotype> haplotypes ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.isReference() ) { return haplotypes.indexOf(h); }
|
||||
}
|
||||
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
|
||||
}
|
||||
|
||||
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
|
|
|
|||
|
|
@ -0,0 +1,181 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin, carneiro
|
||||
* Date: 10/16/12
|
||||
*/
|
||||
|
||||
public class CachingPairHMM extends OriginalPairHMM {
|
||||
|
||||
double[][] constantMatrix = null; // The cache in the CachingPairHMM
|
||||
double[][] distanceMatrix = null; // The cache in the CachingPairHMM
|
||||
|
||||
protected static final double [] firstRowConstantMatrix = {
|
||||
QualityUtils.qualToProbLog10((byte) (DEFAULT_GOP + DEFAULT_GOP)),
|
||||
QualityUtils.qualToProbLog10(DEFAULT_GCP),
|
||||
QualityUtils.qualToErrorProbLog10(DEFAULT_GOP),
|
||||
QualityUtils.qualToErrorProbLog10(DEFAULT_GCP),
|
||||
0.0,
|
||||
0.0
|
||||
};
|
||||
|
||||
@Override
|
||||
public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
|
||||
|
||||
super.initialize(READ_MAX_LENGTH, HAPLOTYPE_MAX_LENGTH);
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
|
||||
final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
|
||||
|
||||
constantMatrix = new double[X_METRIC_LENGTH][6];
|
||||
distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
// fill in the first row
|
||||
for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) {
|
||||
updateCell(1, jjj, 0.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues ) {
|
||||
|
||||
if( recacheReadValues ) {
|
||||
initializeConstants( insertionGOP, deletionGOP, overallGCP );
|
||||
}
|
||||
initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex );
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = readBases.length + 2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
|
||||
|
||||
for (int i = 2; i < X_METRIC_LENGTH; i++) {
|
||||
for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) {
|
||||
updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
|
||||
// final probability is the log10 sum of the last element in all three state arrays
|
||||
final int endI = X_METRIC_LENGTH - 1;
|
||||
final int endJ = Y_METRIC_LENGTH - 1;
|
||||
return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the matrix that holds all the constants related to the editing
|
||||
* distance between the read and the haplotype.
|
||||
*
|
||||
* @param haplotypeBases the bases of the haplotype
|
||||
* @param readBases the bases of the read
|
||||
* @param readQuals the base quality scores of the read
|
||||
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
|
||||
*/
|
||||
public void initializeDistanceMatrix( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final int startIndex ) {
|
||||
|
||||
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
|
||||
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
|
||||
|
||||
for (int i = 0; i < readBases.length; i++) {
|
||||
final byte x = readBases[i];
|
||||
final byte qual = readQuals[i];
|
||||
for (int j = startIndex; j < haplotypeBases.length; j++) {
|
||||
final byte y = haplotypeBases[j];
|
||||
distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
|
||||
QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the matrix that holds all the constants related to quality scores.
|
||||
*
|
||||
* @param insertionGOP insertion quality scores of the read
|
||||
* @param deletionGOP deletion quality scores of the read
|
||||
* @param overallGCP overall gap continuation penalty
|
||||
*/
|
||||
public void initializeConstants( final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP ) {
|
||||
|
||||
final int l = insertionGOP.length;
|
||||
constantMatrix[1] = firstRowConstantMatrix;
|
||||
for (int i = 0; i < l; i++) {
|
||||
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
|
||||
constantMatrix[i+2][0] = QualityUtils.qualToProbLog10((byte) qualIndexGOP);
|
||||
constantMatrix[i+2][1] = QualityUtils.qualToProbLog10(overallGCP[i]);
|
||||
constantMatrix[i+2][2] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]);
|
||||
constantMatrix[i+2][3] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
|
||||
constantMatrix[i+2][4] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]);
|
||||
constantMatrix[i+2][5] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
|
||||
}
|
||||
constantMatrix[l+1][4] = 0.0;
|
||||
constantMatrix[l+1][5] = 0.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates a cell in the HMM matrix
|
||||
*
|
||||
* The read and haplotype indices are offset by one because the state arrays have an extra column to hold the
|
||||
* initial conditions
|
||||
|
||||
* @param indI row index in the matrices to update
|
||||
* @param indJ column index in the matrices to update
|
||||
* @param prior the likelihood editing distance matrix for the read x haplotype
|
||||
* @param constants an array with the six constants relevant to this location
|
||||
* @param matchMetricArray the matches likelihood matrix
|
||||
* @param XMetricArray the insertions likelihood matrix
|
||||
* @param YMetricArray the deletions likelihood matrix
|
||||
*/
|
||||
private void updateCell( final int indI, final int indJ, final double prior, final double[] constants,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
|
||||
matchMetricArray[indI][indJ] = prior +
|
||||
MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ - 1] + constants[0],
|
||||
XMetricArray[indI - 1][indJ - 1] + constants[1],
|
||||
YMetricArray[indI - 1][indJ - 1] + constants[1] );
|
||||
XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ] + constants[2],
|
||||
XMetricArray[indI - 1][indJ] + constants[3]);
|
||||
YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI][indJ - 1] + constants[4],
|
||||
YMetricArray[indI][indJ - 1] + constants[5]);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,187 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin, carneiro
|
||||
* Date: 10/16/12
|
||||
*/
|
||||
|
||||
public class LoglessCachingPairHMM extends CachingPairHMM {
|
||||
|
||||
protected static final double SCALE_FACTOR_LOG10 = 300.0;
|
||||
|
||||
protected static final double [] firstRowConstantMatrix = {
|
||||
QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)),
|
||||
QualityUtils.qualToProb(DEFAULT_GCP),
|
||||
QualityUtils.qualToErrorProb(DEFAULT_GOP),
|
||||
QualityUtils.qualToErrorProb(DEFAULT_GCP),
|
||||
1.0,
|
||||
1.0
|
||||
};
|
||||
|
||||
@Override
|
||||
public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
|
||||
final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
|
||||
|
||||
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
|
||||
Arrays.fill(matchMetricArray[iii], 0.0);
|
||||
Arrays.fill(XMetricArray[iii], 0.0);
|
||||
Arrays.fill(YMetricArray[iii], 0.0);
|
||||
}
|
||||
|
||||
// the initial condition
|
||||
matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10); // Math.log10(1.0);
|
||||
|
||||
constantMatrix = new double[X_METRIC_LENGTH][6];
|
||||
distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
// fill in the first row
|
||||
for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) {
|
||||
updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues ) {
|
||||
|
||||
if( recacheReadValues ) {
|
||||
initializeConstants( insertionGOP, deletionGOP, overallGCP );
|
||||
}
|
||||
initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex );
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = readBases.length + 2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
|
||||
|
||||
for (int i = 2; i < X_METRIC_LENGTH; i++) {
|
||||
for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) {
|
||||
updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
|
||||
// final probability is the log10 sum of the last element in all three state arrays
|
||||
final int endI = X_METRIC_LENGTH - 1;
|
||||
final int endJ = Y_METRIC_LENGTH - 1;
|
||||
return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the matrix that holds all the constants related to the editing
|
||||
* distance between the read and the haplotype.
|
||||
*
|
||||
* @param haplotypeBases the bases of the haplotype
|
||||
* @param readBases the bases of the read
|
||||
* @param readQuals the base quality scores of the read
|
||||
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
|
||||
*/
|
||||
public void initializeDistanceMatrix( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final int startIndex ) {
|
||||
|
||||
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
|
||||
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
|
||||
|
||||
for (int i = 0; i < readBases.length; i++) {
|
||||
final byte x = readBases[i];
|
||||
final byte qual = readQuals[i];
|
||||
for (int j = startIndex; j < haplotypeBases.length; j++) {
|
||||
final byte y = haplotypeBases[j];
|
||||
distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
|
||||
QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the matrix that holds all the constants related to quality scores.
|
||||
*
|
||||
* @param insertionGOP insertion quality scores of the read
|
||||
* @param deletionGOP deletion quality scores of the read
|
||||
* @param overallGCP overall gap continuation penalty
|
||||
*/
|
||||
public void initializeConstants( final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP ) {
|
||||
|
||||
final int l = insertionGOP.length;
|
||||
constantMatrix[1] = firstRowConstantMatrix;
|
||||
for (int i = 0; i < l; i++) {
|
||||
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
|
||||
constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP);
|
||||
constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]);
|
||||
constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]);
|
||||
constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]);
|
||||
constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]);
|
||||
constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]);
|
||||
}
|
||||
constantMatrix[l+1][4] = 1.0;
|
||||
constantMatrix[l+1][5] = 1.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates a cell in the HMM matrix
|
||||
*
|
||||
* The read and haplotype indices are offset by one because the state arrays have an extra column to hold the
|
||||
* initial conditions
|
||||
|
||||
* @param indI row index in the matrices to update
|
||||
* @param indJ column index in the matrices to update
|
||||
* @param prior the likelihood editing distance matrix for the read x haplotype
|
||||
* @param constants an array with the six constants relevant to this location
|
||||
* @param matchMetricArray the matches likelihood matrix
|
||||
* @param XMetricArray the insertions likelihood matrix
|
||||
* @param YMetricArray the deletions likelihood matrix
|
||||
*/
|
||||
private void updateCell( final int indI, final int indJ, final double prior, final double[] constants,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
|
||||
matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] +
|
||||
XMetricArray[indI - 1][indJ - 1] * constants[1] +
|
||||
YMetricArray[indI - 1][indJ - 1] * constants[1] );
|
||||
XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3];
|
||||
YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5];
|
||||
}
|
||||
}
|
||||
|
|
@ -60,27 +60,27 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testBOTH_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","67dabdbf1e6ed8a83d2e85766558a20a");
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","9ce24f4ff787aed9d3754519a60ef49f");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","d4bfae27f1b07923f381d708d8a34cf4");
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","492c8ba9a80a902097ff15bbeb031592");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
|
||||
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","7d6f319b9edcb1ff8c290fef150a2df8");
|
||||
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","848e1092b5cd57b0da5f1187e67134e7");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
|
||||
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","dd02890123e07e7412a49475cb6280f1");
|
||||
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","51a7b51d82a341adec0e6510f5dfadd8");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMT_SNP_DISCOVERY_sp4() {
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da84bf45f7080a46a7a78542b3a0629d");
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","0a8c3b06243040b743dd90d497bb3f83");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
|
|
|
|||
|
|
@ -56,6 +56,11 @@ public class AFCalcResultUnitTest extends BaseTest {
|
|||
tests.add(new Object[]{new MyTest(new double[]{-1e-9, badL}, new double[]{0.0, badL})});
|
||||
}
|
||||
|
||||
// test that a non-ref site gets reasonable posteriors with an ~0.0 value doesn't get lost
|
||||
for ( final double nonRefL : Arrays.asList(-100.0, -50.0, -10.0, -9.0, -8.0, -7.0, -6.0, -5.0)) {
|
||||
tests.add(new Object[]{new MyTest(new double[]{0.0, nonRefL}, new double[]{0.0, nonRefL})});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
|
@ -124,12 +125,7 @@ public class AFCalcUnitTest extends BaseTest {
|
|||
final List<Genotype> triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2);
|
||||
|
||||
for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
|
||||
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(
|
||||
Arrays.asList(
|
||||
AFCalcFactory.Calculation.EXACT_REFERENCE,
|
||||
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
|
||||
AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY
|
||||
), 4, 2, 2, 2);
|
||||
List<AFCalc> calcs = AFCalcFactory.createAFCalcs( Arrays.asList( AFCalcFactory.Calculation.values() ), 4, 2, 2);
|
||||
|
||||
final int nPriorValues = 2*nSamples+1;
|
||||
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
|
|
@ -146,7 +142,7 @@ public class AFCalcUnitTest extends BaseTest {
|
|||
new GetGLsTest(model, 1, genotypes, priors, priorName);
|
||||
|
||||
// tri-allelic
|
||||
if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) ) // || model != generalCalc ) )
|
||||
if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) && ! ( model instanceof OriginalDiploidExactAFCalc) ) // || model != generalCalc ) )
|
||||
for ( List<Genotype> genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) )
|
||||
new GetGLsTest(model, 2, genotypes, priors, priorName);
|
||||
}
|
||||
|
|
@ -156,22 +152,28 @@ public class AFCalcUnitTest extends BaseTest {
|
|||
return GetGLsTest.getTests(GetGLsTest.class);
|
||||
}
|
||||
|
||||
@DataProvider(name = "badGLs")
|
||||
public Object[][] createBadGLs() {
|
||||
final List<Genotype> genotypes = Arrays.asList(AB2, BB2, CC2, CC2);
|
||||
final int nSamples = genotypes.size();
|
||||
// @DataProvider(name = "badGLs")
|
||||
// public Object[][] createBadGLs() {
|
||||
// final List<Genotype> genotypes = Arrays.asList(AB2, BB2, CC2, CC2);
|
||||
// final int nSamples = genotypes.size();
|
||||
//
|
||||
// final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4);
|
||||
//
|
||||
// final int nPriorValues = 2*nSamples+1;
|
||||
// final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
// for ( AFCalc model : Arrays.asList(indCalc) ) {
|
||||
// final String priorName = "flat";
|
||||
// new GetGLsTest(model, 2, genotypes, priors, priorName);
|
||||
// }
|
||||
//
|
||||
// return GetGLsTest.getTests(GetGLsTest.class);
|
||||
// }
|
||||
|
||||
final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4);
|
||||
|
||||
final int nPriorValues = 2*nSamples+1;
|
||||
final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
for ( AFCalc model : Arrays.asList(indCalc) ) {
|
||||
final String priorName = "flat";
|
||||
new GetGLsTest(model, 2, genotypes, priors, priorName);
|
||||
}
|
||||
|
||||
return GetGLsTest.getTests(GetGLsTest.class);
|
||||
}
|
||||
//
|
||||
// @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs")
|
||||
// public void testBadGLs(GetGLsTest cfg) {
|
||||
// testResultSimple(cfg);
|
||||
// }
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs")
|
||||
public void testBiallelicGLs(GetGLsTest cfg) {
|
||||
|
|
@ -185,11 +187,6 @@ public class AFCalcUnitTest extends BaseTest {
|
|||
testResultSimple(cfg);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "badGLs")
|
||||
public void testBadGLs(GetGLsTest cfg) {
|
||||
testResultSimple(cfg);
|
||||
}
|
||||
|
||||
private static class NonInformativeData {
|
||||
final Genotype nonInformative;
|
||||
final List<Genotype> called;
|
||||
|
|
@ -218,16 +215,14 @@ public class AFCalcUnitTest extends BaseTest {
|
|||
samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative));
|
||||
|
||||
final int nSamples = samples.size();
|
||||
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(
|
||||
Arrays.asList(
|
||||
AFCalcFactory.Calculation.EXACT_REFERENCE,
|
||||
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
|
||||
AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY
|
||||
), 4, 2, 2, 2);
|
||||
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(Arrays.asList(AFCalcFactory.Calculation.values()), 4, 2, 2);
|
||||
|
||||
final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors
|
||||
|
||||
for ( AFCalc model : calcs ) {
|
||||
if ( testData.nAltAlleles > 1 && model instanceof OriginalDiploidExactAFCalc )
|
||||
continue;
|
||||
|
||||
final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat");
|
||||
|
||||
for ( int rotation = 0; rotation < nSamples; rotation++ ) {
|
||||
|
|
@ -428,6 +423,94 @@ public class AFCalcUnitTest extends BaseTest {
|
|||
"Actual pNonRef not within tolerance " + tolerance + " of expected");
|
||||
}
|
||||
|
||||
@DataProvider(name = "PNonRefBiallelicSystematic")
|
||||
public Object[][] makePNonRefBiallelicSystematic() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final List<Integer> bigNonRefPLs = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 50, 100, 1000);
|
||||
final List<List<Integer>> bigDiploidPLs = removeBadPLs(Utils.makePermutations(bigNonRefPLs, 3, true));
|
||||
|
||||
for ( AFCalcFactory.Calculation modelType : AFCalcFactory.Calculation.values() ) {
|
||||
|
||||
if ( false ) { // for testing only
|
||||
tests.add(new Object[]{modelType, toGenotypes(Arrays.asList(Arrays.asList(0,100,0)))});
|
||||
} else {
|
||||
if ( modelType == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) continue; // TODO -- GENERAL_PLOIDY DOESN'T WORK
|
||||
|
||||
// test all combinations of PLs for 1 sample
|
||||
for ( final List<List<Integer>> PLsPerSample : Utils.makePermutations(bigDiploidPLs, 1, true) ) {
|
||||
tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)});
|
||||
}
|
||||
|
||||
|
||||
final List<List<Integer>> smallDiploidPLs = new LinkedList<List<Integer>>();
|
||||
for ( final int nonRefPL : Arrays.asList(5, 10, 20, 30) ) {
|
||||
for ( int i = 0; i < 2; i++ ) {
|
||||
List<Integer> pls = new ArrayList<Integer>(Collections.nCopies(3, nonRefPL));
|
||||
pls.set(i, 0);
|
||||
smallDiploidPLs.add(pls);
|
||||
}
|
||||
}
|
||||
|
||||
for ( final List<List<Integer>> PLsPerSample : Utils.makePermutations(smallDiploidPLs, 5, false) ) {
|
||||
tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
final List<List<Integer>> removeBadPLs(List<List<Integer>> listOfPLs) {
|
||||
List<List<Integer>> clean = new LinkedList<List<Integer>>();
|
||||
|
||||
for ( final List<Integer> PLs : listOfPLs ) {
|
||||
int x = PLs.get(0);
|
||||
boolean bad = false;
|
||||
for ( int pl1 : PLs )
|
||||
if ( pl1 > x )
|
||||
bad = true;
|
||||
else
|
||||
x = pl1;
|
||||
if ( ! bad ) clean.add(PLs);
|
||||
}
|
||||
|
||||
return clean;
|
||||
}
|
||||
|
||||
private List<Genotype> toGenotypes(final List<List<Integer>> PLsPerSample) {
|
||||
final List<Allele> nocall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
||||
final List<Genotype> genotypes = new ArrayList<Genotype>(PLsPerSample.size());
|
||||
|
||||
for ( final List<Integer> PLs : PLsPerSample ) {
|
||||
final int[] pls = ArrayUtils.toPrimitive(PLs.toArray(new Integer[3]));
|
||||
final int min = MathUtils.arrayMin(pls);
|
||||
for ( int i = 0; i < pls.length; i++ ) pls[i] -= min;
|
||||
genotypes.add(makePL(nocall, pls));
|
||||
}
|
||||
|
||||
return genotypes;
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRefBiallelicSystematic")
|
||||
private void PNonRefBiallelicSystematic(AFCalcFactory.Calculation modelType, final List<Genotype> genotypes) {
|
||||
//logger.warn("Running " + modelType + " with " + genotypes);
|
||||
final AFCalcTestBuilder refBuilder = new AFCalcTestBuilder(genotypes.size(), 1, AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcTestBuilder.PriorType.human);
|
||||
final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(genotypes.size(), 1, modelType, AFCalcTestBuilder.PriorType.human);
|
||||
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A, C));
|
||||
vcb.genotypes(genotypes);
|
||||
|
||||
final AFCalcResult refResult = refBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
|
||||
final AFCalcResult testResult = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
|
||||
|
||||
final double tolerance = 1e-3;
|
||||
Assert.assertEquals(testResult.getLog10PosteriorOfAFGT0(), refResult.getLog10PosteriorOfAFGT0(), tolerance,
|
||||
"Actual pNonRef not within tolerance " + tolerance + " of expected");
|
||||
Assert.assertEquals(testResult.getAlleleCountsOfMLE(), refResult.getAlleleCountsOfMLE(),
|
||||
"Actual MLE " + Utils.join(",", testResult.getAlleleCountsOfMLE()) + " not equal to expected " + Utils.join(",", refResult.getAlleleCountsOfMLE()));
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test priors
|
||||
|
|
@ -495,7 +578,7 @@ public class AFCalcUnitTest extends BaseTest {
|
|||
|
||||
// list of all high-quality models in the system
|
||||
final List<AFCalcFactory.Calculation> models = Arrays.asList(
|
||||
AFCalcFactory.Calculation.EXACT,
|
||||
AFCalcFactory.Calculation.getDefaultModel(),
|
||||
AFCalcFactory.Calculation.EXACT_REFERENCE,
|
||||
AFCalcFactory.Calculation.EXACT_INDEPENDENT);
|
||||
|
||||
|
|
|
|||
|
|
@ -137,18 +137,15 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest {
|
|||
|
||||
@Test(dataProvider = "getGLs")
|
||||
public void testGLs(GetGLsTest cfg) {
|
||||
|
||||
final AFCalcResultTracker resultTracker = new AFCalcResultTracker(cfg.numAltAlleles);
|
||||
final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size());
|
||||
double[] priors = new double[len]; // flat priors
|
||||
|
||||
GeneralPloidyExactAFCalc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, resultTracker);
|
||||
final GeneralPloidyExactAFCalc calc = new GeneralPloidyExactAFCalc(cfg.GLs.size(), 1 + cfg.numAltAlleles, cfg.ploidy);
|
||||
calc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors);
|
||||
int nameIndex = 1;
|
||||
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
|
||||
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
|
||||
int calculatedAlleleCount = resultTracker.getAlleleCountsOfMAP()[allele];
|
||||
|
||||
// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount);
|
||||
int calculatedAlleleCount = calc.getStateTracker().getAlleleCountsOfMAP()[allele];
|
||||
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSample() {
|
||||
HCTest(CEUTRIO_BAM, "", "8c52c0955099cca3215a0d78fd455894");
|
||||
HCTest(CEUTRIO_BAM, "", "ee866a8694a6f6c77242041275350ab9");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -31,7 +31,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGA() {
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "61c1a0fb62d909229af6b5a91dad8b35");
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "53caa950535749f99d3c5b9bb61c7b60");
|
||||
}
|
||||
|
||||
private void HCTestComplexVariants(String bam, String args, String md5) {
|
||||
|
|
@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleSymbolic() {
|
||||
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "6eb9c1026225b38ba7bd3c4c218f8269");
|
||||
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "b4ea70a446e4782bd3700ca14dd726ff");
|
||||
}
|
||||
|
||||
private void HCTestIndelQualityScores(String bam, String args, String md5) {
|
||||
|
|
@ -64,13 +64,13 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "98d82d74e8d6a778290bee6c0df6d092");
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("fa5c5eb996e95aed12c50d70e6dd74d7"));
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c54c0c9411054bf629bfd98b616e53fc"));
|
||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -23,24 +23,26 @@
|
|||
*/
|
||||
|
||||
// our package
|
||||
package org.broadinstitute.sting.utils;
|
||||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class PairHMMUnitTest extends BaseTest {
|
||||
final static boolean EXTENSIVE_TESTING = true;
|
||||
PairHMM hmm = new PairHMM( false ); // reference implementation
|
||||
PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding
|
||||
PairHMM exactHMM = new ExactPairHMM(); // the log truth implementation
|
||||
PairHMM originalHMM = new OriginalPairHMM(); // the reference implementation
|
||||
PairHMM cachingHMM = new CachingPairHMM();
|
||||
PairHMM loglessHMM = new LoglessCachingPairHMM();
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -57,7 +59,7 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC";
|
||||
final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA";
|
||||
|
||||
public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) {
|
||||
public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) {
|
||||
this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false);
|
||||
}
|
||||
|
||||
|
|
@ -76,115 +78,51 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
public double expectedLogL() {
|
||||
return expectedQual / -10.0;
|
||||
return (expectedQual / -10.0) + 0.03 ;
|
||||
}
|
||||
|
||||
public double tolerance() {
|
||||
return 0.1; // TODO FIXME arbitrary
|
||||
public double toleranceFromTheoretical() {
|
||||
return 0.2;
|
||||
}
|
||||
|
||||
public double calcLogL() {
|
||||
public double toleranceFromReference() {
|
||||
return 1E-4;
|
||||
}
|
||||
|
||||
double logL = hmm.computeReadLikelihoodGivenHaplotype(
|
||||
public double toleranceFromExact() {
|
||||
return 1E-9;
|
||||
}
|
||||
|
||||
public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) {
|
||||
pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length);
|
||||
return pairHMM.computeReadLikelihoodGivenHaplotypeLog10(
|
||||
refBasesWithContext, readBasesWithContext,
|
||||
qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true),
|
||||
qualAsBytes(gcp, false));
|
||||
|
||||
return logL;
|
||||
qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel),
|
||||
qualAsBytes(gcp, false, anchorIndel), 0, true);
|
||||
}
|
||||
|
||||
private final byte[] asBytes(final String bases, final boolean left, final boolean right) {
|
||||
return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
|
||||
}
|
||||
|
||||
private byte[] qualAsBytes(final int phredQual, final boolean doGOP) {
|
||||
private byte[] qualAsBytes(final int phredQual, final boolean doGOP, final boolean anchorIndel) {
|
||||
final byte phredQuals[] = new byte[readBasesWithContext.length];
|
||||
// initialize everything to MASSIVE_QUAL so it cannot be moved by HMM
|
||||
Arrays.fill(phredQuals, (byte)100);
|
||||
|
||||
// update just the bases corresponding to the provided micro read with the quality scores
|
||||
if( doGOP ) {
|
||||
phredQuals[0 + CONTEXT.length()] = (byte)phredQual;
|
||||
} else {
|
||||
for ( int i = 0; i < read.length(); i++)
|
||||
phredQuals[i + CONTEXT.length()] = (byte)phredQual;
|
||||
}
|
||||
if( anchorIndel ) {
|
||||
// initialize everything to MASSIVE_QUAL so it cannot be moved by HMM
|
||||
Arrays.fill(phredQuals, (byte)100);
|
||||
|
||||
return phredQuals;
|
||||
}
|
||||
}
|
||||
|
||||
final Random random = new Random(87865573);
|
||||
private class BandedLikelihoodTestProvider extends TestDataProvider {
|
||||
final String ref, read;
|
||||
final byte[] refBasesWithContext, readBasesWithContext;
|
||||
final int baseQual, insQual, delQual, gcp;
|
||||
final int expectedQual;
|
||||
final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC";
|
||||
final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA";
|
||||
final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT";
|
||||
final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA";
|
||||
final byte[] baseQuals, insQuals, delQuals, gcps;
|
||||
|
||||
public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) {
|
||||
this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false);
|
||||
}
|
||||
|
||||
public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) {
|
||||
super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual));
|
||||
this.baseQual = baseQual;
|
||||
this.delQual = delQual;
|
||||
this.insQual = insQual;
|
||||
this.gcp = gcp;
|
||||
this.read = read;
|
||||
this.ref = ref;
|
||||
this.expectedQual = expectedQual;
|
||||
|
||||
refBasesWithContext = asBytes(ref, left, right);
|
||||
readBasesWithContext = asBytes(read, false, false);
|
||||
baseQuals = qualAsBytes(baseQual);
|
||||
insQuals = qualAsBytes(insQual);
|
||||
delQuals = qualAsBytes(delQual);
|
||||
gcps = qualAsBytes(gcp, false);
|
||||
}
|
||||
|
||||
public double expectedLogL() {
|
||||
double logL = hmm.computeReadLikelihoodGivenHaplotype(
|
||||
refBasesWithContext, readBasesWithContext,
|
||||
baseQuals, insQuals, delQuals, gcps);
|
||||
|
||||
return logL;
|
||||
}
|
||||
|
||||
public double tolerance() {
|
||||
return 0.2; // TODO FIXME arbitrary
|
||||
}
|
||||
|
||||
public double calcLogL() {
|
||||
|
||||
double logL = bandedHMM.computeReadLikelihoodGivenHaplotype(
|
||||
refBasesWithContext, readBasesWithContext,
|
||||
baseQuals, insQuals, delQuals, gcps);
|
||||
|
||||
return logL;
|
||||
}
|
||||
|
||||
private final byte[] asBytes(final String bases, final boolean left, final boolean right) {
|
||||
return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
|
||||
}
|
||||
|
||||
private byte[] qualAsBytes(final int phredQual) {
|
||||
return qualAsBytes(phredQual, true);
|
||||
}
|
||||
|
||||
private byte[] qualAsBytes(final int phredQual, final boolean addRandom) {
|
||||
final byte phredQuals[] = new byte[readBasesWithContext.length];
|
||||
Arrays.fill(phredQuals, (byte)phredQual);
|
||||
if(addRandom) {
|
||||
for( int iii = 0; iii < phredQuals.length; iii++) {
|
||||
phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3));
|
||||
// update just the bases corresponding to the provided micro read with the quality scores
|
||||
if( doGOP ) {
|
||||
phredQuals[0 + CONTEXT.length()] = (byte)phredQual;
|
||||
} else {
|
||||
for ( int i = 0; i < read.length(); i++)
|
||||
phredQuals[i + CONTEXT.length()] = (byte)phredQual;
|
||||
}
|
||||
} else {
|
||||
Arrays.fill(phredQuals, (byte)phredQual);
|
||||
}
|
||||
|
||||
return phredQuals;
|
||||
}
|
||||
}
|
||||
|
|
@ -195,8 +133,8 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
// test all combinations
|
||||
final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30);
|
||||
final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40);
|
||||
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10);
|
||||
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2);
|
||||
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(8, 10, 20) : Arrays.asList(10);
|
||||
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20,30,35) : Arrays.asList(2);
|
||||
|
||||
for ( final int baseQual : baseQuals ) {
|
||||
for ( final int indelQual : indelQuals ) {
|
||||
|
|
@ -219,7 +157,7 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
|
||||
for ( boolean insertionP : Arrays.asList(true, false)) {
|
||||
final String small = Utils.dupString((char)base, 1);
|
||||
final String big = Utils.dupString((char)base, size);
|
||||
final String big = Utils.dupString((char) base, size);
|
||||
|
||||
final String ref = insertionP ? small : big;
|
||||
final String read = insertionP ? big : small;
|
||||
|
|
@ -238,69 +176,65 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true)
|
||||
public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) {
|
||||
double calculatedLogL = cfg.calcLogL();
|
||||
double expectedLogL = cfg.expectedLogL();
|
||||
logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString()));
|
||||
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance());
|
||||
}
|
||||
|
||||
@DataProvider(name = "BandedLikelihoodTestProvider")
|
||||
public Object[][] makeBandedLikelihoodTests() {
|
||||
final Random random = new Random(87860573);
|
||||
@DataProvider(name = "OptimizedLikelihoodTestProvider")
|
||||
public Object[][] makeOptimizedLikelihoodTests() {
|
||||
// context on either side is ACGTTGCA REF ACGTTGCA
|
||||
// test all combinations
|
||||
final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30);
|
||||
final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40);
|
||||
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10);
|
||||
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2);
|
||||
final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 30, 40, 60) : Arrays.asList(30);
|
||||
final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 40, 60) : Arrays.asList(40);
|
||||
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10);
|
||||
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(3, 20, 50, 90, 160) : Arrays.asList(2);
|
||||
|
||||
for ( final int baseQual : baseQuals ) {
|
||||
for ( final int indelQual : indelQuals ) {
|
||||
for ( final int gcp : gcps ) {
|
||||
|
||||
// test substitutions
|
||||
for ( final byte refBase : BaseUtils.BASES ) {
|
||||
for ( final byte readBase : BaseUtils.BASES ) {
|
||||
final String ref = new String(new byte[]{refBase});
|
||||
final String read = new String(new byte[]{readBase});
|
||||
final int expected = refBase == readBase ? 0 : baseQual;
|
||||
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp);
|
||||
}
|
||||
}
|
||||
|
||||
// test insertions and deletions
|
||||
for ( final int size : sizes ) {
|
||||
for ( final byte base : BaseUtils.BASES ) {
|
||||
final int expected = indelQual + (size - 2) * gcp;
|
||||
|
||||
for ( boolean insertionP : Arrays.asList(true, false)) {
|
||||
final String small = Utils.dupString((char)base, 1);
|
||||
final String big = Utils.dupString((char)base, size);
|
||||
|
||||
final String ref = insertionP ? small : big;
|
||||
final String read = insertionP ? big : small;
|
||||
|
||||
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp);
|
||||
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false);
|
||||
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true);
|
||||
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true);
|
||||
for ( final int refSize : sizes ) {
|
||||
for ( final int readSize : sizes ) {
|
||||
String ref = "";
|
||||
String read = "";
|
||||
for( int iii = 0; iii < refSize; iii++) {
|
||||
ref += (char) BaseUtils.BASES[random.nextInt(4)];
|
||||
}
|
||||
for( int iii = 0; iii < readSize; iii++) {
|
||||
read += (char) BaseUtils.BASES[random.nextInt(4)];
|
||||
}
|
||||
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp);
|
||||
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, false);
|
||||
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, false, true);
|
||||
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class);
|
||||
return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true)
|
||||
public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) {
|
||||
double calculatedLogL = cfg.calcLogL();
|
||||
@Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true)
|
||||
public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) {
|
||||
double exactLogL = cfg.calcLogL( exactHMM, true );
|
||||
double calculatedLogL = cfg.calcLogL( originalHMM, true );
|
||||
double optimizedLogL = cfg.calcLogL( cachingHMM, true );
|
||||
double loglessLogL = cfg.calcLogL( loglessHMM, true );
|
||||
double expectedLogL = cfg.expectedLogL();
|
||||
logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString()));
|
||||
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance());
|
||||
//logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString()));
|
||||
Assert.assertEquals(exactLogL, expectedLogL, cfg.toleranceFromTheoretical());
|
||||
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.toleranceFromTheoretical());
|
||||
Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference());
|
||||
Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact());
|
||||
}
|
||||
|
||||
@Test(dataProvider = "OptimizedLikelihoodTestProvider", enabled = true)
|
||||
public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) {
|
||||
double exactLogL = cfg.calcLogL( exactHMM, false );
|
||||
double calculatedLogL = cfg.calcLogL( originalHMM, false );
|
||||
double optimizedLogL = cfg.calcLogL( cachingHMM, false );
|
||||
double loglessLogL = cfg.calcLogL( loglessHMM, false );
|
||||
//logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString()));
|
||||
Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference());
|
||||
Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -322,11 +256,11 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset);
|
||||
// change single base at position k to C. If it's a C, change to T
|
||||
mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
|
||||
double res1 = hmm.computeReadLikelihoodGivenHaplotype(
|
||||
originalHMM.initialize(mread.length, haplotype1.length);
|
||||
double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
|
||||
haplotype1, mread,
|
||||
quals, gop, gop,
|
||||
gcp);
|
||||
|
||||
gcp, 0, false);
|
||||
|
||||
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
|
||||
|
||||
|
|
@ -353,11 +287,11 @@ public class PairHMMUnitTest extends BaseTest {
|
|||
byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length);
|
||||
// change single base at position k to C. If it's a C, change to T
|
||||
mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
|
||||
double res1 = hmm.computeReadLikelihoodGivenHaplotype(
|
||||
originalHMM.initialize(mread.length, haplotype1.length);
|
||||
double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
|
||||
haplotype1, mread,
|
||||
quals, gop, gop,
|
||||
gcp);
|
||||
|
||||
gcp, 0, false);
|
||||
|
||||
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
|
||||
|
||||
|
|
@ -62,7 +62,7 @@ public @interface Argument {
|
|||
* --help argument is specified.
|
||||
* @return Doc string associated with this command-line argument.
|
||||
*/
|
||||
String doc();
|
||||
String doc() default "Undocumented option";
|
||||
|
||||
/**
|
||||
* Is this argument required. If true, the command-line argument system will
|
||||
|
|
|
|||
|
|
@ -532,7 +532,7 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
Object[] vals = type.getEnumConstants();
|
||||
Object defaultEnumeration = null; // as we look at options, record the default option if it exists
|
||||
for (Object val : vals) {
|
||||
if (String.valueOf(val).equalsIgnoreCase(value.asString())) return val;
|
||||
if (String.valueOf(val).equalsIgnoreCase(value == null ? null : value.asString())) return val;
|
||||
try { if (type.getField(val.toString()).isAnnotationPresent(EnumerationArgumentDefault.class)) defaultEnumeration = val; }
|
||||
catch (NoSuchFieldException e) { throw new ReviewedStingException("parsing " + type.toString() + "doesn't contain the field " + val.toString()); }
|
||||
}
|
||||
|
|
@ -546,10 +546,10 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
else
|
||||
throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value.asString());
|
||||
} else if (type.equals(File.class)) {
|
||||
result = value.asFile();
|
||||
result = value == null ? null : value.asFile();
|
||||
} else {
|
||||
Constructor ctor = type.getConstructor(String.class);
|
||||
result = ctor.newInstance(value.asString());
|
||||
result = ctor.newInstance(value == null ? null : value.asString());
|
||||
}
|
||||
} catch (UserException e) {
|
||||
throw e;
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.arguments;
|
|||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -54,22 +55,60 @@ public class StandardCallerArgumentCollection {
|
|||
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
|
||||
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
|
||||
* that you not play around with this parameter.
|
||||
*
|
||||
* As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES = 3;
|
||||
public int MAX_ALTERNATE_ALLELES = 6;
|
||||
|
||||
/**
|
||||
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
|
||||
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
|
||||
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
|
||||
* that you not play around with this parameter.
|
||||
*
|
||||
* This argument has been retired in GATK 2.2. Please specify just maxAltAlleles from now on
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2;
|
||||
@Deprecated
|
||||
@Hidden
|
||||
@Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "This argument has been retired in GATK 2.2. Please specify just maxAltAlleles from now on, which will apply to any variant, regardless of type", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES_FOR_INDELS = -1;
|
||||
|
||||
/**
|
||||
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
|
||||
* Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we
|
||||
* will try to remove (N * contamination fraction) bases for each alternate allele.
|
||||
*/
|
||||
@Hidden
|
||||
@Argument(fullName = "contamination_percentage_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
|
||||
public double CONTAMINATION_PERCENTAGE = 0.0;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "logExactCalls", doc="x", required=false)
|
||||
public File exactCallsLog = null;
|
||||
|
||||
public StandardCallerArgumentCollection() { }
|
||||
|
||||
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
|
||||
public StandardCallerArgumentCollection(final StandardCallerArgumentCollection SCAC) {
|
||||
this.alleles = SCAC.alleles;
|
||||
this.GenotypingMode = SCAC.GenotypingMode;
|
||||
this.heterozygosity = SCAC.heterozygosity;
|
||||
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
|
||||
this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
this.OutputMode = SCAC.OutputMode;
|
||||
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
this.CONTAMINATION_PERCENTAGE = SCAC.CONTAMINATION_PERCENTAGE;
|
||||
this.exactCallsLog = SCAC.exactCallsLog;
|
||||
this.AFmodel = SCAC.AFmodel;
|
||||
}
|
||||
|
||||
/**
|
||||
* Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
|
||||
public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -74,8 +74,6 @@ import java.util.*;
|
|||
*
|
||||
*/
|
||||
public abstract class MicroScheduler implements MicroSchedulerMBean {
|
||||
// TODO -- remove me and retire non nano scheduled versions of traversals
|
||||
private final static boolean USE_NANOSCHEDULER_FOR_EVERYTHING = true;
|
||||
protected static final Logger logger = Logger.getLogger(MicroScheduler.class);
|
||||
|
||||
/**
|
||||
|
|
@ -157,18 +155,22 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
|
||||
if ( ! (walker instanceof TreeReducible) ) {
|
||||
throw badNT("nt", engine, walker);
|
||||
} else {
|
||||
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation);
|
||||
}
|
||||
}
|
||||
|
||||
if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) {
|
||||
throw badNT("nct", engine, walker);
|
||||
}
|
||||
|
||||
if ( threadAllocation.getNumDataThreads() > 1 ) {
|
||||
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation);
|
||||
} else {
|
||||
if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) )
|
||||
throw badNT("nct", engine, walker);
|
||||
return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation);
|
||||
}
|
||||
}
|
||||
|
||||
private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) {
|
||||
throw new UserException.BadArgumentValue("nt",
|
||||
throw new UserException.BadArgumentValue(parallelArg,
|
||||
String.format("The analysis %s currently does not support parallel execution with %s. " +
|
||||
"Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg));
|
||||
}
|
||||
|
|
@ -234,15 +236,9 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
|
|||
@Ensures("result != null")
|
||||
private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) {
|
||||
if (walker instanceof ReadWalker) {
|
||||
if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 )
|
||||
return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread());
|
||||
else
|
||||
return new TraverseReads();
|
||||
return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread());
|
||||
} else if (walker instanceof LocusWalker) {
|
||||
if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 )
|
||||
return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread());
|
||||
else
|
||||
return new TraverseLociLinear();
|
||||
return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread());
|
||||
} else if (walker instanceof DuplicateWalker) {
|
||||
return new TraverseDuplicates();
|
||||
} else if (walker instanceof ReadPairWalker) {
|
||||
|
|
|
|||
|
|
@ -123,13 +123,13 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor
|
|||
|
||||
// This parser has been passed a null filename and the GATK is not responsible for creating a type default for the object;
|
||||
// therefore, the user must have failed to specify a type default
|
||||
if(writerFileName.asFile() == null && generateMD5)
|
||||
if(writerFileName != null && writerFileName.asFile() == null && generateMD5)
|
||||
throw new ArgumentException("MD5 generation specified, but no output file specified. If md5 generation is desired, please specify a BAM output file and an md5 file will be written alongside.");
|
||||
|
||||
// Create the stub and set parameters.
|
||||
SAMFileWriterStub stub = null; // stub = new SAMFileWriterStub(engine, defaultOutputStream);
|
||||
|
||||
if ( writerFileName.asFile() != null ) {
|
||||
if (writerFileName != null && writerFileName.asFile() != null ) {
|
||||
stub = new SAMFileWriterStub(engine, writerFileName.asFile());
|
||||
|
||||
if ( compressionLevel != null )
|
||||
|
|
|
|||
|
|
@ -151,7 +151,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
? new VariantContextWriterStub(engine, writerFile, argumentSources)
|
||||
: new VariantContextWriterStub(engine, defaultOutputStream, argumentSources);
|
||||
|
||||
stub.setCompressed(isCompressed(writerFileName.asString()));
|
||||
stub.setCompressed(isCompressed(writerFileName == null ? null: writerFileName.asString()));
|
||||
stub.setDoNotWriteGenotypes(argumentIsPresent(createSitesOnlyArgumentDefinition(),matches));
|
||||
stub.setSkipWritingCommandLineHeader(argumentIsPresent(createNoCommandLineHeaderArgumentDefinition(),matches));
|
||||
stub.setForceBCF(argumentIsPresent(createBCFArgumentDefinition(),matches));
|
||||
|
|
|
|||
|
|
@ -104,10 +104,11 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
|
||||
prevLoc = location;
|
||||
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
printProgress(locus.getLocation());
|
||||
}
|
||||
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
|
||||
// Take the individual isActive calls and integrate them into contiguous active regions and
|
||||
// add these blocks of work to the work queue
|
||||
// band-pass filter the list of isActive probabilities and turn into active regions
|
||||
|
|
|
|||
|
|
@ -1,103 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
|
||||
/**
|
||||
* A simple solution to iterating over all reference positions over a series of genomic locations.
|
||||
*/
|
||||
public abstract class TraverseLociBase<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,LocusShardDataProvider> {
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
protected static final Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||
|
||||
@Override
|
||||
public final String getTraversalUnits() {
|
||||
return "sites";
|
||||
}
|
||||
|
||||
protected static class TraverseResults<T> {
|
||||
final int numIterations;
|
||||
final T reduceResult;
|
||||
|
||||
public TraverseResults(int numIterations, T reduceResult) {
|
||||
this.numIterations = numIterations;
|
||||
this.reduceResult = reduceResult;
|
||||
}
|
||||
}
|
||||
|
||||
protected abstract TraverseResults<T> traverse( final LocusWalker<M,T> walker,
|
||||
final LocusView locusView,
|
||||
final LocusReferenceView referenceView,
|
||||
final ReferenceOrderedView referenceOrderedDataView,
|
||||
final T sum);
|
||||
|
||||
@Override
|
||||
public T traverse( LocusWalker<M,T> walker,
|
||||
LocusShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
logger.debug(String.format("TraverseLociBase.traverse: Shard is %s", dataProvider));
|
||||
|
||||
final LocusView locusView = getLocusView( walker, dataProvider );
|
||||
|
||||
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
||||
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
|
||||
ReferenceOrderedView referenceOrderedDataView = null;
|
||||
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
|
||||
referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider );
|
||||
else
|
||||
referenceOrderedDataView = (RodLocusView)locusView;
|
||||
|
||||
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
|
||||
final TraverseResults<T> result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum );
|
||||
sum = result.reduceResult;
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations);
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
}
|
||||
|
||||
// We have a final map call to execute here to clean up the skipped based from the
|
||||
// last position in the ROD to that in the interval
|
||||
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) {
|
||||
// only do this if the walker isn't done!
|
||||
final RodLocusView rodLocusView = (RodLocusView)locusView;
|
||||
final long nSkipped = rodLocusView.getLastSkippedBases();
|
||||
if ( nSkipped > 0 ) {
|
||||
final GenomeLoc site = rodLocusView.getLocOneBeyondShard();
|
||||
final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped);
|
||||
final M x = walker.map(null, null, ac);
|
||||
sum = walker.reduce(x, sum);
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track'
|
||||
* of sorts, providing a consistent interface so that TraverseLociBase doesn't need to be reimplemented for any new datatype
|
||||
* that comes along.
|
||||
* @param walker walker to interrogate.
|
||||
* @param dataProvider Data which which to drive the locus view.
|
||||
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
|
||||
*/
|
||||
private LocusView getLocusView( Walker<M,T> walker, LocusShardDataProvider dataProvider ) {
|
||||
final DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
|
||||
if( dataSource == DataSource.READS )
|
||||
return new CoveredLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )
|
||||
return new AllLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE_ORDERED_DATA )
|
||||
return new RodLocusView(dataProvider);
|
||||
else
|
||||
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,47 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
/**
|
||||
* A simple solution to iterating over all reference positions over a series of genomic locations.
|
||||
*/
|
||||
public class TraverseLociLinear<M,T> extends TraverseLociBase<M,T> {
|
||||
|
||||
@Override
|
||||
protected TraverseResults<T> traverse(LocusWalker<M, T> walker, LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView, T sum) {
|
||||
// We keep processing while the next reference location is within the interval
|
||||
boolean done = false;
|
||||
int numIterations = 0;
|
||||
|
||||
while( locusView.hasNext() && ! done ) {
|
||||
numIterations++;
|
||||
final AlignmentContext locus = locusView.next();
|
||||
final GenomeLoc location = locus.getLocation();
|
||||
|
||||
// create reference context. Note that if we have a pileup of "extended events", the context will
|
||||
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
|
||||
final ReferenceContext refContext = referenceView.getReferenceContext(location);
|
||||
|
||||
// Iterate forward to get all reference ordered data covering this location
|
||||
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
|
||||
|
||||
final boolean keepMeP = walker.filter(tracker, refContext, locus);
|
||||
if (keepMeP) {
|
||||
final M x = walker.map(tracker, refContext, locus);
|
||||
sum = walker.reduce(x, sum);
|
||||
done = walker.isDone();
|
||||
}
|
||||
|
||||
printProgress(locus.getLocation());
|
||||
}
|
||||
|
||||
return new TraverseResults<T>(numIterations, sum);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,24 +1,26 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.LocusView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction;
|
||||
import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction;
|
||||
import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction;
|
||||
import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* A simple solution to iterating over all reference positions over a series of genomic locations.
|
||||
*/
|
||||
public class TraverseLociNano<M,T> extends TraverseLociBase<M,T> {
|
||||
public class TraverseLociNano<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,LocusShardDataProvider> {
|
||||
/** our log, which we want to capture anything from this class */
|
||||
private static final boolean DEBUG = false;
|
||||
|
||||
|
|
@ -30,6 +32,81 @@ public class TraverseLociNano<M,T> extends TraverseLociBase<M,T> {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final String getTraversalUnits() {
|
||||
return "sites";
|
||||
}
|
||||
|
||||
protected static class TraverseResults<T> {
|
||||
final int numIterations;
|
||||
final T reduceResult;
|
||||
|
||||
public TraverseResults(int numIterations, T reduceResult) {
|
||||
this.numIterations = numIterations;
|
||||
this.reduceResult = reduceResult;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public T traverse( LocusWalker<M,T> walker,
|
||||
LocusShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider));
|
||||
|
||||
final LocusView locusView = getLocusView( walker, dataProvider );
|
||||
|
||||
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
|
||||
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
|
||||
ReferenceOrderedView referenceOrderedDataView = null;
|
||||
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
|
||||
referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider );
|
||||
else
|
||||
referenceOrderedDataView = (RodLocusView)locusView;
|
||||
|
||||
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
|
||||
|
||||
final TraverseResults<T> result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum );
|
||||
sum = result.reduceResult;
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations);
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
}
|
||||
|
||||
// We have a final map call to execute here to clean up the skipped based from the
|
||||
// last position in the ROD to that in the interval
|
||||
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) {
|
||||
// only do this if the walker isn't done!
|
||||
final RodLocusView rodLocusView = (RodLocusView)locusView;
|
||||
final long nSkipped = rodLocusView.getLastSkippedBases();
|
||||
if ( nSkipped > 0 ) {
|
||||
final GenomeLoc site = rodLocusView.getLocOneBeyondShard();
|
||||
final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped);
|
||||
final M x = walker.map(null, null, ac);
|
||||
sum = walker.reduce(x, sum);
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track'
|
||||
* of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype
|
||||
* that comes along.
|
||||
* @param walker walker to interrogate.
|
||||
* @param dataProvider Data which which to drive the locus view.
|
||||
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
|
||||
*/
|
||||
private LocusView getLocusView( Walker<M,T> walker, LocusShardDataProvider dataProvider ) {
|
||||
final DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
|
||||
if( dataSource == DataSource.READS )
|
||||
return new CoveredLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )
|
||||
return new AllLocusView(dataProvider);
|
||||
else if( dataSource == DataSource.REFERENCE_ORDERED_DATA )
|
||||
return new RodLocusView(dataProvider);
|
||||
else
|
||||
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
|
||||
}
|
||||
|
||||
protected TraverseResults<T> traverse(final LocusWalker<M, T> walker,
|
||||
final LocusView locusView,
|
||||
final LocusReferenceView referenceView,
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ public class TraverseReadPairs<M,T> extends TraversalEngine<M,T, ReadPairWalker<
|
|||
public T traverse(ReadPairWalker<M, T> walker,
|
||||
ReadShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
logger.debug(String.format("TraverseReads.traverse Covered dataset is %s", dataProvider));
|
||||
logger.debug(String.format("TraverseReadsPairs.traverse Covered dataset is %s", dataProvider));
|
||||
|
||||
if( !dataProvider.hasReads() )
|
||||
throw new IllegalArgumentException("Unable to traverse reads; no read data is available.");
|
||||
|
|
|
|||
|
|
@ -1,111 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* @version 1.0
|
||||
* @date Apr 24, 2009
|
||||
* <p/>
|
||||
* Class TraverseReads
|
||||
* <p/>
|
||||
* This class handles traversing by reads in the new shardable style
|
||||
*/
|
||||
public class TraverseReads<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,ReadShardDataProvider> {
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static final Logger logger = Logger.getLogger(TraverseReads.class);
|
||||
|
||||
@Override
|
||||
public String getTraversalUnits() {
|
||||
return "reads";
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverse by reads, given the data and the walker
|
||||
*
|
||||
* @param walker the walker to traverse with
|
||||
* @param dataProvider the provider of the reads data
|
||||
* @param sum the value of type T, specified by the walker, to feed to the walkers reduce function
|
||||
* @return the reduce variable of the read walker
|
||||
*/
|
||||
public T traverse(ReadWalker<M,T> walker,
|
||||
ReadShardDataProvider dataProvider,
|
||||
T sum) {
|
||||
|
||||
logger.debug(String.format("TraverseReads.traverse Covered dataset is %s", dataProvider));
|
||||
|
||||
if( !dataProvider.hasReads() )
|
||||
throw new IllegalArgumentException("Unable to traverse reads; no read data is available.");
|
||||
|
||||
final ReadView reads = new ReadView(dataProvider);
|
||||
final ReadReferenceView reference = new ReadReferenceView(dataProvider);
|
||||
|
||||
// get the reference ordered data
|
||||
final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);
|
||||
|
||||
boolean done = walker.isDone();
|
||||
// while we still have more reads
|
||||
for (final SAMRecord read : reads) {
|
||||
if ( done ) break;
|
||||
|
||||
// ReferenceContext -- the reference bases covered by the read
|
||||
final ReferenceContext refContext = ! read.getReadUnmappedFlag() && dataProvider.hasReference()
|
||||
? reference.getReferenceContext(read)
|
||||
: null;
|
||||
|
||||
// update the number of reads we've seen
|
||||
dataProvider.getShard().getReadMetrics().incrementNumIterations();
|
||||
|
||||
// if the read is mapped, create a metadata tracker
|
||||
final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null;
|
||||
|
||||
final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read);
|
||||
if (keepMeP) {
|
||||
M x = walker.map(refContext, (GATKSAMRecord) read, tracker); // the tracker can be null
|
||||
sum = walker.reduce(x, sum);
|
||||
}
|
||||
|
||||
final GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart());
|
||||
|
||||
updateCumulativeMetrics(dataProvider.getShard());
|
||||
printProgress(locus);
|
||||
|
||||
done = walker.isDone();
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
|
@ -48,7 +48,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
|
|||
|
||||
private Map<String, Object> calculateIC(final VariantContext vc) {
|
||||
final GenotypesContext genotypes = (founderIds == null || founderIds.isEmpty()) ? vc.getGenotypes() : vc.getGenotypes(founderIds);
|
||||
if ( genotypes == null || genotypes.size() < MIN_SAMPLES )
|
||||
if ( genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant())
|
||||
return null;
|
||||
|
||||
int idxAA = 0, idxAB = 1, idxBB = 2;
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
|
||||
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
|
||||
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
|
||||
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
|
||||
|
|
@ -231,7 +231,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
int count = 0;
|
||||
for (PileupElement p : pileup) {
|
||||
if (p.isDeletion() || p.isInsertionAtBeginningOfRead() || BaseUtils.isRegularBase(p.getBase()))
|
||||
count++;
|
||||
count += p.getRepresentativeCount();
|
||||
}
|
||||
|
||||
return count;
|
||||
|
|
|
|||
|
|
@ -113,23 +113,20 @@ public class PerReadAlleleLikelihoodMap {
|
|||
return likelihoodReadMap.get(p.getRead());
|
||||
}
|
||||
|
||||
public static Allele getMostLikelyAllele(Map<Allele,Double> alleleMap) {
|
||||
double minLike = Double.POSITIVE_INFINITY, maxLike = Double.NEGATIVE_INFINITY;
|
||||
public static Allele getMostLikelyAllele( final Map<Allele,Double> alleleMap ) {
|
||||
double maxLike = Double.NEGATIVE_INFINITY;
|
||||
double prevMaxLike = Double.NEGATIVE_INFINITY;
|
||||
Allele mostLikelyAllele = Allele.NO_CALL;
|
||||
|
||||
for (Map.Entry<Allele,Double> el : alleleMap.entrySet()) {
|
||||
for (final Map.Entry<Allele,Double> el : alleleMap.entrySet()) {
|
||||
if (el.getValue() > maxLike) {
|
||||
prevMaxLike = maxLike;
|
||||
maxLike = el.getValue();
|
||||
mostLikelyAllele = el.getKey();
|
||||
} else if( el.getValue() > prevMaxLike ) {
|
||||
prevMaxLike = el.getValue();
|
||||
}
|
||||
|
||||
if (el.getValue() < minLike)
|
||||
minLike = el.getValue();
|
||||
|
||||
}
|
||||
if (maxLike-minLike > INDEL_LIKELIHOOD_THRESH)
|
||||
return mostLikelyAllele;
|
||||
else
|
||||
return Allele.NO_CALL;
|
||||
return (maxLike - prevMaxLike > INDEL_LIKELIHOOD_THRESH ? mostLikelyAllele : Allele.NO_CALL );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,19 +41,20 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
|||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||
|
||||
private final boolean useAlleleFromVCF;
|
||||
|
||||
private final double[] likelihoodSums = new double[4];
|
||||
|
||||
private final ArrayList<PileupElement>[] alleleStratifiedElements = new ArrayList[4];
|
||||
|
||||
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleStratifiedElements[i] = new ArrayList<PileupElement>();
|
||||
}
|
||||
|
||||
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
|
||||
|
|
@ -78,8 +79,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
ArrayList<SampleGenotypeData> GLs = new ArrayList<SampleGenotypeData>(contexts.size());
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
|
||||
if ( UAC.CONTAMINATION_PERCENTAGE > 0.0 )
|
||||
pileup = createDecontaminatedPileup(pileup, UAC.CONTAMINATION_PERCENTAGE);
|
||||
if ( useBAQedPileup )
|
||||
pileup = createBAQedPileup( pileup );
|
||||
pileup = createBAQedPileup(pileup);
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.PCR_error);
|
||||
|
|
@ -150,8 +153,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
|
||||
// create the genotypes; no-call everyone for now
|
||||
final GenotypesContext genotypes = GenotypesContext.create();
|
||||
final List<Allele> noCall = new ArrayList<Allele>();
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
for ( SampleGenotypeData sampleData : GLs ) {
|
||||
final double[] allLikelihoods = sampleData.GL.getLikelihoods();
|
||||
|
|
@ -202,6 +203,42 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
return allelesToUse;
|
||||
}
|
||||
|
||||
public ReadBackedPileup createDecontaminatedPileup(final ReadBackedPileup pileup, final double contaminationPercentage) {
|
||||
// special case removal of all reads
|
||||
if ( contaminationPercentage >= 1.0 )
|
||||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>());
|
||||
|
||||
// start by stratifying the reads by the alleles they represent at this position
|
||||
for( final PileupElement pe : pileup ) {
|
||||
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
|
||||
if ( baseIndex != -1 )
|
||||
alleleStratifiedElements[baseIndex].add(pe);
|
||||
}
|
||||
|
||||
// Down-sample *each* allele by the contamination fraction applied to the entire pileup.
|
||||
// Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
|
||||
int numReadsToRemove = (int)Math.ceil((double)pileup.getNumberOfElements() * contaminationPercentage);
|
||||
final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
|
||||
@Override
|
||||
public int compare(PileupElement element1, PileupElement element2) {
|
||||
final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
|
||||
return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
|
||||
}
|
||||
});
|
||||
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
|
||||
if ( alleleList.size() > numReadsToRemove )
|
||||
elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove));
|
||||
}
|
||||
|
||||
// clean up pointers so memory can be garbage collected if needed
|
||||
for ( int i = 0; i < 4; i++ )
|
||||
alleleStratifiedElements[i].clear();
|
||||
|
||||
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
|
||||
}
|
||||
|
||||
public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) {
|
||||
final List<PileupElement> BAQedElements = new ArrayList<PileupElement>();
|
||||
for( final PileupElement PE : pileup ) {
|
||||
|
|
@ -220,6 +257,22 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); }
|
||||
}
|
||||
|
||||
private List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove) {
|
||||
final int pileupSize = elements.size();
|
||||
final BitSet itemsToRemove = new BitSet(pileupSize);
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
|
||||
itemsToRemove.set(selectedIndex);
|
||||
}
|
||||
|
||||
ArrayList<PileupElement> elementsToKeep = new ArrayList<PileupElement>(pileupSize - numElementsToRemove);
|
||||
for ( int i = 0; i < pileupSize; i++ ) {
|
||||
if ( !itemsToRemove.get(i) )
|
||||
elementsToKeep.add(elements.get(i));
|
||||
}
|
||||
|
||||
return elementsToKeep;
|
||||
}
|
||||
|
||||
private static class SampleGenotypeData {
|
||||
|
||||
public final String name;
|
||||
|
|
|
|||
|
|
@ -27,23 +27,15 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
|
||||
public class UnifiedArgumentCollection extends StandardCallerArgumentCollection {
|
||||
|
||||
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false)
|
||||
public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
|
||||
|
||||
/**
|
||||
* Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
|
||||
public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT;
|
||||
|
||||
/**
|
||||
* The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily
|
||||
* distinguish between PCR errors vs. sequencing errors. The practical implication for this value is that it
|
||||
|
|
@ -65,6 +57,12 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false)
|
||||
public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false;
|
||||
|
||||
/**
|
||||
* The PairHMM implementation to use for -glm INDEL genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
|
||||
*/
|
||||
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for -glm INDEL genotype likelihood calculations", required = false)
|
||||
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ORIGINAL;
|
||||
|
||||
/**
|
||||
* The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base
|
||||
* is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value.
|
||||
|
|
@ -112,10 +110,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false)
|
||||
public int INDEL_HAPLOTYPE_SIZE = 80;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "noBandedIndel", shortName = "noBandedIndel", doc = "Don't do Banded Indel likelihood computation", required = false)
|
||||
public boolean DONT_DO_BANDED_INDEL_COMPUTATION = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
|
||||
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
|
||||
|
|
@ -183,63 +177,57 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
|
||||
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
|
||||
|
||||
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
|
||||
public UnifiedArgumentCollection clone() {
|
||||
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
|
||||
/**
|
||||
* Create a new UAC with defaults for all UAC arguments
|
||||
*/
|
||||
public UnifiedArgumentCollection() {
|
||||
super();
|
||||
}
|
||||
|
||||
uac.GLmodel = GLmodel;
|
||||
uac.AFmodel = AFmodel;
|
||||
uac.heterozygosity = heterozygosity;
|
||||
uac.PCR_error = PCR_error;
|
||||
uac.GenotypingMode = GenotypingMode;
|
||||
uac.OutputMode = OutputMode;
|
||||
uac.NO_SLOD = NO_SLOD;
|
||||
uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED;
|
||||
uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
|
||||
uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION;
|
||||
uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING;
|
||||
uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE;
|
||||
uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY;
|
||||
uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY;
|
||||
uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY;
|
||||
uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO;
|
||||
uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE;
|
||||
uac.alleles = alleles;
|
||||
uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES;
|
||||
uac.MAX_ALTERNATE_ALLELES_FOR_INDELS = MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
uac.GLmodel = GLmodel;
|
||||
uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL;
|
||||
uac.referenceSampleRod = referenceSampleRod;
|
||||
uac.referenceSampleName = referenceSampleName;
|
||||
uac.samplePloidy = samplePloidy;
|
||||
uac.maxQualityScore = minQualityScore;
|
||||
uac.phredScaledPrior = phredScaledPrior;
|
||||
uac.minPower = minPower;
|
||||
uac.minReferenceDepth = minReferenceDepth;
|
||||
uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES;
|
||||
uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO;
|
||||
uac.exactCallsLog = exactCallsLog;
|
||||
/**
|
||||
* Create a new UAC based on the information only our in super-class scac and defaults for all UAC arguments
|
||||
* @param scac
|
||||
*/
|
||||
public UnifiedArgumentCollection(final StandardCallerArgumentCollection scac) {
|
||||
super(scac);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new UAC with all parameters having the values in uac
|
||||
*
|
||||
* @param uac
|
||||
*/
|
||||
public UnifiedArgumentCollection(final UnifiedArgumentCollection uac) {
|
||||
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
|
||||
super(uac);
|
||||
|
||||
this.GLmodel = uac.GLmodel;
|
||||
this.AFmodel = uac.AFmodel;
|
||||
this.PCR_error = uac.PCR_error;
|
||||
this.NO_SLOD = uac.NO_SLOD;
|
||||
this.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED;
|
||||
this.MIN_BASE_QUALTY_SCORE = uac.MIN_BASE_QUALTY_SCORE;
|
||||
this.MAX_DELETION_FRACTION = uac.MAX_DELETION_FRACTION;
|
||||
this.MIN_INDEL_COUNT_FOR_GENOTYPING = uac.MIN_INDEL_COUNT_FOR_GENOTYPING;
|
||||
this.MIN_INDEL_FRACTION_PER_SAMPLE = uac.MIN_INDEL_FRACTION_PER_SAMPLE;
|
||||
this.INDEL_HETEROZYGOSITY = uac.INDEL_HETEROZYGOSITY;
|
||||
this.INDEL_GAP_OPEN_PENALTY = uac.INDEL_GAP_OPEN_PENALTY;
|
||||
this.INDEL_GAP_CONTINUATION_PENALTY = uac.INDEL_GAP_CONTINUATION_PENALTY;
|
||||
this.OUTPUT_DEBUG_INDEL_INFO = uac.OUTPUT_DEBUG_INDEL_INFO;
|
||||
this.INDEL_HAPLOTYPE_SIZE = uac.INDEL_HAPLOTYPE_SIZE;
|
||||
this.TREAT_ALL_READS_AS_SINGLE_POOL = uac.TREAT_ALL_READS_AS_SINGLE_POOL;
|
||||
this.referenceSampleRod = uac.referenceSampleRod;
|
||||
this.referenceSampleName = uac.referenceSampleName;
|
||||
this.samplePloidy = uac.samplePloidy;
|
||||
this.maxQualityScore = uac.minQualityScore;
|
||||
this.phredScaledPrior = uac.phredScaledPrior;
|
||||
this.minPower = uac.minPower;
|
||||
this.minReferenceDepth = uac.minReferenceDepth;
|
||||
this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES;
|
||||
this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO;
|
||||
this.pairHMM = uac.pairHMM;
|
||||
|
||||
// todo- arguments to remove
|
||||
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
|
||||
uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION;
|
||||
return uac;
|
||||
}
|
||||
|
||||
public UnifiedArgumentCollection() { }
|
||||
|
||||
public UnifiedArgumentCollection( final StandardCallerArgumentCollection SCAC ) {
|
||||
super();
|
||||
this.alleles = SCAC.alleles;
|
||||
this.GenotypingMode = SCAC.GenotypingMode;
|
||||
this.heterozygosity = SCAC.heterozygosity;
|
||||
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
|
||||
this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
this.OutputMode = SCAC.OutputMode;
|
||||
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
this.exactCallsLog = SCAC.exactCallsLog;
|
||||
this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,30 +45,46 @@ public abstract class AFCalc implements Cloneable {
|
|||
|
||||
protected final int nSamples;
|
||||
protected final int maxAlternateAllelesToGenotype;
|
||||
protected final int maxAlternateAllelesForIndels;
|
||||
|
||||
protected Logger logger = defaultLogger;
|
||||
|
||||
private SimpleTimer callTimer = new SimpleTimer();
|
||||
private final AFCalcResultTracker resultTracker;
|
||||
private final StateTracker stateTracker;
|
||||
private ExactCallLogger exactCallLogger = null;
|
||||
|
||||
protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
|
||||
/**
|
||||
* Create a new AFCalc object capable of calculating the prob. that alleles are
|
||||
* segregating among nSamples with up to maxAltAlleles for SNPs and maxAltAllelesForIndels
|
||||
* for indels for samples with ploidy
|
||||
*
|
||||
* @param nSamples number of samples, must be > 0
|
||||
* @param maxAltAlleles maxAltAlleles for SNPs
|
||||
* @param ploidy the ploidy, must be > 0
|
||||
*/
|
||||
protected AFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) {
|
||||
if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples);
|
||||
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles);
|
||||
if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels);
|
||||
if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be > 0 but got " + ploidy);
|
||||
|
||||
this.nSamples = nSamples;
|
||||
this.maxAlternateAllelesToGenotype = maxAltAlleles;
|
||||
this.maxAlternateAllelesForIndels = maxAltAllelesForIndels;
|
||||
this.resultTracker = new AFCalcResultTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels));
|
||||
this.stateTracker = new StateTracker(maxAltAlleles);
|
||||
}
|
||||
|
||||
/**
|
||||
* Enable exact call logging to file
|
||||
*
|
||||
* @param exactCallsLog the destination file
|
||||
*/
|
||||
public void enableProcessLog(final File exactCallsLog) {
|
||||
exactCallLogger = new ExactCallLogger(exactCallsLog);
|
||||
}
|
||||
|
||||
/**
|
||||
* Use this logger instead of the default logger
|
||||
*
|
||||
* @param logger
|
||||
*/
|
||||
public void setLogger(Logger logger) {
|
||||
this.logger = logger;
|
||||
}
|
||||
|
|
@ -83,10 +99,10 @@ public abstract class AFCalc implements Cloneable {
|
|||
public AFCalcResult getLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) {
|
||||
if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null");
|
||||
if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null");
|
||||
if ( resultTracker == null ) throw new IllegalArgumentException("Results object cannot be null");
|
||||
if ( stateTracker == null ) throw new IllegalArgumentException("Results object cannot be null");
|
||||
|
||||
// reset the result, so we can store our new result there
|
||||
resultTracker.reset();
|
||||
stateTracker.reset();
|
||||
|
||||
final VariantContext vcWorking = reduceScope(vc);
|
||||
|
||||
|
|
@ -100,10 +116,20 @@ public abstract class AFCalc implements Cloneable {
|
|||
return result;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
protected AFCalcResult resultFromTracker(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) {
|
||||
resultTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles());
|
||||
return resultTracker.toAFCalcResult(log10AlleleFrequencyPriors);
|
||||
/**
|
||||
* Convert the final state of the state tracker into our result as an AFCalcResult
|
||||
*
|
||||
* Assumes that stateTracker has been updated accordingly
|
||||
*
|
||||
* @param vcWorking the VariantContext we actually used as input to the calc model (after reduction)
|
||||
* @param log10AlleleFrequencyPriors the priors by AC vector
|
||||
* @return a AFCalcResult describing the result of this calculation
|
||||
*/
|
||||
@Requires("stateTracker.getnEvaluations() >= 0")
|
||||
@Ensures("result != null")
|
||||
protected AFCalcResult getResultFromFinalState(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) {
|
||||
stateTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles());
|
||||
return stateTracker.toAFCalcResult(log10AlleleFrequencyPriors);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -134,11 +160,13 @@ public abstract class AFCalc implements Cloneable {
|
|||
* @param log10AlleleFrequencyPriors priors
|
||||
* @return a AFCalcResult object describing the results of this calculation
|
||||
*/
|
||||
// TODO -- add consistent requires among args
|
||||
@Requires({"vc != null", "log10AlleleFrequencyPriors != null"})
|
||||
protected abstract AFCalcResult computeLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors);
|
||||
|
||||
/**
|
||||
* Subset VC to the just allelesToUse, updating genotype likelihoods
|
||||
*
|
||||
* Must be overridden by concrete subclasses
|
||||
*
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
|
|
@ -159,11 +187,11 @@ public abstract class AFCalc implements Cloneable {
|
|||
// ---------------------------------------------------------------------------
|
||||
|
||||
public int getMaxAltAlleles() {
|
||||
return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels);
|
||||
return maxAlternateAllelesToGenotype;
|
||||
}
|
||||
|
||||
public AFCalcResultTracker getResultTracker() {
|
||||
return resultTracker;
|
||||
protected StateTracker getStateTracker() {
|
||||
return stateTracker;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -24,15 +24,12 @@ public class AFCalcFactory {
|
|||
* the needs of the request (i.e., considering ploidy).
|
||||
*/
|
||||
public enum Calculation {
|
||||
/** The default implementation */
|
||||
EXACT(ReferenceDiploidExactAFCalc.class, 2, -1),
|
||||
|
||||
/** reference implementation of multi-allelic EXACT model */
|
||||
EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1),
|
||||
|
||||
/** expt. implementation -- for testing only */
|
||||
EXACT_INDEPENDENT(IndependentAllelesDiploidExactAFCalc.class, 2, -1),
|
||||
|
||||
/** reference implementation of multi-allelic EXACT model. Extremely slow for many alternate alleles */
|
||||
EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1),
|
||||
|
||||
/** original biallelic exact model, for testing only */
|
||||
EXACT_ORIGINAL(OriginalDiploidExactAFCalc.class, 2, 2),
|
||||
|
||||
|
|
@ -60,6 +57,8 @@ public class AFCalcFactory {
|
|||
return (requiredPloidy == -1 || requiredPloidy == requestedPloidy)
|
||||
&& (maxAltAlleles == -1 || maxAltAlleles >= requestedMaxAltAlleles);
|
||||
}
|
||||
|
||||
public static Calculation getDefaultModel() { return EXACT_INDEPENDENT; }
|
||||
}
|
||||
|
||||
private static final Map<String, Class<? extends AFCalc>> afClasses;
|
||||
|
|
@ -92,7 +91,7 @@ public class AFCalcFactory {
|
|||
public static AFCalc createAFCalc(final UnifiedArgumentCollection UAC,
|
||||
final int nSamples,
|
||||
final Logger logger) {
|
||||
final int maxAltAlleles = Math.max(UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS);
|
||||
final int maxAltAlleles = UAC.MAX_ALTERNATE_ALLELES;
|
||||
if ( ! UAC.AFmodel.usableForParams(UAC.samplePloidy, maxAltAlleles) ) {
|
||||
logger.info("Requested ploidy " + UAC.samplePloidy + " maxAltAlleles " + maxAltAlleles + " not supported by requested model " + UAC.AFmodel + " looking for an option");
|
||||
final List<Calculation> supportingCalculations = new LinkedList<Calculation>();
|
||||
|
|
@ -110,7 +109,7 @@ public class AFCalcFactory {
|
|||
logger.info("Selecting model " + UAC.AFmodel);
|
||||
}
|
||||
|
||||
final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.samplePloidy);
|
||||
final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.samplePloidy);
|
||||
|
||||
if ( logger != null ) calc.setLogger(logger);
|
||||
if ( UAC.exactCallsLog != null ) calc.enableProcessLog(UAC.exactCallsLog);
|
||||
|
|
@ -127,7 +126,7 @@ public class AFCalcFactory {
|
|||
* @return an initialized AFCalc
|
||||
*/
|
||||
public static AFCalc createAFCalc(final int nSamples) {
|
||||
return createAFCalc(chooseBestCalculation(nSamples, 2, 1), nSamples, 2, 2, 2);
|
||||
return createAFCalc(chooseBestCalculation(nSamples, 2, 1), nSamples, 2, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -140,7 +139,7 @@ public class AFCalcFactory {
|
|||
* @return an initialized AFCalc
|
||||
*/
|
||||
public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles) {
|
||||
return createAFCalc(calc, nSamples, maxAltAlleles, maxAltAlleles, 2);
|
||||
return createAFCalc(calc, nSamples, maxAltAlleles, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -148,14 +147,12 @@ public class AFCalcFactory {
|
|||
*
|
||||
* @param nSamples the number of samples we'll be using
|
||||
* @param maxAltAlleles the max. alt alleles to consider for SNPs
|
||||
* @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs
|
||||
* @param ploidy the sample ploidy. Must be consistent with the calc
|
||||
*
|
||||
* @return an initialized AFCalc
|
||||
*/
|
||||
public static AFCalc createAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
|
||||
final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels);
|
||||
return createAFCalc(chooseBestCalculation(nSamples, ploidy, maxAlt), nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
public static AFCalc createAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) {
|
||||
return createAFCalc(chooseBestCalculation(nSamples, ploidy, maxAltAlleles), nSamples, maxAltAlleles, ploidy);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -182,20 +179,17 @@ public class AFCalcFactory {
|
|||
* @param calc the calculation to use
|
||||
* @param nSamples the number of samples we'll be using
|
||||
* @param maxAltAlleles the max. alt alleles to consider for SNPs
|
||||
* @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs
|
||||
* @param ploidy the sample ploidy. Must be consistent with the calc
|
||||
*
|
||||
* @return an initialized AFCalc
|
||||
*/
|
||||
public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
|
||||
public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles, final int ploidy) {
|
||||
if ( calc == null ) throw new IllegalArgumentException("Calculation cannot be null");
|
||||
if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples);
|
||||
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles);
|
||||
if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels);
|
||||
if ( ploidy < 1 ) throw new IllegalArgumentException("sample ploidy must be greater than zero " + ploidy);
|
||||
|
||||
final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels);
|
||||
if ( ! calc.usableForParams(ploidy, maxAlt) )
|
||||
if ( ! calc.usableForParams(ploidy, maxAltAlleles) )
|
||||
throw new IllegalArgumentException("AFCalc " + calc + " does not support requested ploidy " + ploidy);
|
||||
|
||||
final Class<? extends AFCalc> afClass = getClassByName(calc.className);
|
||||
|
|
@ -203,19 +197,19 @@ public class AFCalcFactory {
|
|||
throw new IllegalArgumentException("Unexpected AFCalc " + calc);
|
||||
|
||||
try {
|
||||
Object args[] = new Object[]{nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy};
|
||||
Constructor c = afClass.getDeclaredConstructor(int.class, int.class, int.class, int.class);
|
||||
Object args[] = new Object[]{nSamples, maxAltAlleles, ploidy};
|
||||
Constructor c = afClass.getDeclaredConstructor(int.class, int.class, int.class);
|
||||
return (AFCalc)c.newInstance(args);
|
||||
} catch (Exception e) {
|
||||
throw new ReviewedStingException("Could not instantiate AFCalc " + calc, e);
|
||||
}
|
||||
}
|
||||
|
||||
protected static List<AFCalc> createAFCalcs(final List<Calculation> calcs, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
|
||||
protected static List<AFCalc> createAFCalcs(final List<Calculation> calcs, final int nSamples, final int maxAltAlleles, final int ploidy) {
|
||||
final List<AFCalc> AFCalcs = new LinkedList<AFCalc>();
|
||||
|
||||
for ( final Calculation calc : calcs )
|
||||
AFCalcs.add(createAFCalc(calc, nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy));
|
||||
AFCalcs.add(createAFCalc(calc, nSamples, maxAltAlleles, ploidy));
|
||||
|
||||
return AFCalcs;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -83,8 +83,8 @@ public class AFCalcResult {
|
|||
if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null");
|
||||
if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
|
||||
if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
|
||||
if ( ! goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC));
|
||||
if ( ! goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC));
|
||||
if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC));
|
||||
if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC));
|
||||
|
||||
this.alleleCountsOfMLE = alleleCountsOfMLE;
|
||||
this.nEvaluations = nEvaluations;
|
||||
|
|
@ -147,7 +147,7 @@ public class AFCalcResult {
|
|||
* Due to computational / implementation constraints this may be smaller than
|
||||
* the actual list of alleles requested
|
||||
*
|
||||
* @return a non-empty list of alleles used during genotyping
|
||||
* @return a non-empty list of alleles used during genotyping, the first of which is the reference allele
|
||||
*/
|
||||
@Ensures({"result != null", "! result.isEmpty()"})
|
||||
public List<Allele> getAllelesUsedInGenotyping() {
|
||||
|
|
@ -159,7 +159,7 @@ public class AFCalcResult {
|
|||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures({"goodLog10Probability(result)"})
|
||||
@Ensures({"MathUtils.goodLog10Probability(result)"})
|
||||
public double getLog10PosteriorOfAFEq0() {
|
||||
return log10PosteriorsOfAC[AF0];
|
||||
}
|
||||
|
|
@ -169,7 +169,7 @@ public class AFCalcResult {
|
|||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures({"goodLog10Probability(result)"})
|
||||
@Ensures({"MathUtils.goodLog10Probability(result)"})
|
||||
public double getLog10PosteriorOfAFGT0() {
|
||||
return log10PosteriorsOfAC[AF1p];
|
||||
}
|
||||
|
|
@ -179,7 +179,7 @@ public class AFCalcResult {
|
|||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures({"goodLog10Probability(result)"})
|
||||
@Ensures({"MathUtils.goodLog10Probability(result)"})
|
||||
public double getLog10LikelihoodOfAFEq0() {
|
||||
return log10LikelihoodsOfAC[AF0];
|
||||
}
|
||||
|
|
@ -189,7 +189,7 @@ public class AFCalcResult {
|
|||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures({"goodLog10Probability(result)"})
|
||||
@Ensures({"MathUtils.goodLog10Probability(result)"})
|
||||
public double getLog10LikelihoodOfAFGT0() {
|
||||
return log10LikelihoodsOfAC[AF1p];
|
||||
}
|
||||
|
|
@ -199,7 +199,7 @@ public class AFCalcResult {
|
|||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures({"goodLog10Probability(result)"})
|
||||
@Ensures({"MathUtils.goodLog10Probability(result)"})
|
||||
public double getLog10PriorOfAFEq0() {
|
||||
return log10PriorsOfAC[AF0];
|
||||
}
|
||||
|
|
@ -209,7 +209,7 @@ public class AFCalcResult {
|
|||
*
|
||||
* @return
|
||||
*/
|
||||
@Ensures({"goodLog10Probability(result)"})
|
||||
@Ensures({"MathUtils.goodLog10Probability(result)"})
|
||||
public double getLog10PriorOfAFGT0() {
|
||||
return log10PriorsOfAC[AF1p];
|
||||
}
|
||||
|
|
@ -263,7 +263,7 @@ public class AFCalcResult {
|
|||
* @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping
|
||||
* @return the log10 probability that allele is segregating at this site
|
||||
*/
|
||||
@Ensures("goodLog10Probability(result)")
|
||||
@Ensures("MathUtils.goodLog10Probability(result)")
|
||||
public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) {
|
||||
final Double log10pNonRef = log10pNonRefByAllele.get(allele);
|
||||
if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele);
|
||||
|
|
@ -279,7 +279,7 @@ public class AFCalcResult {
|
|||
* @return freshly allocated log10 normalized posteriors vector
|
||||
*/
|
||||
@Requires("log10LikelihoodsOfAC.length == log10PriorsOfAC.length")
|
||||
@Ensures("goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)")
|
||||
@Ensures("MathUtils.goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)")
|
||||
private static double[] computePosteriors(final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC) {
|
||||
final double[] log10UnnormalizedPosteriors = new double[log10LikelihoodsOfAC.length];
|
||||
for ( int i = 0; i < log10LikelihoodsOfAC.length; i++ )
|
||||
|
|
@ -287,29 +287,6 @@ public class AFCalcResult {
|
|||
return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that the log10 prob vector vector is well formed
|
||||
*
|
||||
* @param vector
|
||||
* @param expectedSize
|
||||
* @param shouldSumToOne
|
||||
*
|
||||
* @return true if vector is well-formed, false otherwise
|
||||
*/
|
||||
private static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) {
|
||||
if ( vector.length != expectedSize ) return false;
|
||||
|
||||
for ( final double pr : vector ) {
|
||||
if ( ! goodLog10Probability(pr) )
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( shouldSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(vector), 1.0, 1e-4) != 0 )
|
||||
return false;
|
||||
|
||||
return true; // everything is good
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the offset into linear vectors indexed by alt allele for allele
|
||||
*
|
||||
|
|
@ -331,14 +308,4 @@ public class AFCalcResult {
|
|||
else
|
||||
return index - 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks that the result is a well-formed log10 probability
|
||||
*
|
||||
* @param result a supposedly well-formed log10 probability value
|
||||
* @return true if result is really well formed
|
||||
*/
|
||||
private static boolean goodLog10Probability(final double result) {
|
||||
return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,256 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Dec 14, 2011
|
||||
*
|
||||
* Useful helper class to communicate the results of the allele frequency calculation
|
||||
*
|
||||
* TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF?
|
||||
*/
|
||||
class AFCalcResultTracker {
|
||||
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
|
||||
|
||||
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
|
||||
protected double log10MLE;
|
||||
protected double log10MAP;
|
||||
private final int[] alleleCountsOfMLE;
|
||||
private final int[] alleleCountsOfMAP;
|
||||
|
||||
// The posteriors seen, not including that of AF=0
|
||||
private static final int LIKELIHOODS_CACHE_SIZE = 5000;
|
||||
private final double[] log10LikelihoodsMatrixValues = new double[LIKELIHOODS_CACHE_SIZE];
|
||||
private int currentLikelihoodsCacheIndex = 0;
|
||||
protected Double log10LikelihoodsMatrixSum = null;
|
||||
|
||||
// These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
|
||||
private double log10LikelihoodOfAFzero;
|
||||
private double log10PosteriorOfAFzero;
|
||||
private int[] AClimits;
|
||||
|
||||
int nEvaluations = 0;
|
||||
|
||||
/**
|
||||
* The list of alleles actually used in computing the AF
|
||||
*/
|
||||
private List<Allele> allelesUsedInGenotyping = null;
|
||||
|
||||
/**
|
||||
* Create a results object capability of storing results for calls with up to maxAltAlleles
|
||||
*
|
||||
* @param maxAltAlleles an integer >= 1
|
||||
*/
|
||||
public AFCalcResultTracker(final int maxAltAlleles) {
|
||||
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles);
|
||||
|
||||
alleleCountsOfMLE = new int[maxAltAlleles];
|
||||
alleleCountsOfMAP = new int[maxAltAlleles];
|
||||
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a vector with maxAltAlleles values containing AC values at the MLE
|
||||
*
|
||||
* The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order,
|
||||
* starting from index 0 (i.e., the first alt allele is at 0). The vector is always
|
||||
* maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values
|
||||
* are meaningful.
|
||||
*
|
||||
* @return a vector with allele counts, not all of which may be meaningful
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public int[] getAlleleCountsOfMLE() {
|
||||
return alleleCountsOfMLE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a vector with maxAltAlleles values containing AC values at the MAP
|
||||
*
|
||||
* @see #getAlleleCountsOfMLE() for the encoding of results in this vector
|
||||
*
|
||||
* @return a non-null vector of ints
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public int[] getAlleleCountsOfMAP() {
|
||||
return alleleCountsOfMAP;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the likelihoods summed across all AC values for AC > 0
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double getLog10LikelihoodOfAFNotZero() {
|
||||
if ( log10LikelihoodsMatrixSum == null ) {
|
||||
if ( currentLikelihoodsCacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have
|
||||
log10LikelihoodsMatrixSum = MathUtils.LOG10_P_OF_ZERO;
|
||||
else
|
||||
log10LikelihoodsMatrixSum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex);
|
||||
}
|
||||
return log10LikelihoodsMatrixSum;
|
||||
}
|
||||
|
||||
public double getLog10LikelihoodOfAFNotZero(final boolean capAt0) {
|
||||
return Math.min(getLog10LikelihoodOfAFNotZero(), capAt0 ? 0.0 : Double.POSITIVE_INFINITY);
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double getLog10LikelihoodOfAFzero() {
|
||||
return log10LikelihoodOfAFzero;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double getLog10PosteriorOfAFzero() {
|
||||
return log10PosteriorOfAFzero;
|
||||
}
|
||||
|
||||
protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) {
|
||||
final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1);
|
||||
final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)};
|
||||
final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true);
|
||||
|
||||
// TODO -- replace with more meaningful computation
|
||||
// TODO -- refactor this calculation into the ref calculation
|
||||
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(allelesUsedInGenotyping.size());
|
||||
for ( int i = 0; i < subACOfMLE.length; i++ ) {
|
||||
final Allele allele = allelesUsedInGenotyping.get(i+1);
|
||||
final double log10PNonRef = getAlleleCountsOfMAP()[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was
|
||||
log10pNonRefByAllele.put(allele, log10PNonRef);
|
||||
}
|
||||
|
||||
return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Protected mutational methods only for use within the calculation models themselves
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Reset the data in this results object, so that it can be used in a subsequent AF calculation
|
||||
*
|
||||
* Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer
|
||||
*/
|
||||
protected void reset() {
|
||||
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = VALUE_NOT_CALCULATED;
|
||||
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
|
||||
alleleCountsOfMLE[i] = 0;
|
||||
alleleCountsOfMAP[i] = 0;
|
||||
}
|
||||
currentLikelihoodsCacheIndex = 0;
|
||||
log10LikelihoodsMatrixSum = null;
|
||||
allelesUsedInGenotyping = null;
|
||||
nEvaluations = 0;
|
||||
Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tell this result we used one more evaluation cycle
|
||||
*/
|
||||
protected void incNEvaluations() {
|
||||
nEvaluations++;
|
||||
}
|
||||
|
||||
protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
addToLikelihoodsCache(log10LofK);
|
||||
|
||||
if ( log10LofK > log10MLE ) {
|
||||
log10MLE = log10LofK;
|
||||
for ( int i = 0; i < alleleCountsForK.length; i++ )
|
||||
alleleCountsOfMLE[i] = alleleCountsForK[i];
|
||||
}
|
||||
}
|
||||
|
||||
protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
if ( log10LofK > log10MAP ) {
|
||||
log10MAP = log10LofK;
|
||||
for ( int i = 0; i < alleleCountsForK.length; i++ )
|
||||
alleleCountsOfMAP[i] = alleleCountsForK[i];
|
||||
}
|
||||
}
|
||||
|
||||
private void addToLikelihoodsCache(final double log10LofK) {
|
||||
// add to the cache
|
||||
log10LikelihoodsMatrixValues[currentLikelihoodsCacheIndex++] = log10LofK;
|
||||
|
||||
// if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
|
||||
if ( currentLikelihoodsCacheIndex == LIKELIHOODS_CACHE_SIZE) {
|
||||
final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex);
|
||||
Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY);
|
||||
log10LikelihoodsMatrixValues[0] = temporarySum;
|
||||
currentLikelihoodsCacheIndex = 1;
|
||||
}
|
||||
}
|
||||
|
||||
protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
|
||||
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
|
||||
if ( log10LikelihoodOfAFzero > log10MLE ) {
|
||||
log10MLE = log10LikelihoodOfAFzero;
|
||||
Arrays.fill(alleleCountsOfMLE, 0);
|
||||
}
|
||||
}
|
||||
|
||||
protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
|
||||
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
|
||||
if ( log10PosteriorOfAFzero > log10MAP ) {
|
||||
log10MAP = log10PosteriorOfAFzero;
|
||||
Arrays.fill(alleleCountsOfMAP, 0);
|
||||
}
|
||||
}
|
||||
|
||||
protected void setAllelesUsedInGenotyping(List<Allele> allelesUsedInGenotyping) {
|
||||
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() )
|
||||
throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty");
|
||||
|
||||
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
|
||||
}
|
||||
|
||||
protected void setAClimits(int[] AClimits) {
|
||||
this.AClimits = AClimits;
|
||||
}
|
||||
}
|
||||
|
|
@ -31,15 +31,11 @@ import org.broadinstitute.sting.utils.variantcontext.*;
|
|||
import java.util.*;
|
||||
|
||||
public abstract class DiploidExactAFCalc extends ExactAFCalc {
|
||||
public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, ploidy);
|
||||
if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy);
|
||||
}
|
||||
|
||||
protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) {
|
||||
return new StateTracker();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected AFCalcResult computeLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors) {
|
||||
|
|
@ -60,43 +56,33 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
|
|||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
final StateTracker stateTracker = makeMaxLikelihood(vc, getResultTracker());
|
||||
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
getResultTracker().incNEvaluations(); // keep track of the number of evaluations
|
||||
getStateTracker().incNEvaluations(); // keep track of the number of evaluations
|
||||
|
||||
// compute log10Likelihoods
|
||||
final ExactACset set = ACqueue.remove();
|
||||
|
||||
if ( stateTracker.withinMaxACs(set.getACcounts()) ) {
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, stateTracker, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, getResultTracker());
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
stateTracker.update(log10LofKs, set.getACcounts());
|
||||
|
||||
// clean up memory
|
||||
indexesToACset.remove(set.getACcounts());
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
|
||||
}
|
||||
// clean up memory
|
||||
indexesToACset.remove(set.getACcounts());
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
|
||||
}
|
||||
|
||||
return resultFromTracker(vc, log10AlleleFrequencyPriors);
|
||||
return getResultFromFinalState(vc, log10AlleleFrequencyPriors);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected VariantContext reduceScope(final VariantContext vc) {
|
||||
final int myMaxAltAllelesToGenotype = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype;
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
if ( vc.getAlternateAlleles().size() > getMaxAltAlleles() ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
List<Allele> alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
|
||||
List<Allele> alleles = new ArrayList<Allele>(getMaxAltAlleles() + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles()));
|
||||
builder.alleles(alleles);
|
||||
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
|
||||
return builder.make();
|
||||
|
|
@ -153,23 +139,21 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
|
|||
|
||||
private double calculateAlleleCountConformation(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final StateTracker stateTracker,
|
||||
final int numChr,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
final double[] log10AlleleFrequencyPriors) {
|
||||
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
||||
|
||||
// compute the log10Likelihoods
|
||||
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, resultTracker);
|
||||
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors);
|
||||
|
||||
final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
|
||||
|
||||
// can we abort early because the log10Likelihoods are so small?
|
||||
if ( stateTracker.abort(log10LofK, set.getACcounts()) ) {
|
||||
if ( getStateTracker().abort(log10LofK, set.getACcounts(), true) ) {
|
||||
//if ( DEBUG )
|
||||
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
return log10LofK;
|
||||
|
|
@ -188,7 +172,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
|
|||
ACcountsClone[allele]++;
|
||||
// to get to this conformation, a sample would need to be AB (remember that ref=0)
|
||||
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
|
||||
updateACset(stateTracker, ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
|
||||
|
|
@ -213,9 +197,9 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
|
|||
|
||||
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
|
||||
for ( DependentSet dependent : differentAlleles )
|
||||
updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
for ( DependentSet dependent : sameAlleles )
|
||||
updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
|
||||
}
|
||||
|
||||
return log10LofK;
|
||||
|
|
@ -223,8 +207,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
|
|||
|
||||
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
|
||||
// also pushes its value to the given callingSetIndex.
|
||||
private void updateACset(final StateTracker stateTracker,
|
||||
final int[] newSetCounts,
|
||||
private void updateACset(final int[] newSetCounts,
|
||||
final int numChr,
|
||||
final ExactACset dependentSet,
|
||||
final int PLsetIndex,
|
||||
|
|
@ -246,8 +229,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
|
|||
|
||||
private void computeLofK(final ExactACset set,
|
||||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
final double[] log10AlleleFrequencyPriors) {
|
||||
|
||||
set.getLog10Likelihoods()[0] = 0.0; // the zero case
|
||||
final int totalK = set.getACsum();
|
||||
|
|
@ -258,8 +240,8 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
|
|||
set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
|
||||
|
||||
final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
|
||||
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -281,14 +263,15 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
|
|||
double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
|
||||
|
||||
// update the MLE if necessary
|
||||
resultTracker.updateMLEifNeeded(log10LofK, set.getACcounts().getCounts());
|
||||
getStateTracker().updateMLEifNeeded(log10LofK, set.getACcounts().getCounts());
|
||||
|
||||
// apply the priors over each alternate allele
|
||||
for ( final int ACcount : set.getACcounts().getCounts() ) {
|
||||
if ( ACcount > 0 )
|
||||
log10LofK += log10AlleleFrequencyPriors[ACcount];
|
||||
}
|
||||
resultTracker.updateMAPifNeeded(log10LofK, set.getACcounts().getCounts());
|
||||
|
||||
getStateTracker().updateMAPifNeeded(log10LofK, set.getACcounts().getCounts());
|
||||
}
|
||||
|
||||
private void pushData(final ExactACset targetSet,
|
||||
|
|
|
|||
|
|
@ -39,8 +39,8 @@ import java.util.ArrayList;
|
|||
abstract class ExactAFCalc extends AFCalc {
|
||||
protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first
|
||||
|
||||
protected ExactAFCalc(final int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
protected ExactAFCalc(final int nSamples, int maxAltAlleles, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, ploidy);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -89,7 +89,7 @@ import java.util.*;
|
|||
/**
|
||||
* The min. confidence of an allele to be included in the joint posterior.
|
||||
*/
|
||||
private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-20);
|
||||
private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-10);
|
||||
|
||||
private final static int[] BIALLELIC_NON_INFORMATIVE_PLS = new int[]{0,0,0};
|
||||
private final static List<Allele> BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
|
||||
|
|
@ -100,7 +100,7 @@ import java.util.*;
|
|||
private final static class CompareAFCalcResultsByPNonRef implements Comparator<AFCalcResult> {
|
||||
@Override
|
||||
public int compare(AFCalcResult o1, AFCalcResult o2) {
|
||||
return Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0());
|
||||
return -1 * Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -111,9 +111,9 @@ import java.util.*;
|
|||
*/
|
||||
final AFCalc biAlleleExactModel;
|
||||
|
||||
protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, 1, ploidy);
|
||||
protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, ploidy);
|
||||
biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, ploidy);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -285,10 +285,14 @@ import java.util.*;
|
|||
// sort the results, so the most likely allele is first
|
||||
Collections.sort(sorted, compareAFCalcResultsByPNonRef);
|
||||
|
||||
double lastPosteriorGt0 = sorted.get(0).getLog10PosteriorOfAFGT0();
|
||||
final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0();
|
||||
|
||||
for ( int i = 0; i < sorted.size(); i++ ) {
|
||||
final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0;
|
||||
if ( sorted.get(i).getLog10PosteriorOfAFGT0() > lastPosteriorGt0 )
|
||||
throw new IllegalStateException("pNonRefResults not sorted: lastPosteriorGt0 " + lastPosteriorGt0 + " but current is " + sorted.get(i).getLog10PosteriorOfAFGT0());
|
||||
|
||||
final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0;
|
||||
final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0));
|
||||
final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 };
|
||||
|
||||
|
|
@ -303,7 +307,13 @@ import java.util.*;
|
|||
/**
|
||||
* Take the independent estimates of pNonRef for each alt allele and combine them into a single result
|
||||
*
|
||||
* TODO -- add more docs
|
||||
* Given n independent calculations for each of n alternate alleles create a single
|
||||
* combined AFCalcResult with:
|
||||
*
|
||||
* priors for AF == 0 equal to theta^N for the nth least likely allele
|
||||
* posteriors that reflect the combined chance that any alleles are segregating and corresponding
|
||||
* likelihoods
|
||||
* combined MLEs in the order of the alt alleles in vc
|
||||
*
|
||||
* @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently
|
||||
*/
|
||||
|
|
@ -315,9 +325,11 @@ import java.util.*;
|
|||
final double[] log10PriorsOfAC = new double[2];
|
||||
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(nAltAlleles);
|
||||
|
||||
// this value is a sum in log space
|
||||
// the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs
|
||||
double log10PosteriorOfACEq0Sum = 0.0;
|
||||
double log10PosteriorOfACGt0Sum = 0.0;
|
||||
|
||||
boolean anyPoly = false;
|
||||
for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) {
|
||||
final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1);
|
||||
final int altI = vc.getAlleles().indexOf(altAllele) - 1;
|
||||
|
|
@ -325,12 +337,15 @@ import java.util.*;
|
|||
// MLE of altI allele is simply the MLE of this allele in altAlleles
|
||||
alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele);
|
||||
|
||||
log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0();
|
||||
log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0();
|
||||
|
||||
// the AF > 0 case requires us to store the normalized likelihood for later summation
|
||||
if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR )
|
||||
if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) {
|
||||
anyPoly = true;
|
||||
log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0();
|
||||
log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0();
|
||||
log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0();
|
||||
}
|
||||
|
||||
log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0();
|
||||
|
||||
// bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior
|
||||
log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0());
|
||||
|
|
@ -339,10 +354,25 @@ import java.util.*;
|
|||
nEvaluations += sortedResultWithThetaNPriors.nEvaluations;
|
||||
}
|
||||
|
||||
// If no alleles were polymorphic, make sure we have the proper priors (the defaults) for likelihood calculation
|
||||
if ( ! anyPoly ) {
|
||||
log10PriorsOfAC[0] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFEq0();
|
||||
log10PriorsOfAC[1] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFGT0();
|
||||
}
|
||||
|
||||
// In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C,
|
||||
// the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently
|
||||
// log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0
|
||||
final double log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO);
|
||||
//
|
||||
// note we need to handle the case where the posterior of AF == 0 is 0.0, in which case we
|
||||
// use the summed log10PosteriorOfACGt0Sum directly. This happens in cases where
|
||||
// AF > 0 : 0.0 and AF == 0 : -16, and if you use the inverse calculation you get 0.0 and MathUtils.LOG10_P_OF_ZERO
|
||||
final double log10PosteriorOfACGt0;
|
||||
if ( log10PosteriorOfACEq0Sum == 0.0 )
|
||||
log10PosteriorOfACGt0 = log10PosteriorOfACGt0Sum;
|
||||
else
|
||||
log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO);
|
||||
|
||||
final double[] log10LikelihoodsOfAC = new double[] {
|
||||
// L + prior = posterior => L = poster - prior
|
||||
log10PosteriorOfACEq0Sum - log10PriorsOfAC[0],
|
||||
|
|
@ -350,8 +380,10 @@ import java.util.*;
|
|||
};
|
||||
|
||||
return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(),
|
||||
MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), // necessary to ensure all values < 0
|
||||
MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized
|
||||
// necessary to ensure all values < 0
|
||||
MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true),
|
||||
// priors incorporate multiple alt alleles, must be normalized
|
||||
MathUtils.normalizeFromLog10(log10PriorsOfAC, true),
|
||||
log10pNonRefByAllele, sortedResultsWithThetaNPriors);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
|
|
@ -11,28 +12,31 @@ import java.util.Map;
|
|||
/**
|
||||
* Original bi-allelic ~O(N) implementation. Kept here for posterity and reference
|
||||
*/
|
||||
public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
|
||||
protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
}
|
||||
|
||||
protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) {
|
||||
return new StateTracker();
|
||||
class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
|
||||
protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, ploidy);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) {
|
||||
final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length];
|
||||
final double[] log10AlleleFrequencyPosteriors = new double[log10AlleleFrequencyPriors.length];
|
||||
final int lastK = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
|
||||
final Pair<Integer, Integer> result = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
|
||||
final int lastK = result.getFirst();
|
||||
final int mleK = result.getSecond();
|
||||
|
||||
final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1)};
|
||||
final double log10LikelihoodAFGt0 = lastK == 0 ? MathUtils.LOG10_P_OF_ZERO : MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1, lastK+1);
|
||||
final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], log10LikelihoodAFGt0};
|
||||
final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)};
|
||||
final double[] log10Posteriors = MathUtils.vectorSum(log10Likelihoods, log10Priors);
|
||||
|
||||
final double pNonRef = lastK > 0 ? 0.0 : -1000.0;
|
||||
final Map<Allele, Double> log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), pNonRef);
|
||||
final double log10PNonRef = log10Posteriors[1] > log10Posteriors[0] ? 0.0 : MathUtils.LOG10_P_OF_ZERO;
|
||||
final Map<Allele, Double> log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PNonRef);
|
||||
|
||||
return new AFCalcResult(new int[]{lastK}, 0, vc.getAlleles(), log10Likelihoods, log10Priors, log10pNonRefByAllele);
|
||||
return new AFCalcResult(new int[]{mleK}, 0, vc.getAlleles(),
|
||||
MathUtils.normalizeFromLog10(log10Likelihoods, true),
|
||||
MathUtils.normalizeFromLog10(log10Priors, true),
|
||||
log10pNonRefByAllele);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -72,11 +76,11 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
|
|||
}
|
||||
}
|
||||
|
||||
public int linearExact(final VariantContext vc,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyLikelihoods,
|
||||
double[] log10AlleleFrequencyPosteriors) {
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), false);
|
||||
public Pair<Integer, Integer> linearExact(final VariantContext vc,
|
||||
double[] log10AlleleFrequencyPriors,
|
||||
double[] log10AlleleFrequencyLikelihoods,
|
||||
double[] log10AlleleFrequencyPosteriors) {
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), true);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
final int numChr = 2*numSamples;
|
||||
|
||||
|
|
@ -85,7 +89,7 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
|
|||
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
boolean done = false;
|
||||
int lastK = -1;
|
||||
int lastK = -1, mleK = -1;
|
||||
|
||||
for (int k=0; k <= numChr && ! done; k++ ) {
|
||||
final double[] kMinus0 = logY.getkMinus0();
|
||||
|
|
@ -131,7 +135,11 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
|
|||
|
||||
// can we abort early?
|
||||
lastK = k;
|
||||
maxLog10L = Math.max(maxLog10L, log10LofK);
|
||||
if ( log10LofK > maxLog10L ) {
|
||||
maxLog10L = log10LofK;
|
||||
mleK = k;
|
||||
}
|
||||
|
||||
if ( log10LofK < maxLog10L - StateTracker.MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
//if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
|
||||
done = true;
|
||||
|
|
@ -140,6 +148,6 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
|
|||
logY.rotate();
|
||||
}
|
||||
|
||||
return lastK;
|
||||
return new Pair<Integer, Integer>(lastK, mleK);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
public class ReferenceDiploidExactAFCalc extends DiploidExactAFCalc {
|
||||
protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, ploidy);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,35 +1,85 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Keeps track of the best state seen by the exact model and the max states to visit
|
||||
* allowing us to abort the search before we visit the entire matrix of AC x samples
|
||||
* Keeps track of the state information during the exact model AF calculation.
|
||||
*
|
||||
* Tracks things like the MLE and MAP AC values, their corresponding likelhood and posterior
|
||||
* values, the likelihood of the AF == 0 state, and the number of evaluations needed
|
||||
* by the calculation to compute the P(AF == 0)
|
||||
*/
|
||||
final class StateTracker {
|
||||
public final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
final private int[] maxACsToConsider;
|
||||
|
||||
private ExactACcounts ACsAtMax = null;
|
||||
private double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
|
||||
public StateTracker() {
|
||||
this(null);
|
||||
}
|
||||
|
||||
public StateTracker(final int[] maxACsToConsider) {
|
||||
this.maxACsToConsider = maxACsToConsider;
|
||||
}
|
||||
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
|
||||
protected final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
/**
|
||||
* Update the maximum log10L seen, if log10LofKs is higher, and the corresponding ACs of this state
|
||||
*
|
||||
* @param log10LofKs the likelihood of our current configuration state
|
||||
* These variables are intended to contain the MLE and MAP (and their corresponding allele counts)
|
||||
* of the site over all alternate alleles
|
||||
*/
|
||||
public void update(final double log10LofKs, final ExactACcounts ACs) {
|
||||
if ( log10LofKs > getMaxLog10L()) {
|
||||
this.setMaxLog10L(log10LofKs);
|
||||
this.ACsAtMax = ACs;
|
||||
}
|
||||
protected double log10MLE;
|
||||
protected double log10MAP;
|
||||
|
||||
/**
|
||||
* Returns a vector with maxAltAlleles values containing AC values at the MLE
|
||||
*
|
||||
* The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order,
|
||||
* starting from index 0 (i.e., the first alt allele is at 0). The vector is always
|
||||
* maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values
|
||||
* are meaningful.
|
||||
*/
|
||||
private final int[] alleleCountsOfMLE;
|
||||
private final int[] alleleCountsOfMAP;
|
||||
|
||||
/**
|
||||
* A vector of log10 likelihood values seen, for future summation. When the size of the
|
||||
* vector is exceeed -- because we've pushed more posteriors than there's space to hold
|
||||
* -- we simply sum up the existing values, make that the first value, and continue.
|
||||
*/
|
||||
private final double[] log10LikelihoodsForAFGt0 = new double[LIKELIHOODS_CACHE_SIZE];
|
||||
private static final int LIKELIHOODS_CACHE_SIZE = 5000;
|
||||
private int log10LikelihoodsForAFGt0CacheIndex = 0;
|
||||
|
||||
/**
|
||||
* The actual sum of the likelihoods. Null if the sum hasn't been computed yet
|
||||
*/
|
||||
protected Double log10LikelihoodsForAFGt0Sum = null;
|
||||
|
||||
/**
|
||||
* Contains the likelihood for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
|
||||
*/
|
||||
private double log10LikelihoodOfAFzero = 0.0;
|
||||
|
||||
/**
|
||||
* The number of evaluates we've gone through in the AFCalc
|
||||
*/
|
||||
private int nEvaluations = 0;
|
||||
|
||||
/**
|
||||
* The list of alleles actually used in computing the AF
|
||||
*/
|
||||
private List<Allele> allelesUsedInGenotyping = null;
|
||||
|
||||
/**
|
||||
* Create a results object capability of storing results for calls with up to maxAltAlleles
|
||||
*
|
||||
* @param maxAltAlleles an integer >= 1
|
||||
*/
|
||||
public StateTracker(final int maxAltAlleles) {
|
||||
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles);
|
||||
|
||||
alleleCountsOfMLE = new int[maxAltAlleles];
|
||||
alleleCountsOfMAP = new int[maxAltAlleles];
|
||||
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -39,58 +89,194 @@ final class StateTracker {
|
|||
* @param log10LofK the log10 likelihood of the configuration we're considering analyzing
|
||||
* @return true if the configuration cannot meaningfully contribute to our likelihood sum
|
||||
*/
|
||||
public boolean tooLowLikelihood(final double log10LofK) {
|
||||
return log10LofK < getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY;
|
||||
private boolean tooLowLikelihood(final double log10LofK) {
|
||||
return log10LofK < log10MLE - MAX_LOG10_ERROR_TO_STOP_EARLY;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are all ACs in otherACs less than or equal to their corresponding ACs in the maxACsToConsider?
|
||||
* @return true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set
|
||||
*/
|
||||
private boolean isLowerAC(final ExactACcounts otherACs) {
|
||||
final int[] otherACcounts = otherACs.getCounts();
|
||||
|
||||
for ( int i = 0; i < otherACcounts.length; i++ ) {
|
||||
if ( alleleCountsOfMLE[i] > otherACcounts[i] )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we stop exploring paths from ACs, given it's log10LofK
|
||||
*
|
||||
* @param otherACs the set of otherACs that we want to know if we should consider analyzing
|
||||
* @return true if otherACs is a state worth considering, or false otherwise
|
||||
* @param log10LofK the log10LofK of these ACs
|
||||
* @param ACs the ACs of this state
|
||||
* @return return true if there's no reason to continue with subpaths of AC, or false otherwise
|
||||
*/
|
||||
public boolean withinMaxACs(final ExactACcounts otherACs) {
|
||||
if ( maxACsToConsider == null )
|
||||
return true;
|
||||
protected boolean abort( final double log10LofK, final ExactACcounts ACs, final boolean enforceLowerACs ) {
|
||||
return tooLowLikelihood(log10LofK) && (!enforceLowerACs || isLowerAC(ACs));
|
||||
}
|
||||
|
||||
final int[] otherACcounts = otherACs.getCounts();
|
||||
@Ensures("result != null")
|
||||
protected int[] getAlleleCountsOfMAP() {
|
||||
return alleleCountsOfMAP;
|
||||
}
|
||||
|
||||
for ( int i = 0; i < maxACsToConsider.length; i++ ) {
|
||||
// consider one more than the max AC to collect a bit more likelihood mass
|
||||
if ( otherACcounts[i] > maxACsToConsider[i] + 1 )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
@Ensures("result >= 0")
|
||||
protected int getnEvaluations() {
|
||||
return nEvaluations;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set
|
||||
* @return the likelihoods summed across all AC values for AC > 0
|
||||
*/
|
||||
public boolean isLowerAC(final ExactACcounts otherACs) {
|
||||
if ( ACsAtMax == null )
|
||||
return true;
|
||||
private double getLog10LikelihoodOfAFNotZero() {
|
||||
if ( log10LikelihoodsForAFGt0Sum == null ) {
|
||||
if ( log10LikelihoodsForAFGt0CacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have
|
||||
log10LikelihoodsForAFGt0Sum = MathUtils.LOG10_P_OF_ZERO;
|
||||
else
|
||||
log10LikelihoodsForAFGt0Sum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex);
|
||||
}
|
||||
return log10LikelihoodsForAFGt0Sum;
|
||||
}
|
||||
|
||||
final int[] myACcounts = this.ACsAtMax.getCounts();
|
||||
final int[] otherACcounts = otherACs.getCounts();
|
||||
/**
|
||||
* @return the log10 likelihood of AF == 0
|
||||
*/
|
||||
private double getLog10LikelihoodOfAFzero() {
|
||||
return log10LikelihoodOfAFzero;
|
||||
}
|
||||
|
||||
for ( int i = 0; i < myACcounts.length; i++ ) {
|
||||
if ( myACcounts[i] > otherACcounts[i] )
|
||||
return false;
|
||||
/**
|
||||
* Convert this state to an corresponding AFCalcResult.
|
||||
*
|
||||
* Assumes that the values in this state have been filled in with meaningful values during the calculation.
|
||||
* For example, that the allelesUsedInGenotyping has been set, that the alleleCountsOfMLE contains meaningful
|
||||
* values, etc.
|
||||
*
|
||||
* @param log10PriorsByAC the priors by AC
|
||||
*
|
||||
* @return an AFCalcResult summarizing the final results of this calculation
|
||||
*/
|
||||
@Requires("allelesUsedInGenotyping != null")
|
||||
protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) {
|
||||
final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1);
|
||||
final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true);
|
||||
final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true);
|
||||
|
||||
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(allelesUsedInGenotyping.size());
|
||||
for ( int i = 0; i < subACOfMLE.length; i++ ) {
|
||||
final Allele allele = allelesUsedInGenotyping.get(i+1);
|
||||
final double log10PNonRef = alleleCountsOfMAP[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was
|
||||
log10pNonRefByAllele.put(allele, log10PNonRef);
|
||||
}
|
||||
|
||||
return true;
|
||||
return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele);
|
||||
}
|
||||
|
||||
public boolean abort( final double log10LofK, final ExactACcounts ACs ) {
|
||||
return tooLowLikelihood(log10LofK) && isLowerAC(ACs);
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Protected mutational methods only for use within the calculation models themselves
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Reset the data in this results object, so that it can be used in a subsequent AF calculation
|
||||
*
|
||||
* Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer
|
||||
*/
|
||||
protected void reset() {
|
||||
log10MLE = log10MAP = log10LikelihoodOfAFzero = VALUE_NOT_CALCULATED;
|
||||
log10LikelihoodsForAFGt0CacheIndex = 0;
|
||||
log10LikelihoodsForAFGt0Sum = null;
|
||||
allelesUsedInGenotyping = null;
|
||||
nEvaluations = 0;
|
||||
Arrays.fill(alleleCountsOfMLE, 0);
|
||||
Arrays.fill(alleleCountsOfMAP, 0);
|
||||
Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY);
|
||||
}
|
||||
|
||||
public double getMaxLog10L() {
|
||||
return maxLog10L;
|
||||
/**
|
||||
* Tell this result we used one more evaluation cycle
|
||||
*/
|
||||
protected void incNEvaluations() {
|
||||
nEvaluations++;
|
||||
}
|
||||
|
||||
public void setMaxLog10L(double maxLog10L) {
|
||||
this.maxLog10L = maxLog10L;
|
||||
/**
|
||||
* Update the maximum log10 likelihoods seen, if log10LofKs is higher, and the corresponding ACs of this state
|
||||
*
|
||||
* @param log10LofK the likelihood of our current configuration state, cannot be the 0 state
|
||||
* @param alleleCountsForK the allele counts for this state
|
||||
*/
|
||||
@Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"})
|
||||
@Ensures("log10MLE == Math.max(log10LofK, log10MLE)")
|
||||
protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
|
||||
addToLikelihoodsCache(log10LofK);
|
||||
|
||||
if ( log10LofK > log10MLE ) {
|
||||
log10MLE = log10LofK;
|
||||
System.arraycopy(alleleCountsForK, 0, alleleCountsOfMLE, 0, alleleCountsForK.length);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the maximum log10 posterior seen, if log10PofKs is higher, and the corresponding ACs of this state
|
||||
*
|
||||
* @param log10PofK the posterior of our current configuration state
|
||||
* @param alleleCountsForK the allele counts for this state
|
||||
*/
|
||||
@Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"})
|
||||
@Ensures("log10MAP == Math.max(log10PofK, log10MAP)")
|
||||
protected void updateMAPifNeeded(final double log10PofK, final int[] alleleCountsForK) {
|
||||
if ( log10PofK > log10MAP ) {
|
||||
log10MAP = log10PofK;
|
||||
System.arraycopy(alleleCountsForK, 0, alleleCountsOfMAP, 0, alleleCountsForK.length);
|
||||
}
|
||||
}
|
||||
|
||||
private void addToLikelihoodsCache(final double log10LofK) {
|
||||
// add to the cache
|
||||
log10LikelihoodsForAFGt0[log10LikelihoodsForAFGt0CacheIndex++] = log10LofK;
|
||||
|
||||
// if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
|
||||
if ( log10LikelihoodsForAFGt0CacheIndex == LIKELIHOODS_CACHE_SIZE) {
|
||||
final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex);
|
||||
Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY);
|
||||
log10LikelihoodsForAFGt0[0] = temporarySum;
|
||||
log10LikelihoodsForAFGt0CacheIndex = 1;
|
||||
}
|
||||
}
|
||||
|
||||
protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
|
||||
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
|
||||
if ( log10LikelihoodOfAFzero > log10MLE ) {
|
||||
log10MLE = log10LikelihoodOfAFzero;
|
||||
Arrays.fill(alleleCountsOfMLE, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@Requires({"MathUtils.goodLog10Probability(log10PosteriorOfAFzero)"})
|
||||
protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
|
||||
if ( log10PosteriorOfAFzero > log10MAP ) {
|
||||
log10MAP = log10PosteriorOfAFzero;
|
||||
Arrays.fill(alleleCountsOfMAP, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the list of alleles used in genotyping
|
||||
*
|
||||
* @param allelesUsedInGenotyping the list of alleles, where the first allele is reference
|
||||
*/
|
||||
@Requires({"allelesUsedInGenotyping != null", "allelesUsedInGenotyping.size() > 1"})
|
||||
protected void setAllelesUsedInGenotyping(List<Allele> allelesUsedInGenotyping) {
|
||||
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() )
|
||||
throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty");
|
||||
if ( allelesUsedInGenotyping.get(0).isNonReference() )
|
||||
throw new IllegalArgumentException("The first element of allelesUsedInGenotyping must be the reference allele");
|
||||
|
||||
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,8 +30,11 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.PairHMM;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pairhmm.ExactPairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.OriginalPairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -48,7 +51,6 @@ public class PairHMMIndelErrorModel {
|
|||
public static final int BASE_QUAL_THRESHOLD = 20;
|
||||
|
||||
private boolean DEBUG = false;
|
||||
private boolean bandedLikelihoods = false;
|
||||
|
||||
private static final int MAX_CACHED_QUAL = 127;
|
||||
|
||||
|
|
@ -67,6 +69,8 @@ public class PairHMMIndelErrorModel {
|
|||
private final byte[] GAP_OPEN_PROB_TABLE;
|
||||
private final byte[] GAP_CONT_PROB_TABLE;
|
||||
|
||||
private final PairHMM pairHMM;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
|
|
@ -85,15 +89,26 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
}
|
||||
|
||||
public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, boolean bandedLikelihoods) {
|
||||
public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
|
||||
this.DEBUG = deb;
|
||||
this.bandedLikelihoods = bandedLikelihoods;
|
||||
|
||||
switch (hmmType) {
|
||||
case EXACT:
|
||||
pairHMM = new ExactPairHMM();
|
||||
break;
|
||||
case ORIGINAL:
|
||||
pairHMM = new OriginalPairHMM();
|
||||
break;
|
||||
case CACHING:
|
||||
case LOGLESS_CACHING:
|
||||
default:
|
||||
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL and EXACT.");
|
||||
}
|
||||
|
||||
// fill gap penalty table, affine naive model:
|
||||
this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX];
|
||||
this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX];
|
||||
|
||||
|
||||
for (int i = 0; i < START_HRUN_GAP_IDX; i++) {
|
||||
GAP_OPEN_PROB_TABLE[i] = indelGOP;
|
||||
GAP_CONT_PROB_TABLE[i] = indelGCP;
|
||||
|
|
@ -190,7 +205,6 @@ public class PairHMMIndelErrorModel {
|
|||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap,
|
||||
final int[] readCounts) {
|
||||
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];
|
||||
final PairHMM pairHMM = new PairHMM(bandedLikelihoods);
|
||||
|
||||
int readIdx=0;
|
||||
for (PileupElement p: pileup) {
|
||||
|
|
@ -303,8 +317,6 @@ public class PairHMMIndelErrorModel {
|
|||
final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases);
|
||||
int j=0;
|
||||
|
||||
// initialize path metric and traceback memories for likelihood computation
|
||||
double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null;
|
||||
byte[] previousHaplotypeSeen = null;
|
||||
final byte[] contextLogGapOpenProbabilities = new byte[readBases.length];
|
||||
final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length];
|
||||
|
|
@ -341,14 +353,9 @@ public class PairHMMIndelErrorModel {
|
|||
final int X_METRIC_LENGTH = readBases.length+2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length+2;
|
||||
|
||||
if (matchMetricArray == null) {
|
||||
if (previousHaplotypeSeen == null) {
|
||||
//no need to reallocate arrays for each new haplotype, as length won't change
|
||||
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
|
||||
PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
|
||||
pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
|
||||
}
|
||||
|
||||
int startIndexInHaplotype = 0;
|
||||
|
|
@ -356,11 +363,10 @@ public class PairHMMIndelErrorModel {
|
|||
startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen);
|
||||
previousHaplotypeSeen = haplotypeBases.clone();
|
||||
|
||||
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals,
|
||||
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals,
|
||||
(read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities),
|
||||
(read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities),
|
||||
contextLogGapContinuationProbabilities,
|
||||
startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray);
|
||||
contextLogGapContinuationProbabilities, startIndexInHaplotype, false);
|
||||
|
||||
|
||||
if (DEBUG) {
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ import java.util.*;
|
|||
* <ul>
|
||||
* <li>In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.</li>
|
||||
* <li>In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.</li>
|
||||
* <li>In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probabilitt is emitted.</li>
|
||||
* <li>In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probability is emitted.</li>
|
||||
* <li>In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.</li>
|
||||
* </ul>
|
||||
*
|
||||
|
|
|
|||
|
|
@ -33,7 +33,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
|||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T CountReads \
|
||||
* -o output.txt \
|
||||
* -I input.bam \
|
||||
* [-L input.intervals]
|
||||
* </pre>
|
||||
|
|
|
|||
|
|
@ -48,11 +48,14 @@ public class GLBasedSampleSelector extends SampleSelector {
|
|||
// first subset to the samples
|
||||
VariantContext subContext = vc.subContextFromSamples(samples);
|
||||
|
||||
if ( ! subContext.isPolymorphicInSamples() )
|
||||
return false;
|
||||
|
||||
// now check to see (using EXACT model) whether this should be variant
|
||||
// do we want to apply a prior? maybe user-spec?
|
||||
if ( flatPriors == null ) {
|
||||
flatPriors = new double[1+2*samples.size()];
|
||||
AFCalculator = AFCalcFactory.createAFCalc(samples.size(), 4, 4, 2);
|
||||
AFCalculator = AFCalcFactory.createAFCalc(samples.size(), 4, 2);
|
||||
}
|
||||
final AFCalcResult result = AFCalculator.getLog10PNonRef(subContext, flatPriors);
|
||||
// do we want to let this qual go up or down?
|
||||
|
|
|
|||
|
|
@ -42,11 +42,11 @@ import org.broadinstitute.sting.utils.MendelianViolation;
|
|||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
|
@ -542,9 +542,11 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
|||
VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS);
|
||||
|
||||
if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) {
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(UG_engine.calculateGenotypes(sub)).filters(sub.getFiltersMaybeNull());
|
||||
addAnnotations(builder, sub);
|
||||
sub = builder.make();
|
||||
synchronized (UG_engine) {
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(UG_engine.calculateGenotypes(sub)).filters(sub.getFiltersMaybeNull());
|
||||
addAnnotations(builder, sub);
|
||||
sub = builder.make();
|
||||
}
|
||||
}
|
||||
|
||||
if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) {
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
|
|||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
|
||||
public class Haplotype {
|
||||
|
|
@ -184,6 +185,21 @@ public class Haplotype {
|
|||
return new Haplotype(newHaplotypeBases);
|
||||
}
|
||||
|
||||
public static class HaplotypeBaseComparator implements Comparator<Haplotype>, Serializable {
|
||||
@Override
|
||||
public int compare( final Haplotype hap1, final Haplotype hap2 ) {
|
||||
final byte[] arr1 = hap1.getBases();
|
||||
final byte[] arr2 = hap2.getBases();
|
||||
// compares byte arrays using lexical ordering
|
||||
final int len = Math.min(arr1.length, arr2.length);
|
||||
for( int iii = 0; iii < len; iii++ ) {
|
||||
final int cmp = arr1[iii] - arr2[iii];
|
||||
if (cmp != 0) { return cmp; }
|
||||
}
|
||||
return arr2.length - arr1.length;
|
||||
}
|
||||
}
|
||||
|
||||
public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(final List<Allele> alleleList,
|
||||
final int startPos,
|
||||
final ReferenceContext ref,
|
||||
|
|
|
|||
|
|
@ -1194,6 +1194,39 @@ public class MathUtils {
|
|||
return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.));
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that the log10 prob vector vector is well formed
|
||||
*
|
||||
* @param vector
|
||||
* @param expectedSize
|
||||
* @param shouldSumToOne
|
||||
*
|
||||
* @return true if vector is well-formed, false otherwise
|
||||
*/
|
||||
public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) {
|
||||
if ( vector.length != expectedSize ) return false;
|
||||
|
||||
for ( final double pr : vector ) {
|
||||
if ( ! goodLog10Probability(pr) )
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 )
|
||||
return false;
|
||||
|
||||
return true; // everything is good
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks that the result is a well-formed log10 probability
|
||||
*
|
||||
* @param result a supposedly well-formed log10 probability value
|
||||
* @return true if result is really well formed
|
||||
*/
|
||||
public static boolean goodLog10Probability(final double result) {
|
||||
return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result);
|
||||
}
|
||||
|
||||
/**
|
||||
* A utility class that computes on the fly average and standard deviation for a stream of numbers.
|
||||
* The number of observations does not have to be known in advance, and can be also very big (so that
|
||||
|
|
|
|||
|
|
@ -1,259 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
|
||||
* User: rpoplin
|
||||
* Date: 3/1/12
|
||||
*/
|
||||
|
||||
public class PairHMM {
|
||||
private static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE;
|
||||
private static final byte DEFAULT_GOP = (byte) 45;
|
||||
private static final byte DEFAULT_GCP = (byte) 10;
|
||||
private static final double BANDING_TOLERANCE = 22.0;
|
||||
private static final int BANDING_CLUSTER_WINDOW = 12;
|
||||
private final boolean noBanded;
|
||||
|
||||
public PairHMM() {
|
||||
noBanded = false;
|
||||
}
|
||||
|
||||
public PairHMM( final boolean noBanded ) {
|
||||
this.noBanded = noBanded;
|
||||
}
|
||||
|
||||
|
||||
public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray,
|
||||
final int X_METRIC_LENGTH) {
|
||||
|
||||
for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
|
||||
Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY);
|
||||
Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY);
|
||||
Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// the initial condition
|
||||
matchMetricArray[1][1] = 0.0; // Math.log10(1.0);
|
||||
|
||||
}
|
||||
|
||||
@Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"})
|
||||
@Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability
|
||||
public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals,
|
||||
final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) {
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = readBases.length + 2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
|
||||
|
||||
// initial arrays to hold the probabilities of being in the match, insertion and deletion cases
|
||||
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
|
||||
|
||||
return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
|
||||
@Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"})
|
||||
@Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability
|
||||
public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals,
|
||||
final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = readBases.length + 2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
|
||||
|
||||
// ensure that all the qual scores have valid values
|
||||
for( int iii = 0; iii < readQuals.length; iii++ ) {
|
||||
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
|
||||
}
|
||||
|
||||
if( false ) {
|
||||
final ArrayList<Integer> workQueue = new ArrayList<Integer>(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step
|
||||
final ArrayList<Integer> workToBeAdded = new ArrayList<Integer>();
|
||||
final ArrayList<Double> calculatedValues = new ArrayList<Double>();
|
||||
final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1;
|
||||
workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype
|
||||
|
||||
for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over
|
||||
//Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order
|
||||
int el = 1;
|
||||
for( int work : workQueue ) {
|
||||
// choose the appropriate diagonal baseline location
|
||||
int iii = 0;
|
||||
int jjj = diag;
|
||||
if( diag > Y_METRIC_LENGTH ) {
|
||||
iii = diag - Y_METRIC_LENGTH;
|
||||
jjj = Y_METRIC_LENGTH;
|
||||
}
|
||||
// move to the starting work location along the diagonal
|
||||
iii += work;
|
||||
jjj -= work;
|
||||
while( iii >= X_METRIC_LENGTH || jjj <= 0 ) {
|
||||
iii--;
|
||||
jjj++;
|
||||
work--;
|
||||
}
|
||||
if( !detectClusteredStartLocations(workToBeAdded, work ) ) {
|
||||
workToBeAdded.add(work); // keep this thread going once it has started
|
||||
}
|
||||
|
||||
if( work >= el - 3 ) {
|
||||
// step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value
|
||||
double maxElement = Double.NEGATIVE_INFINITY;
|
||||
for( el = work; el < numDiags + 1; el++ ) {
|
||||
updateCell(iii, jjj, haplotypeBases, readBases, readQuals,
|
||||
insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray);
|
||||
final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]);
|
||||
calculatedValues.add(bestMetric);
|
||||
if( bestMetric > maxElement ) {
|
||||
maxElement = bestMetric;
|
||||
} else if( maxElement - bestMetric > BANDING_TOLERANCE ) {
|
||||
break;
|
||||
}
|
||||
if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix
|
||||
break;
|
||||
}
|
||||
if( --jjj <= 0 ) { // don't walk off the edge of the matrix
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// find a local maximum to start a new band in the work queue
|
||||
double localMaxElement = Double.NEGATIVE_INFINITY;
|
||||
int localMaxElementIndex = 0;
|
||||
for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) {
|
||||
final double bestMetric = calculatedValues.get(kkk);
|
||||
if( bestMetric > localMaxElement ) {
|
||||
localMaxElement = bestMetric;
|
||||
localMaxElementIndex = kkk;
|
||||
} else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum
|
||||
if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) {
|
||||
workToBeAdded.add( work + localMaxElementIndex );
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
calculatedValues.clear();
|
||||
|
||||
// reset iii and jjj to the appropriate diagonal baseline location
|
||||
iii = 0;
|
||||
jjj = diag;
|
||||
if( diag > Y_METRIC_LENGTH ) {
|
||||
iii = diag - Y_METRIC_LENGTH;
|
||||
jjj = Y_METRIC_LENGTH;
|
||||
}
|
||||
// move to the starting work location along the diagonal
|
||||
iii += work-1;
|
||||
jjj -= work-1;
|
||||
|
||||
// step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value
|
||||
for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) {
|
||||
updateCell(iii, jjj, haplotypeBases, readBases, readQuals,
|
||||
insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray);
|
||||
final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]);
|
||||
if( bestMetric > maxElement ) {
|
||||
maxElement = bestMetric;
|
||||
} else if( maxElement - bestMetric > BANDING_TOLERANCE ) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
workQueue.clear();
|
||||
workQueue.addAll(workToBeAdded);
|
||||
workToBeAdded.clear();
|
||||
}
|
||||
} else {
|
||||
// simple rectangular version of update loop, slow
|
||||
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
|
||||
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
|
||||
if( (iii == 1 && jjj == 1) ) { continue; }
|
||||
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
|
||||
matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// final probability is the log10 sum of the last element in all three state arrays
|
||||
final int endI = X_METRIC_LENGTH - 1;
|
||||
final int endJ = Y_METRIC_LENGTH - 1;
|
||||
return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
|
||||
}
|
||||
|
||||
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
|
||||
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
|
||||
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
|
||||
final int im1 = indI - 1;
|
||||
final int jm1 = indJ - 1;
|
||||
|
||||
// update the match array
|
||||
double pBaseReadLog10 = 0.0; // Math.log10(1.0);
|
||||
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state
|
||||
final byte x = readBases[im1-1];
|
||||
final byte y = haplotypeBases[jm1-1];
|
||||
final byte qual = readQuals[im1-1];
|
||||
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
|
||||
}
|
||||
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
|
||||
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
|
||||
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
|
||||
matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0);
|
||||
|
||||
// update the X (insertion) array
|
||||
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
|
||||
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
|
||||
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
|
||||
XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1);
|
||||
|
||||
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
|
||||
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) );
|
||||
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
|
||||
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
|
||||
YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2);
|
||||
}
|
||||
|
||||
// private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other
|
||||
private boolean detectClusteredStartLocations( final ArrayList<Integer> list, int loc ) {
|
||||
for(int x : list) {
|
||||
if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -25,8 +25,11 @@
|
|||
package org.broadinstitute.sting.utils.codecs.hapmap;
|
||||
|
||||
import org.broad.tribble.AsciiFeatureCodec;
|
||||
import org.broad.tribble.FeatureCodecHeader;
|
||||
import org.broad.tribble.annotation.Strand;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
|
@ -116,4 +119,10 @@ public class RawHapMapCodec extends AsciiFeatureCodec<RawHapMapFeature> {
|
|||
}
|
||||
return headerLine;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FeatureCodecHeader readHeader(final PositionalBufferedStream stream) throws IOException {
|
||||
final AsciiLineReader br = new AsciiLineReader(stream);
|
||||
return new FeatureCodecHeader(readHeader(br), br.getPosition());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,8 +2,6 @@ package org.broadinstitute.sting.utils.codecs.vcf;
|
|||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
|
@ -119,7 +117,7 @@ public class VCFCodec extends AbstractVCFCodec {
|
|||
// empty set for passes filters
|
||||
List<String> fFields = new LinkedList<String>();
|
||||
// otherwise we have to parse and cache the value
|
||||
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
|
||||
if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) )
|
||||
fFields.add(filterString);
|
||||
else
|
||||
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
|
||||
|
|
|
|||
|
|
@ -352,6 +352,9 @@ public class UserException extends ReviewedStingException {
|
|||
}
|
||||
|
||||
public static class CannotExecuteQScript extends UserException {
|
||||
public CannotExecuteQScript(String message) {
|
||||
super(String.format("Unable to execute QScript: " + message));
|
||||
}
|
||||
public CannotExecuteQScript(String message, Exception e) {
|
||||
super(String.format("Unable to execute QScript: " + message), e);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,107 @@
|
|||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 10/16/12
|
||||
*/
|
||||
|
||||
public class ExactPairHMM extends PairHMM {
|
||||
|
||||
@Override
|
||||
public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
|
||||
final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
|
||||
|
||||
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
|
||||
Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY);
|
||||
Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY);
|
||||
Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// the initial condition
|
||||
matchMetricArray[1][1] = 0.0; // Math.log10(1.0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues ) {
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = readBases.length + 2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
|
||||
|
||||
// ensure that all the qual scores have valid values
|
||||
for( int iii = 0; iii < readQuals.length; iii++ ) {
|
||||
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
|
||||
}
|
||||
|
||||
// simple rectangular version of update loop, slow
|
||||
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
|
||||
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
|
||||
if( (iii == 1 && jjj == 1) ) { continue; }
|
||||
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
|
||||
matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
|
||||
// final probability is the log10 sum of the last element in all three state arrays
|
||||
final int endI = X_METRIC_LENGTH - 1;
|
||||
final int endJ = Y_METRIC_LENGTH - 1;
|
||||
return MathUtils.log10sumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]});
|
||||
}
|
||||
|
||||
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
|
||||
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
|
||||
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
|
||||
final int im1 = indI - 1;
|
||||
final int jm1 = indJ - 1;
|
||||
|
||||
// update the match array
|
||||
double pBaseReadLog10 = 0.0; // Math.log10(1.0);
|
||||
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state
|
||||
final byte x = readBases[im1-1];
|
||||
final byte y = haplotypeBases[jm1-1];
|
||||
final byte qual = readQuals[im1-1];
|
||||
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
|
||||
}
|
||||
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
|
||||
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
|
||||
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
|
||||
matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0});
|
||||
|
||||
// update the X (insertion) array
|
||||
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
|
||||
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
|
||||
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
|
||||
XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1});
|
||||
|
||||
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
|
||||
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) );
|
||||
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
|
||||
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
|
||||
YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2});
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,105 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
/**
|
||||
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
|
||||
* User: rpoplin
|
||||
* Date: 3/1/12
|
||||
*/
|
||||
|
||||
public class OriginalPairHMM extends ExactPairHMM {
|
||||
|
||||
@Override
|
||||
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues ) {
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
final int X_METRIC_LENGTH = readBases.length + 2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
|
||||
|
||||
// ensure that all the qual scores have valid values
|
||||
for( int iii = 0; iii < readQuals.length; iii++ ) {
|
||||
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
|
||||
}
|
||||
|
||||
// simple rectangular version of update loop, slow
|
||||
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
|
||||
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
|
||||
if( (iii == 1 && jjj == 1) ) { continue; }
|
||||
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
|
||||
matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
|
||||
// final probability is the log10 sum of the last element in all three state arrays
|
||||
final int endI = X_METRIC_LENGTH - 1;
|
||||
final int endJ = Y_METRIC_LENGTH - 1;
|
||||
return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
|
||||
}
|
||||
|
||||
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
|
||||
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
|
||||
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
|
||||
final int im1 = indI - 1;
|
||||
final int jm1 = indJ - 1;
|
||||
|
||||
// update the match array
|
||||
double pBaseReadLog10 = 0.0; // Math.log10(1.0);
|
||||
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state
|
||||
final byte x = readBases[im1-1];
|
||||
final byte y = haplotypeBases[jm1-1];
|
||||
final byte qual = readQuals[im1-1];
|
||||
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
|
||||
}
|
||||
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
|
||||
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
|
||||
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
|
||||
matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0);
|
||||
|
||||
// update the X (insertion) array
|
||||
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
|
||||
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
|
||||
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
|
||||
XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1);
|
||||
|
||||
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
|
||||
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) );
|
||||
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
|
||||
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
|
||||
YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 10/16/12
|
||||
*/
|
||||
|
||||
public abstract class PairHMM {
|
||||
protected static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE;
|
||||
protected static final byte DEFAULT_GOP = (byte) 45;
|
||||
protected static final byte DEFAULT_GCP = (byte) 10;
|
||||
|
||||
public enum HMM_IMPLEMENTATION {
|
||||
/* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */
|
||||
EXACT,
|
||||
/* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */
|
||||
ORIGINAL,
|
||||
/* Optimized version of the PairHMM which caches per-read computations */
|
||||
CACHING,
|
||||
/* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */
|
||||
LOGLESS_CACHING
|
||||
}
|
||||
|
||||
protected double[][] matchMetricArray = null;
|
||||
protected double[][] XMetricArray = null;
|
||||
protected double[][] YMetricArray = null;
|
||||
|
||||
public abstract void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH );
|
||||
|
||||
@Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length",
|
||||
"readBases.length == overallGCP.length", "matchMetricArray!=null", "XMetricArray!=null", "YMetricArray!=null"})
|
||||
@Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 likelihood
|
||||
public abstract double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues );
|
||||
}
|
||||
|
|
@ -31,6 +31,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
|
|
@ -59,8 +60,9 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
private String mReadString = null;
|
||||
private GATKSAMReadGroupRecord mReadGroup = null;
|
||||
private byte[] reducedReadCounts = null;
|
||||
private int softStart = -1;
|
||||
private int softEnd = -1;
|
||||
private final static int UNINITIALIZED = -1;
|
||||
private int softStart = UNINITIALIZED;
|
||||
private int softEnd = UNINITIALIZED;
|
||||
|
||||
// because some values can be null, we don't want to duplicate effort
|
||||
private boolean retrievedReadGroup = false;
|
||||
|
|
@ -385,15 +387,16 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
* @return the unclipped start of the read taking soft clips (but not hard clips) into account
|
||||
*/
|
||||
public int getSoftStart() {
|
||||
if (softStart < 0) {
|
||||
int start = this.getUnclippedStart();
|
||||
for (CigarElement cigarElement : this.getCigar().getCigarElements()) {
|
||||
if (cigarElement.getOperator() == CigarOperator.HARD_CLIP)
|
||||
start += cigarElement.getLength();
|
||||
else
|
||||
if ( softStart == UNINITIALIZED ) {
|
||||
softStart = getAlignmentStart();
|
||||
for (final CigarElement cig : getCigar().getCigarElements()) {
|
||||
final CigarOperator op = cig.getOperator();
|
||||
|
||||
if (op == CigarOperator.SOFT_CLIP)
|
||||
softStart -= cig.getLength();
|
||||
else if (op != CigarOperator.HARD_CLIP)
|
||||
break;
|
||||
}
|
||||
softStart = start;
|
||||
}
|
||||
return softStart;
|
||||
}
|
||||
|
|
@ -406,24 +409,26 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
* @return the unclipped end of the read taking soft clips (but not hard clips) into account
|
||||
*/
|
||||
public int getSoftEnd() {
|
||||
if (softEnd < 0) {
|
||||
int stop = this.getUnclippedStart();
|
||||
if ( softEnd == UNINITIALIZED ) {
|
||||
boolean foundAlignedBase = false;
|
||||
softEnd = getAlignmentEnd();
|
||||
final List<CigarElement> cigs = getCigar().getCigarElements();
|
||||
for (int i = cigs.size() - 1; i >= 0; --i) {
|
||||
final CigarElement cig = cigs.get(i);
|
||||
final CigarOperator op = cig.getOperator();
|
||||
|
||||
if (ReadUtils.readIsEntirelyInsertion(this))
|
||||
return stop;
|
||||
|
||||
int shift = 0;
|
||||
CigarOperator lastOperator = null;
|
||||
for (CigarElement cigarElement : this.getCigar().getCigarElements()) {
|
||||
stop += shift;
|
||||
lastOperator = cigarElement.getOperator();
|
||||
if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP)
|
||||
shift = cigarElement.getLength();
|
||||
else
|
||||
shift = 0;
|
||||
if (op == CigarOperator.SOFT_CLIP) // assumes the soft clip that we found is at the end of the aligned read
|
||||
softEnd += cig.getLength();
|
||||
else if (op != CigarOperator.HARD_CLIP) {
|
||||
foundAlignedBase = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( !foundAlignedBase ) { // for example 64H14S, the soft end is actually the same as the alignment end
|
||||
softEnd = getAlignmentEnd();
|
||||
}
|
||||
softEnd = (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ;
|
||||
}
|
||||
|
||||
return softEnd;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -6,13 +6,12 @@ import org.broadinstitute.sting.BaseTest;
|
|||
import org.broadinstitute.sting.commandline.Tags;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.qc.CountReads;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -62,9 +61,9 @@ public class TraverseReadsUnitTest extends BaseTest {
|
|||
private SAMReaderID bam = new SAMReaderID(new File(validationDataLocation + "index_test.bam"),new Tags()); // TCGA-06-0188.aligned.duplicates_marked.bam");
|
||||
private File refFile = new File(validationDataLocation + "Homo_sapiens_assembly17.fasta");
|
||||
private List<SAMReaderID> bamList;
|
||||
private Walker countReadWalker;
|
||||
private ReadWalker countReadWalker;
|
||||
private File output;
|
||||
private TraverseReads traversalEngine = null;
|
||||
private TraverseReadsNano traversalEngine = null;
|
||||
|
||||
private IndexedFastaSequenceFile ref = null;
|
||||
private GenomeLocParser genomeLocParser = null;
|
||||
|
|
@ -107,7 +106,7 @@ public class TraverseReadsUnitTest extends BaseTest {
|
|||
bamList.add(bam);
|
||||
countReadWalker = new CountReads();
|
||||
|
||||
traversalEngine = new TraverseReads();
|
||||
traversalEngine = new TraverseReadsNano(1);
|
||||
traversalEngine.initialize(engine);
|
||||
}
|
||||
|
||||
|
|
@ -125,7 +124,7 @@ public class TraverseReadsUnitTest extends BaseTest {
|
|||
fail("Shard == null");
|
||||
}
|
||||
|
||||
ShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null,null);
|
||||
ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null,null);
|
||||
accumulator = traversalEngine.traverse(countReadWalker, dataProvider, accumulator);
|
||||
dataProvider.close();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import org.testng.annotations.Test;
|
|||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
// ********************************************************************************** //
|
||||
|
|
@ -18,6 +19,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
|
||||
private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
|
||||
private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
|
||||
private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam";
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
@ -28,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSamplePilot1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
|
||||
Arrays.asList("b3abf320f7d02d0e3b2883833419130e"));
|
||||
Arrays.asList("847605f4efafef89529fe0e496315edd"));
|
||||
executeTest("test MultiSample Pilot1", spec);
|
||||
}
|
||||
|
||||
|
|
@ -52,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testSingleSamplePilot2() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
|
||||
Arrays.asList("57e409dbb12e0d85cd8af73db221b1fc"));
|
||||
Arrays.asList("afb8768f31ab57eb43f75c1115eadc99"));
|
||||
executeTest("test SingleSample Pilot2", spec);
|
||||
}
|
||||
|
||||
|
|
@ -60,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultipleSNPAlleles() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
|
||||
Arrays.asList("26af30187316f742878c85f0ed091837"));
|
||||
Arrays.asList("543f68e42034bf44cfb24da8c9204320"));
|
||||
executeTest("test Multiple SNP alleles", spec);
|
||||
}
|
||||
|
||||
|
|
@ -76,7 +78,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testReverseTrim() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
|
||||
Arrays.asList("aa9cf96ab8f5aa844387e3aef1f27249"));
|
||||
Arrays.asList("5ce03dd9ca2d9324c1d4a9d64389beb5"));
|
||||
executeTest("test reverse trim", spec);
|
||||
}
|
||||
|
||||
|
|
@ -84,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMismatchedPLs() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
|
||||
Arrays.asList("d210ee1baa75dd4a0c63aef6b1fa7a8a"));
|
||||
Arrays.asList("3c006b06b17bbe8e787d64eff6a63a19"));
|
||||
executeTest("test mismatched PLs", spec);
|
||||
}
|
||||
|
||||
|
|
@ -94,7 +96,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private final static String COMPRESSED_OUTPUT_MD5 = "367c0355b4e7b10c2988e5c41f44b3d2";
|
||||
private final static String COMPRESSED_OUTPUT_MD5 = "fd236bd635d514e4214d364f45ec4d10";
|
||||
|
||||
@Test
|
||||
public void testCompressedOutput() {
|
||||
|
|
@ -115,7 +117,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
|
||||
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
|
||||
|
||||
String md5 = "360d1274c1072a1ae9868e4e106c2650";
|
||||
String md5 = "d408b4661b820ed86272415b8ea08780";
|
||||
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
|
||||
|
|
@ -147,7 +149,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMinBaseQualityScore() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1,
|
||||
Arrays.asList("6ae4a219c7b9c837fcbf12edeeac3c0c"));
|
||||
Arrays.asList("839ecd30d354a36b5dfa2b5e99859765"));
|
||||
executeTest("test min_base_quality_score 26", spec);
|
||||
}
|
||||
|
||||
|
|
@ -175,6 +177,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
executeTest("test using comp track", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNoCmdLineHeaderStdout() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandNoCmdLineHeaderStdout + " -glm INDEL -L 1:67,225,396-67,288,518", 0,
|
||||
Collections.<String>emptyList());
|
||||
executeTest("testNoCmdLineHeaderStdout", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOutputParameterSitesOnly() {
|
||||
testOutputParameters("-sites_only", "97ba874eafc9884a4de027a84c036311");
|
||||
|
|
@ -187,7 +197,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testOutputParameterAllSites() {
|
||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "67739a3ccf30975bcaef8a563e4b80cf");
|
||||
testOutputParameters("--output_mode EMIT_ALL_SITES", "41c046d38ea328421df924e37e017645");
|
||||
}
|
||||
|
||||
private void testOutputParameters(final String args, final String md5) {
|
||||
|
|
@ -220,12 +230,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Test
|
||||
public void testHeterozyosity1() {
|
||||
testHeterozosity( 0.01, "f1c4c8e701b2334bf3c4f12fc395fec8" );
|
||||
testHeterozosity( 0.01, "986923de51c71635d47e3d06fe3794a1" );
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHeterozyosity2() {
|
||||
testHeterozosity( 1.0 / 1850, "7fbbf4a21d6bf0026bfdadbb3c086fbe" );
|
||||
testHeterozosity( 1.0 / 1850, "fb12b1553f813004a394a391a8540873" );
|
||||
}
|
||||
|
||||
private void testHeterozosity(final double arg, final String md5) {
|
||||
|
|
@ -268,7 +278,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -L 1:10,000,000-10,100,000" +
|
||||
" -baq CALCULATE_AS_NECESSARY",
|
||||
1,
|
||||
Arrays.asList("950fb032cc9902ae48bd21f272d2fd52"));
|
||||
Arrays.asList("98058fc913b61c22d44875da1f5ea89c"));
|
||||
|
||||
executeTest(String.format("test calling with BAQ"), spec);
|
||||
}
|
||||
|
|
@ -287,7 +297,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,500,000",
|
||||
1,
|
||||
Arrays.asList("b3df138254ed141b61a758df87757e0d"));
|
||||
Arrays.asList("650c53774afacfc07a595675e8cdde17"));
|
||||
|
||||
executeTest(String.format("test indel caller in SLX"), spec);
|
||||
}
|
||||
|
|
@ -302,7 +312,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -minIndelCnt 1" +
|
||||
" -L 1:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("63fd9488daadd4baaef0a98f02916996"));
|
||||
Arrays.asList("6a0c2a3a7bcc56ad01428c71408055aa"));
|
||||
|
||||
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
|
||||
}
|
||||
|
|
@ -315,7 +325,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,500,000",
|
||||
1,
|
||||
Arrays.asList("52b5a432092995c92fe71e1942689ba8"));
|
||||
Arrays.asList("5f2721c3323de5390d2d47446139f32b"));
|
||||
|
||||
executeTest(String.format("test indel calling, multiple technologies"), spec);
|
||||
}
|
||||
|
|
@ -343,13 +353,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testMultiSampleIndels1() {
|
||||
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
|
||||
Arrays.asList("7fc488fe16dea9f023bfcfdaa908a548"));
|
||||
Arrays.asList("a4761d7f25e7a62f34494801c98a0da7"));
|
||||
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
|
||||
|
||||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
|
||||
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
|
||||
Arrays.asList("f3ff7fe0f15f31eadd726c711d6bf3de"));
|
||||
Arrays.asList("c526c234947482d1cd2ffc5102083a08"));
|
||||
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
|
||||
}
|
||||
|
||||
|
|
@ -371,7 +381,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 20:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("945a2f994eaced8efdf8de24b58f2680"));
|
||||
Arrays.asList("1e0d2c15546c3b0959b00ffb75488b56"));
|
||||
|
||||
executeTest(String.format("test UG with base indel quality scores"), spec);
|
||||
}
|
||||
|
|
@ -449,8 +459,25 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("bbf16e1873e525ee5975021cfb8988cf"));
|
||||
Arrays.asList("da9c05f87bd6415e97f90c49cf68ed19"));
|
||||
executeTest("test calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamSNPs() {
|
||||
testReducedCalling("SNP", "1d4a826b144723ff0766c36aa0239287");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamINDELs() {
|
||||
testReducedCalling("INDEL", "68ef51d5c98480e0c0192e0eecb95bca");
|
||||
}
|
||||
|
||||
|
||||
private void testReducedCalling(final String model, final String md5) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-11,000,000 -glm " + model, 1,
|
||||
Arrays.asList(md5));
|
||||
executeTest("test calling on a ReducedRead BAM with " + model, spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -190,7 +190,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("549321a2543608f214ab4893ab478be6")
|
||||
Arrays.asList("46ff472fc7ef6734ad01170028d5924a")
|
||||
);
|
||||
|
||||
executeTest("testRegenotype--" + testFile, spec);
|
||||
|
|
@ -216,7 +216,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
|
|||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
|
||||
1,
|
||||
Arrays.asList("549321a2543608f214ab4893ab478be6")
|
||||
Arrays.asList("46ff472fc7ef6734ad01170028d5924a")
|
||||
);
|
||||
|
||||
executeTest("testRemoveMLEAndRegenotype--" + testFile, spec);
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest {
|
|||
for ( final int nct : Arrays.asList(1, 2) ) {
|
||||
// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct });
|
||||
//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct });
|
||||
tests.add(new Object[]{ "BOTH", "8cad82c3a5f5b932042933f136663c8a", nt, nct });
|
||||
tests.add(new Object[]{ "BOTH", "85fc5d6dfeb60ed89763470f4b4c981e", nt, nct });
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
|
|
|
|||
|
|
@ -130,7 +130,10 @@ public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest {
|
|||
return StateTest.getTests(StateTest.class);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND)
|
||||
// NOTE this test takes an unreasonably long time to run, and so it's been disabled as these monitoring threads
|
||||
// aren't a core GATK feature any longer. Should be reabled if we come to care about this capability again
|
||||
// in the future, or we can run these in parallel
|
||||
@Test(enabled = false, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND)
|
||||
public void testStateTest(final StateTest test) throws InterruptedException {
|
||||
// allows us to test blocking
|
||||
final EfficiencyMonitoringThreadFactory factory = new EfficiencyMonitoringThreadFactory(test.getNStates());
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ import net.sf.samtools.SAMFileHeader.SortOrder
|
|||
import org.broadinstitute.sting.queue.util.QScriptUtils
|
||||
import org.broadinstitute.sting.queue.function.ListWriterFunction
|
||||
import org.broadinstitute.sting.commandline.Hidden
|
||||
import org.broadinstitute.sting.commandline
|
||||
|
||||
class DataProcessingPipeline extends QScript {
|
||||
qscript =>
|
||||
|
|
@ -41,34 +42,34 @@ class DataProcessingPipeline extends QScript {
|
|||
@Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false)
|
||||
var bwaPath: File = _
|
||||
|
||||
@Input(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", fullName="project", shortName="p", required=false)
|
||||
@Argument(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", fullName="project", shortName="p", required=false)
|
||||
var projectName: String = "project"
|
||||
|
||||
@Input(doc="Output path for the processed BAM files.", fullName="output_directory", shortName="outputDir", required=false)
|
||||
@Argument(doc="Output path for the processed BAM files.", fullName="output_directory", shortName="outputDir", required=false)
|
||||
var outputDir: String = ""
|
||||
|
||||
@Input(doc="the -L interval string to be used by GATK - output bams at interval only", fullName="gatk_interval_string", shortName="L", required=false)
|
||||
@Argument(doc="the -L interval string to be used by GATK - output bams at interval only", fullName="gatk_interval_string", shortName="L", required=false)
|
||||
var intervalString: String = ""
|
||||
|
||||
@Input(doc="an intervals file to be used by GATK - output bams at intervals only", fullName="gatk_interval_file", shortName="intervals", required=false)
|
||||
var intervals: File = _
|
||||
|
||||
@Input(doc="Cleaning model: KNOWNS_ONLY, USE_READS or USE_SW", fullName="clean_model", shortName="cm", required=false)
|
||||
@Argument(doc="Cleaning model: KNOWNS_ONLY, USE_READS or USE_SW", fullName="clean_model", shortName="cm", required=false)
|
||||
var cleaningModel: String = "USE_READS"
|
||||
|
||||
@Input(doc="Decompose input BAM file and fully realign it using BWA and assume Single Ended reads", fullName="use_bwa_single_ended", shortName="bwase", required=false)
|
||||
@Argument(doc="Decompose input BAM file and fully realign it using BWA and assume Single Ended reads", fullName="use_bwa_single_ended", shortName="bwase", required=false)
|
||||
var useBWAse: Boolean = false
|
||||
|
||||
@Input(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false)
|
||||
@Argument(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false)
|
||||
var useBWApe: Boolean = false
|
||||
|
||||
@Input(doc="Decompose input BAM file and fully realign it using BWA SW", fullName="use_bwa_sw", shortName="bwasw", required=false)
|
||||
@Argument(doc="Decompose input BAM file and fully realign it using BWA SW", fullName="use_bwa_sw", shortName="bwasw", required=false)
|
||||
var useBWAsw: Boolean = false
|
||||
|
||||
@Input(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false)
|
||||
@Argument(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false)
|
||||
var bwaThreads: Int = 1
|
||||
|
||||
@Input(doc="Perform validation on the BAM files", fullName="validation", shortName="vs", required=false)
|
||||
@Argument(doc="Perform validation on the BAM files", fullName="validation", shortName="vs", required=false)
|
||||
var validation: Boolean = false
|
||||
|
||||
|
||||
|
|
@ -76,15 +77,15 @@ class DataProcessingPipeline extends QScript {
|
|||
* Hidden Parameters
|
||||
****************************************************************************/
|
||||
@Hidden
|
||||
@Input(doc="How many ways to scatter/gather", fullName="scatter_gather", shortName="sg", required=false)
|
||||
@Argument(doc="How many ways to scatter/gather", fullName="scatter_gather", shortName="sg", required=false)
|
||||
var nContigs: Int = -1
|
||||
|
||||
@Hidden
|
||||
@Input(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false)
|
||||
@Argument(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false)
|
||||
var defaultPlatform: String = ""
|
||||
|
||||
@Hidden
|
||||
@Input(doc="Run the pipeline in test mode only", fullName = "test_mode", shortName = "test", required=false)
|
||||
@Argument(doc="Run the pipeline in test mode only", fullName = "test_mode", shortName = "test", required=false)
|
||||
var testMode: Boolean = false
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -27,28 +27,28 @@ class PacbioProcessingPipeline extends QScript {
|
|||
@Input(doc="dbsnp VCF file to use ", shortName="D", required=true)
|
||||
var dbSNP: File = _
|
||||
|
||||
@Input(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false)
|
||||
@Argument(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false)
|
||||
var threads: Int = 0
|
||||
|
||||
@Input(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false)
|
||||
@Argument(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false)
|
||||
var sample: String = "NA"
|
||||
|
||||
@Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false)
|
||||
var bwaPath: File = _
|
||||
|
||||
@Input(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false)
|
||||
@Argument(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false)
|
||||
var BLASR_BAM: Boolean = false
|
||||
|
||||
@Hidden
|
||||
@Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false)
|
||||
@Argument(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false)
|
||||
var dbq: Int = 20
|
||||
|
||||
@Hidden
|
||||
@Input(shortName="bwastring", required=false)
|
||||
@Argument(shortName="bwastring", required=false)
|
||||
var bwastring: String = ""
|
||||
|
||||
@Hidden
|
||||
@Input(shortName = "test", fullName = "test_mode", required = false)
|
||||
@Argument(shortName = "test", fullName = "test_mode", required = false)
|
||||
var testMode: Boolean = false
|
||||
|
||||
val queueLogDir: String = ".qlog/"
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@
|
|||
package org.broadinstitute.sting.queue
|
||||
|
||||
import java.io.File
|
||||
import org.broadinstitute.sting.commandline.Argument
|
||||
import org.broadinstitute.sting.commandline.{ClassType, Argument}
|
||||
|
||||
/**
|
||||
* Default settings settable on the command line and passed to CommandLineFunctions.
|
||||
|
|
@ -41,6 +41,7 @@ class QSettings {
|
|||
var jobQueue: String = _
|
||||
|
||||
@Argument(fullName="job_priority", shortName="jobPriority", doc="Default priority for jobs. Min = 0, Max = 100", required=false)
|
||||
@ClassType(classOf[Int])
|
||||
var jobPriority: Option[Int] = None
|
||||
|
||||
@Argument(fullName="job_native_arg", shortName="jobNative", doc="Native arguments to pass to the job runner.", required=false)
|
||||
|
|
@ -53,15 +54,19 @@ class QSettings {
|
|||
var jobEnvironmentNames: Seq[String] = Nil
|
||||
|
||||
@Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes. If not set defaults to 2GB.", required=false)
|
||||
@ClassType(classOf[Double])
|
||||
var memoryLimit: Option[Double] = Some(2)
|
||||
|
||||
@Argument(fullName="memory_limit_threshold", shortName="memLimitThresh", doc="After passing this threshold stop increasing memory limit for jobs, in gigabytes.", required=false)
|
||||
@ClassType(classOf[Double])
|
||||
var memoryLimitThreshold: Option[Double] = None
|
||||
|
||||
@Argument(fullName="resident_memory_limit", shortName="resMemLimit", doc="Default resident memory limit for jobs, in gigabytes.", required=false)
|
||||
@ClassType(classOf[Double])
|
||||
var residentLimit: Option[Double] = None
|
||||
|
||||
@Argument(fullName="resident_memory_request", shortName="resMemReq", doc="Default resident memory request for jobs, in gigabytes.", required=false)
|
||||
@ClassType(classOf[Double])
|
||||
var residentRequest: Option[Double] = None
|
||||
|
||||
@Argument(fullName="resident_memory_request_parameter", shortName="resMemReqParam", doc="Parameter for resident memory requests. By default not requested.", required=false)
|
||||
|
|
|
|||
|
|
@ -159,12 +159,11 @@ object ReflectionUtils {
|
|||
private def getGenericTypes(field: Field): Option[Array[Class[_]]] = {
|
||||
// TODO: Refactor: based on java code in org.broadinstitute.sting.commandline.ArgumentTypeDescriptor
|
||||
// If this is a parameterized collection, find the contained type. If blow up if only one type exists.
|
||||
if (field.getGenericType.isInstanceOf[ParameterizedType]) {
|
||||
if (hasAnnotation(field, classOf[ClassType])) {
|
||||
Some(Array(getAnnotation(field, classOf[ClassType]).value))
|
||||
} else if (field.getGenericType.isInstanceOf[ParameterizedType]) {
|
||||
val parameterizedType = field.getGenericType.asInstanceOf[ParameterizedType]
|
||||
Some(parameterizedType.getActualTypeArguments.map(_.asInstanceOf[Class[_]]))
|
||||
} else if (hasAnnotation(field, classOf[ClassType])) {
|
||||
Some(Array(getAnnotation(field, classOf[ClassType]).value))
|
||||
}
|
||||
else None
|
||||
} else None
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ import collection.JavaConversions._
|
|||
import org.broadinstitute.sting.queue.QException
|
||||
import java.lang.Class
|
||||
import org.broadinstitute.sting.commandline.{ArgumentMatches, ArgumentSource, ArgumentTypeDescriptor, ParsingEngine}
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException
|
||||
import java.lang.reflect.Type
|
||||
|
||||
/**
|
||||
|
|
@ -75,6 +76,8 @@ class ScalaCompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
|
||||
def parse(parsingEngine: ParsingEngine, source: ArgumentSource, classType: Class[_], argumentMatches: ArgumentMatches) = {
|
||||
val componentType = ReflectionUtils.getCollectionType(source.field)
|
||||
if (componentType == classOf[java.lang.Object])
|
||||
throw new UserException.CannotExecuteQScript("Please also include a @ClassType(classOf[<primitive type>]) annotation on field: " + source.field + ". Example: @ClassType(classOf[Double]). The scala generic type for the field was subjected to java/scala type erasure and is not available via reflection.")
|
||||
val componentArgumentParser = parsingEngine.selectBestTypeDescriptor(componentType)
|
||||
|
||||
if (classOf[Seq[_]].isAssignableFrom(classType)) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue