Merge branch 'master' of github.com:broadinstitute/cmi-gatk

This commit is contained in:
Guillermo del Angel 2012-11-02 06:58:21 -04:00
commit 52c635b551
103 changed files with 3330 additions and 2560 deletions

View File

@ -1,8 +1,5 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import java.util.HashMap;
import java.util.Map;
/**
* An object that keeps track of the base counts as well as the sum of the base, insertion and deletion qualities of each base.
*
@ -10,35 +7,31 @@ import java.util.Map;
* @since 6/15/12
*/
public class BaseAndQualsCounts extends BaseCounts {
private final Map<BaseIndex, Long> sumInsertionQuals;
private final Map<BaseIndex, Long> sumDeletionQuals;
private final long[] sumInsertionQuals;
private final long[] sumDeletionQuals;
public BaseAndQualsCounts() {
super();
this.sumInsertionQuals = new HashMap<BaseIndex, Long>();
this.sumDeletionQuals = new HashMap<BaseIndex, Long>();
for (BaseIndex i : BaseIndex.values()) {
sumInsertionQuals.put(i, 0L);
sumDeletionQuals.put(i, 0L);
this.sumInsertionQuals = new long[BaseIndex.values().length];
this.sumDeletionQuals = new long[BaseIndex.values().length];
for (final BaseIndex i : BaseIndex.values()) {
sumInsertionQuals[i.index] = 0L;
sumDeletionQuals[i.index] = 0L;
}
}
public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
super.incr(base, baseQual);
BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // do not allow Ns
sumInsertionQuals.put(i, sumInsertionQuals.get(i) + insQual);
sumDeletionQuals.put(i, sumDeletionQuals.get(i) + delQual);
}
final BaseIndex i = BaseIndex.byteToBase(base);
super.incr(i, baseQual);
sumInsertionQuals[i.index] += insQual;
sumDeletionQuals[i.index] += delQual;
}
public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
super.decr(base, baseQual);
BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // do not allow Ns
sumInsertionQuals.put(i, sumInsertionQuals.get(i) - insQual);
sumDeletionQuals.put(i, sumDeletionQuals.get(i) - delQual);
}
final BaseIndex i = BaseIndex.byteToBase(base);
super.decr(i, baseQual);
sumInsertionQuals[i.index] -= insQual;
sumDeletionQuals[i.index] -= delQual;
}
public byte averageInsertionQualsOfBase(final BaseIndex base) {
@ -49,7 +42,7 @@ public class BaseAndQualsCounts extends BaseCounts {
return getGenericAverageQualOfBase(base, sumDeletionQuals);
}
private byte getGenericAverageQualOfBase(final BaseIndex base, final Map<BaseIndex, Long> sumQuals) {
return (byte) (sumQuals.get(base) / getCount(base));
private byte getGenericAverageQualOfBase(final BaseIndex base, final long[] sumQuals) {
return (byte) (sumQuals[base.index] / countOfBase(base));
}
}

View File

@ -3,8 +3,6 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import java.util.EnumMap;
import java.util.Map;
/**
* An object to keep track of the number of occurrences of each base and it's quality.
@ -18,79 +16,73 @@ import java.util.Map;
public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N;
public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte();
private final Map<BaseIndex, Integer> counts; // keeps track of the base counts
private final Map<BaseIndex, Long> sumQuals; // keeps track of the quals of each base
private final int[] counts; // keeps track of the base counts
private final long[] sumQuals; // keeps track of the quals of each base
private int totalCount = 0; // keeps track of total count since this is requested so often
public BaseCounts() {
counts = new EnumMap<BaseIndex, Integer>(BaseIndex.class);
sumQuals = new EnumMap<BaseIndex, Long>(BaseIndex.class);
for (BaseIndex i : BaseIndex.values()) {
counts.put(i, 0);
sumQuals.put(i, 0L);
counts = new int[BaseIndex.values().length];
sumQuals = new long[BaseIndex.values().length];
for (final BaseIndex i : BaseIndex.values()) {
counts[i.index] = 0;
sumQuals[i.index] = 0L;
}
}
public static BaseCounts createWithCounts(int[] countsACGT) {
BaseCounts baseCounts = new BaseCounts();
baseCounts.counts.put(BaseIndex.A, countsACGT[0]);
baseCounts.counts.put(BaseIndex.C, countsACGT[1]);
baseCounts.counts.put(BaseIndex.G, countsACGT[2]);
baseCounts.counts.put(BaseIndex.T, countsACGT[3]);
baseCounts.counts[BaseIndex.A.index] = countsACGT[0];
baseCounts.counts[BaseIndex.C.index] = countsACGT[1];
baseCounts.counts[BaseIndex.G.index] = countsACGT[2];
baseCounts.counts[BaseIndex.T.index] = countsACGT[3];
baseCounts.totalCount = countsACGT[0] + countsACGT[1] + countsACGT[2] + countsACGT[3];
return baseCounts;
}
@Requires("other != null")
public void add(BaseCounts other) {
for (final BaseIndex i : BaseIndex.values())
counts.put(i, counts.get(i) + other.counts.get(i));
public void add(final BaseCounts other) {
for (final BaseIndex i : BaseIndex.values()) {
final int otherCount = other.counts[i.index];
counts[i.index] += otherCount;
totalCount += otherCount;
}
}
@Requires("other != null")
public void sub(BaseCounts other) {
for (final BaseIndex i : BaseIndex.values())
counts.put(i, counts.get(i) - other.counts.get(i));
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void incr(byte base) {
final BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) // no Ns
counts.put(i, counts.get(i) + 1);
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void incr(byte base, byte qual) {
final BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // no Ns
counts.put(i, counts.get(i) + 1);
sumQuals.put(i, sumQuals.get(i) + qual);
public void sub(final BaseCounts other) {
for (final BaseIndex i : BaseIndex.values()) {
final int otherCount = other.counts[i.index];
counts[i.index] -= otherCount;
totalCount -= otherCount;
}
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
public void decr(byte base) {
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void incr(final byte base) {
final BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) // no Ns
counts.put(i, counts.get(i) - 1);
counts[i.index]++;
totalCount++;
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void incr(final BaseIndex base, final byte qual) {
counts[base.index]++;
totalCount++;
sumQuals[base.index] += qual;
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
public void decr(byte base, byte qual) {
public void decr(final byte base) {
final BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // no Ns
counts.put(i, counts.get(i) - 1);
sumQuals.put(i, sumQuals.get(i) - qual);
}
counts[i.index]--;
totalCount--;
}
@Ensures("result >= 0")
public int getCount(final byte base) {
return getCount(BaseIndex.byteToBase(base));
}
@Ensures("result >= 0")
public int getCount(final BaseIndex base) {
return counts.get(base);
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
public void decr(final BaseIndex base, final byte qual) {
counts[base.index]--;
totalCount--;
sumQuals[base.index] -= qual;
}
@Ensures("result >= 0")
@ -100,27 +92,32 @@ import java.util.Map;
@Ensures("result >= 0")
public long getSumQuals(final BaseIndex base) {
return sumQuals.get(base);
return sumQuals[base.index];
}
@Ensures("result >= 0")
public byte averageQuals(final byte base) {
return (byte) (getSumQuals(base) / getCount(base));
return (byte) (getSumQuals(base) / countOfBase(base));
}
@Ensures("result >= 0")
public byte averageQuals(final BaseIndex base) {
return (byte) (getSumQuals(base) / getCount(base));
return (byte) (getSumQuals(base) / countOfBase(base));
}
@Ensures("result >= 0")
public int countOfBase(final byte base) {
return countOfBase(BaseIndex.byteToBase(base));
}
@Ensures("result >= 0")
public int countOfBase(final BaseIndex base) {
return counts.get(base);
return counts[base.index];
}
@Ensures("result >= 0")
public long sumQualsOfBase(final BaseIndex base) {
return sumQuals.get(base);
return sumQuals[base.index];
}
@Ensures("result >= 0")
@ -131,44 +128,36 @@ import java.util.Map;
@Ensures("result >= 0")
public int totalCount() {
int sum = 0;
for (int c : counts.values())
sum += c;
return sum;
return totalCount;
}
/**
* Given a base , it returns the proportional count of this base compared to all other bases
*
* @param base
* @param base base
* @return the proportion of this base over all other bases
*/
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportion(final byte base) {
return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount();
return baseCountProportion(BaseIndex.byteToBase(base));
}
/**
* Given a base , it returns the proportional count of this base compared to all other bases
*
* @param baseIndex
* @param baseIndex base
* @return the proportion of this base over all other bases
*/
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportion(final BaseIndex baseIndex) {
int total = totalCount();
if (total == 0)
return 0.0;
return (double) counts.get(baseIndex) / totalCount();
return (totalCount == 0) ? 0.0 : (double)counts[baseIndex.index] / (double)totalCount;
}
@Ensures("result != null")
public String toString() {
StringBuilder b = new StringBuilder();
for (Map.Entry<BaseIndex, Integer> elt : counts.entrySet()) {
b.append(elt.toString()).append("=").append(elt.getValue()).append(",");
for (final BaseIndex i : BaseIndex.values()) {
b.append(i.toString()).append("=").append(counts[i.index]).append(",");
}
return b.toString();
}
@ -180,9 +169,9 @@ import java.util.Map;
@Ensures("result != null")
public BaseIndex baseIndexWithMostCounts() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet()) {
if (entry.getValue() > counts.get(maxI))
maxI = entry.getKey();
for (final BaseIndex i : BaseIndex.values()) {
if (counts[i.index] > counts[maxI.index])
maxI = i;
}
return maxI;
}
@ -190,17 +179,17 @@ import java.util.Map;
@Ensures("result != null")
public BaseIndex baseIndexWithMostCountsWithoutIndels() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet()) {
if (entry.getKey().isNucleotide() && entry.getValue() > counts.get(maxI))
maxI = entry.getKey();
for (final BaseIndex i : BaseIndex.values()) {
if (i.isNucleotide() && counts[i.index] > counts[maxI.index])
maxI = i;
}
return maxI;
}
private boolean hasHigherCount(final BaseIndex targetIndex, final BaseIndex testIndex) {
final int targetCount = counts.get(targetIndex);
final int testCount = counts.get(testIndex);
return ( targetCount > testCount || (targetCount == testCount && sumQuals.get(targetIndex) > sumQuals.get(testIndex)) );
final int targetCount = counts[targetIndex.index];
final int testCount = counts[testIndex.index];
return ( targetCount > testCount || (targetCount == testCount && sumQuals[targetIndex.index] > sumQuals[testIndex.index]) );
}
public byte baseWithMostProbability() {
@ -210,48 +199,42 @@ import java.util.Map;
@Ensures("result != null")
public BaseIndex baseIndexWithMostProbability() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (Map.Entry<BaseIndex, Long> entry : sumQuals.entrySet()) {
if (entry.getValue() > sumQuals.get(maxI))
maxI = entry.getKey();
for (final BaseIndex i : BaseIndex.values()) {
if (sumQuals[i.index] > sumQuals[maxI.index])
maxI = i;
}
return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCounts());
return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCounts());
}
@Ensures("result != null")
public BaseIndex baseIndexWithMostProbabilityWithoutIndels() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (Map.Entry<BaseIndex, Long> entry : sumQuals.entrySet()) {
if (entry.getKey().isNucleotide() && entry.getValue() > sumQuals.get(maxI))
maxI = entry.getKey();
for (final BaseIndex i : BaseIndex.values()) {
if (i.isNucleotide() && sumQuals[i.index] > sumQuals[maxI.index])
maxI = i;
}
return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCountsWithoutIndels());
return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCountsWithoutIndels());
}
@Ensures("result >=0")
public int totalCountWithoutIndels() {
int sum = 0;
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet())
if (entry.getKey().isNucleotide())
sum += entry.getValue();
return sum;
return totalCount - counts[BaseIndex.D.index] - counts[BaseIndex.I.index];
}
/**
* Calculates the proportional count of a base compared to all other bases except indels (I and D)
*
* @param index
* @param base base
* @return the proportion of this base over all other bases except indels
*/
@Requires("index.isNucleotide()")
@Requires("base.isNucleotide()")
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportionWithoutIndels(final BaseIndex index) {
public double baseCountProportionWithoutIndels(final BaseIndex base) {
final int total = totalCountWithoutIndels();
if (total == 0)
return 0.0;
return (double) counts.get(index) / totalCountWithoutIndels();
return (total == 0) ? 0.0 : (double)counts[base.index] / (double)total;
}
public Object[] countsArray() {
return counts.values().toArray();
public int[] countsArray() {
return counts.clone();
}
}

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/**
* Simple byte / base index conversions
*
@ -56,7 +58,7 @@ public enum BaseIndex {
case 'N':
case 'n':
return N;
default: return null;
default: throw new ReviewedStingException("Tried to create a byte index for an impossible base " + base);
}
}
@ -68,7 +70,7 @@ public enum BaseIndex {
* @return whether or not it is a nucleotide, given the definition above
*/
public boolean isNucleotide() {
return this == A || this == C || this == G || this == T || this == N;
return !isIndel();
}
/**

View File

@ -0,0 +1,21 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import org.broadinstitute.sting.utils.GenomeLocComparator;
import java.util.TreeSet;
/**
* A stash of regions that must be kept uncompressed in all samples
*
* In general, these are regions that were kept uncompressed by a tumor sample and we want to force
* all other samples (normals and/or tumors) to also keep these regions uncompressed
*
* User: carneiro
* Date: 10/15/12
* Time: 4:08 PM
*/
public class CompressionStash extends TreeSet<SimpleGenomeLoc> {
public CompressionStash() {
super(new GenomeLocComparator());
}
}

View File

@ -157,11 +157,9 @@ public class HeaderElement {
* @return whether or not the HeaderElement is variant due to excess insertions
*/
private boolean isVariantFromInsertions(double minIndelProportion) {
int numberOfBases = consensusBaseCounts.totalCount();
if (numberOfBases == 0 && insertionsToTheRight > 0)
return true; // we only have insertions
else if (numberOfBases == 0)
return false; // we don't have anything
final int numberOfBases = consensusBaseCounts.totalCount();
if (numberOfBases == 0)
return (insertionsToTheRight > 0); // do we only have insertions?
// if we have bases and insertions, check the ratio
return ((double) insertionsToTheRight / numberOfBases) > minIndelProportion;
@ -215,11 +213,11 @@ public class HeaderElement {
if (totalCount == 0)
return 0;
Object[] countsArray = consensusBaseCounts.countsArray();
int[] countsArray = consensusBaseCounts.countsArray();
Arrays.sort(countsArray);
for (int i = countsArray.length-1; i>=0; i--) {
nHaplotypes++;
runningCount += (Integer) countsArray[i];
runningCount += countsArray[i];
if (runningCount/totalCount > minVariantProportion)
break;
}

View File

@ -55,11 +55,12 @@ public class MultiSampleCompressor implements Compressor {
final int minBaseQual,
final ReduceReads.DownsampleStrategy downsampleStrategy,
final int nContigs,
final boolean allowPolyploidReduction) {
final boolean allowPolyploidReduction,
final CompressionStash compressionStash) {
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
compressorsPerSample.put(name,
new SingleSampleCompressor(contextSize, downsampleCoverage,
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction));
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction, compressionStash));
}
}

View File

@ -222,6 +222,8 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
HashMap<String, Long> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
Long nextReadNumber = 1L; // The next number to use for the compressed read name.
CompressionStash compressionStash = new CompressionStash();
SortedSet<GenomeLoc> intervalList;
private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
@ -328,7 +330,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
*/
@Override
public ReduceReadsStash reduceInit() {
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION));
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION, compressionStash));
}
/**

View File

@ -20,6 +20,7 @@ public class SingleSampleCompressor implements Compressor {
final private ReduceReads.DownsampleStrategy downsampleStrategy;
final private int nContigs;
final private boolean allowPolyploidReduction;
final CompressionStash compressionStash;
private SlidingWindow slidingWindow;
private int slidingWindowCounter;
@ -33,7 +34,8 @@ public class SingleSampleCompressor implements Compressor {
final int minBaseQual,
final ReduceReads.DownsampleStrategy downsampleStrategy,
final int nContigs,
final boolean allowPolyploidReduction) {
final boolean allowPolyploidReduction,
final CompressionStash compressionStash) {
this.contextSize = contextSize;
this.downsampleCoverage = downsampleCoverage;
this.minMappingQuality = minMappingQuality;
@ -44,6 +46,7 @@ public class SingleSampleCompressor implements Compressor {
this.downsampleStrategy = downsampleStrategy;
this.nContigs = nContigs;
this.allowPolyploidReduction = allowPolyploidReduction;
this.compressionStash = compressionStash;
}
/**
@ -65,7 +68,7 @@ public class SingleSampleCompressor implements Compressor {
}
if ( slidingWindow == null) { // this is the first read
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction);
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction, compressionStash);
slidingWindowCounter++;
}

View File

@ -6,7 +6,6 @@ import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.recalibration.EventType;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
@ -24,7 +23,7 @@ import java.util.*;
public class SlidingWindow {
// Sliding Window data
final private LinkedList<GATKSAMRecord> readsInWindow;
final private TreeSet<GATKSAMRecord> readsInWindow;
final private LinkedList<HeaderElement> windowHeader;
protected int contextSize; // the largest context size (between mismatches and indels)
protected String contig;
@ -56,6 +55,7 @@ public class SlidingWindow {
private final int nContigs;
private boolean allowPolyploidReductionInGeneral;
private CompressionStash compressionStash;
/**
* The types of synthetic reads to use in the finalizeAndAdd method
@ -87,7 +87,7 @@ public class SlidingWindow {
}
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction) {
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction, CompressionStash compressionStash) {
this.contextSize = contextSize;
this.downsampleCoverage = downsampleCoverage;
@ -97,7 +97,13 @@ public class SlidingWindow {
this.MIN_MAPPING_QUALITY = minMappingQuality;
this.windowHeader = new LinkedList<HeaderElement>();
this.readsInWindow = new LinkedList<GATKSAMRecord>();
this.readsInWindow = new TreeSet<GATKSAMRecord>(new Comparator<GATKSAMRecord>() {
@Override
public int compare(GATKSAMRecord read1, GATKSAMRecord read2) {
final int difference = read1.getSoftEnd() - read2.getSoftEnd();
return difference != 0 ? difference : read1.getReadName().compareTo(read2.getReadName());
}
});
this.contig = contig;
this.contigIndex = contigIndex;
@ -118,6 +124,7 @@ public class SlidingWindow {
this.nContigs = nContigs;
this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
this.compressionStash = compressionStash;
}
/**
@ -145,7 +152,7 @@ public class SlidingWindow {
* @param variantSite boolean array with true marking variant regions
* @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region.
*/
private Pair<Integer, Integer> getNextVariantRegion(int from, int to, boolean[] variantSite) {
private SimpleGenomeLoc getNextVariantRegion(int from, int to, boolean[] variantSite) {
boolean foundStart = false;
int variantRegionStartIndex = 0;
for (int i=from; i<to; i++) {
@ -154,10 +161,10 @@ public class SlidingWindow {
foundStart = true;
}
else if(!variantSite[i] && foundStart) {
return(new Pair<Integer, Integer>(variantRegionStartIndex, i-1));
return(new SimpleGenomeLoc(contig, contigIndex, variantRegionStartIndex, i-1, true));
}
}
return (foundStart) ? new Pair<Integer, Integer>(variantRegionStartIndex, -1) : null;
return (foundStart) ? new SimpleGenomeLoc(contig, contigIndex, variantRegionStartIndex, to-1, false) : null;
}
/**
@ -168,23 +175,22 @@ public class SlidingWindow {
* @param variantSite boolean array with true marking variant regions
* @return a list with start/stops of variant regions following getNextVariantRegion description
*/
private List<Pair<Integer, Integer>> getAllVariantRegions(int from, int to, boolean[] variantSite) {
List<Pair<Integer,Integer>> regions = new LinkedList<Pair<Integer, Integer>>();
private CompressionStash getVariantRegionsFromThisSample(int from, int to, boolean[] variantSite) {
CompressionStash regions = new CompressionStash();
int index = from;
while(index < to) {
Pair<Integer,Integer> result = getNextVariantRegion(index, to, variantSite);
SimpleGenomeLoc result = getNextVariantRegion(index, to, variantSite);
if (result == null)
break;
regions.add(result);
if (result.getSecond() < 0)
if (result.getStop() < 0)
break;
index = result.getSecond() + 1;
index = result.getStop() + 1;
}
return regions;
}
/**
* Determines if the window can be slid given the new incoming read.
*
@ -195,55 +201,105 @@ public class SlidingWindow {
* @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
* @return all reads that have fallen to the left of the sliding window after the slide
*/
protected List<GATKSAMRecord> slideWindow(int incomingReadUnclippedStart) {
protected List<GATKSAMRecord> slideWindow(final int incomingReadUnclippedStart) {
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
if (incomingReadUnclippedStart - contextSize > getStartLocation(windowHeader)) {
int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation(windowHeader);
boolean[] variantSite = markSites(getStartLocation(windowHeader) + readStartHeaderIndex);
final int windowHeaderStartLocation = getStartLocation(windowHeader);
if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) {
markSites(incomingReadUnclippedStart);
int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation;
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, breakpoint, variantSite);
CompressionStash regions = getVariantRegionsFromThisSample(0, breakpoint, markedSites.getVariantSiteBitSet());
finalizedReads = closeVariantRegions(regions, false);
List<GATKSAMRecord> readsToRemove = new LinkedList<GATKSAMRecord>();
final int windowHeaderStartLoc = getStartLocation(windowHeader);
for (final GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
if (read.getSoftEnd() < windowHeaderStartLoc) {
readsToRemove.add(read);
}
}
for (GATKSAMRecord read : readsToRemove) {
readsInWindow.remove(read);
while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
readsInWindow.pollFirst();
}
}
return finalizedReads;
}
private final class MarkedSites {
private boolean[] siteIsVariant = new boolean[0];
private int startLocation = 0;
public MarkedSites() {}
public boolean[] getVariantSiteBitSet() { return siteIsVariant; }
/**
* Updates the variant site bitset given the new startlocation and size of the region to mark.
*
* @param newStartLocation the new start location of the bitset
* @param sizeOfRegion the new size of the region to be represented
*
* @return the end position (newStartLocation + index) of the region marked by this method; the calling method is responsible for the remainder.
*/
public int updateRegion(final int newStartLocation, final int sizeOfRegion) {
int lastPositionMarked = sizeOfRegion;
// if this is the first time we set the array and we can't reuse anything, just create a new array from scratch
if ( newStartLocation >= this.startLocation + siteIsVariant.length || newStartLocation < this.startLocation ) {
siteIsVariant = new boolean[sizeOfRegion];
lastPositionMarked = 0;
}
// if the dimensions change, copy what we can and continue
else if ( newStartLocation != this.startLocation || sizeOfRegion != siteIsVariant.length ) {
final boolean[] tempArray = new boolean[sizeOfRegion];
final int differenceInStartPositions = newStartLocation - this.startLocation;
lastPositionMarked = Math.min(siteIsVariant.length - differenceInStartPositions, sizeOfRegion);
System.arraycopy(siteIsVariant, differenceInStartPositions, tempArray, 0, lastPositionMarked);
siteIsVariant = null; // explicitly allow garbage collection
siteIsVariant = tempArray;
}
this.startLocation = newStartLocation;
return lastPositionMarked + newStartLocation;
}
}
private final MarkedSites markedSites = new MarkedSites();
/**
* returns an array marked with variant and non-variant regions (it uses
* markVariantRegions to make the marks)
*
* @param stop check the window from start to stop (not-inclusive)
* @return a boolean array with 'true' marking variant regions and false marking consensus sites
*/
protected boolean[] markSites(int stop) {
protected void markSites(final int stop) {
boolean[] markedSites = new boolean[stop - getStartLocation(windowHeader) + contextSize + 1];
final int windowHeaderStartLocation = getStartLocation(windowHeader);
final int sizeOfMarkedRegion = stop - windowHeaderStartLocation + contextSize + 1;
// copy over as many bits as we can from the previous calculation. Note that we can't trust the
// last (contextSize - 1) worth of bits because we may not have actually looked at variant regions there.
final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion) - contextSize - 1;
final int locationToProcess = Math.min(lastPositionMarked, stop - contextSize);
// update the iterator to the correct position
Iterator<HeaderElement> headerElementIterator = windowHeader.iterator();
for (int i = getStartLocation(windowHeader); i < stop; i++) {
for (int i = windowHeaderStartLocation; i < locationToProcess; i++) {
if (headerElementIterator.hasNext())
headerElementIterator.next();
}
// process a contextSize worth of region from scratch in case there's a variant there
for (int i = locationToProcess; i < stop; i++) {
if (headerElementIterator.hasNext()) {
HeaderElement headerElement = headerElementIterator.next();
if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
markVariantRegion(markedSites, i - getStartLocation(windowHeader));
markVariantRegion(markedSites, i - windowHeaderStartLocation);
} else
break;
}
return markedSites;
}
/**
@ -252,11 +308,11 @@ public class SlidingWindow {
* @param markedSites the boolean array to bear the marks
* @param variantSiteLocation the location where a variant site was found
*/
protected void markVariantRegion(boolean[] markedSites, int variantSiteLocation) {
protected void markVariantRegion(final MarkedSites markedSites, final int variantSiteLocation) {
int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize;
int to = (variantSiteLocation + contextSize + 1 > markedSites.length) ? markedSites.length : variantSiteLocation + contextSize + 1;
int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length : variantSiteLocation + contextSize + 1;
for (int i = from; i < to; i++)
markedSites[i] = true;
markedSites.getVariantSiteBitSet()[i] = true;
}
/**
@ -567,26 +623,31 @@ public class SlidingWindow {
result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
result.addAll(finalizeAndAdd(ConsensusType.BOTH));
return result; // finalized reads will be downsampled if necessary
return result; // finalized reads will be downsampled if necessary
}
private List<GATKSAMRecord> closeVariantRegions(List<Pair<Integer, Integer>> regions, boolean forceClose) {
private List<GATKSAMRecord> closeVariantRegions(CompressionStash regions, boolean forceClose) {
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
if (!regions.isEmpty()) {
int lastStop = -1;
for (Pair<Integer, Integer> region : regions) {
int start = region.getFirst();
int stop = region.getSecond();
if (stop < 0 && forceClose)
stop = windowHeader.size() - 1;
if (stop >= 0) {
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
lastStop = stop;
for (SimpleGenomeLoc region : regions) {
int start = region.getStart();
int stop = region.getStop();
if (!region.isFinished()) {
if(forceClose) // region is unfinished but we're forcing the close of this window
stop = windowHeader.size() - 1;
else
continue; // region is unfinished and we're not forcing the close of this window
}
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
lastStop = stop;
}
for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here!
for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here!
}
return allReads;
}
@ -625,8 +686,8 @@ public class SlidingWindow {
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
if (!windowHeader.isEmpty()) {
boolean[] variantSite = markSites(getStopLocation(windowHeader) + 1);
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, windowHeader.size(), variantSite);
markSites(getStopLocation(windowHeader) + 1);
CompressionStash regions = getVariantRegionsFromThisSample(0, windowHeader.size(), markedSites.getVariantSiteBitSet());
finalizedReads = closeVariantRegions(regions, true);
if (!windowHeader.isEmpty()) {
@ -635,6 +696,7 @@ public class SlidingWindow {
}
}
return finalizedReads;
}

View File

@ -72,7 +72,7 @@ public class ErrorModel {
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
if (refSampleVC.isIndel()) {
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements
}
}

View File

@ -245,7 +245,7 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
// find the alternate allele(s) that we should be using
final List<Allele> alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs);
if (alleles == null || alleles.isEmpty())
if (alleles == null || alleles.isEmpty() || (alleles.size() == 1 && alleles.get(0).isReference()))
return null;
// start making the VariantContext
final GenomeLoc loc = ref.getLocus();

View File

@ -62,7 +62,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
}

View File

@ -5,23 +5,22 @@ import org.apache.log4j.Logger;
import org.apache.log4j.TTCCLayout;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.io.*;
import java.util.*;
/**
* Created with IntelliJ IDEA.
* User: depristo
* Date: 10/2/12
* Time: 10:25 AM
* To change this template use File | Settings | File Templates.
* A simple GATK utility (i.e, runs from command-line) for assessing the performance of
* the exact model
*/
public class AFCalcPerformanceTest {
final static Logger logger = Logger.getLogger(AFCalcPerformanceTest.class);
@ -190,7 +189,8 @@ public class AFCalcPerformanceTest {
public enum Operation {
ANALYZE,
SINGLE
SINGLE,
EXACT_LOG
}
public static void main(final String[] args) throws Exception {
final TTCCLayout layout = new TTCCLayout();
@ -204,10 +204,49 @@ public class AFCalcPerformanceTest {
switch ( op ) {
case ANALYZE: analyze(args); break;
case SINGLE: profileBig(args); break;
case EXACT_LOG: exactLog(args); break;
default: throw new IllegalAccessException("unknown operation " + op);
}
}
private static void exactLog(final String[] args) throws Exception {
final File ref = new File(args[1]);
final File exactLogFile = new File(args[2]);
final List<Integer> startsToUse = new LinkedList<Integer>();
for ( int i = 3; i < args.length; i++ )
startsToUse.add(Integer.valueOf(args[i]));
final CachingIndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(ref);
final GenomeLocParser parser = new GenomeLocParser(seq);
final BufferedReader reader = new BufferedReader(new FileReader(exactLogFile));
final List<ExactCallLogger.ExactCall> loggedCalls = ExactCallLogger.readExactLog(reader, startsToUse, parser);
for ( final ExactCallLogger.ExactCall call : loggedCalls ) {
final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(call.vc.getNSamples(), 1,
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
AFCalcTestBuilder.PriorType.human);
logger.info(call);
final SimpleTimer timer = new SimpleTimer().start();
final AFCalcResult result = testBuilder.makeModel().getLog10PNonRef(call.vc, testBuilder.makePriors());
final long newNanoTime = timer.getElapsedTimeNano();
if ( call.originalCall.anyPolymorphic(-1) || result.anyPolymorphic(-1) ) {
logger.info("**** ONE IS POLY");
}
logger.info("\t\t getLog10PosteriorOfAFGT0: " + call.originalCall.getLog10PosteriorOfAFGT0() + " vs " + result.getLog10PosteriorOfAFGT0());
final double speedup = call.runtime / (1.0 * newNanoTime);
logger.info("\t\t runtime: " + call.runtime + " vs " + newNanoTime + " speedup " + String.format("%.2f", speedup) + "x");
for ( final Allele a : call.originalCall.getAllelesUsedInGenotyping() ) {
if ( a.isNonReference() ) {
final String warningmeMLE = call.originalCall.getAlleleCountAtMLE(a) != result.getAlleleCountAtMLE(a) ? " DANGER-MLE-DIFFERENT" : "";
logger.info("\t\t MLE " + a + ": " + call.originalCall.getAlleleCountAtMLE(a) + " vs " + result.getAlleleCountAtMLE(a) + warningmeMLE);
final String warningmePost = call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) == 0 && result.getLog10PosteriorOfAFGt0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : "";
logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFGt0ForAllele(a) + warningmePost);
}
}
}
}
private static void profileBig(final String[] args) throws Exception {
final int nSamples = Integer.valueOf(args[1]);
final int ac = Integer.valueOf(args[2]);
@ -234,7 +273,6 @@ public class AFCalcPerformanceTest {
final List<ModelParams> modelParams = Arrays.asList(
new ModelParams(AFCalcFactory.Calculation.EXACT_REFERENCE, 10000, 10),
// new ModelParams(AFCalcTestBuilder.ModelType.GeneralExact, 100, 10),
new ModelParams(AFCalcFactory.Calculation.EXACT_CONSTRAINED, 10000, 100),
new ModelParams(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 10000, 1000));
final boolean ONLY_HUMAN_PRIORS = false;

View File

@ -45,12 +45,16 @@ public class AFCalcTestBuilder {
human
}
public int getNumAltAlleles() {
return numAltAlleles;
}
public int getnSamples() {
return nSamples;
}
public AFCalc makeModel() {
return AFCalcFactory.createAFCalc(modelType, nSamples, 4, 4, 2);
return AFCalcFactory.createAFCalc(modelType, nSamples, getNumAltAlleles(), 2);
}
public double[] makePriors() {

View File

@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods;
import org.broadinstitute.sting.gatk.walkers.genotyper.ProbabilityVector;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -41,22 +40,20 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private final static boolean VERBOSE = false;
protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) {
super(nSamples, maxAltAlleles, ploidy);
this.ploidy = ploidy;
}
@Override
protected VariantContext reduceScope(VariantContext vc) {
final int maxAltAlleles = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype;
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > maxAltAlleles) {
logger.warn("this tool is currently set to genotype at most " + maxAltAlleles + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
if ( vc.getAlternateAlleles().size() > getMaxAltAlleles()) {
logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
final List<Allele> alleles = new ArrayList<Allele>(maxAltAlleles + 1);
final List<Allele> alleles = new ArrayList<Allele>(getMaxAltAlleles() + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, maxAltAlleles, ploidy));
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles(), ploidy));
VariantContextBuilder builder = new VariantContextBuilder(vc);
builder.alleles(alleles);
@ -69,8 +66,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
@Override
public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) {
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, getResultTracker());
return resultFromTracker(vc, log10AlleleFrequencyPriors);
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors);
return getResultFromFinalState(vc, log10AlleleFrequencyPriors);
}
/**
@ -128,6 +125,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
* @return list of numAllelesToChoose most likely alleles
*/
private static final int PL_INDEX_OF_HOM_REF = 0;
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) {
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
@ -135,7 +133,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes(), true);
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes(), false);
for ( final double[] likelihoods : GLs ) {
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
@ -143,7 +141,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
// by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele
for (int k=1; k < acCount.length;k++) {
if (acCount[k] > 0)
likelihoodSums[k-1].sum += likelihoods[PLindexOfBestGL];
likelihoodSums[k-1].sum += acCount[k] * (likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]);
}
}
@ -170,13 +168,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
* @param numAlleles Number of alternate alleles
* @param ploidyPerPool Number of samples per pool
* @param log10AlleleFrequencyPriors Frequency priors
* @param resultTracker object to fill with output values
*/
protected static void combineSinglePools(final GenotypesContext GLs,
final int numAlleles,
final int ploidyPerPool,
final double[] log10AlleleFrequencyPriors,
final AFCalcResultTracker resultTracker) {
protected void combineSinglePools(final GenotypesContext GLs,
final int numAlleles,
final int ploidyPerPool,
final double[] log10AlleleFrequencyPriors) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs, true);
@ -195,24 +191,24 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
if ( genotypeLikelihoods.size() <= 1 ) {
// no meaningful GLs at all, just set the tracker to non poly values
resultTracker.reset(); // just mimic-ing call below
resultTracker.setLog10LikelihoodOfAFzero(0.0);
getStateTracker().reset(); // just mimic-ing call below
getStateTracker().setLog10LikelihoodOfAFzero(0.0);
} else {
for (int p=1; p<genotypeLikelihoods.size(); p++) {
resultTracker.reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
numAlleles, log10AlleleFrequencyPriors, resultTracker);
getStateTracker().reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p),
combinedPloidy, ploidyPerPool, numAlleles, log10AlleleFrequencyPriors);
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
}
}
}
public static CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool, double[] newGL, int originalPloidy, int newGLPloidy, int numAlleles,
final double[] log10AlleleFrequencyPriors,
final AFCalcResultTracker resultTracker) {
public CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool,
double[] newGL,
int originalPloidy,
int newGLPloidy,
int numAlleles,
final double[] log10AlleleFrequencyPriors) {
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>();
@ -229,16 +225,11 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
// keep processing while we have AC conformations that need to be calculated
StateTracker stateTracker = new StateTracker();
while ( !ACqueue.isEmpty() ) {
resultTracker.incNEvaluations();
getStateTracker().incNEvaluations();
// compute log10Likelihoods
final ExactACset ACset = ACqueue.remove();
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, resultTracker, stateTracker, ACqueue, indexesToACset);
// adjust max likelihood seen if needed
if ( log10LofKs > stateTracker.getMaxLog10L())
stateTracker.update(log10LofKs, ACset.getACcounts());
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset);
// clean up memory
indexesToACset.remove(ACset.getACcounts());
@ -259,39 +250,32 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
* @param log10AlleleFrequencyPriors Prior object
* @param originalPloidy Total ploidy of original combined pool
* @param newGLPloidy Ploidy of GL vector
* @param resultTracker AFResult object
* @param stateTracker max likelihood observed so far
* @param ACqueue Queue of conformations to compute
* @param indexesToACset AC indices of objects in queue
* @return max log likelihood
*/
private static double calculateACConformationAndUpdateQueue(final ExactACset set,
final CombinedPoolLikelihoods newPool,
final CombinedPoolLikelihoods originalPool,
final double[] newGL,
final double[] log10AlleleFrequencyPriors,
final int originalPloidy,
final int newGLPloidy,
final AFCalcResultTracker resultTracker,
final StateTracker stateTracker,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
private double calculateACConformationAndUpdateQueue(final ExactACset set,
final CombinedPoolLikelihoods newPool,
final CombinedPoolLikelihoods originalPool,
final double[] newGL,
final double[] log10AlleleFrequencyPriors,
final int originalPloidy,
final int newGLPloidy,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
// compute likeihood in "set" of new set based on original likelihoods
final int numAlleles = set.getACcounts().getCounts().length;
final int newPloidy = set.getACsum();
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, resultTracker);
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy);
// add to new pool
if (!Double.isInfinite(log10LofK))
newPool.add(set);
// TODO -- uncomment this correct line when the implementation of this model is optimized (it's too slow now to handle this fix)
//if ( log10LofK < stateTracker.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && stateTracker.isLowerAC(set.ACcounts) ) {
if ( log10LofK < stateTracker.getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
if ( VERBOSE )
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, stateTracker.getMaxLog10L());
// TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix)
if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) {
return log10LofK;
}
@ -322,67 +306,67 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
}
/**
* Naive combiner of two multiallelic pools - number of alt alleles must be the same.
* Math is generalization of biallelic combiner.
*
* For vector K representing an allele count conformation,
* Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
* where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
* @param originalPool First log-likelihood pool GL vector
* @param yy Second pool GL vector
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
* @param ploidy2 Ploidy of second pool
* @param numAlleles Number of alleles
* @param log10AlleleFrequencyPriors Array of biallelic priors
* @param resultTracker Af calculation result object
*/
public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
final double[] log10AlleleFrequencyPriors,
final AFCalcResultTracker resultTracker) {
/*
final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
if (dim1 != originalPool.getLength() || dim2 != yy.length)
throw new ReviewedStingException("BUG: Inconsistent vector length");
if (ploidy2 == 0)
return;
final int newPloidy = ploidy1 + ploidy2;
// Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
// and L2(K) = Pr(D|AC2=K) * choose(m2,K)
GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
final double[] x = originalPool.getLikelihoodsAsVector(true);
while(firstIterator.hasNext()) {
x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
firstIterator.next();
}
GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
final double[] y = yy.clone();
while(secondIterator.hasNext()) {
y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
secondIterator.next();
}
// initialize output to -log10(choose(m1+m2,[k1 k2...])
final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
// Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
while(outputIterator.hasNext()) {
final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
originalPool.add(likelihood, set, outputIterator.getLinearIndex());
outputIterator.next();
}
*/
}
// /**
// * Naive combiner of two multiallelic pools - number of alt alleles must be the same.
// * Math is generalization of biallelic combiner.
// *
// * For vector K representing an allele count conformation,
// * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
// * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
// * @param originalPool First log-likelihood pool GL vector
// * @param yy Second pool GL vector
// * @param ploidy1 Ploidy of first pool (# of chromosomes in it)
// * @param ploidy2 Ploidy of second pool
// * @param numAlleles Number of alleles
// * @param log10AlleleFrequencyPriors Array of biallelic priors
// * @param resultTracker Af calculation result object
// */
// public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
// final double[] log10AlleleFrequencyPriors,
// final AFCalcResultTracker resultTracker) {
///*
// final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
// final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
//
// if (dim1 != originalPool.getLength() || dim2 != yy.length)
// throw new ReviewedStingException("BUG: Inconsistent vector length");
//
// if (ploidy2 == 0)
// return;
//
// final int newPloidy = ploidy1 + ploidy2;
//
// // Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
// // and L2(K) = Pr(D|AC2=K) * choose(m2,K)
// GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
// final double[] x = originalPool.getLikelihoodsAsVector(true);
// while(firstIterator.hasNext()) {
// x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
// firstIterator.next();
// }
//
// GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
// final double[] y = yy.clone();
// while(secondIterator.hasNext()) {
// y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
// secondIterator.next();
// }
//
// // initialize output to -log10(choose(m1+m2,[k1 k2...])
// final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
// final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
//
//
// // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
// while(outputIterator.hasNext()) {
// final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
// double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
//
// originalPool.add(likelihood, set, outputIterator.getLinearIndex());
// outputIterator.next();
// }
//*/
// }
/**
* Compute likelihood of a particular AC conformation and update AFresult object
@ -393,15 +377,13 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
* @param numAlleles Number of alleles (including ref)
* @param ploidy1 Ploidy of original pool (combined)
* @param ploidy2 Ploidy of new pool
* @param resultTracker AFResult object
* @return log-likehood of requested conformation
*/
private static double computeLofK(final ExactACset set,
final CombinedPoolLikelihoods firstGLs,
final double[] secondGL,
final double[] log10AlleleFrequencyPriors,
final int numAlleles, final int ploidy1, final int ploidy2,
final AFCalcResultTracker resultTracker) {
private double computeLofK(final ExactACset set,
final CombinedPoolLikelihoods firstGLs,
final double[] secondGL,
final double[] log10AlleleFrequencyPriors,
final int numAlleles, final int ploidy1, final int ploidy2) {
final int newPloidy = ploidy1 + ploidy2;
@ -419,8 +401,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX];
set.getLog10Likelihoods()[0] = log10Lof0;
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0);
getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
return log10Lof0;
} else {
@ -463,14 +445,16 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
// update the MLE if necessary
final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length);
resultTracker.updateMLEifNeeded(log10LofK, altCounts);
// TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY
getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts);
// apply the priors over each alternate allele
for (final int ACcount : altCounts ) {
if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
resultTracker.updateMAPifNeeded(log10LofK, altCounts);
// TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY
getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts);
return log10LofK;
}
@ -493,99 +477,6 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
return (sum == ploidy);
}
/**
* Combines naively two biallelic pools (of arbitrary size).
* For two pools of size m1 and m2, we can compute the combined likelihood as:
* Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k)
* @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i
* @param newPLVector Second GL vector
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
* @param ploidy2 Ploidy of second pool
* @param log10AlleleFrequencyPriors Array of biallelic priors
* @param resultTracker Af calculation result object
* @return Combined likelihood vector
*/
public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector,
final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors,
final AFCalcResultTracker resultTracker) {
final int newPloidy = ploidy1 + ploidy2;
final double[] combinedLikelihoods = new double[1+newPloidy];
/** Pre-fill result array and incorporate weights into input vectors
* Say L1(k) = Pr(D|AC1=k) * choose(m1,k)
* and L2(k) = Pr(D|AC2=k) * choose(m2,k)
* equation reduces to
* Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j)
* which is just plain convolution of L1 and L2 (with pre-existing vector)
*/
// intialize result vector to -infinity
Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY);
final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1);
for (int k=originalPool.getProbabilityVector().length; k< x.length; k++)
x[k] = Double.NEGATIVE_INFINITY;
final double[] y = newPLVector.clone();
final double log10Lof0 = x[0]+y[0];
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
double maxElement = log10Lof0;
int maxElementIdx = 0;
int[] alleleCounts = new int[1];
for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) {
double[] acc = new double[k+1];
Arrays.fill(acc,Double.NEGATIVE_INFINITY);
double innerMax = Double.NEGATIVE_INFINITY;
for (int j=0; j <=k; j++) {
double x1,y1;
if (k-j>=0 && k-j < y.length)
y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j);
else
continue;
if (j < x.length)
x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j);
else
continue;
if (Double.isInfinite(x1) || Double.isInfinite(y1))
continue;
acc[j] = x1 + y1;
if (acc[j] > innerMax)
innerMax = acc[j];
else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY)
break;
}
combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k);
maxElementIdx = k;
double maxDiff = combinedLikelihoods[k] - maxElement;
if (maxDiff > 0)
maxElement = combinedLikelihoods[k];
else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) {
break;
}
alleleCounts[0] = k;
resultTracker.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
resultTracker.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
}
return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true));
}
/**
* From a given variant context, extract a given subset of alleles, and update genotype context accordingly,
* including updating the PL's, and assign genotypes accordingly
@ -674,10 +565,10 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc {
*
* @return genotype
*/
private static void assignGenotype(final GenotypeBuilder gb,
final double[] newLikelihoods,
final List<Allele> allelesToUse,
final int numChromosomes) {
private void assignGenotype(final GenotypeBuilder gb,
final double[] newLikelihoods,
final List<Allele> allelesToUse,
final int numChromosomes) {
final int numNewAltAlleles = allelesToUse.size() - 1;

View File

@ -52,6 +52,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -114,6 +115,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
@Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false)
protected PrintStream graphWriter = null;
/**
* The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
*/
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
@Hidden
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
protected String keepRG = null;
@ -234,14 +241,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
samplesList.addAll( samples );
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
UnifiedArgumentCollection simpleUAC = UAC.clone();
UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC);
simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
simpleUAC.exactCallsLog = null;
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
@ -287,7 +294,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
}
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false );
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
}
@ -400,6 +407,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria
if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do!
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() );
// evaluate each sample's reads against all haplotypes
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList = splitReadsBySample( activeRegion.getReads() );
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );

View File

@ -30,6 +30,9 @@ import com.google.java.contract.Requires;
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pairhmm.*;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
@ -44,8 +47,25 @@ public class LikelihoodCalculationEngine {
private final boolean DEBUG;
private final PairHMM pairHMM;
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) {
pairHMM = new PairHMM( noBanded );
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
switch (hmmType) {
case EXACT:
pairHMM = new ExactPairHMM();
break;
case ORIGINAL:
pairHMM = new OriginalPairHMM();
break;
case CACHING:
pairHMM = new CachingPairHMM();
break;
case LOGLESS_CACHING:
pairHMM = new LoglessCachingPairHMM();
break;
default:
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");
}
this.constantGCP = constantGCP;
DEBUG = debug;
}
@ -69,23 +89,18 @@ public class LikelihoodCalculationEngine {
X_METRIC_LENGTH += 2;
Y_METRIC_LENGTH += 2;
// initial arrays to hold the probabilities of being in the match, insertion and deletion cases
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
// for each sample's reads
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
//if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
// evaluate the likelihood of the reads given those haplotypes
computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey(), matchMetricArray, XMetricArray, YMetricArray );
computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() );
}
}
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample ) {
final int numHaplotypes = haplotypes.size();
final int numReads = reads.size();
@ -113,9 +128,8 @@ public class LikelihoodCalculationEngine {
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
previousHaplotypeSeen = haplotype;
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(),
readQuals, readInsQuals, readDelQuals, overallGCP,
haplotypeStart, matchMetricArray, XMetricArray, YMetricArray);
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0);
readCounts[jjj][iii] = readCount;
}
}
@ -130,7 +144,7 @@ public class LikelihoodCalculationEngine {
return iii;
}
}
return b1.length;
return Math.min(b1.length, b2.length);
}
@Requires({"haplotypes.size() > 0"})
@ -280,7 +294,7 @@ public class LikelihoodCalculationEngine {
final int numHaplotypes = haplotypes.size();
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
bestHaplotypesIndexList.add(0); // always start with the reference haplotype
bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
// set up the default 1-to-1 haplotype mapping object
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
for( final Haplotype h : haplotypes ) {
@ -322,6 +336,13 @@ public class LikelihoodCalculationEngine {
return bestHaplotypes;
}
public static int findReferenceIndex( final List<Haplotype> haplotypes ) {
for( final Haplotype h : haplotypes ) {
if( h.isReference() ) { return haplotypes.indexOf(h); }
}
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
}
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
@ -348,12 +369,14 @@ public class LikelihoodCalculationEngine {
}
}
}
// add all filtered reads to the NO_CALL list because they weren't given any likelihoods
for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
for( final Allele a : call.getFirst().getAlleles() )
for( final Allele a : call.getFirst().getAlleles() ) {
likelihoodMap.add(read, a, 0.0);
}
}
}

View File

@ -0,0 +1,181 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.pairhmm;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.Arrays;
/**
* Created with IntelliJ IDEA.
* User: rpoplin, carneiro
* Date: 10/16/12
*/
public class CachingPairHMM extends OriginalPairHMM {
double[][] constantMatrix = null; // The cache in the CachingPairHMM
double[][] distanceMatrix = null; // The cache in the CachingPairHMM
protected static final double [] firstRowConstantMatrix = {
QualityUtils.qualToProbLog10((byte) (DEFAULT_GOP + DEFAULT_GOP)),
QualityUtils.qualToProbLog10(DEFAULT_GCP),
QualityUtils.qualToErrorProbLog10(DEFAULT_GOP),
QualityUtils.qualToErrorProbLog10(DEFAULT_GCP),
0.0,
0.0
};
@Override
public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
super.initialize(READ_MAX_LENGTH, HAPLOTYPE_MAX_LENGTH);
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
constantMatrix = new double[X_METRIC_LENGTH][6];
distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
// fill in the first row
for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) {
updateCell(1, jjj, 0.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray);
}
}
@Override
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP,
final int hapStartIndex,
final boolean recacheReadValues ) {
if( recacheReadValues ) {
initializeConstants( insertionGOP, deletionGOP, overallGCP );
}
initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex );
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
for (int i = 2; i < X_METRIC_LENGTH; i++) {
for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) {
updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray);
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1;
return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
}
/**
* Initializes the matrix that holds all the constants related to the editing
* distance between the read and the haplotype.
*
* @param haplotypeBases the bases of the haplotype
* @param readBases the bases of the read
* @param readQuals the base quality scores of the read
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
*/
public void initializeDistanceMatrix( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final int startIndex ) {
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
for (int i = 0; i < readBases.length; i++) {
final byte x = readBases[i];
final byte qual = readQuals[i];
for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j];
distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
}
}
}
/**
* Initializes the matrix that holds all the constants related to quality scores.
*
* @param insertionGOP insertion quality scores of the read
* @param deletionGOP deletion quality scores of the read
* @param overallGCP overall gap continuation penalty
*/
public void initializeConstants( final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP ) {
final int l = insertionGOP.length;
constantMatrix[1] = firstRowConstantMatrix;
for (int i = 0; i < l; i++) {
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
constantMatrix[i+2][0] = QualityUtils.qualToProbLog10((byte) qualIndexGOP);
constantMatrix[i+2][1] = QualityUtils.qualToProbLog10(overallGCP[i]);
constantMatrix[i+2][2] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]);
constantMatrix[i+2][3] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
constantMatrix[i+2][4] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]);
constantMatrix[i+2][5] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
}
constantMatrix[l+1][4] = 0.0;
constantMatrix[l+1][5] = 0.0;
}
/**
* Updates a cell in the HMM matrix
*
* The read and haplotype indices are offset by one because the state arrays have an extra column to hold the
* initial conditions
* @param indI row index in the matrices to update
* @param indJ column index in the matrices to update
* @param prior the likelihood editing distance matrix for the read x haplotype
* @param constants an array with the six constants relevant to this location
* @param matchMetricArray the matches likelihood matrix
* @param XMetricArray the insertions likelihood matrix
* @param YMetricArray the deletions likelihood matrix
*/
private void updateCell( final int indI, final int indJ, final double prior, final double[] constants,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
matchMetricArray[indI][indJ] = prior +
MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ - 1] + constants[0],
XMetricArray[indI - 1][indJ - 1] + constants[1],
YMetricArray[indI - 1][indJ - 1] + constants[1] );
XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ] + constants[2],
XMetricArray[indI - 1][indJ] + constants[3]);
YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI][indJ - 1] + constants[4],
YMetricArray[indI][indJ - 1] + constants[5]);
}
}

View File

@ -0,0 +1,187 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.pairhmm;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.Arrays;
/**
* Created with IntelliJ IDEA.
* User: rpoplin, carneiro
* Date: 10/16/12
*/
public class LoglessCachingPairHMM extends CachingPairHMM {
protected static final double SCALE_FACTOR_LOG10 = 300.0;
protected static final double [] firstRowConstantMatrix = {
QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)),
QualityUtils.qualToProb(DEFAULT_GCP),
QualityUtils.qualToErrorProb(DEFAULT_GOP),
QualityUtils.qualToErrorProb(DEFAULT_GCP),
1.0,
1.0
};
@Override
public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
Arrays.fill(matchMetricArray[iii], 0.0);
Arrays.fill(XMetricArray[iii], 0.0);
Arrays.fill(YMetricArray[iii], 0.0);
}
// the initial condition
matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10); // Math.log10(1.0);
constantMatrix = new double[X_METRIC_LENGTH][6];
distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
// fill in the first row
for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) {
updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray);
}
}
@Override
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP,
final int hapStartIndex,
final boolean recacheReadValues ) {
if( recacheReadValues ) {
initializeConstants( insertionGOP, deletionGOP, overallGCP );
}
initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex );
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
for (int i = 2; i < X_METRIC_LENGTH; i++) {
for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) {
updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray);
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1;
return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10;
}
/**
* Initializes the matrix that holds all the constants related to the editing
* distance between the read and the haplotype.
*
* @param haplotypeBases the bases of the haplotype
* @param readBases the bases of the read
* @param readQuals the base quality scores of the read
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
*/
public void initializeDistanceMatrix( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final int startIndex ) {
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
for (int i = 0; i < readBases.length; i++) {
final byte x = readBases[i];
final byte qual = readQuals[i];
for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j];
distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) );
}
}
}
/**
* Initializes the matrix that holds all the constants related to quality scores.
*
* @param insertionGOP insertion quality scores of the read
* @param deletionGOP deletion quality scores of the read
* @param overallGCP overall gap continuation penalty
*/
public void initializeConstants( final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP ) {
final int l = insertionGOP.length;
constantMatrix[1] = firstRowConstantMatrix;
for (int i = 0; i < l; i++) {
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP);
constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]);
constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]);
constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]);
constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]);
constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]);
}
constantMatrix[l+1][4] = 1.0;
constantMatrix[l+1][5] = 1.0;
}
/**
* Updates a cell in the HMM matrix
*
* The read and haplotype indices are offset by one because the state arrays have an extra column to hold the
* initial conditions
* @param indI row index in the matrices to update
* @param indJ column index in the matrices to update
* @param prior the likelihood editing distance matrix for the read x haplotype
* @param constants an array with the six constants relevant to this location
* @param matchMetricArray the matches likelihood matrix
* @param XMetricArray the insertions likelihood matrix
* @param YMetricArray the deletions likelihood matrix
*/
private void updateCell( final int indI, final int indJ, final double prior, final double[] constants,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] +
XMetricArray[indI - 1][indJ - 1] * constants[1] +
YMetricArray[indI - 1][indJ - 1] * constants[1] );
XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3];
YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5];
}
}

View File

@ -14,6 +14,9 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
final String DIVIDEBYZERO_BAM = validationDataLocation + "ReduceReadsDivideByZeroBug.bam";
final String DIVIDEBYZERO_L = " -L " + validationDataLocation + "ReduceReadsDivideByZeroBug.intervals";
final String L = " -L 20:10,100,000-10,120,000 ";
final String COREDUCTION_BAM_A = validationDataLocation + "coreduction.test.A.bam";
final String COREDUCTION_BAM_B = validationDataLocation + "coreduction.test.B.bam";
final String COREDUCTION_L = " -L 1:1,853,860-1,854,354 -L 1:1,884,131-1,892,057";
private void RRTest(String testName, String args, String md5) {
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s ";
@ -21,36 +24,36 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
executeTest(testName, spec);
}
@Test(enabled = false)
@Test(enabled = true)
public void testDefaultCompression() {
RRTest("testDefaultCompression ", L, "323dd4deabd7767efa0f2c6e7fa4189f");
RRTest("testDefaultCompression ", L, "46ea88e32bae3072f5cd68a0db4b55f1");
}
@Test(enabled = false)
@Test(enabled = true)
public void testMultipleIntervals() {
String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
RRTest("testMultipleIntervals ", intervals, "c437fb160547ff271f8eba30e5f3ff76");
RRTest("testMultipleIntervals ", intervals, "c3784a0b42f5456b705f9b152a4b697a");
}
@Test(enabled = false)
@Test(enabled = true)
public void testHighCompression() {
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "3a607bc3ebaf84e9dc44e005c5f8a047");
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e385eb0ae5768f8507671d5303a212d5");
}
@Test(enabled = false)
@Test(enabled = true)
public void testLowCompression() {
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7c9b4a70c2c90b0a995800aa42852e63");
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "6b5546be9363e493b9838542f5dc8cae");
}
@Test(enabled = false)
@Test(enabled = true)
public void testIndelCompression() {
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f7b9fa44c10bc4b2247813d2b8dc1973");
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f6c9ea83608f35f113cf1f62a77ee6d0");
}
@Test(enabled = false)
@Test(enabled = true)
public void testFilteredDeletionCompression() {
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s ";
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb")));
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("122e4e60c4412a31d0aeb3cce879e841")));
}
/**
@ -61,20 +64,26 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
*
* This bam is simplified to replicate the exact bug with the three provided intervals.
*/
@Test(enabled = false)
@Test(enabled = true)
public void testAddingReadAfterTailingTheStash() {
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s ";
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("886b43e1f26ff18425814dc7563931c6")));
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("647b0f0f95730de8e6bc4f74186ad4df")));
}
/**
* Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get
* filtered out.
*/
@Test(enabled = false)
@Test(enabled = true)
public void testDivideByZero() {
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s ";
executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("93ffdc209d4cc0fc4f0169ca9be55cc2")));
executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("2c87985972dd43ee9dd50b463d93a511")));
}
@Test(enabled = true)
public void testCoReduction() {
String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s ";
executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("")));
}
}

View File

@ -60,27 +60,27 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
@Test(enabled = true)
public void testBOTH_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","67dabdbf1e6ed8a83d2e85766558a20a");
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","9ce24f4ff787aed9d3754519a60ef49f");
}
@Test(enabled = true)
public void testINDEL_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","d4bfae27f1b07923f381d708d8a34cf4");
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","492c8ba9a80a902097ff15bbeb031592");
}
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","7d6f319b9edcb1ff8c290fef150a2df8");
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","848e1092b5cd57b0da5f1187e67134e7");
}
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","dd02890123e07e7412a49475cb6280f1");
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","51a7b51d82a341adec0e6510f5dfadd8");
}
@Test(enabled = true)
public void testMT_SNP_DISCOVERY_sp4() {
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da84bf45f7080a46a7a78542b3a0629d");
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","0a8c3b06243040b743dd90d497bb3f83");
}
@Test(enabled = true)

View File

@ -0,0 +1,87 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class AFCalcPerformanceUnitTest extends BaseTest {
@DataProvider(name = "ScalingTests")
public Object[][] makepolyTestProviderLotsOfAlleles() {
List<Object[]> tests = new ArrayList<Object[]>();
// list of all high-quality models in the system
final List<AFCalcFactory.Calculation> biAllelicModels = Arrays.asList(
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
AFCalcFactory.Calculation.EXACT_REFERENCE);
final List<AFCalcFactory.Calculation> multiAllelicModels = Arrays.asList(
AFCalcFactory.Calculation.EXACT_INDEPENDENT);
// for ( final int nonTypePLs : Arrays.asList(100) ) {
// for ( final int nSamples : Arrays.asList(10000) ) {
// final List<Integer> alleleCounts = Arrays.asList(50);
// for ( final int nAltAlleles : Arrays.asList(1) ) {
for ( final int nonTypePLs : Arrays.asList(100) ) {
for ( final int nSamples : Arrays.asList(100, 1000) ) {
final List<Integer> alleleCounts = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 50, 500);
for ( final int nAltAlleles : Arrays.asList(1, 2, 3) ) {
final List<AFCalcFactory.Calculation> models = nAltAlleles > 1 ? multiAllelicModels : biAllelicModels;
for ( final AFCalcFactory.Calculation model : models ) {
for ( final List<Integer> ACs : Utils.makePermutations(alleleCounts, nAltAlleles, true) ) {
if ( MathUtils.sum(ACs) < nSamples * 2 ) {
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(nSamples, nAltAlleles, model, AFCalcTestBuilder.PriorType.human);
tests.add(new Object[]{testBuilder, ACs, nonTypePLs});
}
}
}
}
}
}
return tests.toArray(new Object[][]{});
}
private Pair<Integer, Integer> estNumberOfEvaluations(final AFCalcTestBuilder testBuilder, final VariantContext vc, final int nonTypePL) {
final int evalOverhead = 2; // 2
final int maxEvalsPerSamplePerAC = 3;
int minEvals = 0, maxEvals = 0;
for ( final Allele alt : vc.getAlternateAlleles() ) {
final int AC = vc.getCalledChrCount(alt);
minEvals += AC + evalOverhead; // everyone is hom-var
maxEvals += AC * maxEvalsPerSamplePerAC + 10;
}
return new Pair<Integer, Integer>(minEvals, maxEvals);
}
@Test(dataProvider = "ScalingTests")
private void testScaling(final AFCalcTestBuilder testBuilder, final List<Integer> ACs, final int nonTypePL) {
final AFCalc calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL);
final AFCalcResult result = calc.getLog10PNonRef(vc, priors);
final Pair<Integer, Integer> expectedNEvaluation = estNumberOfEvaluations(testBuilder, vc, nonTypePL);
final int minEvals = expectedNEvaluation.getFirst();
final int maxEvals = expectedNEvaluation.getSecond();
logger.warn(" min " + minEvals + " obs " + result.getnEvaluations() + " max " + maxEvals + " for test " + testBuilder + " sum(ACs)=" + (int)MathUtils.sum(ACs));
Assert.assertTrue(result.getnEvaluations() >= minEvals,
"Actual number of evaluations " + result.getnEvaluations() + " < min number of evals " + minEvals);
Assert.assertTrue(result.getnEvaluations() <= maxEvals,
"Actual number of evaluations " + result.getnEvaluations() + " > max number of evals " + minEvals);
}
}

View File

@ -0,0 +1,82 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
public class AFCalcResultUnitTest extends BaseTest {
private static class MyTest {
final double[] Ls, expectedPosteriors;
private MyTest(double[] ls, double[] expectedPosteriors) {
Ls = ls;
this.expectedPosteriors = expectedPosteriors;
}
@Override
public String toString() {
return "Ls [" + Utils.join(",", Ls) + "] expectedPosteriors [" + Utils.join(",", expectedPosteriors) + "]";
}
}
@DataProvider(name = "TestComputePosteriors")
public Object[][] makeTestCombineGLs() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{new MyTest(log10Even, log10Even)});
for ( double L0 = -1e9; L0 < 0.0; L0 /= 10.0 ) {
for ( double L1 = -1e2; L1 < 0.0; L1 /= 100.0 ) {
final double[] input = new double[]{L0, L1};
final double[] expected = MathUtils.normalizeFromLog10(input, true);
tests.add(new Object[]{new MyTest(input, expected)});
}
}
for ( double bigBadL = -1e50; bigBadL < -1e200; bigBadL *= 10 ) {
// test that a huge bad likelihood remains, even with a massive better result
for ( final double betterL : Arrays.asList(-1000.0, -100.0, -10.0, -1.0, -0.1, -0.01, -0.001, 0.0)) {
tests.add(new Object[]{new MyTest(new double[]{bigBadL, betterL}, new double[]{bigBadL, 0.0})});
tests.add(new Object[]{new MyTest(new double[]{betterL, bigBadL}, new double[]{0.0, bigBadL})});
}
}
// test that a modest bad likelihood with an ~0.0 value doesn't get lost
for ( final double badL : Arrays.asList(-10000.0, -1000.0, -100.0, -10.0)) {
tests.add(new Object[]{new MyTest(new double[]{badL, -1e-9}, new double[]{badL, 0.0})});
tests.add(new Object[]{new MyTest(new double[]{-1e-9, badL}, new double[]{0.0, badL})});
}
// test that a non-ref site gets reasonable posteriors with an ~0.0 value doesn't get lost
for ( final double nonRefL : Arrays.asList(-100.0, -50.0, -10.0, -9.0, -8.0, -7.0, -6.0, -5.0)) {
tests.add(new Object[]{new MyTest(new double[]{0.0, nonRefL}, new double[]{0.0, nonRefL})});
}
return tests.toArray(new Object[][]{});
}
final static double[] log10Even = MathUtils.normalizeFromLog10(new double[]{0.5, 0.5}, true);
final static Allele C = Allele.create("C");
final static List<Allele> alleles = Arrays.asList(Allele.create("A", true), C);
@Test(enabled = true, dataProvider = "TestComputePosteriors")
private void testComputingPosteriors(final MyTest data) {
final AFCalcResult result = new AFCalcResult(new int[]{0}, 1, alleles, data.Ls, log10Even, Collections.singletonMap(C, -1.0));
Assert.assertEquals(result.getLog10PosteriorOfAFEq0(), data.expectedPosteriors[0], 1e-3, "AF = 0 not expected");
Assert.assertEquals(result.getLog10PosteriorOfAFGT0(), data.expectedPosteriors[1], 1e-3, "AF > 0 not expected");
final double[] actualPosteriors = new double[]{result.getLog10PosteriorOfAFEq0(), result.getLog10PosteriorOfAFGT0()};
Assert.assertEquals(MathUtils.sumLog10(actualPosteriors), 1.0, 1e-3, "Posteriors don't sum to 1 with 1e-3 precision");
}
}

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.MathUtils;
@ -124,12 +125,7 @@ public class AFCalcUnitTest extends BaseTest {
final List<Genotype> triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2);
for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(
Arrays.asList(
AFCalcFactory.Calculation.EXACT_REFERENCE,
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY
), 4, 2, 2, 2);
List<AFCalc> calcs = AFCalcFactory.createAFCalcs( Arrays.asList( AFCalcFactory.Calculation.values() ), 4, 2, 2);
final int nPriorValues = 2*nSamples+1;
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
@ -146,7 +142,7 @@ public class AFCalcUnitTest extends BaseTest {
new GetGLsTest(model, 1, genotypes, priors, priorName);
// tri-allelic
if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) ) // || model != generalCalc ) )
if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) && ! ( model instanceof OriginalDiploidExactAFCalc) ) // || model != generalCalc ) )
for ( List<Genotype> genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) )
new GetGLsTest(model, 2, genotypes, priors, priorName);
}
@ -156,22 +152,28 @@ public class AFCalcUnitTest extends BaseTest {
return GetGLsTest.getTests(GetGLsTest.class);
}
@DataProvider(name = "badGLs")
public Object[][] createBadGLs() {
final List<Genotype> genotypes = Arrays.asList(AB2, BB2, CC2, CC2);
final int nSamples = genotypes.size();
// @DataProvider(name = "badGLs")
// public Object[][] createBadGLs() {
// final List<Genotype> genotypes = Arrays.asList(AB2, BB2, CC2, CC2);
// final int nSamples = genotypes.size();
//
// final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4);
//
// final int nPriorValues = 2*nSamples+1;
// final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
// for ( AFCalc model : Arrays.asList(indCalc) ) {
// final String priorName = "flat";
// new GetGLsTest(model, 2, genotypes, priors, priorName);
// }
//
// return GetGLsTest.getTests(GetGLsTest.class);
// }
final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4);
final int nPriorValues = 2*nSamples+1;
final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
for ( AFCalc model : Arrays.asList(indCalc) ) {
final String priorName = "flat";
new GetGLsTest(model, 2, genotypes, priors, priorName);
}
return GetGLsTest.getTests(GetGLsTest.class);
}
//
// @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs")
// public void testBadGLs(GetGLsTest cfg) {
// testResultSimple(cfg);
// }
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs")
public void testBiallelicGLs(GetGLsTest cfg) {
@ -185,11 +187,6 @@ public class AFCalcUnitTest extends BaseTest {
testResultSimple(cfg);
}
@Test(enabled = true, dataProvider = "badGLs")
public void testBadGLs(GetGLsTest cfg) {
testResultSimple(cfg);
}
private static class NonInformativeData {
final Genotype nonInformative;
final List<Genotype> called;
@ -218,16 +215,14 @@ public class AFCalcUnitTest extends BaseTest {
samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative));
final int nSamples = samples.size();
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(
Arrays.asList(
AFCalcFactory.Calculation.EXACT_REFERENCE,
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY
), 4, 2, 2, 2);
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(Arrays.asList(AFCalcFactory.Calculation.values()), 4, 2, 2);
final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors
for ( AFCalc model : calcs ) {
if ( testData.nAltAlleles > 1 && model instanceof OriginalDiploidExactAFCalc )
continue;
final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat");
for ( int rotation = 0; rotation < nSamples; rotation++ ) {
@ -372,16 +367,14 @@ public class AFCalcUnitTest extends BaseTest {
final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make();
final AFCalcTestBuilder.PriorType priorType = AFCalcTestBuilder.PriorType.flat;
final List<AFCalcFactory.Calculation> constrainedModel = Arrays.asList(AFCalcFactory.Calculation.EXACT_CONSTRAINED);
final double TOLERANCE = 0.5;
final List<PNonRefData> initialPNonRefData = Arrays.asList(
// bi-allelic sites
new PNonRefData(vc2, makePL(AA, 0, 10, 10), 0.1666667, TOLERANCE, true),
new PNonRefData(vc2, makePL(AA, 0, 1, 10), 0.4721084, TOLERANCE, false, constrainedModel),
new PNonRefData(vc2, makePL(AA, 0, 1, 1), 0.6136992, TOLERANCE, false, constrainedModel),
new PNonRefData(vc2, makePL(AA, 0, 5, 5), 0.3874259, TOLERANCE, false, constrainedModel),
new PNonRefData(vc2, makePL(AA, 0, 1, 10), 0.4721084, TOLERANCE, false),
new PNonRefData(vc2, makePL(AA, 0, 1, 1), 0.6136992, TOLERANCE, false),
new PNonRefData(vc2, makePL(AA, 0, 5, 5), 0.3874259, TOLERANCE, false),
new PNonRefData(vc2, makePL(AC, 10, 0, 10), 0.9166667, TOLERANCE, true),
new PNonRefData(vc2, makePL(CC, 10, 10, 0), 0.9166667, TOLERANCE, true),
@ -430,6 +423,94 @@ public class AFCalcUnitTest extends BaseTest {
"Actual pNonRef not within tolerance " + tolerance + " of expected");
}
@DataProvider(name = "PNonRefBiallelicSystematic")
public Object[][] makePNonRefBiallelicSystematic() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<Integer> bigNonRefPLs = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 50, 100, 1000);
final List<List<Integer>> bigDiploidPLs = removeBadPLs(Utils.makePermutations(bigNonRefPLs, 3, true));
for ( AFCalcFactory.Calculation modelType : AFCalcFactory.Calculation.values() ) {
if ( false ) { // for testing only
tests.add(new Object[]{modelType, toGenotypes(Arrays.asList(Arrays.asList(0,100,0)))});
} else {
if ( modelType == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) continue; // TODO -- GENERAL_PLOIDY DOESN'T WORK
// test all combinations of PLs for 1 sample
for ( final List<List<Integer>> PLsPerSample : Utils.makePermutations(bigDiploidPLs, 1, true) ) {
tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)});
}
final List<List<Integer>> smallDiploidPLs = new LinkedList<List<Integer>>();
for ( final int nonRefPL : Arrays.asList(5, 10, 20, 30) ) {
for ( int i = 0; i < 2; i++ ) {
List<Integer> pls = new ArrayList<Integer>(Collections.nCopies(3, nonRefPL));
pls.set(i, 0);
smallDiploidPLs.add(pls);
}
}
for ( final List<List<Integer>> PLsPerSample : Utils.makePermutations(smallDiploidPLs, 5, false) ) {
tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)});
}
}
}
return tests.toArray(new Object[][]{});
}
final List<List<Integer>> removeBadPLs(List<List<Integer>> listOfPLs) {
List<List<Integer>> clean = new LinkedList<List<Integer>>();
for ( final List<Integer> PLs : listOfPLs ) {
int x = PLs.get(0);
boolean bad = false;
for ( int pl1 : PLs )
if ( pl1 > x )
bad = true;
else
x = pl1;
if ( ! bad ) clean.add(PLs);
}
return clean;
}
private List<Genotype> toGenotypes(final List<List<Integer>> PLsPerSample) {
final List<Allele> nocall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
final List<Genotype> genotypes = new ArrayList<Genotype>(PLsPerSample.size());
for ( final List<Integer> PLs : PLsPerSample ) {
final int[] pls = ArrayUtils.toPrimitive(PLs.toArray(new Integer[3]));
final int min = MathUtils.arrayMin(pls);
for ( int i = 0; i < pls.length; i++ ) pls[i] -= min;
genotypes.add(makePL(nocall, pls));
}
return genotypes;
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRefBiallelicSystematic")
private void PNonRefBiallelicSystematic(AFCalcFactory.Calculation modelType, final List<Genotype> genotypes) {
//logger.warn("Running " + modelType + " with " + genotypes);
final AFCalcTestBuilder refBuilder = new AFCalcTestBuilder(genotypes.size(), 1, AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcTestBuilder.PriorType.human);
final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(genotypes.size(), 1, modelType, AFCalcTestBuilder.PriorType.human);
final VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A, C));
vcb.genotypes(genotypes);
final AFCalcResult refResult = refBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
final AFCalcResult testResult = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
final double tolerance = 1e-3;
Assert.assertEquals(testResult.getLog10PosteriorOfAFGT0(), refResult.getLog10PosteriorOfAFGT0(), tolerance,
"Actual pNonRef not within tolerance " + tolerance + " of expected");
Assert.assertEquals(testResult.getAlleleCountsOfMLE(), refResult.getAlleleCountsOfMLE(),
"Actual MLE " + Utils.join(",", testResult.getAlleleCountsOfMLE()) + " not equal to expected " + Utils.join(",", refResult.getAlleleCountsOfMLE()));
}
// --------------------------------------------------------------------------------
//
// Test priors
@ -448,7 +529,7 @@ public class AFCalcUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@Test(enabled = true & ! DEBUG_ONLY, dataProvider = "Models")
@Test(enabled = true && !DEBUG_ONLY, dataProvider = "Models")
public void testBiallelicPriors(final AFCalc model) {
for ( int REF_PL = 10; REF_PL <= 20; REF_PL += 10 ) {
@ -456,26 +537,29 @@ public class AFCalcUnitTest extends BaseTest {
for ( int log10NonRefPrior = 1; log10NonRefPrior < 10*REF_PL; log10NonRefPrior += 1 ) {
final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior);
final double[] priors = MathUtils.normalizeFromLog10(MathUtils.toLog10(new double[]{refPrior, (1-refPrior) / 2, (1-refPrior) / 2}), true);
GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior);
final AFCalcResult resultTracker = cfg.execute();
final int actualAC = resultTracker.getAlleleCountsOfMLE()[0];
final double nonRefPrior = (1-refPrior) / 2;
final double[] priors = MathUtils.normalizeFromLog10(MathUtils.toLog10(new double[]{refPrior, nonRefPrior, nonRefPrior}), true);
if ( ! Double.isInfinite(priors[1]) ) {
GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior);
final AFCalcResult resultTracker = cfg.execute();
final int actualAC = resultTracker.getAlleleCountsOfMLE()[0];
final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1] - Math.log10(0.5);
final double nonRefPost = Math.pow(10, pHetWithPrior) / (Math.pow(10, pRefWithPrior) + Math.pow(10, pHetWithPrior));
final double log10NonRefPost = Math.log10(nonRefPost);
final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1] - Math.log10(0.5);
final double nonRefPost = Math.pow(10, pHetWithPrior) / (Math.pow(10, pRefWithPrior) + Math.pow(10, pHetWithPrior));
final double log10NonRefPost = Math.log10(nonRefPost);
if ( ! Double.isInfinite(log10NonRefPost) )
Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), log10NonRefPost, 1e-2);
if ( ! Double.isInfinite(log10NonRefPost) )
Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), log10NonRefPost, 1e-2);
if ( nonRefPost >= 0.9 )
Assert.assertTrue(resultTracker.isPolymorphic(C, -1));
if ( nonRefPost >= 0.9 )
Assert.assertTrue(resultTracker.isPolymorphic(C, -1));
final int expectedMLEAC = 1; // the MLE is independent of the prior
Assert.assertEquals(actualAC, expectedMLEAC,
"actual AC with priors " + log10NonRefPrior + " not expected "
+ expectedMLEAC + " priors " + Utils.join(",", priors));
final int expectedMLEAC = 1; // the MLE is independent of the prior
Assert.assertEquals(actualAC, expectedMLEAC,
"actual AC with priors " + log10NonRefPrior + " not expected "
+ expectedMLEAC + " priors " + Utils.join(",", priors));
}
}
}
}
@ -494,7 +578,7 @@ public class AFCalcUnitTest extends BaseTest {
// list of all high-quality models in the system
final List<AFCalcFactory.Calculation> models = Arrays.asList(
AFCalcFactory.Calculation.EXACT,
AFCalcFactory.Calculation.getDefaultModel(),
AFCalcFactory.Calculation.EXACT_REFERENCE,
AFCalcFactory.Calculation.EXACT_INDEPENDENT);

View File

@ -1,124 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class ConstrainedAFCalculationModelUnitTest extends BaseTest {
static Allele A = Allele.create("A", true);
static Allele C = Allele.create("C");
static Allele G = Allele.create("G");
protected static Genotype makePL(final List<Allele> expectedGT, int ... pls) {
return AFCalcUnitTest.makePL(expectedGT, pls);
}
@DataProvider(name = "MaxACsToVisit")
public Object[][] makeMaxACsToVisit() {
List<Object[]> tests = new ArrayList<Object[]>();
final int nSamples = 10;
for (int nNonInformative = 0; nNonInformative < nSamples - 1; nNonInformative++ ) {
final int nChrom = (nSamples - nNonInformative) * 2;
for ( int i = 0; i < nChrom; i++ ) {
// bi-allelic
tests.add(new Object[]{nSamples, Arrays.asList(i), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED});
// tri-allelic
for ( int j = 0; j < (nChrom - i); j++)
tests.add(new Object[]{nSamples, Arrays.asList(i, j), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED});
}
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "MaxACsToVisit")
public void testMaxACsToVisit(final int nSamples, final List<Integer> requestedACs, final int nNonInformative, final AFCalcFactory.Calculation modelType) {
final int nAlts = requestedACs.size();
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(nSamples, nAlts, modelType,
AFCalcTestBuilder.PriorType.human);
final VariantContext vc = testBuilder.makeACTest(requestedACs, nNonInformative, 100);
final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc);
testExpectedACs(vc, maxACsToVisit);
}
private void testExpectedACs(final VariantContext vc, final int[] maxACsToVisit) {
// this is necessary because cannot ensure that the tester gives us back the
// requested ACs due to rounding errors
final List<Integer> ACs = new ArrayList<Integer>();
for ( final Allele a : vc.getAlternateAlleles() )
ACs.add(vc.getCalledChrCount(a));
for ( int i = 0; i < maxACsToVisit.length; i++ ) {
Assert.assertEquals(maxACsToVisit[i], (int)ACs.get(i), "Maximum AC computed wasn't equal to the max possible in the construction for alt allele " + i);
}
}
@DataProvider(name = "MaxACsGenotypes")
public Object[][] makeMaxACsForGenotype() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<Allele> AA = Arrays.asList(A, A);
final List<Allele> AC = Arrays.asList(A, C);
final List<Allele> CC = Arrays.asList(C, C);
final List<Allele> AG = Arrays.asList(A, G);
final List<Allele> GG = Arrays.asList(G, G);
final List<Allele> CG = Arrays.asList(C, G);
final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make();
final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make();
tests.add(new Object[]{vc2, makePL(AA, 0, 10, 10)});
tests.add(new Object[]{vc2, makePL(AC, 10, 0, 10)});
tests.add(new Object[]{vc2, makePL(CC, 10, 10, 0)});
// make sure non-informative => 0
tests.add(new Object[]{vc2, makePL(AA, 0, 0, 0)});
tests.add(new Object[]{vc3, makePL(AA, 0, 0, 0, 0, 0, 0)});
// multi-allelics
tests.add(new Object[]{vc3, makePL(AG, 10, 10, 10, 0, 10, 10)});
tests.add(new Object[]{vc3, makePL(CG, 10, 10, 10, 10, 0, 10)});
tests.add(new Object[]{vc3, makePL(GG, 10, 10, 10, 10, 10, 0)});
// deal with non-informatives third alleles
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 10)});
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 10)});
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 0)});
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 0)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 10)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 10)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 0)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 0)});
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "MaxACsGenotypes")
private void testMakeACByGenotype(final VariantContext vcRoot, final Genotype g) {
final VariantContext vc = new VariantContextBuilder(vcRoot).genotypes(g).make();
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(1, vc.getNAlleles()-1, AFCalcFactory.Calculation.EXACT_CONSTRAINED,
AFCalcTestBuilder.PriorType.human);
final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc);
testExpectedACs(vc, maxACsToVisit);
}
}

View File

@ -137,18 +137,15 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest {
@Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) {
final AFCalcResultTracker resultTracker = new AFCalcResultTracker(cfg.numAltAlleles);
final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size());
double[] priors = new double[len]; // flat priors
GeneralPloidyExactAFCalc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, resultTracker);
final GeneralPloidyExactAFCalc calc = new GeneralPloidyExactAFCalc(cfg.GLs.size(), 1 + cfg.numAltAlleles, cfg.ploidy);
calc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
int calculatedAlleleCount = resultTracker.getAlleleCountsOfMAP()[allele];
// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount);
int calculatedAlleleCount = calc.getStateTracker().getAlleleCountsOfMAP()[allele];
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
}
}

View File

@ -55,48 +55,14 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@DataProvider(name = "TestCombineGLsWithDrops")
public Object[][] makeTestCombineGLsWithDrops() {
List<Object[]> tests = new ArrayList<Object[]>();
final Set<Integer> noDrops = Collections.emptySet();
final Set<Integer> drop1 = Collections.singleton(1);
final Set<Integer> drop2 = Collections.singleton(2);
// AA AB BB AC BC CC
// drop1 (B): AA AC CC
// drop2 (C): AA AB BB
tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5), noDrops});
tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9), noDrops});
tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 1, 2), drop2});
tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 3, 5), drop1});
tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(0, 2, 6), noDrops});
tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(1, 0, 2), noDrops});
tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(2, 1, 0), drop2});
tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(5, 2, 0), drop1});
tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 8,11), noDrops});
tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL( 5, 7, 0), noDrops});
tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 0, 0), drop2});
tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL(10,10, 0), drop1});
return tests.toArray(new Object[][]{});
}
private Genotype makePL(final int ... PLs) {
return AFCalcUnitTest.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), PLs);
}
@Test(enabled = true, dataProvider = "TestCombineGLs")
private void testCombineGLs(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) {
testCombineGLsWithDrops(altIndex, nAlts, testg, expected, Collections.<Integer>emptySet());
}
@Test(enabled = true, dataProvider = "TestCombineGLsWithDrops")
private void testCombineGLsWithDrops(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected, Set<Integer> allelesToDrop) {
final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4);
final Genotype combined = calc.combineGLs(testg, altIndex, allelesToDrop, nAlts);
final Genotype combined = calc.combineGLs(testg, altIndex, nAlts);
Assert.assertEquals(combined.getPL(), expected.getPL(),
"Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL()));
@ -120,22 +86,21 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest {
final Genotype gACG = makePL( 0, 1, 2, 3, 4, 5);
final Genotype gAGC = makePL( 0, 4, 5, 1, 3, 2);
final Genotype gACcombined = makePL(0, 2, 5);
final Genotype gACcombined2 = makePL(0, 1, 4);
final Genotype gAGcombined = makePL(0, 4, 9);
final Genotype gACdropped = makePL(0, 1, 2);
final Genotype gAGdropped = makePL(0, 3, 5);
// biallelic
tests.add(new Object[]{vcAC.genotypes(gACcombined).make(), Arrays.asList(vcAC.genotypes(gACcombined).make())});
// tri-allelic
tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGdropped).make())});
tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACdropped).make())});
tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGcombined).make())});
tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACcombined2).make())});
return tests.toArray(new Object[][]{});
}
@Test(enabled = false, dataProvider = "TestMakeAlleleConditionalContexts")
@Test(enabled = true, dataProvider = "TestMakeAlleleConditionalContexts")
private void testMakeAlleleConditionalContexts(final VariantContext vc, final List<VariantContext> expectedVCs) {
final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4);
final List<VariantContext> biAllelicVCs = calc.makeAlleleConditionalContexts(vc);
@ -148,7 +113,8 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest {
Assert.assertEquals(actual.getAlleles(), expected.getAlleles());
for ( int j = 0; j < actual.getNSamples(); j++ )
Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL());
Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL(),
"expected PLs " + Utils.join(",", expected.getGenotype(j).getPL()) + " not equal to actual " + Utils.join(",", actual.getGenotype(j).getPL()));
}
}

View File

@ -21,7 +21,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "8c52c0955099cca3215a0d78fd455894");
HCTest(CEUTRIO_BAM, "", "ee866a8694a6f6c77242041275350ab9");
}
@Test
@ -31,7 +31,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "61c1a0fb62d909229af6b5a91dad8b35");
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "53caa950535749f99d3c5b9bb61c7b60");
}
private void HCTestComplexVariants(String bam, String args, String md5) {
@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleSymbolic() {
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "6eb9c1026225b38ba7bd3c4c218f8269");
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "b4ea70a446e4782bd3700ca14dd726ff");
}
private void HCTestIndelQualityScores(String bam, String args, String md5) {
@ -64,13 +64,13 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "98d82d74e8d6a778290bee6c0df6d092");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8");
}
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("fa5c5eb996e95aed12c50d70e6dd74d7"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c54c0c9411054bf629bfd98b616e53fc"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}

View File

@ -23,24 +23,26 @@
*/
// our package
package org.broadinstitute.sting.utils;
package org.broadinstitute.sting.utils.pairhmm;
// the imports for unit testing.
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
public class PairHMMUnitTest extends BaseTest {
final static boolean EXTENSIVE_TESTING = true;
PairHMM hmm = new PairHMM( false ); // reference implementation
PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding
PairHMM exactHMM = new ExactPairHMM(); // the log truth implementation
PairHMM originalHMM = new OriginalPairHMM(); // the reference implementation
PairHMM cachingHMM = new CachingPairHMM();
PairHMM loglessHMM = new LoglessCachingPairHMM();
// --------------------------------------------------------------------------------
//
@ -57,7 +59,7 @@ public class PairHMMUnitTest extends BaseTest {
final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC";
final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA";
public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) {
public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) {
this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false);
}
@ -76,115 +78,51 @@ public class PairHMMUnitTest extends BaseTest {
}
public double expectedLogL() {
return expectedQual / -10.0;
return (expectedQual / -10.0) + 0.03 ;
}
public double tolerance() {
return 0.1; // TODO FIXME arbitrary
public double toleranceFromTheoretical() {
return 0.2;
}
public double calcLogL() {
public double toleranceFromReference() {
return 1E-4;
}
double logL = hmm.computeReadLikelihoodGivenHaplotype(
public double toleranceFromExact() {
return 1E-9;
}
public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) {
pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length);
return pairHMM.computeReadLikelihoodGivenHaplotypeLog10(
refBasesWithContext, readBasesWithContext,
qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true),
qualAsBytes(gcp, false));
return logL;
qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel),
qualAsBytes(gcp, false, anchorIndel), 0, true);
}
private final byte[] asBytes(final String bases, final boolean left, final boolean right) {
return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
}
private byte[] qualAsBytes(final int phredQual, final boolean doGOP) {
private byte[] qualAsBytes(final int phredQual, final boolean doGOP, final boolean anchorIndel) {
final byte phredQuals[] = new byte[readBasesWithContext.length];
// initialize everything to MASSIVE_QUAL so it cannot be moved by HMM
Arrays.fill(phredQuals, (byte)100);
// update just the bases corresponding to the provided micro read with the quality scores
if( doGOP ) {
phredQuals[0 + CONTEXT.length()] = (byte)phredQual;
} else {
for ( int i = 0; i < read.length(); i++)
phredQuals[i + CONTEXT.length()] = (byte)phredQual;
}
if( anchorIndel ) {
// initialize everything to MASSIVE_QUAL so it cannot be moved by HMM
Arrays.fill(phredQuals, (byte)100);
return phredQuals;
}
}
final Random random = new Random(87865573);
private class BandedLikelihoodTestProvider extends TestDataProvider {
final String ref, read;
final byte[] refBasesWithContext, readBasesWithContext;
final int baseQual, insQual, delQual, gcp;
final int expectedQual;
final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC";
final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA";
final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT";
final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA";
final byte[] baseQuals, insQuals, delQuals, gcps;
public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) {
this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false);
}
public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) {
super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual));
this.baseQual = baseQual;
this.delQual = delQual;
this.insQual = insQual;
this.gcp = gcp;
this.read = read;
this.ref = ref;
this.expectedQual = expectedQual;
refBasesWithContext = asBytes(ref, left, right);
readBasesWithContext = asBytes(read, false, false);
baseQuals = qualAsBytes(baseQual);
insQuals = qualAsBytes(insQual);
delQuals = qualAsBytes(delQual);
gcps = qualAsBytes(gcp, false);
}
public double expectedLogL() {
double logL = hmm.computeReadLikelihoodGivenHaplotype(
refBasesWithContext, readBasesWithContext,
baseQuals, insQuals, delQuals, gcps);
return logL;
}
public double tolerance() {
return 0.2; // TODO FIXME arbitrary
}
public double calcLogL() {
double logL = bandedHMM.computeReadLikelihoodGivenHaplotype(
refBasesWithContext, readBasesWithContext,
baseQuals, insQuals, delQuals, gcps);
return logL;
}
private final byte[] asBytes(final String bases, final boolean left, final boolean right) {
return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
}
private byte[] qualAsBytes(final int phredQual) {
return qualAsBytes(phredQual, true);
}
private byte[] qualAsBytes(final int phredQual, final boolean addRandom) {
final byte phredQuals[] = new byte[readBasesWithContext.length];
Arrays.fill(phredQuals, (byte)phredQual);
if(addRandom) {
for( int iii = 0; iii < phredQuals.length; iii++) {
phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3));
// update just the bases corresponding to the provided micro read with the quality scores
if( doGOP ) {
phredQuals[0 + CONTEXT.length()] = (byte)phredQual;
} else {
for ( int i = 0; i < read.length(); i++)
phredQuals[i + CONTEXT.length()] = (byte)phredQual;
}
} else {
Arrays.fill(phredQuals, (byte)phredQual);
}
return phredQuals;
}
}
@ -195,8 +133,8 @@ public class PairHMMUnitTest extends BaseTest {
// test all combinations
final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30);
final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40);
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10);
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2);
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(8, 10, 20) : Arrays.asList(10);
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20,30,35) : Arrays.asList(2);
for ( final int baseQual : baseQuals ) {
for ( final int indelQual : indelQuals ) {
@ -219,7 +157,7 @@ public class PairHMMUnitTest extends BaseTest {
for ( boolean insertionP : Arrays.asList(true, false)) {
final String small = Utils.dupString((char)base, 1);
final String big = Utils.dupString((char)base, size);
final String big = Utils.dupString((char) base, size);
final String ref = insertionP ? small : big;
final String read = insertionP ? big : small;
@ -238,69 +176,65 @@ public class PairHMMUnitTest extends BaseTest {
return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
}
@Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true)
public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) {
double calculatedLogL = cfg.calcLogL();
double expectedLogL = cfg.expectedLogL();
logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString()));
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance());
}
@DataProvider(name = "BandedLikelihoodTestProvider")
public Object[][] makeBandedLikelihoodTests() {
final Random random = new Random(87860573);
@DataProvider(name = "OptimizedLikelihoodTestProvider")
public Object[][] makeOptimizedLikelihoodTests() {
// context on either side is ACGTTGCA REF ACGTTGCA
// test all combinations
final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30);
final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40);
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10);
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2);
final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 30, 40, 60) : Arrays.asList(30);
final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 40, 60) : Arrays.asList(40);
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10);
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(3, 20, 50, 90, 160) : Arrays.asList(2);
for ( final int baseQual : baseQuals ) {
for ( final int indelQual : indelQuals ) {
for ( final int gcp : gcps ) {
// test substitutions
for ( final byte refBase : BaseUtils.BASES ) {
for ( final byte readBase : BaseUtils.BASES ) {
final String ref = new String(new byte[]{refBase});
final String read = new String(new byte[]{readBase});
final int expected = refBase == readBase ? 0 : baseQual;
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp);
}
}
// test insertions and deletions
for ( final int size : sizes ) {
for ( final byte base : BaseUtils.BASES ) {
final int expected = indelQual + (size - 2) * gcp;
for ( boolean insertionP : Arrays.asList(true, false)) {
final String small = Utils.dupString((char)base, 1);
final String big = Utils.dupString((char)base, size);
final String ref = insertionP ? small : big;
final String read = insertionP ? big : small;
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp);
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false);
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true);
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true);
for ( final int refSize : sizes ) {
for ( final int readSize : sizes ) {
String ref = "";
String read = "";
for( int iii = 0; iii < refSize; iii++) {
ref += (char) BaseUtils.BASES[random.nextInt(4)];
}
for( int iii = 0; iii < readSize; iii++) {
read += (char) BaseUtils.BASES[random.nextInt(4)];
}
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp);
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, false);
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, false, true);
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, true);
}
}
}
}
}
return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class);
return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
}
@Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true)
public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) {
double calculatedLogL = cfg.calcLogL();
@Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true)
public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) {
double exactLogL = cfg.calcLogL( exactHMM, true );
double calculatedLogL = cfg.calcLogL( originalHMM, true );
double optimizedLogL = cfg.calcLogL( cachingHMM, true );
double loglessLogL = cfg.calcLogL( loglessHMM, true );
double expectedLogL = cfg.expectedLogL();
logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString()));
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance());
//logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString()));
Assert.assertEquals(exactLogL, expectedLogL, cfg.toleranceFromTheoretical());
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.toleranceFromTheoretical());
Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference());
Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact());
}
@Test(dataProvider = "OptimizedLikelihoodTestProvider", enabled = true)
public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) {
double exactLogL = cfg.calcLogL( exactHMM, false );
double calculatedLogL = cfg.calcLogL( originalHMM, false );
double optimizedLogL = cfg.calcLogL( cachingHMM, false );
double loglessLogL = cfg.calcLogL( loglessHMM, false );
//logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString()));
Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference());
Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact());
}
@Test
@ -322,11 +256,11 @@ public class PairHMMUnitTest extends BaseTest {
byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset);
// change single base at position k to C. If it's a C, change to T
mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
double res1 = hmm.computeReadLikelihoodGivenHaplotype(
originalHMM.initialize(mread.length, haplotype1.length);
double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
haplotype1, mread,
quals, gop, gop,
gcp);
gcp, 0, false);
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
@ -353,11 +287,11 @@ public class PairHMMUnitTest extends BaseTest {
byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length);
// change single base at position k to C. If it's a C, change to T
mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
double res1 = hmm.computeReadLikelihoodGivenHaplotype(
originalHMM.initialize(mread.length, haplotype1.length);
double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
haplotype1, mread,
quals, gop, gop,
gcp);
gcp, 0, false);
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);

View File

@ -62,7 +62,7 @@ public @interface Argument {
* --help argument is specified.
* @return Doc string associated with this command-line argument.
*/
String doc();
String doc() default "Undocumented option";
/**
* Is this argument required. If true, the command-line argument system will

View File

@ -532,7 +532,7 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
Object[] vals = type.getEnumConstants();
Object defaultEnumeration = null; // as we look at options, record the default option if it exists
for (Object val : vals) {
if (String.valueOf(val).equalsIgnoreCase(value.asString())) return val;
if (String.valueOf(val).equalsIgnoreCase(value == null ? null : value.asString())) return val;
try { if (type.getField(val.toString()).isAnnotationPresent(EnumerationArgumentDefault.class)) defaultEnumeration = val; }
catch (NoSuchFieldException e) { throw new ReviewedStingException("parsing " + type.toString() + "doesn't contain the field " + val.toString()); }
}
@ -546,10 +546,10 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
else
throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value.asString());
} else if (type.equals(File.class)) {
result = value.asFile();
result = value == null ? null : value.asFile();
} else {
Constructor ctor = type.getConstructor(String.class);
result = ctor.newInstance(value.asString());
result = ctor.newInstance(value == null ? null : value.asString());
}
} catch (UserException e) {
throw e;

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.arguments;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
@ -54,22 +55,60 @@ public class StandardCallerArgumentCollection {
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
* that you not play around with this parameter.
*
* As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6.
*/
@Advanced
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
public int MAX_ALTERNATE_ALLELES = 3;
public int MAX_ALTERNATE_ALLELES = 6;
/**
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
* that you not play around with this parameter.
*
* This argument has been retired in GATK 2.2. Please specify just maxAltAlleles from now on
*/
@Advanced
@Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false)
public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2;
@Deprecated
@Hidden
@Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "This argument has been retired in GATK 2.2. Please specify just maxAltAlleles from now on, which will apply to any variant, regardless of type", required = false)
public int MAX_ALTERNATE_ALLELES_FOR_INDELS = -1;
/**
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
* Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we
* will try to remove (N * contamination fraction) bases for each alternate allele.
*/
@Hidden
@Argument(fullName = "contamination_percentage_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
public double CONTAMINATION_PERCENTAGE = 0.0;
@Hidden
@Argument(shortName = "logExactCalls", doc="x", required=false)
public File exactCallsLog = null;
public StandardCallerArgumentCollection() { }
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
public StandardCallerArgumentCollection(final StandardCallerArgumentCollection SCAC) {
this.alleles = SCAC.alleles;
this.GenotypingMode = SCAC.GenotypingMode;
this.heterozygosity = SCAC.heterozygosity;
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS;
this.OutputMode = SCAC.OutputMode;
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
this.CONTAMINATION_PERCENTAGE = SCAC.CONTAMINATION_PERCENTAGE;
this.exactCallsLog = SCAC.exactCallsLog;
this.AFmodel = SCAC.AFmodel;
}
/**
* Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
*/
@Advanced
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();
}

View File

@ -74,8 +74,6 @@ import java.util.*;
*
*/
public abstract class MicroScheduler implements MicroSchedulerMBean {
// TODO -- remove me and retire non nano scheduled versions of traversals
private final static boolean USE_NANOSCHEDULER_FOR_EVERYTHING = true;
protected static final Logger logger = Logger.getLogger(MicroScheduler.class);
/**
@ -157,18 +155,22 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
if ( ! (walker instanceof TreeReducible) ) {
throw badNT("nt", engine, walker);
} else {
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation);
}
}
if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) {
throw badNT("nct", engine, walker);
}
if ( threadAllocation.getNumDataThreads() > 1 ) {
return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation);
} else {
if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) )
throw badNT("nct", engine, walker);
return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation);
}
}
private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) {
throw new UserException.BadArgumentValue("nt",
throw new UserException.BadArgumentValue(parallelArg,
String.format("The analysis %s currently does not support parallel execution with %s. " +
"Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg));
}
@ -234,15 +236,9 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
@Ensures("result != null")
private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) {
if (walker instanceof ReadWalker) {
if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 )
return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread());
else
return new TraverseReads();
return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread());
} else if (walker instanceof LocusWalker) {
if ( USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 )
return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread());
else
return new TraverseLociLinear();
return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread());
} else if (walker instanceof DuplicateWalker) {
return new TraverseDuplicates();
} else if (walker instanceof ReadPairWalker) {

View File

@ -123,13 +123,13 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor
// This parser has been passed a null filename and the GATK is not responsible for creating a type default for the object;
// therefore, the user must have failed to specify a type default
if(writerFileName.asFile() == null && generateMD5)
if(writerFileName != null && writerFileName.asFile() == null && generateMD5)
throw new ArgumentException("MD5 generation specified, but no output file specified. If md5 generation is desired, please specify a BAM output file and an md5 file will be written alongside.");
// Create the stub and set parameters.
SAMFileWriterStub stub = null; // stub = new SAMFileWriterStub(engine, defaultOutputStream);
if ( writerFileName.asFile() != null ) {
if (writerFileName != null && writerFileName.asFile() != null ) {
stub = new SAMFileWriterStub(engine, writerFileName.asFile());
if ( compressionLevel != null )

View File

@ -151,7 +151,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
? new VariantContextWriterStub(engine, writerFile, argumentSources)
: new VariantContextWriterStub(engine, defaultOutputStream, argumentSources);
stub.setCompressed(isCompressed(writerFileName.asString()));
stub.setCompressed(isCompressed(writerFileName == null ? null: writerFileName.asString()));
stub.setDoNotWriteGenotypes(argumentIsPresent(createSitesOnlyArgumentDefinition(),matches));
stub.setSkipWritingCommandLineHeader(argumentIsPresent(createNoCommandLineHeaderArgumentDefinition(),matches));
stub.setForceBCF(argumentIsPresent(createBCFArgumentDefinition(),matches));

View File

@ -104,10 +104,11 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
prevLoc = location;
updateCumulativeMetrics(dataProvider.getShard());
printProgress(locus.getLocation());
}
updateCumulativeMetrics(dataProvider.getShard());
// Take the individual isActive calls and integrate them into contiguous active regions and
// add these blocks of work to the work queue
// band-pass filter the list of isActive probabilities and turn into active regions

View File

@ -1,103 +0,0 @@
package org.broadinstitute.sting.gatk.traversals;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.WalkerManager;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.datasources.providers.*;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
/**
* A simple solution to iterating over all reference positions over a series of genomic locations.
*/
public abstract class TraverseLociBase<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,LocusShardDataProvider> {
/**
* our log, which we want to capture anything from this class
*/
protected static final Logger logger = Logger.getLogger(TraversalEngine.class);
@Override
public final String getTraversalUnits() {
return "sites";
}
protected static class TraverseResults<T> {
final int numIterations;
final T reduceResult;
public TraverseResults(int numIterations, T reduceResult) {
this.numIterations = numIterations;
this.reduceResult = reduceResult;
}
}
protected abstract TraverseResults<T> traverse( final LocusWalker<M,T> walker,
final LocusView locusView,
final LocusReferenceView referenceView,
final ReferenceOrderedView referenceOrderedDataView,
final T sum);
@Override
public T traverse( LocusWalker<M,T> walker,
LocusShardDataProvider dataProvider,
T sum) {
logger.debug(String.format("TraverseLociBase.traverse: Shard is %s", dataProvider));
final LocusView locusView = getLocusView( walker, dataProvider );
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
ReferenceOrderedView referenceOrderedDataView = null;
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider );
else
referenceOrderedDataView = (RodLocusView)locusView;
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final TraverseResults<T> result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum );
sum = result.reduceResult;
dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations);
updateCumulativeMetrics(dataProvider.getShard());
}
// We have a final map call to execute here to clean up the skipped based from the
// last position in the ROD to that in the interval
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) {
// only do this if the walker isn't done!
final RodLocusView rodLocusView = (RodLocusView)locusView;
final long nSkipped = rodLocusView.getLastSkippedBases();
if ( nSkipped > 0 ) {
final GenomeLoc site = rodLocusView.getLocOneBeyondShard();
final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped);
final M x = walker.map(null, null, ac);
sum = walker.reduce(x, sum);
}
}
return sum;
}
/**
* Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track'
* of sorts, providing a consistent interface so that TraverseLociBase doesn't need to be reimplemented for any new datatype
* that comes along.
* @param walker walker to interrogate.
* @param dataProvider Data which which to drive the locus view.
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
*/
private LocusView getLocusView( Walker<M,T> walker, LocusShardDataProvider dataProvider ) {
final DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
if( dataSource == DataSource.READS )
return new CoveredLocusView(dataProvider);
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )
return new AllLocusView(dataProvider);
else if( dataSource == DataSource.REFERENCE_ORDERED_DATA )
return new RodLocusView(dataProvider);
else
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
}
}

View File

@ -1,47 +0,0 @@
package org.broadinstitute.sting.gatk.traversals;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
import org.broadinstitute.sting.gatk.datasources.providers.LocusView;
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
/**
* A simple solution to iterating over all reference positions over a series of genomic locations.
*/
public class TraverseLociLinear<M,T> extends TraverseLociBase<M,T> {
@Override
protected TraverseResults<T> traverse(LocusWalker<M, T> walker, LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView, T sum) {
// We keep processing while the next reference location is within the interval
boolean done = false;
int numIterations = 0;
while( locusView.hasNext() && ! done ) {
numIterations++;
final AlignmentContext locus = locusView.next();
final GenomeLoc location = locus.getLocation();
// create reference context. Note that if we have a pileup of "extended events", the context will
// hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup).
final ReferenceContext refContext = referenceView.getReferenceContext(location);
// Iterate forward to get all reference ordered data covering this location
final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext);
final boolean keepMeP = walker.filter(tracker, refContext, locus);
if (keepMeP) {
final M x = walker.map(tracker, refContext, locus);
sum = walker.reduce(x, sum);
done = walker.isDone();
}
printProgress(locus.getLocation());
}
return new TraverseResults<T>(numIterations, sum);
}
}

View File

@ -1,24 +1,26 @@
package org.broadinstitute.sting.gatk.traversals;
import org.broadinstitute.sting.gatk.WalkerManager;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView;
import org.broadinstitute.sting.gatk.datasources.providers.LocusView;
import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView;
import org.broadinstitute.sting.gatk.datasources.providers.*;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import java.util.Iterator;
/**
* A simple solution to iterating over all reference positions over a series of genomic locations.
*/
public class TraverseLociNano<M,T> extends TraverseLociBase<M,T> {
public class TraverseLociNano<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,LocusShardDataProvider> {
/** our log, which we want to capture anything from this class */
private static final boolean DEBUG = false;
@ -30,6 +32,81 @@ public class TraverseLociNano<M,T> extends TraverseLociBase<M,T> {
}
@Override
public final String getTraversalUnits() {
return "sites";
}
protected static class TraverseResults<T> {
final int numIterations;
final T reduceResult;
public TraverseResults(int numIterations, T reduceResult) {
this.numIterations = numIterations;
this.reduceResult = reduceResult;
}
}
@Override
public T traverse( LocusWalker<M,T> walker,
LocusShardDataProvider dataProvider,
T sum) {
logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider));
final LocusView locusView = getLocusView( walker, dataProvider );
if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all
//ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider );
ReferenceOrderedView referenceOrderedDataView = null;
if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA )
referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider );
else
referenceOrderedDataView = (RodLocusView)locusView;
final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider );
final TraverseResults<T> result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum );
sum = result.reduceResult;
dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations);
updateCumulativeMetrics(dataProvider.getShard());
}
// We have a final map call to execute here to clean up the skipped based from the
// last position in the ROD to that in the interval
if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) {
// only do this if the walker isn't done!
final RodLocusView rodLocusView = (RodLocusView)locusView;
final long nSkipped = rodLocusView.getLastSkippedBases();
if ( nSkipped > 0 ) {
final GenomeLoc site = rodLocusView.getLocOneBeyondShard();
final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped);
final M x = walker.map(null, null, ac);
sum = walker.reduce(x, sum);
}
}
return sum;
}
/**
* Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track'
* of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype
* that comes along.
* @param walker walker to interrogate.
* @param dataProvider Data which which to drive the locus view.
* @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal.
*/
private LocusView getLocusView( Walker<M,T> walker, LocusShardDataProvider dataProvider ) {
final DataSource dataSource = WalkerManager.getWalkerDataSource(walker);
if( dataSource == DataSource.READS )
return new CoveredLocusView(dataProvider);
else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers )
return new AllLocusView(dataProvider);
else if( dataSource == DataSource.REFERENCE_ORDERED_DATA )
return new RodLocusView(dataProvider);
else
throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource);
}
protected TraverseResults<T> traverse(final LocusWalker<M, T> walker,
final LocusView locusView,
final LocusReferenceView referenceView,

View File

@ -42,7 +42,7 @@ public class TraverseReadPairs<M,T> extends TraversalEngine<M,T, ReadPairWalker<
public T traverse(ReadPairWalker<M, T> walker,
ReadShardDataProvider dataProvider,
T sum) {
logger.debug(String.format("TraverseReads.traverse Covered dataset is %s", dataProvider));
logger.debug(String.format("TraverseReadsPairs.traverse Covered dataset is %s", dataProvider));
if( !dataProvider.hasReads() )
throw new IllegalArgumentException("Unable to traverse reads; no read data is available.");

View File

@ -1,111 +0,0 @@
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.traversals;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView;
import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView;
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
/**
* @author aaron
* @version 1.0
* @date Apr 24, 2009
* <p/>
* Class TraverseReads
* <p/>
* This class handles traversing by reads in the new shardable style
*/
public class TraverseReads<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,ReadShardDataProvider> {
/** our log, which we want to capture anything from this class */
protected static final Logger logger = Logger.getLogger(TraverseReads.class);
@Override
public String getTraversalUnits() {
return "reads";
}
/**
* Traverse by reads, given the data and the walker
*
* @param walker the walker to traverse with
* @param dataProvider the provider of the reads data
* @param sum the value of type T, specified by the walker, to feed to the walkers reduce function
* @return the reduce variable of the read walker
*/
public T traverse(ReadWalker<M,T> walker,
ReadShardDataProvider dataProvider,
T sum) {
logger.debug(String.format("TraverseReads.traverse Covered dataset is %s", dataProvider));
if( !dataProvider.hasReads() )
throw new IllegalArgumentException("Unable to traverse reads; no read data is available.");
final ReadView reads = new ReadView(dataProvider);
final ReadReferenceView reference = new ReadReferenceView(dataProvider);
// get the reference ordered data
final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);
boolean done = walker.isDone();
// while we still have more reads
for (final SAMRecord read : reads) {
if ( done ) break;
// ReferenceContext -- the reference bases covered by the read
final ReferenceContext refContext = ! read.getReadUnmappedFlag() && dataProvider.hasReference()
? reference.getReferenceContext(read)
: null;
// update the number of reads we've seen
dataProvider.getShard().getReadMetrics().incrementNumIterations();
// if the read is mapped, create a metadata tracker
final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 ? rodView.getReferenceOrderedDataForRead(read) : null;
final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read);
if (keepMeP) {
M x = walker.map(refContext, (GATKSAMRecord) read, tracker); // the tracker can be null
sum = walker.reduce(x, sum);
}
final GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart());
updateCumulativeMetrics(dataProvider.getShard());
printProgress(locus);
done = walker.isDone();
}
return sum;
}
}

View File

@ -48,7 +48,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
private Map<String, Object> calculateIC(final VariantContext vc) {
final GenotypesContext genotypes = (founderIds == null || founderIds.isEmpty()) ? vc.getGenotypes() : vc.getGenotypes(founderIds);
if ( genotypes == null || genotypes.size() < MIN_SAMPLES )
if ( genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant())
return null;
int idxAA = 0, idxAB = 1, idxBB = 2;

View File

@ -0,0 +1,30 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import org.broadinstitute.sting.utils.GenomeLoc;
/**
* GenomeLocs are very useful objects to keep track of genomic locations and perform set operations
* with them.
*
* However, GenomeLocs are bound to strict validation through the GenomeLocParser and cannot
* be created easily for small tasks that do not require the rigors of the GenomeLocParser validation
*
* SimpleGenomeLoc is a simple utility to create GenomeLocs without going through the parser. Should
* only be used outside of the engine.
*
* User: carneiro
* Date: 10/16/12
* Time: 2:07 PM
*/
public class SimpleGenomeLoc extends GenomeLoc {
private boolean finished;
public SimpleGenomeLoc(String contigName, int contigIndex, int start, int stop, boolean finished) {
super(contigName, contigIndex, start, stop);
this.finished = finished;
}
public boolean isFinished() {
return finished;
}
}

View File

@ -57,7 +57,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
@ -231,7 +231,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
int count = 0;
for (PileupElement p : pileup) {
if (p.isDeletion() || p.isInsertionAtBeginningOfRead() || BaseUtils.isRegularBase(p.getBase()))
count++;
count += p.getRepresentativeCount();
}
return count;

View File

@ -113,23 +113,20 @@ public class PerReadAlleleLikelihoodMap {
return likelihoodReadMap.get(p.getRead());
}
public static Allele getMostLikelyAllele(Map<Allele,Double> alleleMap) {
double minLike = Double.POSITIVE_INFINITY, maxLike = Double.NEGATIVE_INFINITY;
public static Allele getMostLikelyAllele( final Map<Allele,Double> alleleMap ) {
double maxLike = Double.NEGATIVE_INFINITY;
double prevMaxLike = Double.NEGATIVE_INFINITY;
Allele mostLikelyAllele = Allele.NO_CALL;
for (Map.Entry<Allele,Double> el : alleleMap.entrySet()) {
for (final Map.Entry<Allele,Double> el : alleleMap.entrySet()) {
if (el.getValue() > maxLike) {
prevMaxLike = maxLike;
maxLike = el.getValue();
mostLikelyAllele = el.getKey();
} else if( el.getValue() > prevMaxLike ) {
prevMaxLike = el.getValue();
}
if (el.getValue() < minLike)
minLike = el.getValue();
}
if (maxLike-minLike > INDEL_LIKELIHOOD_THRESH)
return mostLikelyAllele;
else
return Allele.NO_CALL;
return (maxLike - prevMaxLike > INDEL_LIKELIHOOD_THRESH ? mostLikelyAllele : Allele.NO_CALL );
}
}

View File

@ -41,19 +41,20 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.*;
public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
private final boolean useAlleleFromVCF;
private final double[] likelihoodSums = new double[4];
private final ArrayList<PileupElement>[] alleleStratifiedElements = new ArrayList[4];
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
for ( int i = 0; i < 4; i++ )
alleleStratifiedElements[i] = new ArrayList<PileupElement>();
}
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
@ -78,8 +79,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
ArrayList<SampleGenotypeData> GLs = new ArrayList<SampleGenotypeData>(contexts.size());
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
if ( UAC.CONTAMINATION_PERCENTAGE > 0.0 )
pileup = createDecontaminatedPileup(pileup, UAC.CONTAMINATION_PERCENTAGE);
if ( useBAQedPileup )
pileup = createBAQedPileup( pileup );
pileup = createBAQedPileup(pileup);
// create the GenotypeLikelihoods object
final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.PCR_error);
@ -150,8 +153,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
// create the genotypes; no-call everyone for now
final GenotypesContext genotypes = GenotypesContext.create();
final List<Allele> noCall = new ArrayList<Allele>();
noCall.add(Allele.NO_CALL);
for ( SampleGenotypeData sampleData : GLs ) {
final double[] allLikelihoods = sampleData.GL.getLikelihoods();
@ -202,6 +203,42 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
return allelesToUse;
}
public ReadBackedPileup createDecontaminatedPileup(final ReadBackedPileup pileup, final double contaminationPercentage) {
// special case removal of all reads
if ( contaminationPercentage >= 1.0 )
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>());
// start by stratifying the reads by the alleles they represent at this position
for( final PileupElement pe : pileup ) {
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
if ( baseIndex != -1 )
alleleStratifiedElements[baseIndex].add(pe);
}
// Down-sample *each* allele by the contamination fraction applied to the entire pileup.
// Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
int numReadsToRemove = (int)Math.ceil((double)pileup.getNumberOfElements() * contaminationPercentage);
final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
@Override
public int compare(PileupElement element1, PileupElement element2) {
final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
}
});
for ( int i = 0; i < 4; i++ ) {
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
if ( alleleList.size() > numReadsToRemove )
elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove));
}
// clean up pointers so memory can be garbage collected if needed
for ( int i = 0; i < 4; i++ )
alleleStratifiedElements[i].clear();
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
}
public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) {
final List<PileupElement> BAQedElements = new ArrayList<PileupElement>();
for( final PileupElement PE : pileup ) {
@ -220,6 +257,22 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); }
}
private List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove) {
final int pileupSize = elements.size();
final BitSet itemsToRemove = new BitSet(pileupSize);
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
itemsToRemove.set(selectedIndex);
}
ArrayList<PileupElement> elementsToKeep = new ArrayList<PileupElement>(pileupSize - numElementsToRemove);
for ( int i = 0; i < pileupSize; i++ ) {
if ( !itemsToRemove.get(i) )
elementsToKeep.add(elements.get(i));
}
return elementsToKeep;
}
private static class SampleGenotypeData {
public final String name;

View File

@ -27,23 +27,15 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
public class UnifiedArgumentCollection extends StandardCallerArgumentCollection {
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false)
public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
/**
* Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
*/
@Advanced
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT;
/**
* The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily
* distinguish between PCR errors vs. sequencing errors. The practical implication for this value is that it
@ -65,6 +57,12 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false)
public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false;
/**
* The PairHMM implementation to use for -glm INDEL genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
*/
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for -glm INDEL genotype likelihood calculations", required = false)
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ORIGINAL;
/**
* The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base
* is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value.
@ -112,10 +110,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false)
public int INDEL_HAPLOTYPE_SIZE = 80;
@Hidden
@Argument(fullName = "noBandedIndel", shortName = "noBandedIndel", doc = "Don't do Banded Indel likelihood computation", required = false)
public boolean DONT_DO_BANDED_INDEL_COMPUTATION = false;
@Hidden
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
@ -183,63 +177,57 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
public UnifiedArgumentCollection clone() {
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
/**
* Create a new UAC with defaults for all UAC arguments
*/
public UnifiedArgumentCollection() {
super();
}
uac.GLmodel = GLmodel;
uac.AFmodel = AFmodel;
uac.heterozygosity = heterozygosity;
uac.PCR_error = PCR_error;
uac.GenotypingMode = GenotypingMode;
uac.OutputMode = OutputMode;
uac.NO_SLOD = NO_SLOD;
uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED;
uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING;
uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING;
uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION;
uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING;
uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE;
uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY;
uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY;
uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY;
uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO;
uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE;
uac.alleles = alleles;
uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES;
uac.MAX_ALTERNATE_ALLELES_FOR_INDELS = MAX_ALTERNATE_ALLELES_FOR_INDELS;
uac.GLmodel = GLmodel;
uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL;
uac.referenceSampleRod = referenceSampleRod;
uac.referenceSampleName = referenceSampleName;
uac.samplePloidy = samplePloidy;
uac.maxQualityScore = minQualityScore;
uac.phredScaledPrior = phredScaledPrior;
uac.minPower = minPower;
uac.minReferenceDepth = minReferenceDepth;
uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES;
uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO;
uac.exactCallsLog = exactCallsLog;
/**
* Create a new UAC based on the information only our in super-class scac and defaults for all UAC arguments
* @param scac
*/
public UnifiedArgumentCollection(final StandardCallerArgumentCollection scac) {
super(scac);
}
/**
* Create a new UAC with all parameters having the values in uac
*
* @param uac
*/
public UnifiedArgumentCollection(final UnifiedArgumentCollection uac) {
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
super(uac);
this.GLmodel = uac.GLmodel;
this.AFmodel = uac.AFmodel;
this.PCR_error = uac.PCR_error;
this.NO_SLOD = uac.NO_SLOD;
this.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED;
this.MIN_BASE_QUALTY_SCORE = uac.MIN_BASE_QUALTY_SCORE;
this.MAX_DELETION_FRACTION = uac.MAX_DELETION_FRACTION;
this.MIN_INDEL_COUNT_FOR_GENOTYPING = uac.MIN_INDEL_COUNT_FOR_GENOTYPING;
this.MIN_INDEL_FRACTION_PER_SAMPLE = uac.MIN_INDEL_FRACTION_PER_SAMPLE;
this.INDEL_HETEROZYGOSITY = uac.INDEL_HETEROZYGOSITY;
this.INDEL_GAP_OPEN_PENALTY = uac.INDEL_GAP_OPEN_PENALTY;
this.INDEL_GAP_CONTINUATION_PENALTY = uac.INDEL_GAP_CONTINUATION_PENALTY;
this.OUTPUT_DEBUG_INDEL_INFO = uac.OUTPUT_DEBUG_INDEL_INFO;
this.INDEL_HAPLOTYPE_SIZE = uac.INDEL_HAPLOTYPE_SIZE;
this.TREAT_ALL_READS_AS_SINGLE_POOL = uac.TREAT_ALL_READS_AS_SINGLE_POOL;
this.referenceSampleRod = uac.referenceSampleRod;
this.referenceSampleName = uac.referenceSampleName;
this.samplePloidy = uac.samplePloidy;
this.maxQualityScore = uac.minQualityScore;
this.phredScaledPrior = uac.phredScaledPrior;
this.minPower = uac.minPower;
this.minReferenceDepth = uac.minReferenceDepth;
this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES;
this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO;
this.pairHMM = uac.pairHMM;
// todo- arguments to remove
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION;
return uac;
}
public UnifiedArgumentCollection() { }
public UnifiedArgumentCollection( final StandardCallerArgumentCollection SCAC ) {
super();
this.alleles = SCAC.alleles;
this.GenotypingMode = SCAC.GenotypingMode;
this.heterozygosity = SCAC.heterozygosity;
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS;
this.OutputMode = SCAC.OutputMode;
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
this.exactCallsLog = SCAC.exactCallsLog;
this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES;
}
}

View File

@ -29,54 +29,62 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.List;
/**
* Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods
*
*/
public abstract class AFCalc implements Cloneable {
private final static Logger defaultLogger = Logger.getLogger(AFCalc.class);
protected final int nSamples;
protected final int maxAlternateAllelesToGenotype;
protected final int maxAlternateAllelesForIndels;
protected Logger logger = defaultLogger;
private SimpleTimer callTimer = new SimpleTimer();
private PrintStream callReport = null;
private final AFCalcResultTracker resultTracker;
private final StateTracker stateTracker;
private ExactCallLogger exactCallLogger = null;
protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
/**
* Create a new AFCalc object capable of calculating the prob. that alleles are
* segregating among nSamples with up to maxAltAlleles for SNPs and maxAltAllelesForIndels
* for indels for samples with ploidy
*
* @param nSamples number of samples, must be > 0
* @param maxAltAlleles maxAltAlleles for SNPs
* @param ploidy the ploidy, must be > 0
*/
protected AFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) {
if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples);
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles);
if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels);
if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be > 0 but got " + ploidy);
this.nSamples = nSamples;
this.maxAlternateAllelesToGenotype = maxAltAlleles;
this.maxAlternateAllelesForIndels = maxAltAllelesForIndels;
this.resultTracker = new AFCalcResultTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels));
this.stateTracker = new StateTracker(maxAltAlleles);
}
/**
* Enable exact call logging to file
*
* @param exactCallsLog the destination file
*/
public void enableProcessLog(final File exactCallsLog) {
initializeOutputFile(exactCallsLog);
exactCallLogger = new ExactCallLogger(exactCallsLog);
}
/**
* Use this logger instead of the default logger
*
* @param logger
*/
public void setLogger(Logger logger) {
this.logger = logger;
}
@ -91,10 +99,10 @@ public abstract class AFCalc implements Cloneable {
public AFCalcResult getLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) {
if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null");
if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null");
if ( resultTracker == null ) throw new IllegalArgumentException("Results object cannot be null");
if ( stateTracker == null ) throw new IllegalArgumentException("Results object cannot be null");
// reset the result, so we can store our new result there
resultTracker.reset();
stateTracker.reset();
final VariantContext vcWorking = reduceScope(vc);
@ -102,16 +110,26 @@ public abstract class AFCalc implements Cloneable {
final AFCalcResult result = computeLog10PNonRef(vcWorking, log10AlleleFrequencyPriors);
final long nanoTime = callTimer.getElapsedTimeNano();
if ( callReport != null )
printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, resultTracker.getLog10PosteriorOfAFzero());
if ( exactCallLogger != null )
exactCallLogger.printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, result);
return result;
}
@Deprecated
protected AFCalcResult resultFromTracker(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) {
resultTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles());
return resultTracker.toAFCalcResult(log10AlleleFrequencyPriors);
/**
* Convert the final state of the state tracker into our result as an AFCalcResult
*
* Assumes that stateTracker has been updated accordingly
*
* @param vcWorking the VariantContext we actually used as input to the calc model (after reduction)
* @param log10AlleleFrequencyPriors the priors by AC vector
* @return a AFCalcResult describing the result of this calculation
*/
@Requires("stateTracker.getnEvaluations() >= 0")
@Ensures("result != null")
protected AFCalcResult getResultFromFinalState(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) {
stateTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles());
return stateTracker.toAFCalcResult(log10AlleleFrequencyPriors);
}
// ---------------------------------------------------------------------------
@ -142,11 +160,13 @@ public abstract class AFCalc implements Cloneable {
* @param log10AlleleFrequencyPriors priors
* @return a AFCalcResult object describing the results of this calculation
*/
// TODO -- add consistent requires among args
@Requires({"vc != null", "log10AlleleFrequencyPriors != null"})
protected abstract AFCalcResult computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors);
/**
* Subset VC to the just allelesToUse, updating genotype likelihoods
*
* Must be overridden by concrete subclasses
*
* @param vc variant context with alleles and genotype likelihoods
@ -167,58 +187,11 @@ public abstract class AFCalc implements Cloneable {
// ---------------------------------------------------------------------------
public int getMaxAltAlleles() {
return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels);
return maxAlternateAllelesToGenotype;
}
// ---------------------------------------------------------------------------
//
// Print information about the call to the calls log
//
// ---------------------------------------------------------------------------
private void initializeOutputFile(final File outputFile) {
try {
if (outputFile != null) {
callReport = new PrintStream( new FileOutputStream(outputFile) );
callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value")));
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(outputFile, e);
}
protected StateTracker getStateTracker() {
return stateTracker;
}
private void printCallInfo(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final long runtimeNano,
final double log10PosteriorOfAFzero) {
printCallElement(vc, "type", "ignore", vc.getType());
int allelei = 0;
for ( final Allele a : vc.getAlleles() )
printCallElement(vc, "allele", allelei++, a.getDisplayString());
for ( final Genotype g : vc.getGenotypes() )
printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString());
for ( int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++ )
printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]);
printCallElement(vc, "runtime.nano", "ignore", runtimeNano);
printCallElement(vc, "log10PosteriorOfAFzero", "ignore", log10PosteriorOfAFzero);
callReport.flush();
}
private void printCallElement(final VariantContext vc,
final Object variable,
final Object key,
final Object value) {
final String loc = String.format("%s:%d", vc.getChr(), vc.getStart());
callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value)));
}
public AFCalcResultTracker getResultTracker() {
return resultTracker;
}
}

View File

@ -24,19 +24,12 @@ public class AFCalcFactory {
* the needs of the request (i.e., considering ploidy).
*/
public enum Calculation {
/** The default implementation */
EXACT(ReferenceDiploidExactAFCalc.class, 2, -1),
/** reference implementation of multi-allelic EXACT model */
EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1),
/** expt. implementation */
@Deprecated
EXACT_CONSTRAINED(ConstrainedDiploidExactAFCalc.class, 2, -1),
/** expt. implementation -- for testing only */
EXACT_INDEPENDENT(IndependentAllelesDiploidExactAFCalc.class, 2, -1),
/** reference implementation of multi-allelic EXACT model. Extremely slow for many alternate alleles */
EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1),
/** original biallelic exact model, for testing only */
EXACT_ORIGINAL(OriginalDiploidExactAFCalc.class, 2, 2),
@ -64,6 +57,8 @@ public class AFCalcFactory {
return (requiredPloidy == -1 || requiredPloidy == requestedPloidy)
&& (maxAltAlleles == -1 || maxAltAlleles >= requestedMaxAltAlleles);
}
public static Calculation getDefaultModel() { return EXACT_INDEPENDENT; }
}
private static final Map<String, Class<? extends AFCalc>> afClasses;
@ -96,7 +91,7 @@ public class AFCalcFactory {
public static AFCalc createAFCalc(final UnifiedArgumentCollection UAC,
final int nSamples,
final Logger logger) {
final int maxAltAlleles = Math.max(UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS);
final int maxAltAlleles = UAC.MAX_ALTERNATE_ALLELES;
if ( ! UAC.AFmodel.usableForParams(UAC.samplePloidy, maxAltAlleles) ) {
logger.info("Requested ploidy " + UAC.samplePloidy + " maxAltAlleles " + maxAltAlleles + " not supported by requested model " + UAC.AFmodel + " looking for an option");
final List<Calculation> supportingCalculations = new LinkedList<Calculation>();
@ -114,7 +109,7 @@ public class AFCalcFactory {
logger.info("Selecting model " + UAC.AFmodel);
}
final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.samplePloidy);
final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.samplePloidy);
if ( logger != null ) calc.setLogger(logger);
if ( UAC.exactCallsLog != null ) calc.enableProcessLog(UAC.exactCallsLog);
@ -131,7 +126,7 @@ public class AFCalcFactory {
* @return an initialized AFCalc
*/
public static AFCalc createAFCalc(final int nSamples) {
return createAFCalc(chooseBestCalculation(nSamples, 2, 1), nSamples, 2, 2, 2);
return createAFCalc(chooseBestCalculation(nSamples, 2, 1), nSamples, 2, 2);
}
/**
@ -144,7 +139,7 @@ public class AFCalcFactory {
* @return an initialized AFCalc
*/
public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles) {
return createAFCalc(calc, nSamples, maxAltAlleles, maxAltAlleles, 2);
return createAFCalc(calc, nSamples, maxAltAlleles, 2);
}
/**
@ -152,14 +147,12 @@ public class AFCalcFactory {
*
* @param nSamples the number of samples we'll be using
* @param maxAltAlleles the max. alt alleles to consider for SNPs
* @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs
* @param ploidy the sample ploidy. Must be consistent with the calc
*
* @return an initialized AFCalc
*/
public static AFCalc createAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels);
return createAFCalc(chooseBestCalculation(nSamples, ploidy, maxAlt), nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
public static AFCalc createAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) {
return createAFCalc(chooseBestCalculation(nSamples, ploidy, maxAltAlleles), nSamples, maxAltAlleles, ploidy);
}
/**
@ -186,20 +179,17 @@ public class AFCalcFactory {
* @param calc the calculation to use
* @param nSamples the number of samples we'll be using
* @param maxAltAlleles the max. alt alleles to consider for SNPs
* @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs
* @param ploidy the sample ploidy. Must be consistent with the calc
*
* @return an initialized AFCalc
*/
public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles, final int ploidy) {
if ( calc == null ) throw new IllegalArgumentException("Calculation cannot be null");
if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples);
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles);
if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels);
if ( ploidy < 1 ) throw new IllegalArgumentException("sample ploidy must be greater than zero " + ploidy);
final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels);
if ( ! calc.usableForParams(ploidy, maxAlt) )
if ( ! calc.usableForParams(ploidy, maxAltAlleles) )
throw new IllegalArgumentException("AFCalc " + calc + " does not support requested ploidy " + ploidy);
final Class<? extends AFCalc> afClass = getClassByName(calc.className);
@ -207,19 +197,19 @@ public class AFCalcFactory {
throw new IllegalArgumentException("Unexpected AFCalc " + calc);
try {
Object args[] = new Object[]{nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy};
Constructor c = afClass.getDeclaredConstructor(int.class, int.class, int.class, int.class);
Object args[] = new Object[]{nSamples, maxAltAlleles, ploidy};
Constructor c = afClass.getDeclaredConstructor(int.class, int.class, int.class);
return (AFCalc)c.newInstance(args);
} catch (Exception e) {
throw new ReviewedStingException("Could not instantiate AFCalc " + calc, e);
}
}
protected static List<AFCalc> createAFCalcs(final List<Calculation> calcs, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
protected static List<AFCalc> createAFCalcs(final List<Calculation> calcs, final int nSamples, final int maxAltAlleles, final int ploidy) {
final List<AFCalc> AFCalcs = new LinkedList<AFCalc>();
for ( final Calculation calc : calcs )
AFCalcs.add(createAFCalc(calc, nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy));
AFCalcs.add(createAFCalc(calc, nSamples, maxAltAlleles, ploidy));
return AFCalcs;
}

View File

@ -31,10 +31,7 @@ import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
/**
* Describes the results of the AFCalc
@ -86,8 +83,8 @@ public class AFCalcResult {
if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null");
if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
if ( ! goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC));
if ( ! goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC));
if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC));
if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC));
this.alleleCountsOfMLE = alleleCountsOfMLE;
this.nEvaluations = nEvaluations;
@ -150,7 +147,7 @@ public class AFCalcResult {
* Due to computational / implementation constraints this may be smaller than
* the actual list of alleles requested
*
* @return a non-empty list of alleles used during genotyping
* @return a non-empty list of alleles used during genotyping, the first of which is the reference allele
*/
@Ensures({"result != null", "! result.isEmpty()"})
public List<Allele> getAllelesUsedInGenotyping() {
@ -162,7 +159,7 @@ public class AFCalcResult {
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
@Ensures({"MathUtils.goodLog10Probability(result)"})
public double getLog10PosteriorOfAFEq0() {
return log10PosteriorsOfAC[AF0];
}
@ -172,7 +169,7 @@ public class AFCalcResult {
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
@Ensures({"MathUtils.goodLog10Probability(result)"})
public double getLog10PosteriorOfAFGT0() {
return log10PosteriorsOfAC[AF1p];
}
@ -182,7 +179,7 @@ public class AFCalcResult {
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
@Ensures({"MathUtils.goodLog10Probability(result)"})
public double getLog10LikelihoodOfAFEq0() {
return log10LikelihoodsOfAC[AF0];
}
@ -192,7 +189,7 @@ public class AFCalcResult {
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
@Ensures({"MathUtils.goodLog10Probability(result)"})
public double getLog10LikelihoodOfAFGT0() {
return log10LikelihoodsOfAC[AF1p];
}
@ -202,7 +199,7 @@ public class AFCalcResult {
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
@Ensures({"MathUtils.goodLog10Probability(result)"})
public double getLog10PriorOfAFEq0() {
return log10PriorsOfAC[AF0];
}
@ -212,11 +209,19 @@ public class AFCalcResult {
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
@Ensures({"MathUtils.goodLog10Probability(result)"})
public double getLog10PriorOfAFGT0() {
return log10PriorsOfAC[AF1p];
}
@Override
public String toString() {
final List<String> byAllele = new LinkedList<String>();
for ( final Allele a : getAllelesUsedInGenotyping() )
if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFGt0ForAllele(a)));
return String.format("AFCalc%n\t\tlog10PosteriorOfAFGT0=%.2f%n\t\t%s", getLog10LikelihoodOfAFGT0(), Utils.join("\n\t\t", byAllele));
}
/**
* Are we sufficiently confidence in being non-ref that the site is considered polymorphic?
*
@ -233,6 +238,19 @@ public class AFCalcResult {
return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef;
}
/**
* Are any of the alleles polymorphic w.r.t. #isPolymorphic?
*
* @param log10minPNonRef the confidence threshold, in log10 space
* @return true if any are poly, false otherwise
*/
public boolean anyPolymorphic(final double log10minPNonRef) {
for ( final Allele a : getAllelesUsedInGenotyping() )
if ( a.isNonReference() && isPolymorphic(a, log10minPNonRef) )
return true;
return false;
}
/**
* Returns the log10 probability that allele is segregating
*
@ -245,7 +263,7 @@ public class AFCalcResult {
* @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping
* @return the log10 probability that allele is segregating at this site
*/
@Ensures("goodLog10Probability(result)")
@Ensures("MathUtils.goodLog10Probability(result)")
public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) {
final Double log10pNonRef = log10pNonRefByAllele.get(allele);
if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele);
@ -261,43 +279,12 @@ public class AFCalcResult {
* @return freshly allocated log10 normalized posteriors vector
*/
@Requires("log10LikelihoodsOfAC.length == log10PriorsOfAC.length")
@Ensures("goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)")
@Ensures("MathUtils.goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)")
private static double[] computePosteriors(final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC) {
final double[] log10UnnormalizedPosteriors = new double[log10LikelihoodsOfAC.length];
for ( int i = 0; i < log10LikelihoodsOfAC.length; i++ )
log10UnnormalizedPosteriors[i] = log10LikelihoodsOfAC[i] + log10PriorsOfAC[i];
// necessary because the posteriors may be so skewed that the log-space normalized value isn't
// good, so we have to try both log-space normalization as well as the real-space normalization if the
// result isn't good
final double[] logNormalized = MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, true);
if ( goodLog10ProbVector(logNormalized, logNormalized.length, true) )
return logNormalized;
else
return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false);
}
/**
* Check that the log10 prob vector vector is well formed
*
* @param vector
* @param expectedSize
* @param shouldSumToOne
*
* @return true if vector is well-formed, false otherwise
*/
private static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) {
if ( vector.length != expectedSize ) return false;
for ( final double pr : vector ) {
if ( ! goodLog10Probability(pr) )
return false;
}
if ( shouldSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(vector), 1.0, 1e-4) != 0 )
return false;
return true; // everything is good
return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false);
}
/**
@ -321,14 +308,4 @@ public class AFCalcResult {
else
return index - 1;
}
/**
* Checks that the result is a well-formed log10 probability
*
* @param result a supposedly well-formed log10 probability value
* @return true if result is really well formed
*/
private static boolean goodLog10Probability(final double result) {
return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result);
}
}

View File

@ -1,256 +0,0 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: ebanks
* Date: Dec 14, 2011
*
* Useful helper class to communicate the results of the allele frequency calculation
*
* TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF?
*/
class AFCalcResultTracker {
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
protected double log10MLE;
protected double log10MAP;
private final int[] alleleCountsOfMLE;
private final int[] alleleCountsOfMAP;
// The posteriors seen, not including that of AF=0
private static final int LIKELIHOODS_CACHE_SIZE = 5000;
private final double[] log10LikelihoodsMatrixValues = new double[LIKELIHOODS_CACHE_SIZE];
private int currentLikelihoodsCacheIndex = 0;
protected Double log10LikelihoodsMatrixSum = null;
// These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
private double log10LikelihoodOfAFzero;
private double log10PosteriorOfAFzero;
private int[] AClimits;
int nEvaluations = 0;
/**
* The list of alleles actually used in computing the AF
*/
private List<Allele> allelesUsedInGenotyping = null;
/**
* Create a results object capability of storing results for calls with up to maxAltAlleles
*
* @param maxAltAlleles an integer >= 1
*/
public AFCalcResultTracker(final int maxAltAlleles) {
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles);
alleleCountsOfMLE = new int[maxAltAlleles];
alleleCountsOfMAP = new int[maxAltAlleles];
reset();
}
/**
* Returns a vector with maxAltAlleles values containing AC values at the MLE
*
* The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order,
* starting from index 0 (i.e., the first alt allele is at 0). The vector is always
* maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values
* are meaningful.
*
* @return a vector with allele counts, not all of which may be meaningful
*/
@Ensures("result != null")
public int[] getAlleleCountsOfMLE() {
return alleleCountsOfMLE;
}
/**
* Returns a vector with maxAltAlleles values containing AC values at the MAP
*
* @see #getAlleleCountsOfMLE() for the encoding of results in this vector
*
* @return a non-null vector of ints
*/
@Ensures("result != null")
public int[] getAlleleCountsOfMAP() {
return alleleCountsOfMAP;
}
/**
* Returns the likelihoods summed across all AC values for AC > 0
*
* @return
*/
public double getLog10LikelihoodOfAFNotZero() {
if ( log10LikelihoodsMatrixSum == null ) {
if ( currentLikelihoodsCacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have
log10LikelihoodsMatrixSum = MathUtils.LOG10_P_OF_ZERO;
else
log10LikelihoodsMatrixSum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex);
}
return log10LikelihoodsMatrixSum;
}
public double getLog10LikelihoodOfAFNotZero(final boolean capAt0) {
return Math.min(getLog10LikelihoodOfAFNotZero(), capAt0 ? 0.0 : Double.POSITIVE_INFINITY);
}
/**
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
*
* @return
*/
public double getLog10LikelihoodOfAFzero() {
return log10LikelihoodOfAFzero;
}
/**
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
*
* @return
*/
public double getLog10PosteriorOfAFzero() {
return log10PosteriorOfAFzero;
}
protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) {
final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1);
final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)};
final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true);
// TODO -- replace with more meaningful computation
// TODO -- refactor this calculation into the ref calculation
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(allelesUsedInGenotyping.size());
for ( int i = 0; i < subACOfMLE.length; i++ ) {
final Allele allele = allelesUsedInGenotyping.get(i+1);
final double log10PNonRef = getAlleleCountsOfMAP()[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was
log10pNonRefByAllele.put(allele, log10PNonRef);
}
return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele);
}
// --------------------------------------------------------------------------------
//
// Protected mutational methods only for use within the calculation models themselves
//
// --------------------------------------------------------------------------------
/**
* Reset the data in this results object, so that it can be used in a subsequent AF calculation
*
* Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer
*/
protected void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = VALUE_NOT_CALCULATED;
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
alleleCountsOfMLE[i] = 0;
alleleCountsOfMAP[i] = 0;
}
currentLikelihoodsCacheIndex = 0;
log10LikelihoodsMatrixSum = null;
allelesUsedInGenotyping = null;
nEvaluations = 0;
Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY);
}
/**
* Tell this result we used one more evaluation cycle
*/
protected void incNEvaluations() {
nEvaluations++;
}
protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
addToLikelihoodsCache(log10LofK);
if ( log10LofK > log10MLE ) {
log10MLE = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMLE[i] = alleleCountsForK[i];
}
}
protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
if ( log10LofK > log10MAP ) {
log10MAP = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMAP[i] = alleleCountsForK[i];
}
}
private void addToLikelihoodsCache(final double log10LofK) {
// add to the cache
log10LikelihoodsMatrixValues[currentLikelihoodsCacheIndex++] = log10LofK;
// if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
if ( currentLikelihoodsCacheIndex == LIKELIHOODS_CACHE_SIZE) {
final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex);
Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY);
log10LikelihoodsMatrixValues[0] = temporarySum;
currentLikelihoodsCacheIndex = 1;
}
}
protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {
log10MLE = log10LikelihoodOfAFzero;
Arrays.fill(alleleCountsOfMLE, 0);
}
}
protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
if ( log10PosteriorOfAFzero > log10MAP ) {
log10MAP = log10PosteriorOfAFzero;
Arrays.fill(alleleCountsOfMAP, 0);
}
}
protected void setAllelesUsedInGenotyping(List<Allele> allelesUsedInGenotyping) {
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() )
throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty");
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
}
protected void setAClimits(int[] AClimits) {
this.AClimits = AClimits;
}
}

View File

@ -1,107 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@Deprecated
public class ConstrainedDiploidExactAFCalc extends DiploidExactAFCalc {
protected ConstrainedDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
}
protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) {
final int[] maxACsToConsider = computeMaxACs(vc);
resultTracker.setAClimits(maxACsToConsider);
return new StateTracker(maxACsToConsider);
}
/**
* Computes the maximum ACs we need to consider for each alt allele
*
* Walks over the genotypes in VC, and computes for each alt allele the maximum
* AC we need to consider in that alt allele dimension. Does the calculation
* based on the PLs in each genotype g, choosing to update the max AC for the
* alt alleles corresponding to that PL. Only takes the first lowest PL,
* if there are multiple genotype configurations with the same PL value. It
* takes values in the order of the alt alleles.
*
* @param vc the variant context we will compute max alt alleles for
* @return a vector of max alt alleles, indexed by alt allele, so result[0] is the AC of the
* first alt allele.
*/
@Ensures("result != null")
protected final int[] computeMaxACs(final VariantContext vc) {
final int[] maxACs = new int[vc.getNAlleles()-1];
for ( final Genotype g : vc.getGenotypes() )
updateMaxACs(g, maxACs);
return maxACs;
}
/**
* Update the maximum achievable allele counts in maxAC according to the PLs in g
*
* Selects the maximum genotype configuration from the PLs in g, and updates
* the maxAC for this configure. For example, if the lowest PL is for 0/1, updates
* the maxAC for the alt allele 1 by 1. If it's 1/1, update is 2. Works for
* many number of alt alleles (determined by length of maxACs).
*
* If the max PL occurs at 0/0, updates nothing
* Note that this function greedily takes the first min PL, so that if 0/1 and 1/1 have
* the same PL value, then updates the first one.
*
* Also, only will update 1 alt allele, so if 0/1 and 0/2 both have the same PL,
* then only first one (1) will be updated
*
* @param g the genotype to update
* @param maxACs the max allele count vector for alt alleles (starting at 0 => first alt allele)
*/
@Requires({
"g != null",
"maxACs != null",
"goodMaxACs(maxACs)"})
private void updateMaxACs(final Genotype g, final int[] maxACs) {
final int[] PLs = g.getLikelihoods().getAsPLs();
int minPLi = 0;
int minPL = PLs[0];
for ( int i = 0; i < PLs.length; i++ ) {
if ( PLs[i] < minPL ) {
minPL = PLs[i];
minPLi = i;
}
}
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(minPLi);
updateMaxACs(maxACs, pair.alleleIndex1);
updateMaxACs(maxACs, pair.alleleIndex2);
}
/**
* Simple helper. Update max alt alleles maxACs according to the allele index (where 0 == ref)
*
* If alleleI == 0 => doesn't update anything
* else maxACs[alleleI - 1]++
*
* @param maxACs array of max alt allele ACs
* @param alleleI the index (relative to 0) to update a count of 1 in max alt alleles.
*/
@Requires({
"alleleI >= 0",
"(alleleI - 1) < maxACs.length",
"goodMaxACs(maxACs)"})
private void updateMaxACs(final int[] maxACs, final int alleleI) {
if ( alleleI > 0 )
maxACs[alleleI-1]++;
}
private static boolean goodMaxACs(final int[] maxACs) {
return MathUtils.sum(maxACs) >= 0;
}
}

View File

@ -31,13 +31,11 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
public abstract class DiploidExactAFCalc extends ExactAFCalc {
public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) {
super(nSamples, maxAltAlleles, ploidy);
if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy);
}
protected abstract StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker);
@Override
protected AFCalcResult computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
@ -58,43 +56,33 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
// keep processing while we have AC conformations that need to be calculated
final StateTracker stateTracker = makeMaxLikelihood(vc, getResultTracker());
while ( !ACqueue.isEmpty() ) {
getResultTracker().incNEvaluations(); // keep track of the number of evaluations
getStateTracker().incNEvaluations(); // keep track of the number of evaluations
// compute log10Likelihoods
final ExactACset set = ACqueue.remove();
if ( stateTracker.withinMaxACs(set.getACcounts()) ) {
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, stateTracker, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, getResultTracker());
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors);
// adjust max likelihood seen if needed
stateTracker.update(log10LofKs, set.getACcounts());
// clean up memory
indexesToACset.remove(set.getACcounts());
//if ( DEBUG )
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
}
// clean up memory
indexesToACset.remove(set.getACcounts());
//if ( DEBUG )
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
}
return resultFromTracker(vc, log10AlleleFrequencyPriors);
return getResultFromFinalState(vc, log10AlleleFrequencyPriors);
}
@Override
protected VariantContext reduceScope(final VariantContext vc) {
final int myMaxAltAllelesToGenotype = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype;
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
if ( vc.getAlternateAlleles().size() > getMaxAltAlleles() ) {
logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
VariantContextBuilder builder = new VariantContextBuilder(vc);
List<Allele> alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
List<Allele> alleles = new ArrayList<Allele>(getMaxAltAlleles() + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles()));
builder.alleles(alleles);
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
return builder.make();
@ -151,23 +139,21 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
private double calculateAlleleCountConformation(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final StateTracker stateTracker,
final int numChr,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[] log10AlleleFrequencyPriors,
final AFCalcResultTracker resultTracker) {
final double[] log10AlleleFrequencyPriors) {
//if ( DEBUG )
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
// compute the log10Likelihoods
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, resultTracker);
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors);
final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
// can we abort early because the log10Likelihoods are so small?
if ( stateTracker.abort(log10LofK, set.getACcounts()) ) {
if ( getStateTracker().abort(log10LofK, set.getACcounts(), true) ) {
//if ( DEBUG )
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
return log10LofK;
@ -186,7 +172,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
ACcountsClone[allele]++;
// to get to this conformation, a sample would need to be AB (remember that ref=0)
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
updateACset(stateTracker, ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
@ -211,9 +197,9 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
for ( DependentSet dependent : differentAlleles )
updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
for ( DependentSet dependent : sameAlleles )
updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
return log10LofK;
@ -221,8 +207,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
// also pushes its value to the given callingSetIndex.
private void updateACset(final StateTracker stateTracker,
final int[] newSetCounts,
private void updateACset(final int[] newSetCounts,
final int numChr,
final ExactACset dependentSet,
final int PLsetIndex,
@ -244,8 +229,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
private void computeLofK(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final double[] log10AlleleFrequencyPriors,
final AFCalcResultTracker resultTracker) {
final double[] log10AlleleFrequencyPriors) {
set.getLog10Likelihoods()[0] = 0.0; // the zero case
final int totalK = set.getACsum();
@ -256,8 +240,8 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0);
getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
return;
}
@ -279,14 +263,15 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc {
double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
// update the MLE if necessary
resultTracker.updateMLEifNeeded(log10LofK, set.getACcounts().getCounts());
getStateTracker().updateMLEifNeeded(log10LofK, set.getACcounts().getCounts());
// apply the priors over each alternate allele
for ( final int ACcount : set.getACcounts().getCounts() ) {
if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
resultTracker.updateMAPifNeeded(log10LofK, set.getACcounts().getCounts());
getStateTracker().updateMAPifNeeded(log10LofK, set.getACcounts().getCounts());
}
private void pushData(final ExactACset targetSet,

View File

@ -39,8 +39,8 @@ import java.util.ArrayList;
abstract class ExactAFCalc extends AFCalc {
protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first
protected ExactAFCalc(final int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
protected ExactAFCalc(final int nSamples, int maxAltAlleles, final int ploidy) {
super(nSamples, maxAltAlleles, ploidy);
}
/**

View File

@ -0,0 +1,179 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import com.google.java.contract.Requires;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.*;
import java.util.*;
/**
* Allows us to write out and read in information about exact calls (site, alleles, PLs, etc) in tabular format
*
* Once opened, calls can be writen to disk with printCallInfo
*/
public class ExactCallLogger implements Cloneable {
private PrintStream callReport = null;
/**
* Create a new ExactCallLogger writing it's output to outputFile
*
* @param outputFile
*/
public ExactCallLogger(final File outputFile) {
try {
callReport = new PrintStream(new BufferedOutputStream(new FileOutputStream(outputFile), 10000000));
callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value")));
} catch (FileNotFoundException e) {
throw new UserException.CouldNotCreateOutputFile(outputFile, e);
}
}
/**
* Summarizes information about an exact call that happened
*/
public static class ExactCall {
final VariantContext vc;
final long runtime;
final AFCalcResult originalCall;
public ExactCall(VariantContext vc, final long runtime, final AFCalcResult originalCall) {
this.vc = vc;
this.runtime = runtime;
this.originalCall = originalCall;
}
@Override
public String toString() {
return String.format("ExactCall %s:%d alleles=%s nSamples=%s orig.pNonRef=%.2f orig.runtime=%s",
vc.getChr(), vc.getStart(), vc.getAlleles(), vc.getNSamples(),
originalCall.getLog10PosteriorOfAFGT0(),
new AutoFormattingTime(runtime / 1e9).toString());
}
}
protected final void printCallInfo(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final long runtimeNano,
final AFCalcResult result) {
printCallElement(vc, "type", "ignore", vc.getType());
int allelei = 0;
for (final Allele a : vc.getAlleles())
printCallElement(vc, "allele", allelei++, a.getDisplayString());
for (final Genotype g : vc.getGenotypes())
printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString());
for (int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++)
printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]);
printCallElement(vc, "runtime.nano", "ignore", runtimeNano);
printCallElement(vc, "log10PosteriorOfAFEq0", "ignore", result.getLog10PosteriorOfAFEq0());
printCallElement(vc, "log10PosteriorOfAFGt0", "ignore", result.getLog10PosteriorOfAFGT0());
for ( final Allele allele : result.getAllelesUsedInGenotyping() ) {
if ( allele.isNonReference() ) {
printCallElement(vc, "MLE", allele, result.getAlleleCountAtMLE(allele));
printCallElement(vc, "pNonRefByAllele", allele, result.getLog10PosteriorOfAFGt0ForAllele(allele));
}
}
callReport.flush();
}
@Requires({"vc != null", "variable != null", "key != null", "value != null", "callReport != null"})
private void printCallElement(final VariantContext vc,
final Object variable,
final Object key,
final Object value) {
final String loc = String.format("%s:%d", vc.getChr(), vc.getStart());
callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value)));
}
/**
* Read in a list of ExactCall objects from reader, keeping only those
* with starts in startsToKeep or all sites (if this is empty)
*
* @param reader a just-opened reader sitting at the start of the file
* @param startsToKeep a list of start position of the calls to keep, or empty if all calls should be kept
* @param parser a genome loc parser to create genome locs
* @return a list of ExactCall objects in reader
* @throws IOException
*/
public static List<ExactCall> readExactLog(final BufferedReader reader, final List<Integer> startsToKeep, GenomeLocParser parser) throws IOException {
if ( reader == null ) throw new IllegalArgumentException("reader cannot be null");
if ( startsToKeep == null ) throw new IllegalArgumentException("startsToKeep cannot be null");
if ( parser == null ) throw new IllegalArgumentException("GenomeLocParser cannot be null");
List<ExactCall> calls = new LinkedList<ExactCall>();
// skip the header line
reader.readLine();
// skip the first "type" line
reader.readLine();
while (true) {
final VariantContextBuilder builder = new VariantContextBuilder();
final List<Allele> alleles = new ArrayList<Allele>();
final List<Genotype> genotypes = new ArrayList<Genotype>();
final double[] posteriors = new double[2];
final double[] priors = MathUtils.normalizeFromLog10(new double[]{0.5, 0.5}, true);
final List<Integer> mle = new ArrayList<Integer>();
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>();
long runtimeNano = -1;
GenomeLoc currentLoc = null;
while (true) {
final String line = reader.readLine();
if (line == null)
return calls;
final String[] parts = line.split("\t");
final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]);
final String variable = parts[1];
final String key = parts[2];
final String value = parts[3];
if (currentLoc == null)
currentLoc = lineLoc;
if (variable.equals("type")) {
if (startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart())) {
builder.alleles(alleles);
final int stop = currentLoc.getStart() + alleles.get(0).length() - 1;
builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop);
builder.genotypes(genotypes);
final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[]{}));
final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele);
calls.add(new ExactCall(builder.make(), runtimeNano, result));
}
break;
} else if (variable.equals("allele")) {
final boolean isRef = key.equals("0");
alleles.add(Allele.create(value, isRef));
} else if (variable.equals("PL")) {
final GenotypeBuilder gb = new GenotypeBuilder(key);
gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs());
genotypes.add(gb.make());
} else if (variable.equals("log10PosteriorOfAFEq0")) {
posteriors[0] = Double.valueOf(value);
} else if (variable.equals("log10PosteriorOfAFGt0")) {
posteriors[1] = Double.valueOf(value);
} else if (variable.equals("MLE")) {
mle.add(Integer.valueOf(value));
} else if (variable.equals("pNonRefByAllele")) {
final Allele a = Allele.create(key);
log10pNonRefByAllele.put(a, Double.valueOf(value));
} else if (variable.equals("runtime.nano")) {
runtimeNano = Long.valueOf(value);
} else {
// nothing to do
}
}
}
}
}

View File

@ -32,31 +32,97 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
/**
* Computes the conditional bi-allelic exact results
*
* Suppose vc contains 2 alt allele: A* with C and T. This function first computes:
*
* (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything]
*
* it then computes the conditional probability on AF_c == 0:
*
* (2) P(D | AF_t > 0 && AF_c == 0)
*
* Thinking about this visually, we have the following likelihood matrix where each cell is
* the P(D | AF_c == i && AF_t == j):
*
* 0 AF_c > 0
* -----------------
* 0 | |
* |--|-------------
* a | |
* f | |
* _ | |
* t | |
* > | |
* 0 | |
*
* What we really want to know how
*
* (3) P(D | AF_c == 0 & AF_t == 0)
*
* compares with
*
* (4) P(D | AF_c > 0 || AF_t > 0)
*
* This is effectively asking for the value in the upper left vs. the sum of all cells.
*
* This class implements the conditional likelihoods summation for any number of alt
* alleles, where each alt allele has its EXACT probability of segregating calculated by
* reducing each alt B into the case XB and computing P(D | AF_b > 0 ) as follows:
*
* Suppose we have for a A/B/C site the following GLs:
*
* AA AB BB AC BC CC
*
* and we want to get the bi-allelic GLs for X/B, where X is everything not B
*
* XX = AA + AC + CC (since X = A or C)
* XB = AB + BC
* BB = BB
*
* After each allele has its probability calculated we compute the joint posterior
* as P(D | AF_* == 0) = prod_i P (D | AF_i == 0), after applying the theta^i
* prior for the ith least likely allele.
*/
public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
/**
* The min. confidence of an allele to be included in the joint posterior.
*/
private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-10);
private final static int[] BIALLELIC_NON_INFORMATIVE_PLS = new int[]{0,0,0};
private final static List<Allele> BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
/**
* Sorts AFCalcResults by their posteriors of AF > 0, so the
*/
private final static class CompareAFCalcResultsByPNonRef implements Comparator<AFCalcResult> {
@Override
public int compare(AFCalcResult o1, AFCalcResult o2) {
return Double.compare(o1.getLog10LikelihoodOfAFGT0(), o2.getLog10LikelihoodOfAFGT0());
return -1 * Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0());
}
}
private final static CompareAFCalcResultsByPNonRef compareAFCalcResultsByPNonRef = new CompareAFCalcResultsByPNonRef();
final ReferenceDiploidExactAFCalc refModel;
/**
* The AFCalc model we are using to do the bi-allelic computation
*/
final AFCalc biAlleleExactModel;
protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
refModel = new ReferenceDiploidExactAFCalc(nSamples, 1, 1, ploidy);
}
@Override
protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) {
return refModel.makeMaxLikelihood(vc, resultTracker);
protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) {
super(nSamples, maxAltAlleles, ploidy);
biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, ploidy);
}
/**
* Trivial subclass that helps with debugging by keeping track of the supporting information for this joint call
*/
private static class MyAFCalcResult extends AFCalcResult {
/**
* List of the supporting bi-allelic AFCalcResults that went into making this multi-allelic joint call
*/
final List<AFCalcResult> supporting;
private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List<Allele> allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map<Allele, Double> log10pNonRefByAllele, List<AFCalcResult> supporting) {
@ -68,121 +134,89 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
@Override
public AFCalcResult computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
final double log10LikelihoodOfRef = computelog10LikelihoodOfRef(vc);
final List<AFCalcResult> independentResultTrackers = computeAlleleConditionalExact(vc, log10AlleleFrequencyPriors);
final List<AFCalcResult> independentResultTrackers = computeAlleleIndependentExact(vc, log10AlleleFrequencyPriors);
final List<AFCalcResult> withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers);
return combineIndependentPNonRefs(vc, log10LikelihoodOfRef, withMultiAllelicPriors);
return combineIndependentPNonRefs(vc, withMultiAllelicPriors);
}
protected final double computelog10LikelihoodOfRef(final VariantContext vc) {
// this value just the likelihood of AF == 0 in the special constrained multi-allelic calculation
final List<double[]> allGLs = getGLs(vc.getGenotypes(), false);
double log10LikelihoodOfHomRef = 0.0;
// TODO -- can be easily optimized (currently looks at all GLs via getGLs)
for ( int i = 0; i < allGLs.size(); i++ ) {
final double[] GLs = allGLs.get(i);
log10LikelihoodOfHomRef += GLs[0];
//log10LikelihoodOfHomRef += MathUtils.normalizeFromLog10(GLs, true)[0];
}
return log10LikelihoodOfHomRef;
}
/**
* Computes the conditional bi-allelic exact results
* Compute the conditional exact AFCalcResult for each allele in vc independently, returning
* the result of each, in order of the alt alleles in VC
*
* Suppose vc contains 2 alt allele: A* with C and T. This function first computes:
*
* (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything]
*
* it then computes the conditional probability on AF_c == 0:
*
* (2) P(D | AF_t > 0 && AF_c == 0)
*
* Thinking about this visually, we have the following likelihood matrix where each cell is
* the P(D | AF_c == i && AF_t == j):
*
* 0 AF_c > 0
* -----------------
* 0 | |
* |--|-------------
* a | |
* f | |
* _ | |
* t | |
* > | |
* 0 | |
*
* What we really want to know how
*
* (3) P(D | AF_c == 0 & AF_t == 0)
*
* compares with
*
* (4) P(D | AF_c > 0 || AF_t > 0)
*
* This is effectively asking for the value in the upper left vs. the sum of all cells.
*
* The quantity (1) is the same of all cells except those with AF_c == 0, while (2) is the
* band at the top where AF_t > 0 and AF_c == 0
*
* So (4) is actually (1) + (2).
*
* (3) is the direct inverse of the (1) and (2), as we are simultaneously calculating
*
* (1*) P(D | AF_c == 0 && AF_t == *) [i.e., T can be anything]
* (2*) P(D | AF_t == 0 && AF_c == 0) [TODO -- note this value looks like the thing we are supposed to use]
*
* This function implements the conditional likelihoods summation for any number of alt
* alleles (not just the tri-allelic case), where each subsequent variant context is
* further constrained such that each already considered allele x has AF_x == 0 in the
* compute.
*
* @param vc
* @param log10AlleleFrequencyPriors
* @return
* @param vc the VariantContext we want to analyze
* @param log10AlleleFrequencyPriors the priors
* @return a list of the AFCalcResults for each bi-allelic sub context of vc
*/
protected List<AFCalcResult> computeAlleleConditionalExact(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
@Requires({"vc != null", "log10AlleleFrequencyPriors != null"})
@Ensures("goodIndependentResult(vc, result)")
protected final List<AFCalcResult> computeAlleleIndependentExact(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
final List<AFCalcResult> results = new LinkedList<AFCalcResult>();
for ( final VariantContext subvc : makeAlleleConditionalContexts(vc) ) {
final AFCalcResult resultTracker = refModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors);
final AFCalcResult resultTracker = biAlleleExactModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors);
results.add(resultTracker);
}
return results;
}
protected List<VariantContext> makeAlleleConditionalContexts(final VariantContext vc) {
/**
* Helper function to ensure that the computeAlleleIndependentExact is returning reasonable results
*/
private static boolean goodIndependentResult(final VariantContext vc, final List<AFCalcResult> results) {
if ( results.size() != vc.getNAlleles() - 1) return false;
for ( int i = 0; i < results.size(); i++ ) {
if ( results.get(i).getAllelesUsedInGenotyping().size() != 2 )
return false;
if ( ! results.get(i).getAllelesUsedInGenotyping().contains(vc.getAlternateAllele(i)) )
return false;
}
return true;
}
/**
* Returns the bi-allelic variant context for each alt allele in vc with bi-allelic likelihoods, in order
*
* @param vc the variant context to split. Must have n.alt.alleles > 1
* @return a bi-allelic variant context for each alt allele in vc
*/
@Requires({"vc != null", "vc.getNAlleles() > 1"})
@Ensures("result.size() == vc.getNAlleles() - 1")
protected final List<VariantContext> makeAlleleConditionalContexts(final VariantContext vc) {
final int nAltAlleles = vc.getNAlleles() - 1;
final List<VariantContext> vcs = new LinkedList<VariantContext>();
final List<Allele> afZeroAlleles = new LinkedList<Allele>();
for ( int altI = 0; altI < nAltAlleles; altI++ ) {
final Allele altAllele = vc.getAlternateAllele(altI);
final List<Allele> biallelic = Arrays.asList(vc.getReference(), altAllele);
vcs.add(biallelicCombinedGLs(vc, biallelic, afZeroAlleles, altI + 1));
//afZeroAlleles.add(altAllele);
vcs.add(biallelicCombinedGLs(vc, altI + 1));
}
return vcs;
}
protected VariantContext biallelicCombinedGLs(final VariantContext rootVC, final List<Allele> biallelic, final List<Allele> afZeroAlleles, final int allele2) {
/**
* Create a single bi-allelic variant context from rootVC with alt allele with index altAlleleIndex
*
* @param rootVC the root (potentially multi-allelic) variant context
* @param altAlleleIndex index of the alt allele, from 0 == first alt allele
* @return a bi-allelic variant context based on rootVC
*/
@Requires({"rootVC.getNAlleles() > 1", "altAlleleIndex < rootVC.getNAlleles()"})
@Ensures({"result.isBiallelic()"})
protected final VariantContext biallelicCombinedGLs(final VariantContext rootVC, final int altAlleleIndex) {
if ( rootVC.isBiallelic() ) {
if ( ! afZeroAlleles.isEmpty() ) throw new IllegalArgumentException("Root VariantContext is biallelic but afZeroAlleles wasn't empty: " + afZeroAlleles);
return rootVC;
} else {
final Set<Integer> allelesToDiscard = new HashSet<Integer>(rootVC.getAlleleIndices(afZeroAlleles));
final int nAlts = rootVC.getNAlleles() - 1;
final List<Genotype> biallelicGenotypes = new ArrayList<Genotype>(rootVC.getNSamples());
for ( final Genotype g : rootVC.getGenotypes() )
biallelicGenotypes.add(combineGLs(g, allele2, allelesToDiscard, nAlts));
biallelicGenotypes.add(combineGLs(g, altAlleleIndex, nAlts));
final VariantContextBuilder vcb = new VariantContextBuilder(rootVC);
vcb.alleles(biallelic);
final Allele altAllele = rootVC.getAlternateAllele(altAlleleIndex - 1);
vcb.alleles(Arrays.asList(rootVC.getReference(), altAllele));
vcb.genotypes(biallelicGenotypes);
return vcb.make();
}
@ -203,30 +237,16 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
* XB = AB + BC
* BB = BB
*
* Supports the additional mode of simply dropping GLs whose allele index occurs in allelesToDiscard. This is
* useful in the case where you want to drop alleles (not combine them), such as above:
*
* AA AB BB AC BC CC
*
* and we want to get the bi-allelic GLs for X/B, where X is everything not B, but dropping C (index 2)
*
* XX = AA (since X = A and C is dropped)
* XB = AB
* BB = BB
*
* This allows us to recover partial GLs the correspond to any allele in allelesToDiscard having strictly
* AF == 0.
*
* @param original the original multi-allelic genotype
* @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0
* @param nAlts the total number of alt alleles
* @return a new biallelic genotype with appropriate PLs
*/
@Requires({"original.hasLikelihoods()", "! allelesToDiscard.contains(altIndex)"})
@Requires({"original.hasLikelihoods()"}) // TODO -- add ploidy == 2 test "original.getPLs() == null || original.getPLs().length == 3"})
@Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"})
protected Genotype combineGLs(final Genotype original, final int altIndex, final Set<Integer> allelesToDiscard, final int nAlts ) {
protected Genotype combineGLs(final Genotype original, final int altIndex, final int nAlts ) {
if ( original.isNonInformative() )
return new GenotypeBuilder(original).PL(new int[]{0,0,0}).alleles(BIALLELIC_NOCALL).make();
return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make();
if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts);
@ -236,10 +256,6 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
for ( int index = 0; index < normalizedPr.length; index++ ) {
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index);
// just continue if we shouldn't include the pair because it's in the discard set
if ( discardAllelePair(pair, allelesToDiscard) )
continue;
if ( pair.alleleIndex1 == altIndex ) {
if ( pair.alleleIndex2 == altIndex )
// hom-alt case
@ -263,20 +279,20 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make();
}
protected boolean discardAllelePair(final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair, Set<Integer> allelesToDiscard) {
return allelesToDiscard.contains(pair.alleleIndex1) || allelesToDiscard.contains(pair.alleleIndex2);
}
protected List<AFCalcResult> applyMultiAllelicPriors(final List<AFCalcResult> conditionalPNonRefResults) {
protected final List<AFCalcResult> applyMultiAllelicPriors(final List<AFCalcResult> conditionalPNonRefResults) {
final ArrayList<AFCalcResult> sorted = new ArrayList<AFCalcResult>(conditionalPNonRefResults);
// sort the results, so the most likely allele is first
Collections.sort(sorted, compareAFCalcResultsByPNonRef);
double lastPosteriorGt0 = sorted.get(0).getLog10PosteriorOfAFGT0();
final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0();
for ( int i = 0; i < sorted.size(); i++ ) {
final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0;
if ( sorted.get(i).getLog10PosteriorOfAFGT0() > lastPosteriorGt0 )
throw new IllegalStateException("pNonRefResults not sorted: lastPosteriorGt0 " + lastPosteriorGt0 + " but current is " + sorted.get(i).getLog10PosteriorOfAFGT0());
final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0;
final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0));
final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 };
@ -291,10 +307,17 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
/**
* Take the independent estimates of pNonRef for each alt allele and combine them into a single result
*
* Given n independent calculations for each of n alternate alleles create a single
* combined AFCalcResult with:
*
* priors for AF == 0 equal to theta^N for the nth least likely allele
* posteriors that reflect the combined chance that any alleles are segregating and corresponding
* likelihoods
* combined MLEs in the order of the alt alleles in vc
*
* @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently
*/
protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc,
final double log10LikelihoodsOfACEq0,
final List<AFCalcResult> sortedResultsWithThetaNPriors) {
int nEvaluations = 0;
final int nAltAlleles = sortedResultsWithThetaNPriors.size();
@ -302,9 +325,11 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
final double[] log10PriorsOfAC = new double[2];
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(nAltAlleles);
// this value is a sum in real space so we need to store values to sum up later
final double[] log10LikelihoodsOfACGt0 = new double[nAltAlleles];
// the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs
double log10PosteriorOfACEq0Sum = 0.0;
double log10PosteriorOfACGt0Sum = 0.0;
boolean anyPoly = false;
for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) {
final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1);
final int altI = vc.getAlleles().indexOf(altAllele) - 1;
@ -312,11 +337,15 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
// MLE of altI allele is simply the MLE of this allele in altAlleles
alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele);
log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0();
log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0();
// the AF > 0 case requires us to store the normalized likelihood for later summation
log10LikelihoodsOfACGt0[altI] = sortedResultWithThetaNPriors.getLog10LikelihoodOfAFGT0();
if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) {
anyPoly = true;
log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0();
log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0();
log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0();
}
log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0();
// bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior
log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0());
@ -325,14 +354,36 @@ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
nEvaluations += sortedResultWithThetaNPriors.nEvaluations;
}
// the log10 likelihoods are the sum of the log10 likelihoods across all alt alleles
final double[] log10LikelihoodsOfAC = new double[]{
log10LikelihoodsOfACEq0,
MathUtils.log10sumLog10(log10LikelihoodsOfACGt0)};
// If no alleles were polymorphic, make sure we have the proper priors (the defaults) for likelihood calculation
if ( ! anyPoly ) {
log10PriorsOfAC[0] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFEq0();
log10PriorsOfAC[1] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFGT0();
}
// In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C,
// the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently
// log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0
//
// note we need to handle the case where the posterior of AF == 0 is 0.0, in which case we
// use the summed log10PosteriorOfACGt0Sum directly. This happens in cases where
// AF > 0 : 0.0 and AF == 0 : -16, and if you use the inverse calculation you get 0.0 and MathUtils.LOG10_P_OF_ZERO
final double log10PosteriorOfACGt0;
if ( log10PosteriorOfACEq0Sum == 0.0 )
log10PosteriorOfACGt0 = log10PosteriorOfACGt0Sum;
else
log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO);
final double[] log10LikelihoodsOfAC = new double[] {
// L + prior = posterior => L = poster - prior
log10PosteriorOfACEq0Sum - log10PriorsOfAC[0],
log10PosteriorOfACGt0 - log10PriorsOfAC[1]
};
return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(),
MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true, true), // necessary to ensure all values < 0
MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized
// necessary to ensure all values < 0
MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true),
// priors incorporate multiple alt alleles, must be normalized
MathUtils.normalizeFromLog10(log10PriorsOfAC, true),
log10pNonRefByAllele, sortedResultsWithThetaNPriors);
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -11,28 +12,31 @@ import java.util.Map;
/**
* Original bi-allelic ~O(N) implementation. Kept here for posterity and reference
*/
public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
}
protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) {
return new StateTracker();
class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) {
super(nSamples, maxAltAlleles, ploidy);
}
@Override
protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) {
final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length];
final double[] log10AlleleFrequencyPosteriors = new double[log10AlleleFrequencyPriors.length];
final int lastK = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
final Pair<Integer, Integer> result = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
final int lastK = result.getFirst();
final int mleK = result.getSecond();
final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1)};
final double log10LikelihoodAFGt0 = lastK == 0 ? MathUtils.LOG10_P_OF_ZERO : MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1, lastK+1);
final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], log10LikelihoodAFGt0};
final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)};
final double[] log10Posteriors = MathUtils.vectorSum(log10Likelihoods, log10Priors);
final double pNonRef = lastK > 0 ? 0.0 : -1000.0;
final Map<Allele, Double> log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), pNonRef);
final double log10PNonRef = log10Posteriors[1] > log10Posteriors[0] ? 0.0 : MathUtils.LOG10_P_OF_ZERO;
final Map<Allele, Double> log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PNonRef);
return new AFCalcResult(new int[]{lastK}, 0, vc.getAlleles(), log10Likelihoods, log10Priors, log10pNonRefByAllele);
return new AFCalcResult(new int[]{mleK}, 0, vc.getAlleles(),
MathUtils.normalizeFromLog10(log10Likelihoods, true),
MathUtils.normalizeFromLog10(log10Priors, true),
log10pNonRefByAllele);
}
/**
@ -72,11 +76,11 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
}
}
public int linearExact(final VariantContext vc,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyLikelihoods,
double[] log10AlleleFrequencyPosteriors) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), false);
public Pair<Integer, Integer> linearExact(final VariantContext vc,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyLikelihoods,
double[] log10AlleleFrequencyPosteriors) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), true);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
@ -85,7 +89,7 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
double maxLog10L = Double.NEGATIVE_INFINITY;
boolean done = false;
int lastK = -1;
int lastK = -1, mleK = -1;
for (int k=0; k <= numChr && ! done; k++ ) {
final double[] kMinus0 = logY.getkMinus0();
@ -131,7 +135,11 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
// can we abort early?
lastK = k;
maxLog10L = Math.max(maxLog10L, log10LofK);
if ( log10LofK > maxLog10L ) {
maxLog10L = log10LofK;
mleK = k;
}
if ( log10LofK < maxLog10L - StateTracker.MAX_LOG10_ERROR_TO_STOP_EARLY ) {
//if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
done = true;
@ -140,6 +148,6 @@ public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
logY.rotate();
}
return lastK;
return new Pair<Integer, Integer>(lastK, mleK);
}
}

View File

@ -1,13 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
public class ReferenceDiploidExactAFCalc extends DiploidExactAFCalc {
protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
}
protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) {
return new StateTracker();
protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) {
super(nSamples, maxAltAlleles, ploidy);
}
}

View File

@ -1,35 +1,85 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Keeps track of the best state seen by the exact model and the max states to visit
* allowing us to abort the search before we visit the entire matrix of AC x samples
* Keeps track of the state information during the exact model AF calculation.
*
* Tracks things like the MLE and MAP AC values, their corresponding likelhood and posterior
* values, the likelihood of the AF == 0 state, and the number of evaluations needed
* by the calculation to compute the P(AF == 0)
*/
final class StateTracker {
public final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
final private int[] maxACsToConsider;
private ExactACcounts ACsAtMax = null;
private double maxLog10L = Double.NEGATIVE_INFINITY;
public StateTracker() {
this(null);
}
public StateTracker(final int[] maxACsToConsider) {
this.maxACsToConsider = maxACsToConsider;
}
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
protected final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
/**
* Update the maximum log10L seen, if log10LofKs is higher, and the corresponding ACs of this state
*
* @param log10LofKs the likelihood of our current configuration state
* These variables are intended to contain the MLE and MAP (and their corresponding allele counts)
* of the site over all alternate alleles
*/
public void update(final double log10LofKs, final ExactACcounts ACs) {
if ( log10LofKs > getMaxLog10L()) {
this.setMaxLog10L(log10LofKs);
this.ACsAtMax = ACs;
}
protected double log10MLE;
protected double log10MAP;
/**
* Returns a vector with maxAltAlleles values containing AC values at the MLE
*
* The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order,
* starting from index 0 (i.e., the first alt allele is at 0). The vector is always
* maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values
* are meaningful.
*/
private final int[] alleleCountsOfMLE;
private final int[] alleleCountsOfMAP;
/**
* A vector of log10 likelihood values seen, for future summation. When the size of the
* vector is exceeed -- because we've pushed more posteriors than there's space to hold
* -- we simply sum up the existing values, make that the first value, and continue.
*/
private final double[] log10LikelihoodsForAFGt0 = new double[LIKELIHOODS_CACHE_SIZE];
private static final int LIKELIHOODS_CACHE_SIZE = 5000;
private int log10LikelihoodsForAFGt0CacheIndex = 0;
/**
* The actual sum of the likelihoods. Null if the sum hasn't been computed yet
*/
protected Double log10LikelihoodsForAFGt0Sum = null;
/**
* Contains the likelihood for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
*/
private double log10LikelihoodOfAFzero = 0.0;
/**
* The number of evaluates we've gone through in the AFCalc
*/
private int nEvaluations = 0;
/**
* The list of alleles actually used in computing the AF
*/
private List<Allele> allelesUsedInGenotyping = null;
/**
* Create a results object capability of storing results for calls with up to maxAltAlleles
*
* @param maxAltAlleles an integer >= 1
*/
public StateTracker(final int maxAltAlleles) {
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles);
alleleCountsOfMLE = new int[maxAltAlleles];
alleleCountsOfMAP = new int[maxAltAlleles];
reset();
}
/**
@ -39,58 +89,194 @@ final class StateTracker {
* @param log10LofK the log10 likelihood of the configuration we're considering analyzing
* @return true if the configuration cannot meaningfully contribute to our likelihood sum
*/
public boolean tooLowLikelihood(final double log10LofK) {
return log10LofK < getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY;
private boolean tooLowLikelihood(final double log10LofK) {
return log10LofK < log10MLE - MAX_LOG10_ERROR_TO_STOP_EARLY;
}
/**
* Are all ACs in otherACs less than or equal to their corresponding ACs in the maxACsToConsider?
* @return true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set
*/
private boolean isLowerAC(final ExactACcounts otherACs) {
final int[] otherACcounts = otherACs.getCounts();
for ( int i = 0; i < otherACcounts.length; i++ ) {
if ( alleleCountsOfMLE[i] > otherACcounts[i] )
return false;
}
return true;
}
/**
* Should we stop exploring paths from ACs, given it's log10LofK
*
* @param otherACs the set of otherACs that we want to know if we should consider analyzing
* @return true if otherACs is a state worth considering, or false otherwise
* @param log10LofK the log10LofK of these ACs
* @param ACs the ACs of this state
* @return return true if there's no reason to continue with subpaths of AC, or false otherwise
*/
public boolean withinMaxACs(final ExactACcounts otherACs) {
if ( maxACsToConsider == null )
return true;
protected boolean abort( final double log10LofK, final ExactACcounts ACs, final boolean enforceLowerACs ) {
return tooLowLikelihood(log10LofK) && (!enforceLowerACs || isLowerAC(ACs));
}
final int[] otherACcounts = otherACs.getCounts();
@Ensures("result != null")
protected int[] getAlleleCountsOfMAP() {
return alleleCountsOfMAP;
}
for ( int i = 0; i < maxACsToConsider.length; i++ ) {
// consider one more than the max AC to collect a bit more likelihood mass
if ( otherACcounts[i] > maxACsToConsider[i] + 1 )
return false;
}
return true;
@Ensures("result >= 0")
protected int getnEvaluations() {
return nEvaluations;
}
/**
* returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set
* @return the likelihoods summed across all AC values for AC > 0
*/
public boolean isLowerAC(final ExactACcounts otherACs) {
if ( ACsAtMax == null )
return true;
private double getLog10LikelihoodOfAFNotZero() {
if ( log10LikelihoodsForAFGt0Sum == null ) {
if ( log10LikelihoodsForAFGt0CacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have
log10LikelihoodsForAFGt0Sum = MathUtils.LOG10_P_OF_ZERO;
else
log10LikelihoodsForAFGt0Sum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex);
}
return log10LikelihoodsForAFGt0Sum;
}
final int[] myACcounts = this.ACsAtMax.getCounts();
final int[] otherACcounts = otherACs.getCounts();
/**
* @return the log10 likelihood of AF == 0
*/
private double getLog10LikelihoodOfAFzero() {
return log10LikelihoodOfAFzero;
}
for ( int i = 0; i < myACcounts.length; i++ ) {
if ( myACcounts[i] > otherACcounts[i] )
return false;
/**
* Convert this state to an corresponding AFCalcResult.
*
* Assumes that the values in this state have been filled in with meaningful values during the calculation.
* For example, that the allelesUsedInGenotyping has been set, that the alleleCountsOfMLE contains meaningful
* values, etc.
*
* @param log10PriorsByAC the priors by AC
*
* @return an AFCalcResult summarizing the final results of this calculation
*/
@Requires("allelesUsedInGenotyping != null")
protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) {
final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1);
final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true);
final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true);
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(allelesUsedInGenotyping.size());
for ( int i = 0; i < subACOfMLE.length; i++ ) {
final Allele allele = allelesUsedInGenotyping.get(i+1);
final double log10PNonRef = alleleCountsOfMAP[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was
log10pNonRefByAllele.put(allele, log10PNonRef);
}
return true;
return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele);
}
public boolean abort( final double log10LofK, final ExactACcounts ACs ) {
return tooLowLikelihood(log10LofK) && isLowerAC(ACs);
// --------------------------------------------------------------------------------
//
// Protected mutational methods only for use within the calculation models themselves
//
// --------------------------------------------------------------------------------
/**
* Reset the data in this results object, so that it can be used in a subsequent AF calculation
*
* Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer
*/
protected void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = VALUE_NOT_CALCULATED;
log10LikelihoodsForAFGt0CacheIndex = 0;
log10LikelihoodsForAFGt0Sum = null;
allelesUsedInGenotyping = null;
nEvaluations = 0;
Arrays.fill(alleleCountsOfMLE, 0);
Arrays.fill(alleleCountsOfMAP, 0);
Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY);
}
public double getMaxLog10L() {
return maxLog10L;
/**
* Tell this result we used one more evaluation cycle
*/
protected void incNEvaluations() {
nEvaluations++;
}
public void setMaxLog10L(double maxLog10L) {
this.maxLog10L = maxLog10L;
/**
* Update the maximum log10 likelihoods seen, if log10LofKs is higher, and the corresponding ACs of this state
*
* @param log10LofK the likelihood of our current configuration state, cannot be the 0 state
* @param alleleCountsForK the allele counts for this state
*/
@Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"})
@Ensures("log10MLE == Math.max(log10LofK, log10MLE)")
protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
addToLikelihoodsCache(log10LofK);
if ( log10LofK > log10MLE ) {
log10MLE = log10LofK;
System.arraycopy(alleleCountsForK, 0, alleleCountsOfMLE, 0, alleleCountsForK.length);
}
}
/**
* Update the maximum log10 posterior seen, if log10PofKs is higher, and the corresponding ACs of this state
*
* @param log10PofK the posterior of our current configuration state
* @param alleleCountsForK the allele counts for this state
*/
@Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"})
@Ensures("log10MAP == Math.max(log10PofK, log10MAP)")
protected void updateMAPifNeeded(final double log10PofK, final int[] alleleCountsForK) {
if ( log10PofK > log10MAP ) {
log10MAP = log10PofK;
System.arraycopy(alleleCountsForK, 0, alleleCountsOfMAP, 0, alleleCountsForK.length);
}
}
private void addToLikelihoodsCache(final double log10LofK) {
// add to the cache
log10LikelihoodsForAFGt0[log10LikelihoodsForAFGt0CacheIndex++] = log10LofK;
// if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
if ( log10LikelihoodsForAFGt0CacheIndex == LIKELIHOODS_CACHE_SIZE) {
final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex);
Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY);
log10LikelihoodsForAFGt0[0] = temporarySum;
log10LikelihoodsForAFGt0CacheIndex = 1;
}
}
protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {
log10MLE = log10LikelihoodOfAFzero;
Arrays.fill(alleleCountsOfMLE, 0);
}
}
@Requires({"MathUtils.goodLog10Probability(log10PosteriorOfAFzero)"})
protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
if ( log10PosteriorOfAFzero > log10MAP ) {
log10MAP = log10PosteriorOfAFzero;
Arrays.fill(alleleCountsOfMAP, 0);
}
}
/**
* Set the list of alleles used in genotyping
*
* @param allelesUsedInGenotyping the list of alleles, where the first allele is reference
*/
@Requires({"allelesUsedInGenotyping != null", "allelesUsedInGenotyping.size() > 1"})
protected void setAllelesUsedInGenotyping(List<Allele> allelesUsedInGenotyping) {
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() )
throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty");
if ( allelesUsedInGenotyping.get(0).isNonReference() )
throw new IllegalArgumentException("The first element of allelesUsedInGenotyping must be the reference allele");
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
}
}

View File

@ -30,8 +30,11 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.PairHMM;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pairhmm.ExactPairHMM;
import org.broadinstitute.sting.utils.pairhmm.OriginalPairHMM;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -48,7 +51,6 @@ public class PairHMMIndelErrorModel {
public static final int BASE_QUAL_THRESHOLD = 20;
private boolean DEBUG = false;
private boolean bandedLikelihoods = false;
private static final int MAX_CACHED_QUAL = 127;
@ -67,6 +69,8 @@ public class PairHMMIndelErrorModel {
private final byte[] GAP_OPEN_PROB_TABLE;
private final byte[] GAP_CONT_PROB_TABLE;
private final PairHMM pairHMM;
/////////////////////////////
// Private Member Variables
/////////////////////////////
@ -85,15 +89,26 @@ public class PairHMMIndelErrorModel {
}
}
public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, boolean bandedLikelihoods) {
public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
this.DEBUG = deb;
this.bandedLikelihoods = bandedLikelihoods;
switch (hmmType) {
case EXACT:
pairHMM = new ExactPairHMM();
break;
case ORIGINAL:
pairHMM = new OriginalPairHMM();
break;
case CACHING:
case LOGLESS_CACHING:
default:
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL and EXACT.");
}
// fill gap penalty table, affine naive model:
this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX];
this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX];
for (int i = 0; i < START_HRUN_GAP_IDX; i++) {
GAP_OPEN_PROB_TABLE[i] = indelGOP;
GAP_CONT_PROB_TABLE[i] = indelGCP;
@ -190,7 +205,6 @@ public class PairHMMIndelErrorModel {
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap,
final int[] readCounts) {
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];
final PairHMM pairHMM = new PairHMM(bandedLikelihoods);
int readIdx=0;
for (PileupElement p: pileup) {
@ -303,8 +317,6 @@ public class PairHMMIndelErrorModel {
final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases);
int j=0;
// initialize path metric and traceback memories for likelihood computation
double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null;
byte[] previousHaplotypeSeen = null;
final byte[] contextLogGapOpenProbabilities = new byte[readBases.length];
final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length];
@ -341,14 +353,9 @@ public class PairHMMIndelErrorModel {
final int X_METRIC_LENGTH = readBases.length+2;
final int Y_METRIC_LENGTH = haplotypeBases.length+2;
if (matchMetricArray == null) {
if (previousHaplotypeSeen == null) {
//no need to reallocate arrays for each new haplotype, as length won't change
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
}
int startIndexInHaplotype = 0;
@ -356,11 +363,10 @@ public class PairHMMIndelErrorModel {
startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen);
previousHaplotypeSeen = haplotypeBases.clone();
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals,
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals,
(read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities),
(read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities),
contextLogGapContinuationProbabilities,
startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray);
contextLogGapContinuationProbabilities, startIndexInHaplotype, false);
if (DEBUG) {

View File

@ -36,7 +36,7 @@ import java.util.*;
* <ul>
* <li>In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.</li>
* <li>In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.</li>
* <li>In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probabilitt is emitted.</li>
* <li>In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probability is emitted.</li>
* <li>In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.</li>
* </ul>
*

View File

@ -33,7 +33,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CountReads \
* -o output.txt \
* -I input.bam \
* [-L input.intervals]
* </pre>

View File

@ -48,11 +48,14 @@ public class GLBasedSampleSelector extends SampleSelector {
// first subset to the samples
VariantContext subContext = vc.subContextFromSamples(samples);
if ( ! subContext.isPolymorphicInSamples() )
return false;
// now check to see (using EXACT model) whether this should be variant
// do we want to apply a prior? maybe user-spec?
if ( flatPriors == null ) {
flatPriors = new double[1+2*samples.size()];
AFCalculator = AFCalcFactory.createAFCalc(samples.size(), 4, 4, 2);
AFCalculator = AFCalcFactory.createAFCalc(samples.size(), 4, 2);
}
final AFCalcResult result = AFCalculator.getLog10PNonRef(subContext, flatPriors);
// do we want to let this qual go up or down?

View File

@ -42,11 +42,11 @@ import org.broadinstitute.sting.utils.MendelianViolation;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.variantcontext.*;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import java.io.File;
import java.io.FileNotFoundException;
@ -542,9 +542,11 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS);
if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) {
final VariantContextBuilder builder = new VariantContextBuilder(UG_engine.calculateGenotypes(sub)).filters(sub.getFiltersMaybeNull());
addAnnotations(builder, sub);
sub = builder.make();
synchronized (UG_engine) {
final VariantContextBuilder builder = new VariantContextBuilder(UG_engine.calculateGenotypes(sub)).filters(sub.getFiltersMaybeNull());
addAnnotations(builder, sub);
sub = builder.make();
}
}
if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) {

View File

@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.Serializable;
import java.util.*;
public class Haplotype {
@ -184,6 +185,21 @@ public class Haplotype {
return new Haplotype(newHaplotypeBases);
}
public static class HaplotypeBaseComparator implements Comparator<Haplotype>, Serializable {
@Override
public int compare( final Haplotype hap1, final Haplotype hap2 ) {
final byte[] arr1 = hap1.getBases();
final byte[] arr2 = hap2.getBases();
// compares byte arrays using lexical ordering
final int len = Math.min(arr1.length, arr2.length);
for( int iii = 0; iii < len; iii++ ) {
final int cmp = arr1[iii] - arr2[iii];
if (cmp != 0) { return cmp; }
}
return arr2.length - arr1.length;
}
}
public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(final List<Allele> alleleList,
final int startPos,
final ReferenceContext ref,

View File

@ -596,7 +596,6 @@ public class MathUtils {
if (keepInLogSpace) {
for (int i = 0; i < array.length; i++) {
array[i] -= maxValue;
array[i] = Math.max(array[i], LOG10_P_OF_ZERO);
}
return array;
}
@ -613,8 +612,11 @@ public class MathUtils {
sum += normalized[i];
for (int i = 0; i < array.length; i++) {
double x = normalized[i] / sum;
if (takeLog10OfOutput)
x = Math.max(Math.log10(x), LOG10_P_OF_ZERO);
if (takeLog10OfOutput) {
x = Math.log10(x);
if ( x < LOG10_P_OF_ZERO || Double.isInfinite(x) )
x = array[i] - maxValue;
}
normalized[i] = x;
}
@ -1192,6 +1194,39 @@ public class MathUtils {
return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.));
}
/**
* Check that the log10 prob vector vector is well formed
*
* @param vector
* @param expectedSize
* @param shouldSumToOne
*
* @return true if vector is well-formed, false otherwise
*/
public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) {
if ( vector.length != expectedSize ) return false;
for ( final double pr : vector ) {
if ( ! goodLog10Probability(pr) )
return false;
}
if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 )
return false;
return true; // everything is good
}
/**
* Checks that the result is a well-formed log10 probability
*
* @param result a supposedly well-formed log10 probability value
* @return true if result is really well formed
*/
public static boolean goodLog10Probability(final double result) {
return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result);
}
/**
* A utility class that computes on the fly average and standard deviation for a stream of numbers.
* The number of observations does not have to be known in advance, and can be also very big (so that

View File

@ -1,259 +0,0 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import java.util.*;
/**
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
* User: rpoplin
* Date: 3/1/12
*/
public class PairHMM {
private static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE;
private static final byte DEFAULT_GOP = (byte) 45;
private static final byte DEFAULT_GCP = (byte) 10;
private static final double BANDING_TOLERANCE = 22.0;
private static final int BANDING_CLUSTER_WINDOW = 12;
private final boolean noBanded;
public PairHMM() {
noBanded = false;
}
public PairHMM( final boolean noBanded ) {
this.noBanded = noBanded;
}
public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray,
final int X_METRIC_LENGTH) {
for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY);
}
// the initial condition
matchMetricArray[1][1] = 0.0; // Math.log10(1.0);
}
@Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"})
@Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability
public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals,
final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
// initial arrays to hold the probabilities of being in the match, insertion and deletion cases
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray);
}
@Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"})
@Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability
public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals,
final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
// ensure that all the qual scores have valid values
for( int iii = 0; iii < readQuals.length; iii++ ) {
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
}
if( false ) {
final ArrayList<Integer> workQueue = new ArrayList<Integer>(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step
final ArrayList<Integer> workToBeAdded = new ArrayList<Integer>();
final ArrayList<Double> calculatedValues = new ArrayList<Double>();
final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1;
workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype
for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over
//Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order
int el = 1;
for( int work : workQueue ) {
// choose the appropriate diagonal baseline location
int iii = 0;
int jjj = diag;
if( diag > Y_METRIC_LENGTH ) {
iii = diag - Y_METRIC_LENGTH;
jjj = Y_METRIC_LENGTH;
}
// move to the starting work location along the diagonal
iii += work;
jjj -= work;
while( iii >= X_METRIC_LENGTH || jjj <= 0 ) {
iii--;
jjj++;
work--;
}
if( !detectClusteredStartLocations(workToBeAdded, work ) ) {
workToBeAdded.add(work); // keep this thread going once it has started
}
if( work >= el - 3 ) {
// step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value
double maxElement = Double.NEGATIVE_INFINITY;
for( el = work; el < numDiags + 1; el++ ) {
updateCell(iii, jjj, haplotypeBases, readBases, readQuals,
insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray);
final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]);
calculatedValues.add(bestMetric);
if( bestMetric > maxElement ) {
maxElement = bestMetric;
} else if( maxElement - bestMetric > BANDING_TOLERANCE ) {
break;
}
if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix
break;
}
if( --jjj <= 0 ) { // don't walk off the edge of the matrix
break;
}
}
// find a local maximum to start a new band in the work queue
double localMaxElement = Double.NEGATIVE_INFINITY;
int localMaxElementIndex = 0;
for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) {
final double bestMetric = calculatedValues.get(kkk);
if( bestMetric > localMaxElement ) {
localMaxElement = bestMetric;
localMaxElementIndex = kkk;
} else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum
if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) {
workToBeAdded.add( work + localMaxElementIndex );
}
break;
}
}
calculatedValues.clear();
// reset iii and jjj to the appropriate diagonal baseline location
iii = 0;
jjj = diag;
if( diag > Y_METRIC_LENGTH ) {
iii = diag - Y_METRIC_LENGTH;
jjj = Y_METRIC_LENGTH;
}
// move to the starting work location along the diagonal
iii += work-1;
jjj -= work-1;
// step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value
for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) {
updateCell(iii, jjj, haplotypeBases, readBases, readQuals,
insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray);
final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]);
if( bestMetric > maxElement ) {
maxElement = bestMetric;
} else if( maxElement - bestMetric > BANDING_TOLERANCE ) {
break;
}
}
}
}
workQueue.clear();
workQueue.addAll(workToBeAdded);
workToBeAdded.clear();
}
} else {
// simple rectangular version of update loop, slow
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
if( (iii == 1 && jjj == 1) ) { continue; }
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
matchMetricArray, XMetricArray, YMetricArray);
}
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1;
return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
}
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
final int im1 = indI - 1;
final int jm1 = indJ - 1;
// update the match array
double pBaseReadLog10 = 0.0; // Math.log10(1.0);
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state
final byte x = readBases[im1-1];
final byte y = haplotypeBases[jm1-1];
final byte qual = readQuals[im1-1];
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
}
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0);
// update the X (insertion) array
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1);
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) );
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2);
}
// private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other
private boolean detectClusteredStartLocations( final ArrayList<Integer> list, int loc ) {
for(int x : list) {
if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) {
return true;
}
}
return false;
}
}

View File

@ -25,8 +25,11 @@
package org.broadinstitute.sting.utils.codecs.hapmap;
import org.broad.tribble.AsciiFeatureCodec;
import org.broad.tribble.FeatureCodecHeader;
import org.broad.tribble.annotation.Strand;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.LineReader;
import org.broad.tribble.readers.PositionalBufferedStream;
import java.io.IOException;
import java.util.Arrays;
@ -116,4 +119,10 @@ public class RawHapMapCodec extends AsciiFeatureCodec<RawHapMapFeature> {
}
return headerLine;
}
@Override
public FeatureCodecHeader readHeader(final PositionalBufferedStream stream) throws IOException {
final AsciiLineReader br = new AsciiLineReader(stream);
return new FeatureCodecHeader(readHeader(br), br.getPosition());
}
}

View File

@ -2,8 +2,6 @@ package org.broadinstitute.sting.utils.codecs.vcf;
import org.broad.tribble.TribbleException;
import org.broad.tribble.readers.LineReader;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.IOException;
import java.util.*;
@ -119,7 +117,7 @@ public class VCFCodec extends AbstractVCFCodec {
// empty set for passes filters
List<String> fFields = new LinkedList<String>();
// otherwise we have to parse and cache the value
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) )
fFields.add(filterString);
else
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));

View File

@ -34,7 +34,12 @@ import java.io.PrintStream;
* to the provided output stream. For testing/debugging purposes.
*
* Log entries are of the following form (fields are tab-separated):
* LABEL VALUE KEY1 KEY2 ... KEY_N
* LABEL OPERATION VALUE KEY1 KEY2 ... KEY_N
*
* A header line is written before the log entries giving the dimensions of this NestedIntegerArray.
* It has the form:
*
* # LABEL SIZE_OF_FIRST_DIMENSION SIZE_OF_SECOND_DIMENSION ... SIZE_OF_NTH_DIMENSION
*
* @author David Roazen
*/
@ -43,6 +48,9 @@ public class LoggingNestedIntegerArray<T> extends NestedIntegerArray<T> {
private PrintStream log;
private String logEntryLabel;
public static final String HEADER_LINE_PREFIX = "# ";
public enum NestedIntegerArrayOperation { GET, PUT };
/**
*
* @param log output stream to which to log update operations
@ -57,6 +65,37 @@ public class LoggingNestedIntegerArray<T> extends NestedIntegerArray<T> {
}
this.log = log;
this.logEntryLabel = logEntryLabel != null ? logEntryLabel : "";
// Write the header line recording the dimensions of this NestedIntegerArray:
StringBuilder logHeaderLine = new StringBuilder();
logHeaderLine.append(HEADER_LINE_PREFIX);
logHeaderLine.append(this.logEntryLabel);
for ( int dimension : dimensions ) {
logHeaderLine.append("\t");
logHeaderLine.append(dimension);
}
this.log.println(logHeaderLine.toString());
}
@Override
public T get( final int... keys ) {
StringBuilder logEntry = new StringBuilder();
logEntry.append(logEntryLabel);
logEntry.append("\t");
logEntry.append(NestedIntegerArrayOperation.GET);
logEntry.append("\t"); // empty field for the datum value
for ( int key : keys ) {
logEntry.append("\t");
logEntry.append(key);
}
log.println(logEntry.toString());
return super.get(keys);
}
@Override
@ -67,6 +106,8 @@ public class LoggingNestedIntegerArray<T> extends NestedIntegerArray<T> {
logEntry.append(logEntryLabel);
logEntry.append("\t");
logEntry.append(NestedIntegerArrayOperation.PUT);
logEntry.append("\t");
logEntry.append(value);
for ( int key : keys ) {
logEntry.append("\t");

View File

@ -352,6 +352,9 @@ public class UserException extends ReviewedStingException {
}
public static class CannotExecuteQScript extends UserException {
public CannotExecuteQScript(String message) {
super(String.format("Unable to execute QScript: " + message));
}
public CannotExecuteQScript(String message, Exception e) {
super(String.format("Unable to execute QScript: " + message), e);
}

View File

@ -0,0 +1,107 @@
package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Created with IntelliJ IDEA.
* User: rpoplin
* Date: 10/16/12
*/
public class ExactPairHMM extends PairHMM {
@Override
public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY);
Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY);
}
// the initial condition
matchMetricArray[1][1] = 0.0; // Math.log10(1.0);
}
@Override
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP,
final int hapStartIndex,
final boolean recacheReadValues ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
// ensure that all the qual scores have valid values
for( int iii = 0; iii < readQuals.length; iii++ ) {
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
}
// simple rectangular version of update loop, slow
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
if( (iii == 1 && jjj == 1) ) { continue; }
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
matchMetricArray, XMetricArray, YMetricArray);
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1;
return MathUtils.log10sumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]});
}
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
final int im1 = indI - 1;
final int jm1 = indJ - 1;
// update the match array
double pBaseReadLog10 = 0.0; // Math.log10(1.0);
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state
final byte x = readBases[im1-1];
final byte y = haplotypeBases[jm1-1];
final byte qual = readQuals[im1-1];
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
}
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0});
// update the X (insertion) array
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1});
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) );
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2});
}
}

View File

@ -0,0 +1,105 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.pairhmm;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
/**
* Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book.
* User: rpoplin
* Date: 3/1/12
*/
public class OriginalPairHMM extends ExactPairHMM {
@Override
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP,
final int hapStartIndex,
final boolean recacheReadValues ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
// ensure that all the qual scores have valid values
for( int iii = 0; iii < readQuals.length; iii++ ) {
readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) );
}
// simple rectangular version of update loop, slow
for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) {
for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) {
if( (iii == 1 && jjj == 1) ) { continue; }
updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP,
matchMetricArray, XMetricArray, YMetricArray);
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1;
return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
}
private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases,
final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
// the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions
final int im1 = indI - 1;
final int jm1 = indJ - 1;
// update the match array
double pBaseReadLog10 = 0.0; // Math.log10(1.0);
if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state
final byte x = readBases[im1-1];
final byte y = haplotypeBases[jm1-1];
final byte qual = readQuals[im1-1];
pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
}
final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) );
final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP);
final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) );
matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0);
// update the X (insertion) array
final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) );
final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1);
// update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype
final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) );
final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) );
final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0
YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2);
}
}

View File

@ -0,0 +1,45 @@
package org.broadinstitute.sting.utils.pairhmm;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
/**
* Created with IntelliJ IDEA.
* User: rpoplin
* Date: 10/16/12
*/
public abstract class PairHMM {
protected static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE;
protected static final byte DEFAULT_GOP = (byte) 45;
protected static final byte DEFAULT_GCP = (byte) 10;
public enum HMM_IMPLEMENTATION {
/* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */
EXACT,
/* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */
ORIGINAL,
/* Optimized version of the PairHMM which caches per-read computations */
CACHING,
/* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */
LOGLESS_CACHING
}
protected double[][] matchMetricArray = null;
protected double[][] XMetricArray = null;
protected double[][] YMetricArray = null;
public abstract void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH );
@Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length",
"readBases.length == overallGCP.length", "matchMetricArray!=null", "XMetricArray!=null", "YMetricArray!=null"})
@Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 likelihood
public abstract double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP,
final int hapStartIndex,
final boolean recacheReadValues );
}

View File

@ -31,6 +31,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
@ -59,8 +60,9 @@ public class GATKSAMRecord extends BAMRecord {
private String mReadString = null;
private GATKSAMReadGroupRecord mReadGroup = null;
private byte[] reducedReadCounts = null;
private int softStart = -1;
private int softEnd = -1;
private final static int UNINITIALIZED = -1;
private int softStart = UNINITIALIZED;
private int softEnd = UNINITIALIZED;
// because some values can be null, we don't want to duplicate effort
private boolean retrievedReadGroup = false;
@ -385,15 +387,16 @@ public class GATKSAMRecord extends BAMRecord {
* @return the unclipped start of the read taking soft clips (but not hard clips) into account
*/
public int getSoftStart() {
if (softStart < 0) {
int start = this.getUnclippedStart();
for (CigarElement cigarElement : this.getCigar().getCigarElements()) {
if (cigarElement.getOperator() == CigarOperator.HARD_CLIP)
start += cigarElement.getLength();
else
if ( softStart == UNINITIALIZED ) {
softStart = getAlignmentStart();
for (final CigarElement cig : getCigar().getCigarElements()) {
final CigarOperator op = cig.getOperator();
if (op == CigarOperator.SOFT_CLIP)
softStart -= cig.getLength();
else if (op != CigarOperator.HARD_CLIP)
break;
}
softStart = start;
}
return softStart;
}
@ -406,24 +409,26 @@ public class GATKSAMRecord extends BAMRecord {
* @return the unclipped end of the read taking soft clips (but not hard clips) into account
*/
public int getSoftEnd() {
if (softEnd < 0) {
int stop = this.getUnclippedStart();
if ( softEnd == UNINITIALIZED ) {
boolean foundAlignedBase = false;
softEnd = getAlignmentEnd();
final List<CigarElement> cigs = getCigar().getCigarElements();
for (int i = cigs.size() - 1; i >= 0; --i) {
final CigarElement cig = cigs.get(i);
final CigarOperator op = cig.getOperator();
if (ReadUtils.readIsEntirelyInsertion(this))
return stop;
int shift = 0;
CigarOperator lastOperator = null;
for (CigarElement cigarElement : this.getCigar().getCigarElements()) {
stop += shift;
lastOperator = cigarElement.getOperator();
if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP)
shift = cigarElement.getLength();
else
shift = 0;
if (op == CigarOperator.SOFT_CLIP) // assumes the soft clip that we found is at the end of the aligned read
softEnd += cig.getLength();
else if (op != CigarOperator.HARD_CLIP) {
foundAlignedBase = true;
break;
}
}
if( !foundAlignedBase ) { // for example 64H14S, the soft end is actually the same as the alignment end
softEnd = getAlignmentEnd();
}
softEnd = (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ;
}
return softEnd;
}

View File

@ -298,12 +298,16 @@ public abstract class Genotype implements Comparable<Genotype> {
* @return true if all samples PLs are equal and == 0
*/
public boolean isNonInformative() {
for ( final int PL : getPL() ) {
if ( PL != 0 )
return false;
}
if ( getPL() == null )
return true;
else {
for ( final int PL : getPL() ) {
if ( PL != 0 )
return false;
}
return true;
return true;
}
}
/**

View File

@ -6,13 +6,12 @@ import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer;
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.qc.CountReads;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -62,9 +61,9 @@ public class TraverseReadsUnitTest extends BaseTest {
private SAMReaderID bam = new SAMReaderID(new File(validationDataLocation + "index_test.bam"),new Tags()); // TCGA-06-0188.aligned.duplicates_marked.bam");
private File refFile = new File(validationDataLocation + "Homo_sapiens_assembly17.fasta");
private List<SAMReaderID> bamList;
private Walker countReadWalker;
private ReadWalker countReadWalker;
private File output;
private TraverseReads traversalEngine = null;
private TraverseReadsNano traversalEngine = null;
private IndexedFastaSequenceFile ref = null;
private GenomeLocParser genomeLocParser = null;
@ -107,7 +106,7 @@ public class TraverseReadsUnitTest extends BaseTest {
bamList.add(bam);
countReadWalker = new CountReads();
traversalEngine = new TraverseReads();
traversalEngine = new TraverseReadsNano(1);
traversalEngine.initialize(engine);
}
@ -125,7 +124,7 @@ public class TraverseReadsUnitTest extends BaseTest {
fail("Shard == null");
}
ShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null,null);
ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null,null);
accumulator = traversalEngine.traverse(countReadWalker, dataProvider, accumulator);
dataProvider.close();
}

View File

@ -7,6 +7,7 @@ import org.testng.annotations.Test;
import java.io.File;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
// ********************************************************************************** //
@ -18,6 +19,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam";
// --------------------------------------------------------------------------------------------------------------
//
@ -28,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("b3abf320f7d02d0e3b2883833419130e"));
Arrays.asList("847605f4efafef89529fe0e496315edd"));
executeTest("test MultiSample Pilot1", spec);
}
@ -52,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("57e409dbb12e0d85cd8af73db221b1fc"));
Arrays.asList("afb8768f31ab57eb43f75c1115eadc99"));
executeTest("test SingleSample Pilot2", spec);
}
@ -60,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("26af30187316f742878c85f0ed091837"));
Arrays.asList("543f68e42034bf44cfb24da8c9204320"));
executeTest("test Multiple SNP alleles", spec);
}
@ -76,7 +78,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("aa9cf96ab8f5aa844387e3aef1f27249"));
Arrays.asList("5ce03dd9ca2d9324c1d4a9d64389beb5"));
executeTest("test reverse trim", spec);
}
@ -84,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMismatchedPLs() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("d210ee1baa75dd4a0c63aef6b1fa7a8a"));
Arrays.asList("3c006b06b17bbe8e787d64eff6a63a19"));
executeTest("test mismatched PLs", spec);
}
@ -94,7 +96,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "367c0355b4e7b10c2988e5c41f44b3d2";
private final static String COMPRESSED_OUTPUT_MD5 = "fd236bd635d514e4214d364f45ec4d10";
@Test
public void testCompressedOutput() {
@ -115,7 +117,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
String md5 = "360d1274c1072a1ae9868e4e106c2650";
String md5 = "d408b4661b820ed86272415b8ea08780";
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
@ -147,7 +149,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinBaseQualityScore() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1,
Arrays.asList("6ae4a219c7b9c837fcbf12edeeac3c0c"));
Arrays.asList("839ecd30d354a36b5dfa2b5e99859765"));
executeTest("test min_base_quality_score 26", spec);
}
@ -175,6 +177,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
executeTest("test using comp track", spec);
}
@Test
public void testNoCmdLineHeaderStdout() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandNoCmdLineHeaderStdout + " -glm INDEL -L 1:67,225,396-67,288,518", 0,
Collections.<String>emptyList());
executeTest("testNoCmdLineHeaderStdout", spec);
}
@Test
public void testOutputParameterSitesOnly() {
testOutputParameters("-sites_only", "97ba874eafc9884a4de027a84c036311");
@ -187,7 +197,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testOutputParameterAllSites() {
testOutputParameters("--output_mode EMIT_ALL_SITES", "67739a3ccf30975bcaef8a563e4b80cf");
testOutputParameters("--output_mode EMIT_ALL_SITES", "41c046d38ea328421df924e37e017645");
}
private void testOutputParameters(final String args, final String md5) {
@ -220,12 +230,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// --------------------------------------------------------------------------------------------------------------
@Test
public void testHeterozyosity1() {
testHeterozosity( 0.01, "f1c4c8e701b2334bf3c4f12fc395fec8" );
testHeterozosity( 0.01, "986923de51c71635d47e3d06fe3794a1" );
}
@Test
public void testHeterozyosity2() {
testHeterozosity( 1.0 / 1850, "7fbbf4a21d6bf0026bfdadbb3c086fbe" );
testHeterozosity( 1.0 / 1850, "fb12b1553f813004a394a391a8540873" );
}
private void testHeterozosity(final double arg, final String md5) {
@ -268,7 +278,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("950fb032cc9902ae48bd21f272d2fd52"));
Arrays.asList("98058fc913b61c22d44875da1f5ea89c"));
executeTest(String.format("test calling with BAQ"), spec);
}
@ -287,7 +297,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("b3df138254ed141b61a758df87757e0d"));
Arrays.asList("650c53774afacfc07a595675e8cdde17"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -302,7 +312,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("63fd9488daadd4baaef0a98f02916996"));
Arrays.asList("6a0c2a3a7bcc56ad01428c71408055aa"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
}
@ -315,7 +325,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("52b5a432092995c92fe71e1942689ba8"));
Arrays.asList("5f2721c3323de5390d2d47446139f32b"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -343,13 +353,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSampleIndels1() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("7fc488fe16dea9f023bfcfdaa908a548"));
Arrays.asList("a4761d7f25e7a62f34494801c98a0da7"));
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("beee9457d7cea42006ac45400db5e873"));
Arrays.asList("c526c234947482d1cd2ffc5102083a08"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@ -371,7 +381,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 20:10,000,000-10,100,000",
1,
Arrays.asList("945a2f994eaced8efdf8de24b58f2680"));
Arrays.asList("1e0d2c15546c3b0959b00ffb75488b56"));
executeTest(String.format("test UG with base indel quality scores"), spec);
}
@ -449,8 +459,25 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("bbf16e1873e525ee5975021cfb8988cf"));
Arrays.asList("da9c05f87bd6415e97f90c49cf68ed19"));
executeTest("test calling on a ReducedRead BAM", spec);
}
@Test
public void testReducedBamSNPs() {
testReducedCalling("SNP", "1d4a826b144723ff0766c36aa0239287");
}
@Test
public void testReducedBamINDELs() {
testReducedCalling("INDEL", "68ef51d5c98480e0c0192e0eecb95bca");
}
private void testReducedCalling(final String model, final String md5) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-11,000,000 -glm " + model, 1,
Arrays.asList(md5));
executeTest("test calling on a ReducedRead BAM with " + model, spec);
}
}

View File

@ -190,7 +190,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("549321a2543608f214ab4893ab478be6")
Arrays.asList("46ff472fc7ef6734ad01170028d5924a")
);
executeTest("testRegenotype--" + testFile, spec);
@ -216,7 +216,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("549321a2543608f214ab4893ab478be6")
Arrays.asList("46ff472fc7ef6734ad01170028d5924a")
);
executeTest("testRemoveMLEAndRegenotype--" + testFile, spec);

View File

@ -21,7 +21,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest {
for ( final int nct : Arrays.asList(1, 2) ) {
// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct });
//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct });
tests.add(new Object[]{ "BOTH", "78ce72d8f9d029313f5f2ceb02bb9822", nt, nct });
tests.add(new Object[]{ "BOTH", "85fc5d6dfeb60ed89763470f4b4c981e", nt, nct });
}
return tests.toArray(new Object[][]{});

View File

@ -130,7 +130,10 @@ public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest {
return StateTest.getTests(StateTest.class);
}
@Test(enabled = true, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND)
// NOTE this test takes an unreasonably long time to run, and so it's been disabled as these monitoring threads
// aren't a core GATK feature any longer. Should be reabled if we come to care about this capability again
// in the future, or we can run these in parallel
@Test(enabled = false, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND)
public void testStateTest(final StateTest test) throws InterruptedException {
// allows us to test blocking
final EfficiencyMonitoringThreadFactory factory = new EfficiencyMonitoringThreadFactory(test.getNStates());

View File

@ -13,6 +13,7 @@ import net.sf.samtools.SAMFileHeader.SortOrder
import org.broadinstitute.sting.queue.util.QScriptUtils
import org.broadinstitute.sting.queue.function.ListWriterFunction
import org.broadinstitute.sting.commandline.Hidden
import org.broadinstitute.sting.commandline
class DataProcessingPipeline extends QScript {
qscript =>
@ -41,34 +42,34 @@ class DataProcessingPipeline extends QScript {
@Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false)
var bwaPath: File = _
@Input(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", fullName="project", shortName="p", required=false)
@Argument(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", fullName="project", shortName="p", required=false)
var projectName: String = "project"
@Input(doc="Output path for the processed BAM files.", fullName="output_directory", shortName="outputDir", required=false)
@Argument(doc="Output path for the processed BAM files.", fullName="output_directory", shortName="outputDir", required=false)
var outputDir: String = ""
@Input(doc="the -L interval string to be used by GATK - output bams at interval only", fullName="gatk_interval_string", shortName="L", required=false)
@Argument(doc="the -L interval string to be used by GATK - output bams at interval only", fullName="gatk_interval_string", shortName="L", required=false)
var intervalString: String = ""
@Input(doc="an intervals file to be used by GATK - output bams at intervals only", fullName="gatk_interval_file", shortName="intervals", required=false)
var intervals: File = _
@Input(doc="Cleaning model: KNOWNS_ONLY, USE_READS or USE_SW", fullName="clean_model", shortName="cm", required=false)
@Argument(doc="Cleaning model: KNOWNS_ONLY, USE_READS or USE_SW", fullName="clean_model", shortName="cm", required=false)
var cleaningModel: String = "USE_READS"
@Input(doc="Decompose input BAM file and fully realign it using BWA and assume Single Ended reads", fullName="use_bwa_single_ended", shortName="bwase", required=false)
@Argument(doc="Decompose input BAM file and fully realign it using BWA and assume Single Ended reads", fullName="use_bwa_single_ended", shortName="bwase", required=false)
var useBWAse: Boolean = false
@Input(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false)
@Argument(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false)
var useBWApe: Boolean = false
@Input(doc="Decompose input BAM file and fully realign it using BWA SW", fullName="use_bwa_sw", shortName="bwasw", required=false)
@Argument(doc="Decompose input BAM file and fully realign it using BWA SW", fullName="use_bwa_sw", shortName="bwasw", required=false)
var useBWAsw: Boolean = false
@Input(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false)
@Argument(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false)
var bwaThreads: Int = 1
@Input(doc="Perform validation on the BAM files", fullName="validation", shortName="vs", required=false)
@Argument(doc="Perform validation on the BAM files", fullName="validation", shortName="vs", required=false)
var validation: Boolean = false
@ -76,15 +77,15 @@ class DataProcessingPipeline extends QScript {
* Hidden Parameters
****************************************************************************/
@Hidden
@Input(doc="How many ways to scatter/gather", fullName="scatter_gather", shortName="sg", required=false)
@Argument(doc="How many ways to scatter/gather", fullName="scatter_gather", shortName="sg", required=false)
var nContigs: Int = -1
@Hidden
@Input(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false)
@Argument(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false)
var defaultPlatform: String = ""
@Hidden
@Input(doc="Run the pipeline in test mode only", fullName = "test_mode", shortName = "test", required=false)
@Argument(doc="Run the pipeline in test mode only", fullName = "test_mode", shortName = "test", required=false)
var testMode: Boolean = false

View File

@ -27,28 +27,28 @@ class PacbioProcessingPipeline extends QScript {
@Input(doc="dbsnp VCF file to use ", shortName="D", required=true)
var dbSNP: File = _
@Input(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false)
@Argument(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false)
var threads: Int = 0
@Input(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false)
@Argument(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false)
var sample: String = "NA"
@Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false)
var bwaPath: File = _
@Input(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false)
@Argument(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false)
var BLASR_BAM: Boolean = false
@Hidden
@Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false)
@Argument(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false)
var dbq: Int = 20
@Hidden
@Input(shortName="bwastring", required=false)
@Argument(shortName="bwastring", required=false)
var bwastring: String = ""
@Hidden
@Input(shortName = "test", fullName = "test_mode", required = false)
@Argument(shortName = "test", fullName = "test_mode", required = false)
var testMode: Boolean = false
val queueLogDir: String = ".qlog/"

View File

@ -24,7 +24,6 @@
package org.broadinstitute.sting.queue
import function.QFunction
import java.io.File
import org.broadinstitute.sting.commandline._
import org.broadinstitute.sting.queue.util._
@ -96,18 +95,18 @@ class QCommandLine extends CommandLineProgram with Logging {
new PluginManager[QScript](classOf[QScript], Seq(qScriptClasses.toURI.toURL))
}
private lazy val qStatusMessengerPluginManager = {
new PluginManager[QStatusMessenger](classOf[QStatusMessenger])
private lazy val qCommandPlugin = {
new PluginManager[QCommandPlugin](classOf[QCommandPlugin])
}
QFunction.parsingEngine = new ParsingEngine(this)
/**
* Takes the QScripts passed in, runs their script() methods, retrieves their generated
* functions, and then builds and runs a QGraph based on the dependencies.
*/
def execute = {
val allStatusMessengers = qStatusMessengerPluginManager.createAllTypes()
ClassFieldCache.parsingEngine = this.parser
val allCommandPlugins = qCommandPlugin.createAllTypes()
if (settings.qSettings.runName == null)
settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName)
@ -115,18 +114,33 @@ class QCommandLine extends CommandLineProgram with Logging {
settings.qSettings.tempDirectory = IOUtils.absolute(settings.qSettings.runDirectory, ".queue/tmp")
qGraph.initializeWithSettings(settings)
for (statusMessenger <- allStatusMessengers) {
loadArgumentsIntoObject(statusMessenger)
for (commandPlugin <- allCommandPlugins) {
loadArgumentsIntoObject(commandPlugin)
}
for (statusMessenger <- allStatusMessengers) {
statusMessenger.started()
for (commandPlugin <- allCommandPlugins) {
if (commandPlugin.statusMessenger != null)
commandPlugin.statusMessenger.started()
}
qGraph.messengers = allCommandPlugins.filter(_.statusMessenger != null).map(_.statusMessenger).toSeq
// TODO: Default command plugin argument?
val remoteFileConverter = (
for (commandPlugin <- allCommandPlugins if (commandPlugin.remoteFileConverter != null))
yield commandPlugin.remoteFileConverter
).headOption.getOrElse(null)
if (remoteFileConverter != null)
loadArgumentsIntoObject(remoteFileConverter)
val allQScripts = qScriptPluginManager.createAllTypes()
for (script <- allQScripts) {
logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript])))
loadArgumentsIntoObject(script)
// TODO: Pulling inputs can be time/io expensive! Some scripts are using the files to generate functions-- even for dry runs-- so pull it all down for now.
//if (settings.run)
script.pullInputs()
script.qSettings = settings.qSettings
try {
script.script()
@ -134,14 +148,15 @@ class QCommandLine extends CommandLineProgram with Logging {
case e: Exception =>
throw new UserException.CannotExecuteQScript(script.getClass.getSimpleName + ".script() threw the following exception: " + e, e)
}
if (remoteFileConverter != null) {
if (remoteFileConverter.convertToRemoteEnabled)
script.mkRemoteOutputs(remoteFileConverter)
}
script.functions.foreach(qGraph.add(_))
logger.info("Added " + script.functions.size + " functions")
}
if (settings.run) {
allQScripts.foreach(_.pullInputs())
}
// Execute the job graph
qGraph.run()
@ -163,14 +178,19 @@ class QCommandLine extends CommandLineProgram with Logging {
if (!success) {
logger.info("Done with errors")
qGraph.logFailed()
for (statusMessenger <- allStatusMessengers)
statusMessenger.exit("Done with errors")
for (commandPlugin <- allCommandPlugins)
if (commandPlugin.statusMessenger != null)
commandPlugin.statusMessenger.exit("Done with errors: %s".format(qGraph.formattedStatusCounts))
1
} else {
if (settings.run) {
allQScripts.foreach(_.pushOutputs())
for (statusMessenger <- allStatusMessengers)
statusMessenger.done()
for (commandPlugin <- allCommandPlugins)
if (commandPlugin.statusMessenger != null) {
val allInputs = allQScripts.map(_.remoteInputs)
val allOutputs = allQScripts.map(_.remoteOutputs)
commandPlugin.statusMessenger.done(allInputs, allOutputs)
}
}
0
}
@ -190,7 +210,7 @@ class QCommandLine extends CommandLineProgram with Logging {
override def getArgumentSources = {
var plugins = Seq.empty[Class[_]]
plugins ++= qScriptPluginManager.getPlugins
plugins ++= qStatusMessengerPluginManager.getPlugins
plugins ++= qCommandPlugin.getPlugins
plugins.toArray
}
@ -201,11 +221,10 @@ class QCommandLine extends CommandLineProgram with Logging {
override def getArgumentSourceName(source: Class[_]) = {
if (classOf[QScript].isAssignableFrom(source))
qScriptPluginManager.getName(source.asSubclass(classOf[QScript]))
else if (classOf[QStatusMessenger].isAssignableFrom(source))
qStatusMessengerPluginManager.getName(source.asSubclass(classOf[QStatusMessenger]))
else if (classOf[QCommandPlugin].isAssignableFrom(source))
qCommandPlugin.getName(source.asSubclass(classOf[QCommandPlugin]))
else
null
}
/**

View File

@ -0,0 +1,9 @@
package org.broadinstitute.sting.queue
import engine.QStatusMessenger
import util.RemoteFileConverter
trait QCommandPlugin {
def statusMessenger: QStatusMessenger = null
def remoteFileConverter: RemoteFileConverter = null
}

View File

@ -28,8 +28,7 @@ import engine.JobRunInfo
import org.broadinstitute.sting.queue.function.QFunction
import annotation.target.field
import util._
import org.broadinstitute.sting.utils.classloader.JVMUtils
import java.lang.reflect.Field
import org.broadinstitute.sting.commandline.ArgumentSource
/**
* Defines a Queue pipeline as a collection of CommandLineFunctions.
@ -109,32 +108,70 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon
functions.foreach( f => add(f) )
}
/**
* Convert all @Output files to remote output files.
* @param remoteFileConverter Converter for files to remote files.
*/
def mkRemoteOutputs(remoteFileConverter: RemoteFileConverter) {
for (field <- outputFields) {
val fieldFile = ClassFieldCache.getFieldFile(this, field)
if (fieldFile != null && !fieldFile.isInstanceOf[RemoteFile]) {
val fieldName = ClassFieldCache.fullName(field)
val remoteFile = remoteFileConverter.convertToRemote(fieldFile, fieldName)
ClassFieldCache.setFieldValue(this, field, remoteFile)
}
}
}
/**
* Pull all remote files to the local disk.
*/
def pullInputs() {
val inputs = getInputs
inputs.filter(_.isInstanceOf[RemoteFile]).map(_.asInstanceOf[RemoteFile]).foreach(_.pullToLocal())
val inputs = ClassFieldCache.getFieldFiles(this, inputFields)
for (remoteFile <- filterRemoteFiles(inputs)) {
logger.info("Pulling %s from %s".format(remoteFile.getAbsolutePath, remoteFile.remoteDescription))
remoteFile.pullToLocal()
}
}
/**
* Push all remote files from the local disk.
*/
def pushOutputs() {
val outputs = getOutputs
outputs.filter(_.isInstanceOf[RemoteFile]).map(_.asInstanceOf[RemoteFile]).foreach(_.pushToRemote())
val outputs = ClassFieldCache.getFieldFiles(this, outputFields)
for (remoteFile <- filterRemoteFiles(outputs)) {
logger.info("Pushing %s to %s".format(remoteFile.getAbsolutePath, remoteFile.remoteDescription))
remoteFile.pushToRemote()
}
}
private def getInputs: Seq[File] = {
getFieldValues(classOf[Input])
/**
* List out the remote outputs
* @return the RemoteFile outputs by argument source
*/
def remoteInputs: Map[ArgumentSource, Seq[RemoteFile]] = remoteFieldMap(inputFields)
/**
* List out the remote outputs
* @return the RemoteFile outputs by argument source
*/
def remoteOutputs: Map[ArgumentSource, Seq[RemoteFile]] = remoteFieldMap(outputFields)
private def remoteFieldMap(fields: Seq[ArgumentSource]): Map[ArgumentSource, Seq[RemoteFile]] = {
fields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap
}
private def getOutputs: Seq[File] = {
getFieldValues(classOf[Output])
}
private def filterRemoteFiles(fields: Seq[File]): Seq[RemoteFile] =
fields.filter(field => field != null && field.isInstanceOf[RemoteFile]).map(_.asInstanceOf[RemoteFile])
private def getFieldValues(annotation: Class[_ <: java.lang.annotation.Annotation]): Seq[File] = {
val filtered: Seq[Field] = fields.filter(field => ReflectionUtils.hasAnnotation(field, annotation))
val files = filtered.filter(field => classOf[File].isAssignableFrom(field.getType)).map(field => ReflectionUtils.getValue(this, field).asInstanceOf[File])
val seqFiles = filtered.filter(field => classOf[Seq[File]].isAssignableFrom(field.getType)).map(field => ReflectionUtils.getValue(this, field).asInstanceOf[Seq[File]])
seqFiles.foldLeft(files)(_ ++ _).filter(_ != null)
}
private lazy val fields = collection.JavaConversions.asScalaBuffer(JVMUtils.getAllFields(this.getClass)).toSeq
/** The complete list of fields. */
def functionFields: Seq[ArgumentSource] = ClassFieldCache.classFunctionFields(this.getClass)
/** The @Input fields. */
def inputFields: Seq[ArgumentSource] = ClassFieldCache.classInputFields(this.getClass)
/** The @Output fields. */
def outputFields: Seq[ArgumentSource] = ClassFieldCache.classOutputFields(this.getClass)
/** The @Argument fields. */
def argumentFields: Seq[ArgumentSource] = ClassFieldCache.classArgumentFields(this.getClass)
}
object QScript {

View File

@ -25,7 +25,7 @@
package org.broadinstitute.sting.queue
import java.io.File
import org.broadinstitute.sting.commandline.Argument
import org.broadinstitute.sting.commandline.{ClassType, Argument}
/**
* Default settings settable on the command line and passed to CommandLineFunctions.
@ -41,6 +41,7 @@ class QSettings {
var jobQueue: String = _
@Argument(fullName="job_priority", shortName="jobPriority", doc="Default priority for jobs. Min = 0, Max = 100", required=false)
@ClassType(classOf[Int])
var jobPriority: Option[Int] = None
@Argument(fullName="job_native_arg", shortName="jobNative", doc="Native arguments to pass to the job runner.", required=false)
@ -53,15 +54,19 @@ class QSettings {
var jobEnvironmentNames: Seq[String] = Nil
@Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes. If not set defaults to 2GB.", required=false)
@ClassType(classOf[Double])
var memoryLimit: Option[Double] = Some(2)
@Argument(fullName="memory_limit_threshold", shortName="memLimitThresh", doc="After passing this threshold stop increasing memory limit for jobs, in gigabytes.", required=false)
@ClassType(classOf[Double])
var memoryLimitThreshold: Option[Double] = None
@Argument(fullName="resident_memory_limit", shortName="resMemLimit", doc="Default resident memory limit for jobs, in gigabytes.", required=false)
@ClassType(classOf[Double])
var residentLimit: Option[Double] = None
@Argument(fullName="resident_memory_request", shortName="resMemReq", doc="Default resident memory request for jobs, in gigabytes.", required=false)
@ClassType(classOf[Double])
var residentRequest: Option[Double] = None
@Argument(fullName="resident_memory_request_parameter", shortName="resMemReqParam", doc="Parameter for resident memory requests. By default not requested.", required=false)

View File

@ -47,6 +47,7 @@ import java.io.{OutputStreamWriter, File}
*/
class QGraph extends Logging {
var settings: QGraphSettings = _
var messengers: Seq[QStatusMessenger] = Nil
private def dryRun = !settings.run
private var numMissingValues = 0
@ -95,7 +96,7 @@ class QGraph extends Logging {
* The settings aren't necessarily available until after this QGraph object has been constructed, so
* this function must be called once the QGraphSettings have been filled in.
*
* @param settings
* @param settings QGraphSettings
*/
def initializeWithSettings(settings: QGraphSettings) {
this.settings = settings
@ -430,6 +431,7 @@ class QGraph extends Logging {
val edge = readyJobs.head
edge.runner = newRunner(edge.function)
edge.start()
messengers.foreach(_.started(jobShortName(edge.function)))
startedJobs += edge
readyJobs -= edge
logNextStatusCounts = true
@ -465,8 +467,14 @@ class QGraph extends Logging {
updateStatus()
runningJobs.foreach(edge => edge.status match {
case RunnerStatus.DONE => doneJobs += edge
case RunnerStatus.FAILED => failedJobs += edge
case RunnerStatus.DONE => {
doneJobs += edge
messengers.foreach(_.done(jobShortName(edge.function)))
}
case RunnerStatus.FAILED => {
failedJobs += edge
messengers.foreach(_.exit(jobShortName(edge.function), edge.function.jobErrorLines.mkString("%n".format())))
}
case RunnerStatus.RUNNING => /* do nothing while still running */
})
@ -493,7 +501,7 @@ class QGraph extends Logging {
// incremental
if ( logNextStatusCounts && INCREMENTAL_JOBS_REPORT ) {
logger.info("Writing incremental jobs reports...")
writeJobsReport(false)
writeJobsReport(plot = false)
}
readyJobs ++= getReadyJobs
@ -516,9 +524,13 @@ class QGraph extends Logging {
private def nextRunningCheck(lastRunningCheck: Long) =
((30 * 1000L) - (System.currentTimeMillis - lastRunningCheck))
def formattedStatusCounts: String = {
"%d Pend, %d Run, %d Fail, %d Done".format(
statusCounts.pending, statusCounts.running, statusCounts.failed, statusCounts.done)
}
private def logStatusCounts() {
logger.info("%d Pend, %d Run, %d Fail, %d Done".format(
statusCounts.pending, statusCounts.running, statusCounts.failed, statusCounts.done))
logger.info(formattedStatusCounts)
}
/**
@ -533,6 +545,16 @@ class QGraph extends Logging {
traverseFunctions(edge => recheckDone(edge))
}
// TODO: Yet another field to add (with overloads) to QFunction?
private def jobShortName(function: QFunction): String = {
var name = function.analysisName
if (function.isInstanceOf[CloneFunction]) {
val cloneFunction = function.asInstanceOf[CloneFunction]
name += " %d of %d".format(cloneFunction.cloneIndex, cloneFunction.cloneCount)
}
name
}
/**
* First pass that checks if an edge is done or if it's an intermediate edge if it can be skipped.
* This function may modify the status of previous edges if it discovers that the edge passed in

View File

@ -1,10 +1,17 @@
package org.broadinstitute.sting.queue.engine
import org.broadinstitute.sting.commandline.ArgumentSource
import org.broadinstitute.sting.queue.util.RemoteFile
/**
* Plugin to sends QStatus messages
*/
trait QStatusMessenger {
def started()
def done()
def done(inputs: Seq[Map[ArgumentSource, Seq[RemoteFile]]], outputs: Seq[Map[ArgumentSource, Seq[RemoteFile]]])
def exit(message: String)
def started(job: String)
def done(job: String)
def exit(job: String, message: String)
}

View File

@ -28,6 +28,7 @@ import org.broadinstitute.sting.queue.function.scattergather.GatherFunction
import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction
import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction}
import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor
import org.broadinstitute.sting.queue.util.ClassFieldCache
/**
* Merges BAM files using net.sf.picard.sam.MergeSamFiles.
@ -47,13 +48,13 @@ class BamGatherFunction extends GatherFunction with PicardBamFunction with Retry
// bam_compression and index_output_bam_on_the_fly from SAMFileWriterArgumentTypeDescriptor
// are added by the GATKExtensionsGenerator to the subclass of CommandLineGATK
val compression = QFunction.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.COMPRESSION_FULLNAME)
val compression = ClassFieldCache.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.COMPRESSION_FULLNAME)
this.compressionLevel = originalGATK.getFieldValue(compression).asInstanceOf[Option[Int]]
val disableIndex = QFunction.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.DISABLE_INDEXING_FULLNAME)
val disableIndex = ClassFieldCache.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.DISABLE_INDEXING_FULLNAME)
this.createIndex = Some(!originalGATK.getFieldValue(disableIndex).asInstanceOf[Boolean])
val enableMD5 = QFunction.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.ENABLE_MD5_FULLNAME)
val enableMD5 = ClassFieldCache.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.ENABLE_MD5_FULLNAME)
this.createMD5 = Some(originalGATK.getFieldValue(enableMD5).asInstanceOf[Boolean])
super.freezeFieldValues()

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.queue.extensions.gatk
import org.broadinstitute.sting.queue.function.scattergather.GatherFunction
import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction}
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor
import org.broadinstitute.sting.queue.util.ClassFieldCache
/**
* Merges a vcf text file.
@ -38,7 +39,6 @@ class VcfGatherFunction extends CombineVariants with GatherFunction with RetryMe
private lazy val originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK]
override def freezeFieldValues() {
this.jarFile = this.originalGATK.jarFile
this.variant = this.gatherParts.zipWithIndex map { case (input, index) => new TaggedFile(input, "input"+index) }
this.out = this.originalOutput
GATKIntervals.copyIntervalArguments(this.originalGATK, this)
@ -46,10 +46,10 @@ class VcfGatherFunction extends CombineVariants with GatherFunction with RetryMe
// NO_HEADER and sites_only from VCFWriterArgumentTypeDescriptor
// are added by the GATKExtensionsGenerator to the subclass of CommandLineGATK
val noHeader = QFunction.findField(originalFunction.getClass, VCFWriterArgumentTypeDescriptor.NO_HEADER_ARG_NAME)
val noHeader = ClassFieldCache.findField(originalFunction.getClass, VCFWriterArgumentTypeDescriptor.NO_HEADER_ARG_NAME)
this.no_cmdline_in_header = originalGATK.getFieldValue(noHeader).asInstanceOf[Boolean]
val sitesOnly = QFunction.findField(originalFunction.getClass, VCFWriterArgumentTypeDescriptor.SITES_ONLY_ARG_NAME)
val sitesOnly = ClassFieldCache.findField(originalFunction.getClass, VCFWriterArgumentTypeDescriptor.SITES_ONLY_ARG_NAME)
this.sites_only = originalGATK.getFieldValue(sitesOnly).asInstanceOf[Boolean]
// ensure that the gather function receives the same unsafe parameter as the scattered function

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.queue.extensions.picard
import org.broadinstitute.sting.commandline.{Argument, Output, Input}
import java.io.File
import net.sf.picard.analysis.MetricAccumulationLevel
/**
* Created with IntelliJ IDEA.
@ -10,9 +11,8 @@ import java.io.File
* Time: 5:59 PM
* To change this template use File | Settings | File Templates.
*/
class CalculateHsMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction {
class CalculateHsMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardMetricsFunction {
analysisName = "CalculateHsMetrics"
javaMainClass = "net.sf.picard.sam.CalculateHsMetrics"
@Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true)
var input: Seq[File] = Nil
@ -28,33 +28,15 @@ class CalculateHsMetrics extends org.broadinstitute.sting.queue.function.JavaCom
@Argument(doc="Reference file", shortName = "reference", fullName = "reference", required = true)
var reference: File = _
/*
@Argument(doc = "Maximum number of file handles to keep open when spilling read ends to disk. Set this number a little lower than the per-process maximum number of file that may be open. This number can be found by executing the 'ulimit -n' command on a Unix system.", shortName = "max_file_handles", fullName ="max_file_handles_for_read_ends_maps", required=false)
var MAX_FILE_HANDLES_FOR_READ_ENDS_MAP: Int = -1;
@Argument(doc = "This number, plus the maximum RAM available to the JVM, determine the memory footprint used by some of the sorting collections. If you are running out of memory, try reducing this number.", shortName = "sorting_ratio", fullName = "sorting_collection_size_ratio", required = false)
var SORTING_COLLECTION_SIZE_RATIO: Double = -1
*/
override def freezeFieldValues() {
super.freezeFieldValues()
// if (outputIndex == null && output != null)
// outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai")
}
val level = "SAMPLE"
val level = MetricAccumulationLevel.SAMPLE
override def inputBams = input
override def outputBam = output
//this.sortOrder = null
//this.createIndex = Some(true)
override def outputFile = output
override def commandLine = super.commandLine +
required("BAIT_INTERVALS=" + baits) +
required("TARGET_INTERVALS=" + targets) +
required("REFERENCE_SEQUENCE=" + reference) +
optional("METRIC_ACCUMULATION_LEVEL="+level)/*+
conditional(REMOVE_DUPLICATES, "REMOVE_DUPLICATES=true") +
conditional(MAX_FILE_HANDLES_FOR_READ_ENDS_MAP > 0, "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=" + MAX_FILE_HANDLES_FOR_READ_ENDS_MAP.toString) +
conditional(SORTING_COLLECTION_SIZE_RATIO > 0, "SORTING_COLLECTION_SIZE_RATIO=" + SORTING_COLLECTION_SIZE_RATIO.toString) */
optional("METRIC_ACCUMULATION_LEVEL="+level)
}

View File

@ -10,9 +10,8 @@ import java.io.File
* Time: 10:37 AM
* To change this template use File | Settings | File Templates.
*/
class CollectGcBiasMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction {
analysisName = "CalculateGcMetrics"
javaMainClass = "net.sf.picard.sam.CalculateGcMetrics"
class CollectGcBiasMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardMetricsFunction {
analysisName = "CollectGcBiasMetrics"
@Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true)
var input: Seq[File] = Nil
@ -24,8 +23,9 @@ class CollectGcBiasMetrics extends org.broadinstitute.sting.queue.function.JavaC
var reference: File = _
override def inputBams = input
override def outputBam = output
override def outputFile = output
override def commandLine = super.commandLine +
required("SUMMARY_OUTPUT=" + output) +
required("CHART_OUTPUT=" + output+".pdf") +
required("REFERENCE_SEQUENCE=" + reference) +
required("ASSUME_SORTED=true")

View File

@ -10,9 +10,8 @@ import java.io.File
* Time: 10:37 AM
* To change this template use File | Settings | File Templates.
*/
class CollectMultipleMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction{
analysisName = "CalculateMultipleMetrics"
javaMainClass = "net.sf.picard.sam.CalculateMultipleMetrics"
class CollectMultipleMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardMetricsFunction{
analysisName = "CollectMultipleMetrics"
@Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true)
var input: Seq[File] = Nil
@ -24,7 +23,7 @@ class CollectMultipleMetrics extends org.broadinstitute.sting.queue.function.Jav
var reference: File = _
override def inputBams = input
override def outputBam = output
override def outputFile = output
override def commandLine = super.commandLine +
required("REFERENCE_SEQUENCE=" + reference) +
required("ASSUME_SORTED=true") +

View File

@ -0,0 +1,53 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.queue.extensions.picard
import java.io.File
import org.broadinstitute.sting.queue.function.JavaCommandLineFunction
import net.sf.samtools.SAMFileReader.ValidationStringency
import net.sf.samtools.SAMFileHeader.SortOrder
/**
* Wraps a Picard function that operates on BAM files but doesn't output a new BAM file (i.e. QC metric files).
* See http://picard.sourceforge.net/ for more info.
*
* Since the various BAM utilities take slightly different arguments
* some values are optional.
*/
trait PicardMetricsFunction extends JavaCommandLineFunction {
var validationStringency = ValidationStringency.SILENT
var maxRecordsInRam: Option[Int] = None
var assumeSorted: Option[Boolean] = None
protected def inputBams: Seq[File]
protected def outputFile: File
abstract override def commandLine = super.commandLine +
repeat("INPUT=", inputBams, spaceSeparated=false) +
required("TMP_DIR=" + jobTempDir) +
optional("VALIDATION_STRINGENCY=", validationStringency, spaceSeparated=false) +
optional("OUTPUT=", outputFile, spaceSeparated=false) +
optional("MAX_RECORDS_IN_RAM=", maxRecordsInRam, spaceSeparated=false) +
optional("ASSUME_SORTED=", assumeSorted, spaceSeparated=false)
}

View File

@ -28,7 +28,6 @@ import java.io.File
import java.lang.annotation.Annotation
import org.broadinstitute.sting.commandline._
import org.broadinstitute.sting.queue.{QException, QSettings}
import collection.JavaConversions._
import java.lang.IllegalStateException
import org.broadinstitute.sting.queue.util._
import org.broadinstitute.sting.utils.io.IOUtils
@ -194,13 +193,13 @@ trait QFunction extends Logging with QJobReport {
def failOutputs: Seq[File] = statusPrefixes.map(path => new File(path + ".fail"))
/** The complete list of fields on this CommandLineFunction. */
def functionFields = QFunction.classFields(this.functionFieldClass).functionFields
def functionFields: Seq[ArgumentSource] = ClassFieldCache.classFunctionFields(this.functionFieldClass)
/** The @Input fields on this CommandLineFunction. */
def inputFields = QFunction.classFields(this.functionFieldClass).inputFields
def inputFields: Seq[ArgumentSource] = ClassFieldCache.classInputFields(this.functionFieldClass)
/** The @Output fields on this CommandLineFunction. */
def outputFields = QFunction.classFields(this.functionFieldClass).outputFields
def outputFields: Seq[ArgumentSource] = ClassFieldCache.classOutputFields(this.functionFieldClass)
/** The @Argument fields on this CommandLineFunction. */
def argumentFields = QFunction.classFields(this.functionFieldClass).argumentFields
def argumentFields: Seq[ArgumentSource] = ClassFieldCache.classArgumentFields(this.functionFieldClass)
/**
* Returns the class that should be used for looking up fields.
@ -475,79 +474,12 @@ trait QFunction extends Logging with QJobReport {
* @param source Field to get the value for.
* @return value of the field.
*/
def getFieldValue(source: ArgumentSource) = ReflectionUtils.getValue(invokeObj(source), source.field)
def getFieldValue(source: ArgumentSource) = ClassFieldCache.getFieldValue(this, source)
/**
* Gets the value of a field.
* @param source Field to set the value for.
* @return value of the field.
*/
def setFieldValue(source: ArgumentSource, value: Any) = ReflectionUtils.setValue(invokeObj(source), source.field, value)
/**
* Walks gets the fields in this object or any collections in that object
* recursively to find the object holding the field to be retrieved or set.
* @param source Field find the invoke object for.
* @return Object to invoke the field on.
*/
private def invokeObj(source: ArgumentSource) = source.parentFields.foldLeft[AnyRef](this)(ReflectionUtils.getValue(_, _))
}
object QFunction {
var parsingEngine: ParsingEngine = _
/**
* The list of fields defined on a class
* @param clazz The class to lookup fields.
*/
private class ClassFields(clazz: Class[_]) {
/** The complete list of fields on this CommandLineFunction. */
val functionFields: Seq[ArgumentSource] = parsingEngine.extractArgumentSources(clazz).toSeq
/** The @Input fields on this CommandLineFunction. */
val inputFields = functionFields.filter(source => ReflectionUtils.hasAnnotation(source.field, classOf[Input]))
/** The @Output fields on this CommandLineFunction. */
val outputFields = functionFields.filter(source => ReflectionUtils.hasAnnotation(source.field, classOf[Output]))
/** The @Argument fields on this CommandLineFunction. */
val argumentFields = functionFields.filter(source => ReflectionUtils.hasAnnotation(source.field, classOf[Argument]))
}
/**
* The mapping from class to fields.
*/
private var classFieldsMap = Map.empty[Class[_], ClassFields]
/**
* Returns the field on clazz.
* @param clazz Class to search.
* @param name Name of the field to return.
* @return Argument source for the field.
*/
def findField(clazz: Class[_], name: String) = {
classFields(clazz).functionFields.find(_.field.getName == name) match {
case Some(source) => source
case None => throw new QException("Could not find a field on class %s with name %s".format(clazz, name))
}
}
/**
* Returns the fields for a class.
* @param clazz Class to retrieve fields for.
* @return the fields for the class.
*/
private def classFields(clazz: Class[_]) = {
classFieldsMap.get(clazz) match {
case Some(classFields) => classFields
case None =>
val classFields = new ClassFields(clazz)
classFieldsMap += clazz -> classFields
classFields
}
}
/**
* Returns the Seq of fields for a QFunction class.
* @param clazz Class to retrieve fields for.
* @return the fields of the class.
*/
def classFunctionFields(clazz: Class[_]) = classFields(clazz).functionFields
def setFieldValue(source: ArgumentSource, value: Any) = ClassFieldCache.setFieldValue(this, source, value)
}

View File

@ -25,18 +25,20 @@
package org.broadinstitute.sting.queue.function.scattergather
import org.broadinstitute.sting.commandline.ArgumentSource
import org.broadinstitute.sting.queue.function.{QFunction, CommandLineFunction}
import org.broadinstitute.sting.queue.function.CommandLineFunction
import org.broadinstitute.sting.queue.util.ClassFieldCache
/**
* Shadow clones another command line function.
*/
object CloneFunction {
private lazy val cloneFunctionFields = QFunction.classFunctionFields(classOf[CloneFunction])
private lazy val cloneFunctionFields = ClassFieldCache.classFunctionFields(classOf[CloneFunction])
}
class CloneFunction extends CommandLineFunction {
var originalFunction: ScatterGatherableFunction = _
var cloneIndex: Int = _
var cloneCount: Int = _
private var overriddenFields = Map.empty[ArgumentSource, Any]
private var withScatterPartCount = 0
@ -76,7 +78,7 @@ class CloneFunction extends CommandLineFunction {
def commandLine = withScatterPart(() => originalFunction.commandLine)
def getFieldValue(field: String): AnyRef = {
val source = QFunction.findField(originalFunction.getClass, field)
val source = ClassFieldCache.findField(originalFunction.getClass, field)
getFieldValue(source)
}
@ -98,7 +100,7 @@ class CloneFunction extends CommandLineFunction {
}
def setFieldValue(field: String, value: Any) {
val source = QFunction.findField(originalFunction.getClass, field)
val source = ClassFieldCache.findField(originalFunction.getClass, field)
setFieldValue(source, value)
}

View File

@ -176,6 +176,7 @@ trait ScatterGatherableFunction extends CommandLineFunction {
cloneFunction.originalFunction = this
cloneFunction.analysisName = this.analysisName
cloneFunction.cloneIndex = i
cloneFunction.cloneCount = numClones
cloneFunction.commandDirectory = this.scatterGatherTempDir(dirFormat.format(i))
cloneFunction.jobOutputFile = if (IOUtils.isSpecialFile(this.jobOutputFile)) this.jobOutputFile else new File(this.jobOutputFile.getName)
if (this.jobErrorFile != null)

View File

@ -0,0 +1,194 @@
package org.broadinstitute.sting.queue.util
import org.broadinstitute.sting.commandline._
import scala.Some
import org.broadinstitute.sting.queue.QException
import collection.JavaConversions._
import java.io.File
/**
* Utilities and a static cache of argument fields for various classes populated by the parsingEngine.
* Because this class works with the ParsingEngine it can walk @ArgumentCollection hierarchies.
*/
object ClassFieldCache {
var parsingEngine: ParsingEngine = _
//
// Field caching
//
/**
* The list of fields defined on a class
* @param clazz The class to lookup fields.
*/
private class ClassFields(clazz: Class[_]) {
/** The complete list of fields on this CommandLineFunction. */
val functionFields: Seq[ArgumentSource] = parsingEngine.extractArgumentSources(clazz).toSeq
/** The @Input fields on this CommandLineFunction. */
val inputFields: Seq[ArgumentSource] = functionFields.filter(source => ReflectionUtils.hasAnnotation(source.field, classOf[Input]))
/** The @Output fields on this CommandLineFunction. */
val outputFields: Seq[ArgumentSource] = functionFields.filter(source => ReflectionUtils.hasAnnotation(source.field, classOf[Output]))
/** The @Argument fields on this CommandLineFunction. */
val argumentFields: Seq[ArgumentSource] = functionFields.filter(source => ReflectionUtils.hasAnnotation(source.field, classOf[Argument]))
}
/**
* The mapping from class to fields.
*/
private var classFieldsMap = Map.empty[Class[_], ClassFields]
/**
* Returns the fields for a class.
* @param clazz Class to retrieve fields for.
* @return the fields for the class.
*/
private def classFields(clazz: Class[_]): ClassFields = {
classFieldsMap.get(clazz) match {
case Some(classFields) => classFields
case None =>
val classFields = new ClassFields(clazz)
classFieldsMap += clazz -> classFields
classFields
}
}
/**
* Returns the field on clazz.
* @param clazz Class to search.
* @param name Name of the field to return.
* @return Argument source for the field.
*/
def findField(clazz: Class[_], name: String): ArgumentSource = {
classFields(clazz).functionFields.find(_.field.getName == name) match {
case Some(source) => source
case None => throw new QException("Could not find a field on class %s with name %s".format(clazz, name))
}
}
/**
* Returns the Seq of fields for a QFunction class.
* @param clazz Class to retrieve fields for.
* @return the fields of the class.
*/
def classFunctionFields(clazz: Class[_]): Seq[ArgumentSource] = classFields(clazz).functionFields
/**
* Returns the Seq of inputs for a QFunction class.
* @param clazz Class to retrieve inputs for.
* @return the inputs of the class.
*/
def classInputFields(clazz: Class[_]): Seq[ArgumentSource] = classFields(clazz).inputFields
/**
* Returns the Seq of outputs for a QFunction class.
* @param clazz Class to retrieve outputs for.
* @return the outputs of the class.
*/
def classOutputFields(clazz: Class[_]): Seq[ArgumentSource] = classFields(clazz).outputFields
/**
* Returns the Seq of arguments for a QFunction class.
* @param clazz Class to retrieve arguments for.
* @return the arguments of the class.
*/
def classArgumentFields(clazz: Class[_]): Seq[ArgumentSource] = classFields(clazz).argumentFields
//
// get/set fields as AnyRef
//
/**
* Gets the value of a field.
* @param obj Top level object storing the source info.
* @param source Field to get the value for.
* @return value of the field.
*/
def getFieldValue(obj: AnyRef, source: ArgumentSource) = ReflectionUtils.getValue(invokeObj(obj, source), source.field)
/**
* Gets the value of a field.
* @param obj Top level object storing the source info.
* @param source Field to set the value for.
* @return value of the field.
*/
def setFieldValue(obj: AnyRef, source: ArgumentSource, value: Any) = ReflectionUtils.setValue(invokeObj(obj, source), source.field, value)
/**
* Walks gets the fields in this object or any collections in that object
* recursively to find the object holding the field to be retrieved or set.
* @param obj Top level object storing the source info.
* @param source Field find the invoke object for.
* @return Object to invoke the field on.
*/
private def invokeObj(obj: AnyRef, source: ArgumentSource) = source.parentFields.foldLeft[AnyRef](obj)(ReflectionUtils.getValue(_, _))
//
// get/set fields as java.io.File
//
/**
* Gets the files from the fields. The fields must be a File, a FileExtension, or a Seq or Set of either.
* @param obj Top level object storing the source info.
* @param fields Fields to get files.
* @return for the fields.
*/
def getFieldFiles(obj: AnyRef, fields: Seq[ArgumentSource]): Seq[File] = {
var files: Seq[File] = Nil
for (field <- fields)
files ++= getFieldFiles(obj, field)
files.distinct
}
/**
* Gets the files from the field. The field must be a File, a FileExtension, or a Seq or Set of either.
* @param obj Top level object storing the source info.
* @param field Field to get files.
* @return for the field.
*/
def getFieldFiles(obj: AnyRef, field: ArgumentSource): Seq[File] = {
var files: Seq[File] = Nil
CollectionUtils.foreach(getFieldValue(obj, field), (fieldValue) => {
val file = fieldValueToFile(field, fieldValue)
if (file != null)
files :+= file
})
files.distinct
}
/**
* Gets the file from the field. The field must be a File or a FileExtension and not a Seq or Set.
* @param obj Top level object storing the source info.
* @param field Field to get the file.
* @return for the field.
*/
def getFieldFile(obj: AnyRef, field: ArgumentSource): File =
fieldValueToFile(field, getFieldValue(obj, field))
/**
* Converts the field value to a file. The field must be a File or a FileExtension.
* @param field Field to get the file.
* @param value Value of the File or FileExtension or null.
* @return Null if value is null, otherwise the File.
* @throws QException if the value is not a File or FileExtension.
*/
private def fieldValueToFile(field: ArgumentSource, value: Any): File = value match {
case file: File => file
case null => null
case unknown => throw new QException("Non-file found. Try removing the annotation, change the annotation to @Argument, or extend File with FileExtension: %s: %s".format(field.field, unknown))
}
//
// other utilities
//
/**
* Retrieves the fullName of the argument
* @param field ArgumentSource to check
* @return Full name of the argument source
*/
def fullName(field: ArgumentSource) = field.createArgumentDefinitions().get(0).fullName
}

View File

@ -159,12 +159,11 @@ object ReflectionUtils {
private def getGenericTypes(field: Field): Option[Array[Class[_]]] = {
// TODO: Refactor: based on java code in org.broadinstitute.sting.commandline.ArgumentTypeDescriptor
// If this is a parameterized collection, find the contained type. If blow up if only one type exists.
if (field.getGenericType.isInstanceOf[ParameterizedType]) {
if (hasAnnotation(field, classOf[ClassType])) {
Some(Array(getAnnotation(field, classOf[ClassType]).value))
} else if (field.getGenericType.isInstanceOf[ParameterizedType]) {
val parameterizedType = field.getGenericType.asInstanceOf[ParameterizedType]
Some(parameterizedType.getActualTypeArguments.map(_.asInstanceOf[Class[_]]))
} else if (hasAnnotation(field, classOf[ClassType])) {
Some(Array(getAnnotation(field, classOf[ClassType]).value))
}
else None
} else None
}
}

Some files were not shown because too many files have changed in this diff Show More