Merge branch 'master' of ssh://gsa2.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Ami Levy Moonshine 2012-10-15 11:19:12 -04:00
commit 0d93effa4d
89 changed files with 4863 additions and 1586 deletions

View File

@ -23,7 +23,7 @@ public class BaseAndQualsCounts extends BaseCounts {
}
}
public void incr(byte base, byte baseQual, byte insQual, byte delQual) {
public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
super.incr(base, baseQual);
BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // do not allow Ns
@ -32,7 +32,7 @@ public class BaseAndQualsCounts extends BaseCounts {
}
}
public void decr(byte base, byte baseQual, byte insQual, byte delQual) {
public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
super.decr(base, baseQual);
BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // do not allow Ns
@ -41,16 +41,15 @@ public class BaseAndQualsCounts extends BaseCounts {
}
}
public byte averageInsertionQualsOfMostCommonBase() {
return getGenericAverageQualOfMostCommonBase(sumInsertionQuals);
public byte averageInsertionQualsOfBase(final BaseIndex base) {
return getGenericAverageQualOfBase(base, sumInsertionQuals);
}
public byte averageDeletionQualsOfMostCommonBase() {
return getGenericAverageQualOfMostCommonBase(sumDeletionQuals);
public byte averageDeletionQualsOfBase(final BaseIndex base) {
return getGenericAverageQualOfBase(base, sumDeletionQuals);
}
private byte getGenericAverageQualOfMostCommonBase(Map<BaseIndex, Long> sumQuals) {
BaseIndex base = BaseIndex.byteToBase(baseWithMostCounts());
private byte getGenericAverageQualOfBase(final BaseIndex base, final Map<BaseIndex, Long> sumQuals) {
return (byte) (sumQuals.get(base) / getCount(base));
}
}

View File

@ -41,26 +41,26 @@ import java.util.Map;
@Requires("other != null")
public void add(BaseCounts other) {
for (BaseIndex i : BaseIndex.values())
for (final BaseIndex i : BaseIndex.values())
counts.put(i, counts.get(i) + other.counts.get(i));
}
@Requires("other != null")
public void sub(BaseCounts other) {
for (BaseIndex i : BaseIndex.values())
for (final BaseIndex i : BaseIndex.values())
counts.put(i, counts.get(i) - other.counts.get(i));
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void incr(byte base) {
BaseIndex i = BaseIndex.byteToBase(base);
final BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) // no Ns
counts.put(i, counts.get(i) + 1);
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void incr(byte base, byte qual) {
BaseIndex i = BaseIndex.byteToBase(base);
final BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // no Ns
counts.put(i, counts.get(i) + 1);
sumQuals.put(i, sumQuals.get(i) + qual);
@ -69,14 +69,14 @@ import java.util.Map;
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
public void decr(byte base) {
BaseIndex i = BaseIndex.byteToBase(base);
final BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) // no Ns
counts.put(i, counts.get(i) - 1);
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
public void decr(byte base, byte qual) {
BaseIndex i = BaseIndex.byteToBase(base);
final BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // no Ns
counts.put(i, counts.get(i) - 1);
sumQuals.put(i, sumQuals.get(i) - qual);
@ -84,52 +84,48 @@ import java.util.Map;
}
@Ensures("result >= 0")
public int getCount(byte base) {
public int getCount(final byte base) {
return getCount(BaseIndex.byteToBase(base));
}
@Ensures("result >= 0")
public int getCount(BaseIndex base) {
public int getCount(final BaseIndex base) {
return counts.get(base);
}
@Ensures("result >= 0")
public long getSumQuals(byte base) {
public long getSumQuals(final byte base) {
return getSumQuals(BaseIndex.byteToBase(base));
}
@Ensures("result >= 0")
public long getSumQuals(BaseIndex base) {
public long getSumQuals(final BaseIndex base) {
return sumQuals.get(base);
}
@Ensures("result >= 0")
public byte averageQuals(byte base) {
public byte averageQuals(final byte base) {
return (byte) (getSumQuals(base) / getCount(base));
}
@Ensures("result >= 0")
public byte averageQuals(BaseIndex base) {
public byte averageQuals(final BaseIndex base) {
return (byte) (getSumQuals(base) / getCount(base));
}
public byte baseWithMostCounts() {
return baseIndexWithMostCounts().getByte();
@Ensures("result >= 0")
public int countOfBase(final BaseIndex base) {
return counts.get(base);
}
@Ensures("result >= 0")
public int countOfMostCommonBase() {
return counts.get(baseIndexWithMostCounts());
public long sumQualsOfBase(final BaseIndex base) {
return sumQuals.get(base);
}
@Ensures("result >= 0")
public long sumQualsOfMostCommonBase() {
return sumQuals.get(baseIndexWithMostCounts());
}
@Ensures("result >= 0")
public byte averageQualsOfMostCommonBase() {
return (byte) (sumQualsOfMostCommonBase() / countOfMostCommonBase());
public byte averageQualsOfBase(final BaseIndex base) {
return (byte) (sumQualsOfBase(base) / countOfBase(base));
}
@ -149,7 +145,7 @@ import java.util.Map;
* @return the proportion of this base over all other bases
*/
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportion(byte base) {
public double baseCountProportion(final byte base) {
return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount();
}
@ -160,7 +156,7 @@ import java.util.Map;
* @return the proportion of this base over all other bases
*/
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportion(BaseIndex baseIndex) {
public double baseCountProportion(final BaseIndex baseIndex) {
int total = totalCount();
if (total == 0)
return 0.0;
@ -177,22 +173,28 @@ import java.util.Map;
return b.toString();
}
public byte baseWithMostCounts() {
return baseIndexWithMostCounts().getByte();
}
@Ensures("result != null")
public BaseIndex baseIndexWithMostCounts() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (BaseIndex i : counts.keySet())
if (hasHigherCount(i, maxI))
maxI = i;
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet()) {
if (entry.getValue() > counts.get(maxI))
maxI = entry.getKey();
}
return maxI;
}
@Ensures("result != null")
public BaseIndex baseIndexWithMostCountsWithoutIndels() {
BaseIndex mostCounts = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (BaseIndex index : counts.keySet())
if (index.isNucleotide() && hasHigherCount(index, mostCounts))
mostCounts = index;
return mostCounts;
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet()) {
if (entry.getKey().isNucleotide() && entry.getValue() > counts.get(maxI))
maxI = entry.getKey();
}
return maxI;
}
private boolean hasHigherCount(final BaseIndex targetIndex, final BaseIndex testIndex) {
@ -201,6 +203,30 @@ import java.util.Map;
return ( targetCount > testCount || (targetCount == testCount && sumQuals.get(targetIndex) > sumQuals.get(testIndex)) );
}
public byte baseWithMostProbability() {
return baseIndexWithMostProbability().getByte();
}
@Ensures("result != null")
public BaseIndex baseIndexWithMostProbability() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (Map.Entry<BaseIndex, Long> entry : sumQuals.entrySet()) {
if (entry.getValue() > sumQuals.get(maxI))
maxI = entry.getKey();
}
return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCounts());
}
@Ensures("result != null")
public BaseIndex baseIndexWithMostProbabilityWithoutIndels() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (Map.Entry<BaseIndex, Long> entry : sumQuals.entrySet()) {
if (entry.getKey().isNucleotide() && entry.getValue() > sumQuals.get(maxI))
maxI = entry.getKey();
}
return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCountsWithoutIndels());
}
@Ensures("result >=0")
public int totalCountWithoutIndels() {
int sum = 0;
@ -218,8 +244,8 @@ import java.util.Map;
*/
@Requires("index.isNucleotide()")
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportionWithoutIndels(BaseIndex index) {
int total = totalCountWithoutIndels();
public double baseCountProportionWithoutIndels(final BaseIndex index) {
final int total = totalCountWithoutIndels();
if (total == 0)
return 0.0;
return (double) counts.get(index) / totalCountWithoutIndels();

View File

@ -182,7 +182,7 @@ public class HeaderElement {
* @return whether or not the HeaderElement is variant due to excess insertions
*/
private boolean isVariantFromMismatches(double minVariantProportion) {
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostCountsWithoutIndels();
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels();
double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon);
return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion);
}

View File

@ -55,7 +55,7 @@ public class SlidingWindow {
private final int nContigs;
private boolean allowPolyploidReduction;
private boolean allowPolyploidReductionInGeneral;
/**
* The types of synthetic reads to use in the finalizeAndAdd method
@ -117,7 +117,7 @@ public class SlidingWindow {
this.hasIndelQualities = hasIndelQualities;
this.nContigs = nContigs;
this.allowPolyploidReduction = allowPolyploidReduction;
this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
}
/**
@ -207,8 +207,9 @@ public class SlidingWindow {
finalizedReads = closeVariantRegions(regions, false);
List<GATKSAMRecord> readsToRemove = new LinkedList<GATKSAMRecord>();
for (GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
if (read.getSoftEnd() < getStartLocation(windowHeader)) {
final int windowHeaderStartLoc = getStartLocation(windowHeader);
for (final GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
if (read.getSoftEnd() < windowHeaderStartLoc) {
readsToRemove.add(read);
}
}
@ -291,7 +292,7 @@ public class SlidingWindow {
reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS));
int endOfFilteredData = findNextNonFilteredDataElement(header, start, end);
addToFilteredData(header, start, endOfFilteredData, isNegativeStrand);
reads.addAll(addToFilteredData(header, start, endOfFilteredData, isNegativeStrand));
if (endOfFilteredData <= start)
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start));
@ -418,7 +419,9 @@ public class SlidingWindow {
* @param start the first header index to add to consensus
* @param end the first header index NOT TO add to consensus
*/
private void addToFilteredData(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
private List<GATKSAMRecord> addToFilteredData(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
List<GATKSAMRecord> result = new ArrayList<GATKSAMRecord>(0);
if (filteredDataConsensus == null)
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
@ -434,8 +437,15 @@ public class SlidingWindow {
if (!headerElement.hasFilteredData())
throw new ReviewedStingException("No filtered data in " + index);
if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) {
result.add(finalizeFilteredDataConsensus());
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
}
genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
}
return result;
}
/**
@ -472,15 +482,15 @@ public class SlidingWindow {
* @param rms the rms mapping quality in the header element
*/
private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) {
BaseIndex base = baseCounts.baseIndexWithMostCounts();
byte count = (byte) Math.min(baseCounts.countOfMostCommonBase(), Byte.MAX_VALUE);
byte qual = baseCounts.averageQualsOfMostCommonBase();
byte insQual = baseCounts.averageInsertionQualsOfMostCommonBase();
byte delQual = baseCounts.averageDeletionQualsOfMostCommonBase();
final BaseIndex base = baseCounts.baseIndexWithMostProbability();
byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE);
byte qual = baseCounts.averageQualsOfBase(base);
byte insQual = baseCounts.averageInsertionQualsOfBase(base);
byte delQual = baseCounts.averageDeletionQualsOfBase(base);
syntheticRead.add(base, count, qual, insQual, delQual, rms);
}
private List<GATKSAMRecord> compressVariantRegion(int start, int stop) {
private List<GATKSAMRecord> compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
// Try to compress into a polyploid consensus
@ -490,7 +500,8 @@ public class SlidingWindow {
boolean foundEvent = false;
Object[] header = windowHeader.toArray();
if ( allowPolyploidReduction ) { // foundEvent will remain false if we don't allow polyploid reduction
// foundEvent will remain false if we don't allow polyploid reduction
if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) {
for (int i = start; i<=stop; i++) {
nHaplotypes = ((HeaderElement) header[i]).getNumberOfHaplotypes(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT);
if (nHaplotypes > nContigs) {
@ -512,9 +523,6 @@ public class SlidingWindow {
}
}
int refStart = windowHeader.get(start).getLocation();
int refStop = windowHeader.get(stop).getLocation();
// Try to compress the variant region
// the "foundEvent" protects us from trying to compress variant regions that are created by insertions
if (canCompress && foundEvent) {
@ -524,6 +532,9 @@ public class SlidingWindow {
// Return all reads that overlap the variant region and remove them from the window header entirely
// also remove all reads preceding the variant region (since they will be output as consensus right after compression
else {
final int refStart = windowHeader.get(start).getLocation();
final int refStop = windowHeader.get(stop).getLocation();
LinkedList<GATKSAMRecord> toRemove = new LinkedList<GATKSAMRecord>();
for (GATKSAMRecord read : readsInWindow) {
if (read.getSoftStart() <= refStop) {
@ -549,8 +560,8 @@ public class SlidingWindow {
* @return all reads contained in the variant region plus any adjacent synthetic reads
*/
@Requires("start <= stop")
protected List<GATKSAMRecord> closeVariantRegion(int start, int stop) {
List<GATKSAMRecord> allReads = compressVariantRegion(start, stop);
protected List<GATKSAMRecord> closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
List<GATKSAMRecord> allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition);
List<GATKSAMRecord> result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
@ -570,7 +581,7 @@ public class SlidingWindow {
if (stop < 0 && forceClose)
stop = windowHeader.size() - 1;
if (stop >= 0) {
allReads.addAll(closeVariantRegion(start, stop));
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
lastStop = stop;
}
}

View File

@ -44,7 +44,7 @@ public class SyntheticRead {
private String contig;
private int contigIndex;
private String readName;
private Integer refStart;
private int refStart;
private boolean hasIndelQualities = false;
private boolean isNegativeStrand = false;
@ -60,7 +60,7 @@ public class SyntheticRead {
* @param refStart the alignment start (reference based)
* @param readTag the reduce reads tag for the synthetic read
*/
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
final int initialCapacity = 10000;
bases = new ArrayList<BaseIndex>(initialCapacity);
counts = new ArrayList<Byte>(initialCapacity);
@ -80,7 +80,7 @@ public class SyntheticRead {
this.isNegativeStrand = isNegativeRead;
}
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, boolean hasIndelQualities, boolean isNegativeRead) {
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
this.bases = bases;
this.counts = counts;
this.quals = quals;
@ -115,11 +115,15 @@ public class SyntheticRead {
this.mappingQuality += mappingQuality;
}
public BaseIndex getBase(int readCoordinate) {
public BaseIndex getBase(final int readCoordinate) {
return bases.get(readCoordinate);
}
/**
public int getRefStart() {
return refStart;
}
/**
* Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid.
*
* Invalid reads are :

View File

@ -26,6 +26,8 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMUtils;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACcounts;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.collections.Pair;
@ -123,7 +125,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
*
*
*/
protected static class SumIterator {
public static class SumIterator {
private int[] currentState;
private final int[] finalState;
private final int restrictSumTo;
@ -491,32 +493,32 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
// If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
// and we repeat until queue is empty
// queue of AC conformations to process
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue = new LinkedList<AlleleFrequencyCalculationModel.ExactACset>();
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset = new HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset>(likelihoodDim);
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(likelihoodDim);
// add AC=0 to the queue
final int[] zeroCounts = new int[nAlleles];
zeroCounts[0] = numChromosomes;
AlleleFrequencyCalculationModel.ExactACset zeroSet =
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts));
ExactACset zeroSet =
new ExactACset(1, new ExactACcounts(zeroCounts));
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet);
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
// keep processing while we have AC conformations that need to be calculated
double maxLog10L = Double.NEGATIVE_INFINITY;
while ( !ACqueue.isEmpty() ) {
// compute log10Likelihoods
final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove();
final ExactACset ACset = ACqueue.remove();
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
// adjust max likelihood seen if needed
maxLog10L = Math.max(maxLog10L, log10LofKs);
// clean up memory
indexesToACset.remove(ACset.ACcounts);
indexesToACset.remove(ACset.getACcounts());
if ( VERBOSE )
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
System.out.printf(" *** removing used set=%s%n", ACset.getACcounts());
}
@ -525,13 +527,13 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
int plIdx = 0;
SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
while (iterator.hasNext()) {
AlleleFrequencyCalculationModel.ExactACset ACset =
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector()));
ExactACset ACset =
new ExactACset(1, new ExactACcounts(iterator.getCurrentVector()));
// for observed base X, add Q(jX,k) to likelihood vector for all k in error model
//likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
setLogPLs(plIdx++, ACset.log10Likelihoods[0]);
setLogPLs(plIdx++, ACset.getLog10Likelihoods()[0]);
iterator.next();
}
}
@ -540,40 +542,40 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
}
private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set,
private double calculateACConformationAndUpdateQueue(final ExactACset set,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
final double maxLog10L,
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts,
AlleleFrequencyCalculationModel.ExactACset> indexesToACset,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts,
ExactACset> indexesToACset,
final ReadBackedPileup pileup) {
// compute likelihood of set
getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
final double log10LofK = set.log10Likelihoods[0];
final double log10LofK = set.getLog10Likelihoods()[0];
// log result in PL vector
int idx = getLinearIndex(set.ACcounts.getCounts(), nAlleles, numChromosomes);
int idx = getLinearIndex(set.getACcounts().getCounts(), nAlleles, numChromosomes);
setLogPLs(idx, log10LofK);
// can we abort early because the log10Likelihoods are so small?
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
if ( VERBOSE )
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, maxLog10L);
return log10LofK;
}
// iterate over higher frequencies if possible
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
final int ACwiggle = numChromosomes - set.getACsum() + set.ACcounts.counts[0];
final int ACwiggle = numChromosomes - set.getACsum() + set.getACcounts().getCounts()[0];
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK;
// add conformations for other cases
for ( int allele = 1; allele < nAlleles; allele++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
final int[] ACcountsClone = set.getACcounts().getCounts().clone();
ACcountsClone[allele]++;
// is this a valid conformation?
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
@ -597,7 +599,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
* @param numObservations Number of observations for each allele
* @param pileup Read backed pileup in case it's necessary
*/
public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
public abstract void getLikelihoodOfConformation(final ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
@ -608,12 +610,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
// Static methods
public static void updateACset(final int[] newSetCounts,
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset) {
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts);
final ExactACcounts index = new ExactACcounts(newSetCounts);
if ( !indexesToACset.containsKey(index) ) {
AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index);
ExactACset newSet = new ExactACset(1, index);
indexesToACset.put(index, newSet);
ACqueue.add(newSet);
if (VERBOSE)

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
@ -188,12 +189,12 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
* @param alleleList List of alleles
* @param numObservations Number of observations for each allele in alleleList
*/
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
public void getLikelihoodOfConformation(final ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
final ReadBackedPileup pileup) {
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, alleleList.size());
final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), alleleList.size());
double p1 = 0.0;
if (!hasReferenceSampleData) {
@ -218,6 +219,6 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
}
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec);
}
ACset.log10Likelihoods[0] = p1;
ACset.getLog10Likelihoods()[0] = p1;
}
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMUtils;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
@ -12,7 +13,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
@ -218,12 +222,12 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
* @param alleleList List of alleles
* @param numObservations Number of observations for each allele in alleleList
*/
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
public void getLikelihoodOfConformation(final ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
final ReadBackedPileup pileup) {
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, BaseUtils.BASES.length);
final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), BaseUtils.BASES.length);
final int[] ac = new int[BaseUtils.BASES.length];
for (int k=0; k < BaseUtils.BASES.length; k++ )
@ -238,9 +242,9 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
final byte qual = qualToUse(elt, true, true, mbq);
if ( qual == 0 )
continue;
final double acc[] = new double[ACset.ACcounts.counts.length];
final double acc[] = new double[ACset.getACcounts().getCounts().length];
for (int k=0; k < acc.length; k++ )
acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.ACcounts.counts[k]]
acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.getACcounts().getCounts()[k]]
- LOG10_PLOIDY;
p1 += MathUtils.log10sumLog10(acc);
}
@ -264,7 +268,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ), acVec);
}
ACset.log10Likelihoods[0] = p1;
ACset.getLog10Likelihoods()[0] = p1;
/* System.out.println(Arrays.toString(ACset.ACcounts.getCounts())+" "+String.valueOf(p1));
System.out.println(Arrays.toString(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ)));
*/

View File

@ -0,0 +1,277 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Logger;
import org.apache.log4j.TTCCLayout;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.*;
/**
* Created with IntelliJ IDEA.
* User: depristo
* Date: 10/2/12
* Time: 10:25 AM
* To change this template use File | Settings | File Templates.
*/
public class AFCalcPerformanceTest {
final static Logger logger = Logger.getLogger(AFCalcPerformanceTest.class);
private static abstract class Analysis {
final GATKReport report;
public Analysis(final String name, final List<String> columns) {
report = GATKReport.newSimpleReport(name, columns);
}
public abstract void run(final AFCalcTestBuilder testBuilder,
final List<Object> coreColumns);
public String getName() {
return getTable().getTableName();
}
public GATKReportTable getTable() {
return report.getTables().iterator().next();
}
}
private static class AnalyzeByACAndPL extends Analysis {
public AnalyzeByACAndPL(final List<String> columns) {
super("AnalyzeByACAndPL", Utils.append(columns, "non.type.pls", "ac", "n.alt.seg", "other.ac"));
}
public void run(final AFCalcTestBuilder testBuilder, final List<Object> coreValues) {
final SimpleTimer timer = new SimpleTimer();
for ( final int nonTypePL : Arrays.asList(100) ) {
final AFCalc calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
for ( int[] ACs : makeACs(testBuilder.numAltAlleles, testBuilder.nSamples*2) ) {
final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL);
timer.start();
final AFCalcResult resultTracker = calc.getLog10PNonRef(vc, priors);
final long runtime = timer.getElapsedTimeNano();
int otherAC = 0;
int nAltSeg = 0;
for ( int i = 0; i < ACs.length; i++ ) {
nAltSeg += ACs[i] > 0 ? 1 : 0;
if ( i > 0 ) otherAC += ACs[i];
}
final List<Object> columns = new LinkedList<Object>(coreValues);
columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, ACs[0], nAltSeg, otherAC));
report.addRowList(columns);
}
}
}
private List<int[]> makeACs(final int nAltAlleles, final int nChrom) {
if ( nAltAlleles > 2 ) throw new IllegalArgumentException("nAltAlleles must be < 3");
final List<int[]> ACs = new LinkedList<int[]>();
final List<Integer> ACsToTry = MathUtils.log10LinearRange(0, nChrom, 0.1); //Arrays.asList(0, 1, 2, 3, 6, 10, 20, 40, 60, 100, 200, 400, 600, 1000, 2000, 4000, 6000, 10000, 100000);
for ( int i : ACsToTry ) {
if ( i < nChrom ) {
if ( nAltAlleles == 1 ) {
ACs.add(new int[]{i});
} else if ( nAltAlleles == 2 ) {
for ( int j : ACsToTry ) {
if ( j < nChrom - i )
ACs.add(new int[]{i, j});
}
} else {
throw new IllegalStateException("cannot get here");
}
}
}
return ACs;
}
}
private static class AnalyzeBySingletonPosition extends Analysis {
public AnalyzeBySingletonPosition(final List<String> columns) {
super("AnalyzeBySingletonPosition", Utils.append(columns, "non.type.pls", "position.of.singleton"));
}
public void run(final AFCalcTestBuilder testBuilder, final List<Object> coreValues) {
final SimpleTimer timer = new SimpleTimer();
for ( final int nonTypePL : Arrays.asList(100) ) {
final AFCalc calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
final int[] ac = new int[testBuilder.numAltAlleles];
ac[0] = 1;
final VariantContext vc = testBuilder.makeACTest(ac, 0, nonTypePL);
for ( final int position : MathUtils.log10LinearRange(0, vc.getNSamples(), 0.1) ) {
final VariantContextBuilder vcb = new VariantContextBuilder(vc);
final List<Genotype> genotypes = new ArrayList<Genotype>(vc.getGenotypes());
Collections.rotate(genotypes, position);
vcb.genotypes(genotypes);
timer.start();
final AFCalcResult resultTracker = calc.getLog10PNonRef(vcb.make(), priors);
final long runtime = timer.getElapsedTimeNano();
final List<Object> columns = new LinkedList<Object>(coreValues);
columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, position));
report.addRowList(columns);
}
}
}
}
private static class AnalyzeByNonInformative extends Analysis {
public AnalyzeByNonInformative(final List<String> columns) {
super("AnalyzeByNonInformative", Utils.append(columns, "non.type.pls", "n.non.informative"));
}
public void run(final AFCalcTestBuilder testBuilder, final List<Object> coreValues) {
final SimpleTimer timer = new SimpleTimer();
for ( final int nonTypePL : Arrays.asList(100) ) {
final AFCalc calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
final int[] ac = new int[testBuilder.numAltAlleles];
ac[0] = 1;
for ( int nNonInformative = 0; nNonInformative < testBuilder.nSamples; nNonInformative++ ) {
final VariantContext vc = testBuilder.makeACTest(ac, nNonInformative, nonTypePL);
timer.start();
final AFCalcResult resultTracker = calc.getLog10PNonRef(vc, priors);
final long runtime = timer.getElapsedTimeNano();
final List<Object> columns = new LinkedList<Object>(coreValues);
columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, nNonInformative));
report.addRowList(columns);
}
}
}
}
private static class ModelParams {
final AFCalcFactory.Calculation modelType;
final int maxBiNSamples, maxTriNSamples;
private ModelParams(AFCalcFactory.Calculation modelType, int maxBiNSamples, int maxTriNSamples) {
this.modelType = modelType;
this.maxBiNSamples = maxBiNSamples;
this.maxTriNSamples = maxTriNSamples;
}
public boolean meetsConstraints(final int nAltAlleles, final int nSamples) {
if ( nAltAlleles == 1 )
return nSamples <= maxBiNSamples;
else if ( nAltAlleles == 2 )
return nSamples <= maxTriNSamples;
else
throw new IllegalStateException("Unexpected number of alt alleles " + nAltAlleles);
}
}
public enum Operation {
ANALYZE,
SINGLE
}
public static void main(final String[] args) throws Exception {
final TTCCLayout layout = new TTCCLayout();
layout.setThreadPrinting(false);
layout.setCategoryPrefixing(false);
layout.setContextPrinting(false);
logger.addAppender(new ConsoleAppender(layout));
final Operation op = Operation.valueOf(args[0]);
switch ( op ) {
case ANALYZE: analyze(args); break;
case SINGLE: profileBig(args); break;
default: throw new IllegalAccessException("unknown operation " + op);
}
}
private static void profileBig(final String[] args) throws Exception {
final int nSamples = Integer.valueOf(args[1]);
final int ac = Integer.valueOf(args[2]);
final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(nSamples, 1,
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
AFCalcTestBuilder.PriorType.human);
final VariantContext vc = testBuilder.makeACTest(new int[]{ac}, 0, 100);
final SimpleTimer timer = new SimpleTimer().start();
final AFCalcResult resultTracker = testBuilder.makeModel().getLog10PNonRef(vc, testBuilder.makePriors());
final long runtime = timer.getElapsedTimeNano();
logger.info("result " + resultTracker.getLog10PosteriorOfAFGT0());
logger.info("runtime " + runtime);
}
private static void analyze(final String[] args) throws Exception {
final List<String> coreColumns = Arrays.asList("iteration", "n.alt.alleles", "n.samples",
"exact.model", "prior.type", "runtime", "n.evaluations");
final PrintStream out = new PrintStream(new FileOutputStream(args[1]));
final List<ModelParams> modelParams = Arrays.asList(
new ModelParams(AFCalcFactory.Calculation.EXACT_REFERENCE, 10000, 10),
// new ModelParams(AFCalcTestBuilder.ModelType.GeneralExact, 100, 10),
new ModelParams(AFCalcFactory.Calculation.EXACT_CONSTRAINED, 10000, 100),
new ModelParams(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 10000, 1000));
final boolean ONLY_HUMAN_PRIORS = false;
final List<AFCalcTestBuilder.PriorType> priorTypes = ONLY_HUMAN_PRIORS
? Arrays.asList(AFCalcTestBuilder.PriorType.values())
: Arrays.asList(AFCalcTestBuilder.PriorType.human);
final List<Analysis> analyzes = new ArrayList<Analysis>();
analyzes.add(new AnalyzeByACAndPL(coreColumns));
analyzes.add(new AnalyzeBySingletonPosition(coreColumns));
//analyzes.add(new AnalyzeByNonInformative(coreColumns));
for ( int iteration = 0; iteration < 1; iteration++ ) {
for ( final int nAltAlleles : Arrays.asList(1, 2) ) {
for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) {
for ( final ModelParams modelToRun : modelParams) {
if ( modelToRun.meetsConstraints(nAltAlleles, nSamples) ) {
for ( final AFCalcTestBuilder.PriorType priorType : priorTypes ) {
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(nSamples, nAltAlleles, modelToRun.modelType, priorType);
for ( final Analysis analysis : analyzes ) {
logger.info(Utils.join("\t", Arrays.asList(iteration, nAltAlleles, nSamples, modelToRun.modelType, priorType, analysis.getName())));
final List<?> values = Arrays.asList(iteration, nAltAlleles, nSamples, modelToRun.modelType, priorType);
analysis.run(testBuilder, (List<Object>)values);
}
}
}
}
}
}
}
final GATKReport report = new GATKReport();
for ( final Analysis analysis : analyzes )
report.addTable(analysis.getTable());
report.print(out);
out.close();
}
}

View File

@ -0,0 +1,170 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
public class AFCalcTestBuilder {
final static Allele A = Allele.create("A", true);
final static Allele C = Allele.create("C");
final static Allele G = Allele.create("G");
final static Allele T = Allele.create("T");
final static Allele AA = Allele.create("AA");
final static Allele AT = Allele.create("AT");
final static Allele AG = Allele.create("AG");
static int sampleNameCounter = 0;
final int nSamples;
final int numAltAlleles;
final AFCalcFactory.Calculation modelType;
final PriorType priorType;
public AFCalcTestBuilder(final int nSamples, final int numAltAlleles,
final AFCalcFactory.Calculation modelType, final PriorType priorType) {
this.nSamples = nSamples;
this.numAltAlleles = numAltAlleles;
this.modelType = modelType;
this.priorType = priorType;
}
@Override
public String toString() {
return String.format("AFCalcTestBuilder nSamples=%d nAlts=%d model=%s prior=%s", nSamples, numAltAlleles, modelType, priorType);
}
public enum PriorType {
flat,
human
}
public int getnSamples() {
return nSamples;
}
public AFCalc makeModel() {
return AFCalcFactory.createAFCalc(modelType, nSamples, 4, 4, 2);
}
public double[] makePriors() {
final int nPriorValues = 2*nSamples+1;
switch ( priorType ) {
case flat:
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
case human:
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
return humanPriors;
default:
throw new RuntimeException("Unexpected type " + priorType);
}
}
public VariantContext makeACTest(final List<Integer> ACs, final int nNonInformative, final int nonTypePL) {
return makeACTest(ArrayUtils.toPrimitive(ACs.toArray(new Integer[]{})), nNonInformative, nonTypePL);
}
public VariantContext makeACTest(final int[] ACs, final int nNonInformative, final int nonTypePL) {
final int nChrom = nSamples * 2;
final int[] nhet = new int[numAltAlleles];
final int[] nhomvar = new int[numAltAlleles];
for ( int i = 0; i < ACs.length; i++ ) {
final double p = ACs[i] / (1.0 * nChrom);
nhomvar[i] = (int)Math.floor((nSamples - nNonInformative) * p * p);
nhet[i] = ACs[i] - 2 * nhomvar[i];
if ( nhet[i] < 0 )
throw new IllegalStateException("Bug! nhet[i] < 0");
}
final long calcAC = MathUtils.sum(nhet) + 2 * MathUtils.sum(nhomvar);
if ( calcAC != MathUtils.sum(ACs) )
throw new IllegalStateException("calculated AC " + calcAC + " not equal to desired AC " + Utils.join(",", ACs));
return makeACTest(nhet, nhomvar, nNonInformative, nonTypePL);
}
public VariantContext makeACTest(final int[] nhet, final int[] nhomvar, final int nNonInformative, final int nonTypePL) {
List<Genotype> samples = new ArrayList<Genotype>(nSamples);
for ( int altI = 0; altI < nhet.length; altI++ ) {
for ( int i = 0; i < nhet[altI]; i++ )
samples.add(makePL(GenotypeType.HET, nonTypePL, altI+1));
for ( int i = 0; i < nhomvar[altI]; i++ )
samples.add(makePL(GenotypeType.HOM_VAR, nonTypePL, altI+1));
}
final Genotype nonInformative = makeNonInformative();
samples.addAll(Collections.nCopies(nNonInformative, nonInformative));
final int nRef = Math.max((int) (nSamples - nNonInformative - MathUtils.sum(nhet) - MathUtils.sum(nhomvar)), 0);
samples.addAll(Collections.nCopies(nRef, makePL(GenotypeType.HOM_REF, nonTypePL, 0)));
samples = samples.subList(0, nSamples);
if ( samples.size() > nSamples )
throw new IllegalStateException("too many samples");
VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, getAlleles());
vcb.genotypes(samples);
return vcb.make();
}
public List<Allele> getAlleles() {
return Arrays.asList(A, C, G, T, AA, AT, AG).subList(0, numAltAlleles+1);
}
public List<Allele> getAlleles(final GenotypeType type, final int altI) {
switch (type) {
case HOM_REF: return Arrays.asList(getAlleles().get(0), getAlleles().get(0));
case HET: return Arrays.asList(getAlleles().get(0), getAlleles().get(altI));
case HOM_VAR: return Arrays.asList(getAlleles().get(altI), getAlleles().get(altI));
default: throw new IllegalArgumentException("Unexpected type " + type);
}
}
public Genotype makePL(final List<Allele> expectedGT, int ... pls) {
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
gb.alleles(expectedGT);
gb.PL(pls);
return gb.make();
}
private int numPLs() {
return GenotypeLikelihoods.numLikelihoods(numAltAlleles+1, 2);
}
public Genotype makeNonInformative() {
final int[] nonInformativePLs = new int[GenotypeLikelihoods.numLikelihoods(numAltAlleles, 2)];
return makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), nonInformativePLs);
}
public Genotype makePL(final GenotypeType type, final int nonTypePL, final int altI) {
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
gb.alleles(getAlleles(type, altI));
final int[] pls = new int[numPLs()];
Arrays.fill(pls, nonTypePL);
int index = 0;
switch ( type ) {
case HOM_REF: index = GenotypeLikelihoods.calculatePLindex(0, 0); break;
case HET: index = GenotypeLikelihoods.calculatePLindex(0, altI); break;
case HOM_VAR: index = GenotypeLikelihoods.calculatePLindex(altI, altI); break;
}
pls[index] = 0;
gb.PL(pls);
return gb.make();
}
}

View File

@ -23,56 +23,55 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods;
import org.broadinstitute.sting.gatk.walkers.genotyper.ProbabilityVector;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public class GeneralPloidyExactAFCalc extends ExactAFCalc {
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
final protected UnifiedArgumentCollection UAC;
private final int ploidy;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private final static boolean VERBOSE = false;
protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
ploidy = UAC.samplePloidy;
this.UAC = UAC;
protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
this.ploidy = ploidy;
}
public List<Allele> getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
GenotypesContext GLs = vc.getGenotypes();
List<Allele> alleles = vc.getAlleles();
@Override
protected VariantContext reduceScope(VariantContext vc) {
final int maxAltAlleles = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype;
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
if ( vc.getAlternateAlleles().size() > maxAltAlleles) {
logger.warn("this tool is currently set to genotype at most " + maxAltAlleles + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
final List<Allele> alleles = new ArrayList<Allele>(maxAltAlleles + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy));
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, maxAltAlleles, ploidy));
GLs = subsetAlleles(vc, alleles, false, ploidy);
VariantContextBuilder builder = new VariantContextBuilder(vc);
builder.alleles(alleles);
builder.genotypes(subsetAlleles(vc, alleles, false, ploidy));
return builder.make();
} else {
return vc;
}
combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
return alleles;
}
@Override
public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) {
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, getResultTracker());
return resultFromTracker(vc, log10AlleleFrequencyPriors);
}
/**
* Simple wrapper class to hold values of combined pool likelihoods.
@ -94,8 +93,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
public void add(ExactACset set) {
alleleCountSetList.add(set);
conformationMap.put(set.ACcounts, set);
final double likelihood = set.log10Likelihoods[0];
conformationMap.put(set.getACcounts(), set);
final double likelihood = set.getLog10Likelihoods()[0];
if (likelihood > maxLikelihood )
maxLikelihood = likelihood;
@ -108,11 +107,11 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
}
public double getLikelihoodOfConformation(int[] ac) {
return conformationMap.get(new ExactACcounts(ac)).log10Likelihoods[0];
return conformationMap.get(new ExactACcounts(ac)).getLog10Likelihoods()[0];
}
public double getGLOfACZero() {
return alleleCountSetList.get(0).log10Likelihoods[0]; // AC 0 is always at beginning of list
return alleleCountSetList.get(0).getLog10Likelihoods()[0]; // AC 0 is always at beginning of list
}
public int getLength() {
@ -136,7 +135,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes(), true);
for ( final double[] likelihoods : GLs ) {
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
@ -171,15 +170,15 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
* @param numAlleles Number of alternate alleles
* @param ploidyPerPool Number of samples per pool
* @param log10AlleleFrequencyPriors Frequency priors
* @param result object to fill with output values
* @param resultTracker object to fill with output values
*/
protected static void combineSinglePools(final GenotypesContext GLs,
final int numAlleles,
final int ploidyPerPool,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
final AFCalcResultTracker resultTracker) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs, true);
int combinedPloidy = 0;
@ -190,21 +189,27 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
// first element: zero ploidy, e.g. trivial degenerate distribution
final int[] zeroCounts = new int[numAlleles];
final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts));
set.log10Likelihoods[0] = 0.0;
set.getLog10Likelihoods()[0] = 0.0;
combinedPoolLikelihoods.add(set);
for (int p=1; p<genotypeLikelihoods.size(); p++) {
result.reset();
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
numAlleles, log10AlleleFrequencyPriors, result);
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
}
if ( genotypeLikelihoods.size() <= 1 ) {
// no meaningful GLs at all, just set the tracker to non poly values
resultTracker.reset(); // just mimic-ing call below
resultTracker.setLog10LikelihoodOfAFzero(0.0);
} else {
for (int p=1; p<genotypeLikelihoods.size(); p++) {
resultTracker.reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
numAlleles, log10AlleleFrequencyPriors, resultTracker);
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
}
}
}
public static CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool, double[] newGL, int originalPloidy, int newGLPloidy, int numAlleles,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
final AFCalcResultTracker resultTracker) {
@ -221,23 +226,24 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts));
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet);
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
// keep processing while we have AC conformations that need to be calculated
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
StateTracker stateTracker = new StateTracker();
while ( !ACqueue.isEmpty() ) {
resultTracker.incNEvaluations();
// compute log10Likelihoods
final ExactACset ACset = ACqueue.remove();
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLikelihoodSeen, ACqueue, indexesToACset);
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, resultTracker, stateTracker, ACqueue, indexesToACset);
// adjust max likelihood seen if needed
if ( log10LofKs > maxLikelihoodSeen.maxLog10L )
maxLikelihoodSeen.update(log10LofKs, ACset.ACcounts);
if ( log10LofKs > stateTracker.getMaxLog10L())
stateTracker.update(log10LofKs, ACset.getACcounts());
// clean up memory
indexesToACset.remove(ACset.ACcounts);
indexesToACset.remove(ACset.getACcounts());
if ( VERBOSE )
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
System.out.printf(" *** removing used set=%s%n", ACset.getACcounts());
}
return newPool;
@ -253,8 +259,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
* @param log10AlleleFrequencyPriors Prior object
* @param originalPloidy Total ploidy of original combined pool
* @param newGLPloidy Ploidy of GL vector
* @param result AFResult object
* @param maxLikelihoodSeen max likelihood observed so far
* @param resultTracker AFResult object
* @param stateTracker max likelihood observed so far
* @param ACqueue Queue of conformations to compute
* @param indexesToACset AC indices of objects in queue
* @return max log likelihood
@ -266,15 +272,15 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
final double[] log10AlleleFrequencyPriors,
final int originalPloidy,
final int newGLPloidy,
final AlleleFrequencyCalculationResult result,
final MaxLikelihoodSeen maxLikelihoodSeen,
final AFCalcResultTracker resultTracker,
final StateTracker stateTracker,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
// compute likeihood in "set" of new set based on original likelihoods
final int numAlleles = set.ACcounts.counts.length;
final int numAlleles = set.getACcounts().getCounts().length;
final int newPloidy = set.getACsum();
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, result);
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, resultTracker);
// add to new pool
@ -282,24 +288,24 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
newPool.add(set);
// TODO -- uncomment this correct line when the implementation of this model is optimized (it's too slow now to handle this fix)
//if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) {
if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
//if ( log10LofK < stateTracker.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && stateTracker.isLowerAC(set.ACcounts) ) {
if ( log10LofK < stateTracker.getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
if ( VERBOSE )
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLikelihoodSeen.maxLog10L);
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, stateTracker.getMaxLog10L());
return log10LofK;
}
// iterate over higher frequencies if possible
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
// so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space
final int ACwiggle = set.ACcounts.counts[0];
final int ACwiggle = set.getACcounts().getCounts()[0];
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK;
// add conformations for other cases
for ( int allele = 1; allele < numAlleles; allele++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
final int[] ACcountsClone = set.getACcounts().getCounts().clone();
ACcountsClone[allele]++;
// is this a valid conformation?
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
@ -329,11 +335,11 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
* @param ploidy2 Ploidy of second pool
* @param numAlleles Number of alleles
* @param log10AlleleFrequencyPriors Array of biallelic priors
* @param result Af calculation result object
* @param resultTracker Af calculation result object
*/
public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
final AFCalcResultTracker resultTracker) {
/*
final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
@ -387,7 +393,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
* @param numAlleles Number of alleles (including ref)
* @param ploidy1 Ploidy of original pool (combined)
* @param ploidy2 Ploidy of new pool
* @param result AFResult object
* @param resultTracker AFResult object
* @return log-likehood of requested conformation
*/
private static double computeLofK(final ExactACset set,
@ -395,7 +401,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
final double[] secondGL,
final double[] log10AlleleFrequencyPriors,
final int numAlleles, final int ploidy1, final int ploidy2,
final AlleleFrequencyCalculationResult result) {
final AFCalcResultTracker resultTracker) {
final int newPloidy = ploidy1 + ploidy2;
@ -404,17 +410,17 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
if (newPloidy != totalAltK)
throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values");
totalAltK -= set.ACcounts.counts[0];
totalAltK -= set.getACcounts().getCounts()[0];
// totalAltK has sum of alt alleles of conformation now
// special case for k = 0 over all k
if ( totalAltK == 0 ) { // all-ref case
final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX];
set.log10Likelihoods[0] = log10Lof0;
set.getLog10Likelihoods()[0] = log10Lof0;
result.setLog10LikelihoodOfAFzero(log10Lof0);
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
return log10Lof0;
} else {
@ -423,12 +429,12 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
// ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy.
// To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i
int[] currentCount = set.ACcounts.getCounts();
int[] currentCount = set.getACcounts().getCounts();
double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount);
// for current conformation, get all possible ways to break vector K into two components G1 and G2
final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY;
set.getLog10Likelihoods()[0] = Double.NEGATIVE_INFINITY;
while (innerIterator.hasNext()) {
// check if breaking current conformation into g1 and g2 is feasible.
final int[] acCount2 = innerIterator.getCurrentVector();
@ -444,27 +450,27 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2);
final double sum = firstGL + gl2 + num1 + num2;
set.log10Likelihoods[0] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[0], sum);
set.getLog10Likelihoods()[0] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[0], sum);
}
}
innerIterator.next();
}
set.log10Likelihoods[0] += denom;
set.getLog10Likelihoods()[0] += denom;
}
double log10LofK = set.log10Likelihoods[0];
double log10LofK = set.getLog10Likelihoods()[0];
// update the MLE if necessary
final int altCounts[] = Arrays.copyOfRange(set.ACcounts.counts,1, set.ACcounts.counts.length);
result.updateMLEifNeeded(log10LofK, altCounts);
final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length);
resultTracker.updateMLEifNeeded(log10LofK, altCounts);
// apply the priors over each alternate allele
for (final int ACcount : altCounts ) {
if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
result.updateMAPifNeeded(log10LofK, altCounts);
resultTracker.updateMAPifNeeded(log10LofK, altCounts);
return log10LofK;
}
@ -496,12 +502,12 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
* @param ploidy2 Ploidy of second pool
* @param log10AlleleFrequencyPriors Array of biallelic priors
* @param result Af calculation result object
* @param resultTracker Af calculation result object
* @return Combined likelihood vector
*/
public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector,
final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
final AFCalcResultTracker resultTracker) {
final int newPloidy = ploidy1 + ploidy2;
@ -526,8 +532,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
final double log10Lof0 = x[0]+y[0];
result.setLog10LikelihoodOfAFzero(log10Lof0);
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
double maxElement = log10Lof0;
int maxElementIdx = 0;
@ -569,8 +575,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
}
alleleCounts[0] = k;
result.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
result.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
resultTracker.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
resultTracker.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
}
@ -622,7 +628,10 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
// create the new likelihoods array from the alleles we are allowed to use
final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
double[] newLikelihoods;
if ( numOriginalAltAlleles == numNewAltAlleles) {
// Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization
// and subsetting
if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) {
newLikelihoods = originalLikelihoods;
} else {
newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse);

View File

@ -283,7 +283,7 @@ public class GenotypingEngine {
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
if( mergedVC == null ) { continue; }
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
int aCount = 0;
for( final Allele a : mergedVC.getAlleles() ) {
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
@ -308,9 +308,20 @@ public class GenotypingEngine {
}
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
}
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
if( call != null ) {
if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call);
// also, need to update the allele -> haplotype mapping
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMapTrim = new HashMap<Allele, ArrayList<Haplotype>>();
for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC
alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii)));
}
call = vcCallTrim;
alleleHashMap = alleleHashMapTrim;
}
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(call, alleleHashMap) );
}
}

View File

@ -237,9 +237,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING);
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
UnifiedArgumentCollection simpleUAC = UAC.clone();
simpleUAC.exactCallsLog = null;
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
// initialize the output VCF header
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());

View File

@ -40,7 +40,6 @@ import java.util.*;
public class LikelihoodCalculationEngine {
private static final double LOG_ONE_HALF = -Math.log10(2.0);
private static final double BEST_LIKELIHOOD_THRESHOLD = 0.1;
private final byte constantGCP;
private final boolean DEBUG;
private final PairHMM pairHMM;
@ -184,7 +183,7 @@ public class LikelihoodCalculationEngine {
haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
}
}
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
}
}
}
@ -323,11 +322,13 @@ public class LikelihoodCalculationEngine {
return bestHaplotypes;
}
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList, final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
//final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>();
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
final ArrayList<GATKSAMRecord> readsForThisSample = sample.getValue();
@ -352,7 +353,7 @@ public class LikelihoodCalculationEngine {
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
for( final Allele a : call.getFirst().getAlleles() )
likelihoodMap.add(read,a,0.0);
likelihoodMap.add(read, a, 0.0);
}
}

View File

@ -63,7 +63,7 @@ public class BaseCountsUnitTest extends BaseTest {
String name = String.format("Test-%s", params.bases);
Assert.assertEquals(counts.totalCount(), params.bases.length(), name);
Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name);
Assert.assertEquals(counts.countOfBase(counts.baseIndexWithMostCounts()), params.mostCommonCount, name);
Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name);
}
}

View File

@ -1,9 +1,9 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
import org.testng.annotations.Test;
/**
* Created by IntelliJ IDEA.
@ -60,31 +60,31 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
@Test(enabled = true)
public void testBOTH_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","a636ae291a27843107294f3e7940b98a");
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","6d60d9f3dfe8e1580214be0d170b0fff");
}
@Test(enabled = true)
public void testINDEL_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","738fa68a3fc838b4bbad5c257f3e96fe");
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","30abf3c1868a61145edbc08fe35c8150");
}
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","9bcf1f2c204a251ee2b0b6f17ed59a61");
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","ef99bc0513d3267f43b84cb88a324376");
}
@Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","c73678eeaad574af9ed45045074828fa");
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","8ca07270717641385fe5d2e07e530782");
}
@Test(enabled = true)
public void testMT_SNP_DISCOVERY_sp4() {
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","c32e10070e10d30d33e5b882c1f89413");
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da84bf45f7080a46a7a78542b3a0629d");
}
@Test(enabled = true)
public void testMT_SNP_GGA_sp10() {
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "d1b48f6f3a175fcba9aec6d427005a45");
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "f8ea18ec6a717a77fdf8c5f2482d8d8d");
}
}

View File

@ -0,0 +1,603 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.*;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
public class AFCalcUnitTest extends BaseTest {
static Allele A = Allele.create("A", true);
static Allele C = Allele.create("C");
static Allele G = Allele.create("G");
static int sampleNameCounter = 0;
static Genotype AA1, AB1, BB1, NON_INFORMATIVE1;
static Genotype AA2, AB2, AC2, BB2, BC2, CC2, NON_INFORMATIVE2;
final double[] FLAT_3SAMPLE_PRIORS = MathUtils.normalizeFromLog10(new double[2*3+1], true); // flat priors
final private static boolean INCLUDE_BIALLELIC = true;
final private static boolean INCLUDE_TRIALLELIC = true;
final private static boolean Guillermo_FIXME = false; // TODO -- can only be enabled when GdA fixes bug
final private static boolean DEBUG_ONLY = false;
@BeforeSuite
public void before() {
AA1 = makePL(Arrays.asList(A, A), 0, 20, 20);
AB1 = makePL(Arrays.asList(A, C), 20, 0, 20);
BB1 = makePL(Arrays.asList(C, C), 20, 20, 0);
NON_INFORMATIVE1 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0);
AA2 = makePL(Arrays.asList(A, A), 0, 20, 20, 20, 20, 20);
AB2 = makePL(Arrays.asList(A, C), 20, 0, 20, 20, 20, 20);
BB2 = makePL(Arrays.asList(C, C), 20, 20, 0, 20, 20, 20);
AC2 = makePL(Arrays.asList(A, G), 20, 20, 20, 0, 20, 20);
BC2 = makePL(Arrays.asList(C, G), 20, 20, 20, 20, 0, 20);
CC2 = makePL(Arrays.asList(G, G), 20, 20, 20, 20, 20, 0);
NON_INFORMATIVE2 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0, 0, 0, 0);
}
protected static Genotype makePL(final List<Allele> expectedGT, int ... pls) {
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
gb.alleles(expectedGT);
gb.PL(pls);
return gb.make();
}
private class GetGLsTest extends TestDataProvider {
GenotypesContext GLs;
int numAltAlleles;
final AFCalc calc;
final int[] expectedACs;
final double[] priors;
final String priorName;
private GetGLsTest(final AFCalc calc, int numAltAlleles, List<Genotype> arg, final double[] priors, final String priorName) {
super(GetGLsTest.class);
GLs = GenotypesContext.create(new ArrayList<Genotype>(arg));
this.numAltAlleles = numAltAlleles;
this.calc = calc;
this.priors = priors;
this.priorName = priorName;
expectedACs = new int[numAltAlleles+1];
for ( int alleleI = 0; alleleI < expectedACs.length; alleleI++ ) {
expectedACs[alleleI] = 0;
final Allele allele = getAlleles().get(alleleI);
for ( Genotype g : arg ) {
expectedACs[alleleI] += Collections.frequency(g.getAlleles(), allele);
}
}
}
public AFCalcResult execute() {
return getCalc().getLog10PNonRef(getVC(), getPriors());
}
public AFCalcResult executeRef() {
final AFCalc ref = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_REFERENCE, getCalc().nSamples, getCalc().getMaxAltAlleles());
return ref.getLog10PNonRef(getVC(), getPriors());
}
public double[] getPriors() {
return priors;
}
public AFCalc getCalc() {
return calc;
}
public VariantContext getVC() {
VariantContextBuilder builder = new VariantContextBuilder("test", "1", 1, 1, getAlleles());
builder.genotypes(GLs);
return builder.make();
}
public List<Allele> getAlleles() {
return Arrays.asList(Allele.create("A", true),
Allele.create("C"),
Allele.create("G"),
Allele.create("T")).subList(0, numAltAlleles+1);
}
public int getExpectedAltAC(final int alleleI) {
return expectedACs[alleleI+1];
}
public String toString() {
return String.format("%s model=%s prior=%s input=%s", super.toString(), calc.getClass().getSimpleName(),
priorName, GLs.size() > 5 ? String.format("%d samples", GLs.size()) : GLs);
}
}
@DataProvider(name = "wellFormedGLs")
public Object[][] createSimpleGLsData() {
final List<Genotype> biAllelicSamples = Arrays.asList(AA1, AB1, BB1);
final List<Genotype> triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2);
for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(
Arrays.asList(
AFCalcFactory.Calculation.EXACT_REFERENCE,
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY
), 4, 2, 2, 2);
final int nPriorValues = 2*nSamples+1;
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
for ( AFCalc model : calcs ) {
final String priorName = priors == humanPriors ? "human" : "flat";
// bi-allelic
if ( INCLUDE_BIALLELIC && nSamples <= biAllelicSamples.size() )
for ( List<Genotype> genotypes : Utils.makePermutations(biAllelicSamples, nSamples, true) )
new GetGLsTest(model, 1, genotypes, priors, priorName);
// tri-allelic
if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) ) // || model != generalCalc ) )
for ( List<Genotype> genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) )
new GetGLsTest(model, 2, genotypes, priors, priorName);
}
}
}
return GetGLsTest.getTests(GetGLsTest.class);
}
@DataProvider(name = "badGLs")
public Object[][] createBadGLs() {
final List<Genotype> genotypes = Arrays.asList(AB2, BB2, CC2, CC2);
final int nSamples = genotypes.size();
final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4);
final int nPriorValues = 2*nSamples+1;
final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
for ( AFCalc model : Arrays.asList(indCalc) ) {
final String priorName = "flat";
new GetGLsTest(model, 2, genotypes, priors, priorName);
}
return GetGLsTest.getTests(GetGLsTest.class);
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs")
public void testBiallelicGLs(GetGLsTest cfg) {
if ( cfg.getAlleles().size() == 2 )
testResultSimple(cfg);
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs")
public void testTriallelicGLs(GetGLsTest cfg) {
if ( cfg.getAlleles().size() > 2 )
testResultSimple(cfg);
}
@Test(enabled = true, dataProvider = "badGLs")
public void testBadGLs(GetGLsTest cfg) {
testResultSimple(cfg);
}
private static class NonInformativeData {
final Genotype nonInformative;
final List<Genotype> called;
final int nAltAlleles;
private NonInformativeData(List<Genotype> called, Genotype nonInformative, int nAltAlleles) {
this.called = called;
this.nonInformative = nonInformative;
this.nAltAlleles = nAltAlleles;
}
}
@DataProvider(name = "GLsWithNonInformative")
public Object[][] makeGLsWithNonInformative() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<NonInformativeData> nonInformativeTests = new LinkedList<NonInformativeData>();
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB1), NON_INFORMATIVE1, 1));
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2), NON_INFORMATIVE2, 2));
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2, BC2), NON_INFORMATIVE2, 2));
for ( final int nNonInformative : Arrays.asList(1, 10, 100) ) {
for ( final NonInformativeData testData : nonInformativeTests ) {
final List<Genotype> samples = new ArrayList<Genotype>();
samples.addAll(testData.called);
samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative));
final int nSamples = samples.size();
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(
Arrays.asList(
AFCalcFactory.Calculation.EXACT_REFERENCE,
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY
), 4, 2, 2, 2);
final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors
for ( AFCalc model : calcs ) {
final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat");
for ( int rotation = 0; rotation < nSamples; rotation++ ) {
Collections.rotate(samples, 1);
final GetGLsTest withNonInformative = new GetGLsTest(model, testData.nAltAlleles, samples, priors, "flat");
tests.add(new Object[]{onlyInformative, withNonInformative});
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "GLsWithNonInformative", dependsOnMethods = {"testBiallelicGLs", "testTriallelicGLs"})
public void testGLsWithNonInformative(GetGLsTest onlyInformative, GetGLsTest withNonInformative) {
final AFCalcResult expected = onlyInformative.execute();
final AFCalcResult actual = withNonInformative.execute();
testResultSimple(withNonInformative);
compareAFCalcResults(actual, expected, onlyInformative.getCalc(), true);
}
private void testResultSimple(final GetGLsTest cfg) {
final AFCalcResult refResultTracker = cfg.executeRef();
final AFCalcResult resultTracker = cfg.execute();
compareAFCalcResults(resultTracker, refResultTracker, cfg.getCalc(), true);
Assert.assertNotNull(resultTracker.getAllelesUsedInGenotyping());
Assert.assertTrue(cfg.getAlleles().containsAll(resultTracker.getAllelesUsedInGenotyping()), "Result object has alleles not in our initial allele list");
for ( int altAlleleI = 0; altAlleleI < cfg.numAltAlleles; altAlleleI++ ) {
int expectedAlleleCount = cfg.getExpectedAltAC(altAlleleI);
int calcAC_MLE = resultTracker.getAlleleCountsOfMLE()[altAlleleI];
final Allele allele = cfg.getAlleles().get(altAlleleI+1);
Assert.assertEquals(calcAC_MLE, expectedAlleleCount, "MLE AC not equal to expected AC for allele " + allele);
}
}
private void compareAFCalcResults(final AFCalcResult actual, final AFCalcResult expected, final AFCalc calc, final boolean onlyPosteriorsShouldBeEqual) {
// note we cannot really test the multi-allelic case because we actually meaningfully differ among the models here
final double TOLERANCE = calc.getMaxAltAlleles() > 1 ? 1000 : 0.1; // much tighter constraints on bi-allelic results
if ( ! onlyPosteriorsShouldBeEqual ) {
Assert.assertEquals(actual.getLog10PriorOfAFEq0(), expected.getLog10PriorOfAFEq0(), TOLERANCE, "Priors AF == 0");
Assert.assertEquals(actual.getLog10PriorOfAFGT0(), expected.getLog10PriorOfAFGT0(), TOLERANCE, "Priors AF > 0");
Assert.assertEquals(actual.getLog10LikelihoodOfAFEq0(), expected.getLog10LikelihoodOfAFEq0(), TOLERANCE, "Likelihoods AF == 0");
Assert.assertEquals(actual.getLog10LikelihoodOfAFGT0(), expected.getLog10LikelihoodOfAFGT0(), TOLERANCE, "Likelihoods AF > 0");
}
Assert.assertEquals(actual.getLog10PosteriorOfAFEq0(), expected.getLog10PosteriorOfAFEq0(), TOLERANCE, "Posteriors AF == 0");
Assert.assertEquals(actual.getLog10PosteriorOfAFGT0(), expected.getLog10PosteriorOfAFGT0(), TOLERANCE, "Posteriors AF > 0");
Assert.assertEquals(actual.getAlleleCountsOfMLE(), expected.getAlleleCountsOfMLE(), "MLE ACs");
Assert.assertEquals(actual.getAllelesUsedInGenotyping(), expected.getAllelesUsedInGenotyping(), "Alleles used in genotyping");
for ( final Allele a : expected.getAllelesUsedInGenotyping() ) {
if ( ! a.isReference() ) {
Assert.assertEquals(actual.getAlleleCountAtMLE(a), expected.getAlleleCountAtMLE(a), "MLE AC for allele " + a);
// TODO -- enable me when IndependentAllelesDiploidExactAFCalc works properly
// if ( ! ( calc instanceof GeneralPloidyExactAFCalc ) )
// // TODO -- delete when general ploidy works properly with multi-allelics
// Assert.assertEquals(actual.isPolymorphic(a, 0.0), expected.isPolymorphic(a, 0.0), "isPolymorphic with thread 0.0 for allele " + a);
}
}
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
public void testLargeGLs(final ExactAFCalc calc) {
final Genotype BB = makePL(Arrays.asList(C, C), 20000000, 20000000, 0);
GetGLsTest cfg = new GetGLsTest(calc, 1, Arrays.asList(BB, BB, BB), FLAT_3SAMPLE_PRIORS, "flat");
final AFCalcResult resultTracker = cfg.execute();
int calculatedAlleleCount = resultTracker.getAlleleCountsOfMLE()[0];
Assert.assertEquals(calculatedAlleleCount, 6);
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
public void testMismatchedGLs(final ExactAFCalc calc) {
final Genotype AB = makePL(Arrays.asList(A, C), 2000, 0, 2000, 2000, 2000, 2000);
final Genotype AC = makePL(Arrays.asList(A, G), 100, 100, 100, 0, 100, 100);
GetGLsTest cfg = new GetGLsTest(calc, 2, Arrays.asList(AB, AC), FLAT_3SAMPLE_PRIORS, "flat");
final AFCalcResult resultTracker = cfg.execute();
Assert.assertEquals(resultTracker.getAlleleCountsOfMLE()[0], 1);
Assert.assertEquals(resultTracker.getAlleleCountsOfMLE()[1], 1);
}
// --------------------------------------------------------------------------------
//
// Code to test that the pNonRef value is meaningful
//
// --------------------------------------------------------------------------------
private static class PNonRefData {
final Genotype g;
final double pNonRef, tolerance;
final boolean canScale;
final List<AFCalcFactory.Calculation> badModels;
final VariantContext vc;
private PNonRefData(final VariantContext vc, Genotype g, double pNonRef, double tolerance, final boolean canScale) {
this(vc, g, pNonRef, tolerance, canScale, Collections.<AFCalcFactory.Calculation>emptyList());
}
private PNonRefData(final VariantContext vc, Genotype g, double pNonRef, double tolerance, final boolean canScale, final List<AFCalcFactory.Calculation> badModels) {
this.g = g;
this.pNonRef = pNonRef;
this.tolerance = tolerance;
this.canScale = canScale;
this.badModels = badModels;
this.vc = vc;
}
public PNonRefData scale(final int scaleFactor) {
if ( canScale ) {
final int[] PLs = new int[g.getPL().length];
for ( int i = 0; i < PLs.length; i++ ) PLs[i] = g.getPL()[i] * ((int)Math.log10(scaleFactor)+1);
final Genotype scaledG = new GenotypeBuilder(g).PL(PLs).make();
final double scaledPNonRef = pNonRef < 0.5 ? pNonRef / scaleFactor : 1 - ((1-pNonRef) / scaleFactor);
return new PNonRefData(vc, scaledG, scaledPNonRef, tolerance, true);
} else {
return this;
}
}
}
@DataProvider(name = "PNonRef")
public Object[][] makePNonRefTest() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<Allele> AA = Arrays.asList(A, A);
final List<Allele> AC = Arrays.asList(A, C);
final List<Allele> CC = Arrays.asList(C, C);
final List<Allele> AG = Arrays.asList(A, G);
final List<Allele> GG = Arrays.asList(G, G);
final List<Allele> CG = Arrays.asList(C, G);
final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make();
final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make();
final AFCalcTestBuilder.PriorType priorType = AFCalcTestBuilder.PriorType.flat;
final List<AFCalcFactory.Calculation> constrainedModel = Arrays.asList(AFCalcFactory.Calculation.EXACT_CONSTRAINED);
final double TOLERANCE = 0.5;
final List<PNonRefData> initialPNonRefData = Arrays.asList(
// bi-allelic sites
new PNonRefData(vc2, makePL(AA, 0, 10, 10), 0.1666667, TOLERANCE, true),
new PNonRefData(vc2, makePL(AA, 0, 1, 10), 0.4721084, TOLERANCE, false, constrainedModel),
new PNonRefData(vc2, makePL(AA, 0, 1, 1), 0.6136992, TOLERANCE, false, constrainedModel),
new PNonRefData(vc2, makePL(AA, 0, 5, 5), 0.3874259, TOLERANCE, false, constrainedModel),
new PNonRefData(vc2, makePL(AC, 10, 0, 10), 0.9166667, TOLERANCE, true),
new PNonRefData(vc2, makePL(CC, 10, 10, 0), 0.9166667, TOLERANCE, true),
// tri-allelic sites -- cannot scale because of the naivety of our scaling estimator
new PNonRefData(vc3, makePL(AA, 0, 10, 10, 10, 10, 10), 0.3023255813953489, TOLERANCE * 2, false), // more tolerance because constrained model is a bit inaccurate
new PNonRefData(vc3, makePL(AC, 10, 0, 10, 10, 10, 10), 0.9166667, TOLERANCE, false),
new PNonRefData(vc3, makePL(CC, 10, 10, 0, 10, 10, 10), 0.9166667, TOLERANCE, false),
new PNonRefData(vc3, makePL(AG, 10, 10, 10, 0, 10, 10), 0.9166667, TOLERANCE, false),
new PNonRefData(vc3, makePL(CG, 10, 10, 10, 10, 0, 10), 0.80, TOLERANCE, false),
new PNonRefData(vc3, makePL(GG, 10, 10, 10, 10, 10, 0), 0.9166667, TOLERANCE, false)
);
for ( AFCalcFactory.Calculation modelType : Arrays.asList(AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcFactory.Calculation.EXACT_INDEPENDENT) ) {
for ( int nNonInformative = 0; nNonInformative < 3; nNonInformative++ ) {
for ( final PNonRefData rootData : initialPNonRefData ) {
for ( int plScale = 1; plScale <= 100000; plScale *= 10 ) {
if ( ! rootData.badModels.contains(modelType) && (plScale == 1 || rootData.canScale) ) {
final PNonRefData data = rootData.scale(plScale);
tests.add(new Object[]{data.vc, modelType, priorType, Arrays.asList(data.g), data.pNonRef, data.tolerance, nNonInformative});
}
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRef")
private void testPNonRef(final VariantContext vcRoot,
AFCalcFactory.Calculation modelType,
AFCalcTestBuilder.PriorType priorType,
final List<Genotype> genotypes,
final double expectedPNonRef,
final double tolerance,
final int nNonInformative) {
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(1, vcRoot.getNAlleles()-1, modelType, priorType);
final VariantContextBuilder vcb = new VariantContextBuilder(vcRoot);
vcb.genotypes(genotypes);
final AFCalcResult resultTracker = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), Math.log10(expectedPNonRef), tolerance,
"Actual pNonRef not within tolerance " + tolerance + " of expected");
}
// --------------------------------------------------------------------------------
//
// Test priors
//
// --------------------------------------------------------------------------------
@DataProvider(name = "Models")
public Object[][] makeModels() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( final AFCalcFactory.Calculation calc : AFCalcFactory.Calculation.values() ) {
if ( calc.usableForParams(2, 4) )
tests.add(new Object[]{AFCalcFactory.createAFCalc(calc, 2, 4)});
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true & ! DEBUG_ONLY, dataProvider = "Models")
public void testBiallelicPriors(final AFCalc model) {
for ( int REF_PL = 10; REF_PL <= 20; REF_PL += 10 ) {
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
for ( int log10NonRefPrior = 1; log10NonRefPrior < 10*REF_PL; log10NonRefPrior += 1 ) {
final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior);
final double[] priors = MathUtils.normalizeFromLog10(MathUtils.toLog10(new double[]{refPrior, (1-refPrior) / 2, (1-refPrior) / 2}), true);
GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior);
final AFCalcResult resultTracker = cfg.execute();
final int actualAC = resultTracker.getAlleleCountsOfMLE()[0];
final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1] - Math.log10(0.5);
final double nonRefPost = Math.pow(10, pHetWithPrior) / (Math.pow(10, pRefWithPrior) + Math.pow(10, pHetWithPrior));
final double log10NonRefPost = Math.log10(nonRefPost);
if ( ! Double.isInfinite(log10NonRefPost) )
Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), log10NonRefPost, 1e-2);
if ( nonRefPost >= 0.9 )
Assert.assertTrue(resultTracker.isPolymorphic(C, -1));
final int expectedMLEAC = 1; // the MLE is independent of the prior
Assert.assertEquals(actualAC, expectedMLEAC,
"actual AC with priors " + log10NonRefPrior + " not expected "
+ expectedMLEAC + " priors " + Utils.join(",", priors));
}
}
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
// --------------------------------------------------------------------------------
//
// Test that polymorphic sites (bi and tri) are properly called
//
// --------------------------------------------------------------------------------
@DataProvider(name = "polyTestProvider")
public Object[][] makePolyTestProvider() {
List<Object[]> tests = new ArrayList<Object[]>();
// list of all high-quality models in the system
final List<AFCalcFactory.Calculation> models = Arrays.asList(
AFCalcFactory.Calculation.EXACT,
AFCalcFactory.Calculation.EXACT_REFERENCE,
AFCalcFactory.Calculation.EXACT_INDEPENDENT);
// note that we cannot use small PLs here or the thresholds are hard to set
for ( final int nonTypePLs : Arrays.asList(100, 1000) ) {
for ( final AFCalcFactory.Calculation model : models ) {
for ( final int allele1AC : Arrays.asList(0, 1, 2, 10, 100, 1000, 10000) ) {
for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) {
// for ( final int nonTypePLs : Arrays.asList(10) ) {
// for ( final AFCalcFactory.Calculation model : models ) {
// for ( final int allele1AC : Arrays.asList(100) ) {
// for ( final int nSamples : Arrays.asList(1000) ) {
if ( nSamples < allele1AC ) continue;
final double pPerSample = Math.pow(10, nonTypePLs / -10.0);
final double errorFreq = pPerSample * nSamples;
final boolean poly1 = allele1AC > errorFreq && (nonTypePLs * allele1AC) > 30;
// bi-allelic tests
{
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(nSamples, 1, model, AFCalcTestBuilder.PriorType.human);
final List<Integer> ACs = Arrays.asList(allele1AC);
tests.add(new Object[]{testBuilder, ACs, nonTypePLs, Arrays.asList(poly1)});
}
// multi-allelic tests
for ( final int allele2AC : Arrays.asList(0, 1, 2, 10, 20, 50) ) {
if ( nSamples < allele2AC || allele1AC + allele2AC > nSamples || nSamples > 100 || nSamples == 1)
continue;
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(nSamples, 2, model, AFCalcTestBuilder.PriorType.human);
final List<Integer> ACs = Arrays.asList(allele1AC, allele2AC);
final boolean poly2 = allele2AC > errorFreq && (nonTypePLs * allele2AC) > 90;
tests.add(new Object[]{testBuilder, ACs, nonTypePLs, Arrays.asList(poly1, poly2)});
}
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "polyTestProvider")
public void testCallingGeneral(final AFCalcTestBuilder testBuilder, final List<Integer> ACs, final int nonTypePL, final List<Boolean> expectedPoly ) {
testCalling(testBuilder, ACs, nonTypePL, expectedPoly);
}
@DataProvider(name = "polyTestProviderLotsOfAlleles")
public Object[][] makepolyTestProviderLotsOfAlleles() {
List<Object[]> tests = new ArrayList<Object[]>();
// list of all high-quality models in the system
final List<AFCalcFactory.Calculation> models = Arrays.asList(AFCalcFactory.Calculation.EXACT_INDEPENDENT);
final List<Integer> alleleCounts = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 20);
final int nonTypePLs = 1000;
final int nAlleles = 4;
for ( final AFCalcFactory.Calculation model : models ) {
for ( final List<Integer> ACs : Utils.makePermutations(alleleCounts, nAlleles, true) ) {
final List<Boolean> isPoly = new ArrayList<Boolean>(ACs.size());
for ( final int ac : ACs ) isPoly.add(ac > 0);
final double acSum = MathUtils.sum(ACs);
for ( final int nSamples : Arrays.asList(1, 10, 100) ) {
if ( nSamples < acSum ) continue;
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(nSamples, nAlleles, model, AFCalcTestBuilder.PriorType.human);
tests.add(new Object[]{testBuilder, ACs, nonTypePLs, isPoly});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "polyTestProviderLotsOfAlleles")
public void testCallingLotsOfAlleles(final AFCalcTestBuilder testBuilder, final List<Integer> ACs, final int nonTypePL, final List<Boolean> expectedPoly ) {
testCalling(testBuilder, ACs, nonTypePL, expectedPoly);
}
private void testCalling(final AFCalcTestBuilder testBuilder, final List<Integer> ACs, final int nonTypePL, final List<Boolean> expectedPoly) {
final AFCalc calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL);
final AFCalcResult result = calc.getLog10PNonRef(vc, priors);
boolean anyPoly = false;
for ( final boolean onePoly : expectedPoly ) anyPoly = anyPoly || onePoly;
if ( anyPoly )
Assert.assertTrue(result.getLog10PosteriorOfAFGT0() > -1);
for ( int altI = 1; altI < result.getAllelesUsedInGenotyping().size(); altI++ ) {
final int i = altI - 1;
final Allele alt = result.getAllelesUsedInGenotyping().get(altI);
// must be getCalledChrCount because we cannot ensure that the VC made has our desired ACs
Assert.assertEquals(result.getAlleleCountAtMLE(alt), vc.getCalledChrCount(alt));
Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFGt0ForAllele(alt));
}
}
}

View File

@ -0,0 +1,124 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class ConstrainedAFCalculationModelUnitTest extends BaseTest {
static Allele A = Allele.create("A", true);
static Allele C = Allele.create("C");
static Allele G = Allele.create("G");
protected static Genotype makePL(final List<Allele> expectedGT, int ... pls) {
return AFCalcUnitTest.makePL(expectedGT, pls);
}
@DataProvider(name = "MaxACsToVisit")
public Object[][] makeMaxACsToVisit() {
List<Object[]> tests = new ArrayList<Object[]>();
final int nSamples = 10;
for (int nNonInformative = 0; nNonInformative < nSamples - 1; nNonInformative++ ) {
final int nChrom = (nSamples - nNonInformative) * 2;
for ( int i = 0; i < nChrom; i++ ) {
// bi-allelic
tests.add(new Object[]{nSamples, Arrays.asList(i), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED});
// tri-allelic
for ( int j = 0; j < (nChrom - i); j++)
tests.add(new Object[]{nSamples, Arrays.asList(i, j), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED});
}
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "MaxACsToVisit")
public void testMaxACsToVisit(final int nSamples, final List<Integer> requestedACs, final int nNonInformative, final AFCalcFactory.Calculation modelType) {
final int nAlts = requestedACs.size();
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(nSamples, nAlts, modelType,
AFCalcTestBuilder.PriorType.human);
final VariantContext vc = testBuilder.makeACTest(requestedACs, nNonInformative, 100);
final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc);
testExpectedACs(vc, maxACsToVisit);
}
private void testExpectedACs(final VariantContext vc, final int[] maxACsToVisit) {
// this is necessary because cannot ensure that the tester gives us back the
// requested ACs due to rounding errors
final List<Integer> ACs = new ArrayList<Integer>();
for ( final Allele a : vc.getAlternateAlleles() )
ACs.add(vc.getCalledChrCount(a));
for ( int i = 0; i < maxACsToVisit.length; i++ ) {
Assert.assertEquals(maxACsToVisit[i], (int)ACs.get(i), "Maximum AC computed wasn't equal to the max possible in the construction for alt allele " + i);
}
}
@DataProvider(name = "MaxACsGenotypes")
public Object[][] makeMaxACsForGenotype() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<Allele> AA = Arrays.asList(A, A);
final List<Allele> AC = Arrays.asList(A, C);
final List<Allele> CC = Arrays.asList(C, C);
final List<Allele> AG = Arrays.asList(A, G);
final List<Allele> GG = Arrays.asList(G, G);
final List<Allele> CG = Arrays.asList(C, G);
final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make();
final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make();
tests.add(new Object[]{vc2, makePL(AA, 0, 10, 10)});
tests.add(new Object[]{vc2, makePL(AC, 10, 0, 10)});
tests.add(new Object[]{vc2, makePL(CC, 10, 10, 0)});
// make sure non-informative => 0
tests.add(new Object[]{vc2, makePL(AA, 0, 0, 0)});
tests.add(new Object[]{vc3, makePL(AA, 0, 0, 0, 0, 0, 0)});
// multi-allelics
tests.add(new Object[]{vc3, makePL(AG, 10, 10, 10, 0, 10, 10)});
tests.add(new Object[]{vc3, makePL(CG, 10, 10, 10, 10, 0, 10)});
tests.add(new Object[]{vc3, makePL(GG, 10, 10, 10, 10, 10, 0)});
// deal with non-informatives third alleles
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 10)});
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 10)});
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 0)});
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 0)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 10)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 10)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 0)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 0)});
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "MaxACsGenotypes")
private void testMakeACByGenotype(final VariantContext vcRoot, final Genotype g) {
final VariantContext vc = new VariantContextBuilder(vcRoot).genotypes(g).make();
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(1, vc.getNAlleles()-1, AFCalcFactory.Calculation.EXACT_CONSTRAINED,
AFCalcTestBuilder.PriorType.human);
final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc);
testExpectedACs(vc, maxACsToVisit);
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
@ -137,15 +138,15 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest {
@Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) {
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(cfg.numAltAlleles);
final AFCalcResultTracker resultTracker = new AFCalcResultTracker(cfg.numAltAlleles);
final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size());
double[] priors = new double[len]; // flat priors
GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
GeneralPloidyExactAFCalc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, resultTracker);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
int calculatedAlleleCount = resultTracker.getAlleleCountsOfMAP()[allele];
// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount);
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);

View File

@ -0,0 +1,210 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
// SEE private/R/pls.R if you want the truth output for these tests
public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest {
@DataProvider(name = "TestCombineGLs")
public Object[][] makeTestCombineGLs() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{1, 1, makePL( 0, 10, 20), makePL( 0, 10, 20)});
tests.add(new Object[]{1, 1, makePL(10, 0, 20), makePL(10, 0, 20)});
tests.add(new Object[]{1, 1, makePL(20, 10, 0), makePL(20, 10, 0)});
// AA AB BB AC BC CC => AA AB+BC CC
tests.add(new Object[]{1, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 10, 20)});
tests.add(new Object[]{2, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 30, 50)});
tests.add(new Object[]{1, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)});
tests.add(new Object[]{2, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)});
tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5)});
tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9)});
tests.add(new Object[]{1, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)});
tests.add(new Object[]{2, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)});
tests.add(new Object[]{1, 2, makePL( 50, 0, 50, 50, 50, 50), makePL(45, 0, 50)});
tests.add(new Object[]{2, 2, makePL( 50, 0, 50, 50, 50, 50), makePL( 0, 47, 50)});
tests.add(new Object[]{1, 2, makePL( 50, 50, 0, 50, 50, 50), makePL(45, 47, 0)});
tests.add(new Object[]{2, 2, makePL( 50, 50, 0, 50, 50, 50), makePL( 0, 47, 50)});
tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(0, 47, 50)});
tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(45, 0, 50)});
tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)});
tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)});
tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(0, 47, 50)});
tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(45, 47, 0)});
return tests.toArray(new Object[][]{});
}
@DataProvider(name = "TestCombineGLsWithDrops")
public Object[][] makeTestCombineGLsWithDrops() {
List<Object[]> tests = new ArrayList<Object[]>();
final Set<Integer> noDrops = Collections.emptySet();
final Set<Integer> drop1 = Collections.singleton(1);
final Set<Integer> drop2 = Collections.singleton(2);
// AA AB BB AC BC CC
// drop1 (B): AA AC CC
// drop2 (C): AA AB BB
tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5), noDrops});
tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9), noDrops});
tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 1, 2), drop2});
tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 3, 5), drop1});
tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(0, 2, 6), noDrops});
tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(1, 0, 2), noDrops});
tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(2, 1, 0), drop2});
tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(5, 2, 0), drop1});
tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 8,11), noDrops});
tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL( 5, 7, 0), noDrops});
tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 0, 0), drop2});
tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL(10,10, 0), drop1});
return tests.toArray(new Object[][]{});
}
private Genotype makePL(final int ... PLs) {
return AFCalcUnitTest.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), PLs);
}
@Test(enabled = true, dataProvider = "TestCombineGLs")
private void testCombineGLs(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) {
testCombineGLsWithDrops(altIndex, nAlts, testg, expected, Collections.<Integer>emptySet());
}
@Test(enabled = true, dataProvider = "TestCombineGLsWithDrops")
private void testCombineGLsWithDrops(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected, Set<Integer> allelesToDrop) {
final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4);
final Genotype combined = calc.combineGLs(testg, altIndex, allelesToDrop, nAlts);
Assert.assertEquals(combined.getPL(), expected.getPL(),
"Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL()));
}
static Allele A = Allele.create("A", true);
static Allele C = Allele.create("C");
static Allele G = Allele.create("G");
@DataProvider(name = "TestMakeAlleleConditionalContexts")
public Object[][] makeTestMakeAlleleConditionalContexts() {
List<Object[]> tests = new ArrayList<Object[]>();
final VariantContextBuilder root = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A));
final VariantContextBuilder vcAC = new VariantContextBuilder(root).alleles(Arrays.asList(A, C));
final VariantContextBuilder vcAG = new VariantContextBuilder(root).alleles(Arrays.asList(A, G));
final VariantContextBuilder vcACG = new VariantContextBuilder(root).alleles(Arrays.asList(A, C, G));
final VariantContextBuilder vcAGC = new VariantContextBuilder(root).alleles(Arrays.asList(A, G, C));
final Genotype gACG = makePL( 0, 1, 2, 3, 4, 5);
final Genotype gAGC = makePL( 0, 4, 5, 1, 3, 2);
final Genotype gACcombined = makePL(0, 2, 5);
final Genotype gAGcombined = makePL(0, 4, 9);
final Genotype gACdropped = makePL(0, 1, 2);
final Genotype gAGdropped = makePL(0, 3, 5);
// biallelic
tests.add(new Object[]{vcAC.genotypes(gACcombined).make(), Arrays.asList(vcAC.genotypes(gACcombined).make())});
// tri-allelic
tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGdropped).make())});
tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACdropped).make())});
return tests.toArray(new Object[][]{});
}
@Test(enabled = false, dataProvider = "TestMakeAlleleConditionalContexts")
private void testMakeAlleleConditionalContexts(final VariantContext vc, final List<VariantContext> expectedVCs) {
final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4);
final List<VariantContext> biAllelicVCs = calc.makeAlleleConditionalContexts(vc);
Assert.assertEquals(biAllelicVCs.size(), expectedVCs.size());
for ( int i = 0; i < biAllelicVCs.size(); i++ ) {
final VariantContext actual = biAllelicVCs.get(i);
final VariantContext expected = expectedVCs.get(i);
Assert.assertEquals(actual.getAlleles(), expected.getAlleles());
for ( int j = 0; j < actual.getNSamples(); j++ )
Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL());
}
}
@DataProvider(name = "ThetaNTests")
public Object[][] makeThetaNTests() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<Double> log10LAlleles = Arrays.asList(0.0, -1.0, -2.0, -3.0, -4.0);
for ( final double log10pRef : Arrays.asList(-1, -2, -3) ) {
for ( final int ploidy : Arrays.asList(1, 2, 3, 4) ) {
for ( List<Double> permutations : Utils.makePermutations(log10LAlleles, ploidy, true)) {
tests.add(new Object[]{permutations, Math.pow(10, log10pRef)});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "ThetaNTests")
public void testThetaNTests(final List<Double> log10LAlleles, final double pRef) {
// biallelic
final double[] rawPriors = MathUtils.toLog10(new double[]{pRef, 1-pRef});
final double log10pNonRef = Math.log10(1-pRef);
final List<AFCalcResult> originalPriors = new LinkedList<AFCalcResult>();
final List<Double> pNonRefN = new LinkedList<Double>();
for ( int i = 0; i < log10LAlleles.size(); i++ ) {
final double log10LAllele1 = log10LAlleles.get(i);
final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true);
final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, 0.0));
originalPriors.add(result1);
pNonRefN.add(log10pNonRef*(i+1));
}
final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 2);
final List<AFCalcResult> thetaNPriors = calc.applyMultiAllelicPriors(originalPriors);
double prevPosterior = 0.0;
for ( int i = 0; i < log10LAlleles.size(); i++ ) {
final AFCalcResult thetaN = thetaNPriors.get(i);
AFCalcResult orig = null;
for ( final AFCalcResult x : originalPriors )
if ( x.getAllelesUsedInGenotyping().equals(thetaN.getAllelesUsedInGenotyping()))
orig = x;
Assert.assertNotNull(orig, "couldn't find original AFCalc");
Assert.assertEquals(orig.getLog10PriorOfAFGT0(), log10pNonRef, 1e-6);
Assert.assertEquals(thetaN.getLog10PriorOfAFGT0(), pNonRefN.get(i), 1e-6);
Assert.assertTrue(orig.getLog10PosteriorOfAFGT0() <= prevPosterior, "AFCalc results should be sorted but " + prevPosterior + " is > original posterior " + orig.getLog10PosteriorOfAFGT0());
prevPosterior = orig.getLog10PosteriorOfAFGT0();
}
}
}

View File

@ -21,17 +21,17 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "5b751474ad0aef4cdb53f094e605f97c");
HCTest(CEUTRIO_BAM, "", "a305107a5ec889152aa2efbe90b249d7");
}
@Test
public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "60efcd2d2722087e900f6365985d18bf");
HCTest(NA12878_BAM, "", "0c2217ec81f19790a6d1f98ebf8cf70d");
}
@Test
public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "71bec55320a2f07af0d54be9d7735322");
HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "0396c7352ab8ab98b03dca36299a0ddf");
}
private void HCTestComplexVariants(String bam, String args, String md5) {
@ -42,7 +42,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerMultiSampleComplex() {
HCTestComplexVariants(CEUTRIO_BAM, "", "f5a809e3fbd9998f79b75bb2973209e1");
HCTestComplexVariants(CEUTRIO_BAM, "", "2cfb7d830d5a7eb7bc754b5f688a27a5");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
@ -53,7 +53,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleSymbolic() {
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "8043b0451a4384e678a93600b34afce7");
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "d511848a46083c0d0b2495f65f162c2e");
}
private void HCTestIndelQualityScores(String bam, String args, String md5) {
@ -64,21 +64,36 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "ea6539e05faf10ffaf76f2d16907c47a");
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "98d82d74e8d6a778290bee6c0df6d092");
}
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("8d092b25f40456e618eef91fdce8adf0"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("7e112ea4623617f1f7f8f562f54aa2aa"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c29e61810c056b52a47baae0696931ea"));
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c642dcd93771f6f084d55de31f180d1b"));
executeTest("HCTestStructuralIndels: ", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing reduced reads
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("79af83432dc4a1768b3ebffffc4d2b8f"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
}

View File

@ -114,6 +114,9 @@ public class CommandLineGATK extends CommandLineExecutable {
public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file";
public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files";
public static final String NO_SPACE_LEFT_ON_DEVICE_ERROR = "No space left on device";
public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded";
private static void checkForMaskedUserErrors(final Throwable t) {
final String message = t.getMessage();
if ( message == null )
@ -133,9 +136,9 @@ public class CommandLineGATK extends CommandLineExecutable {
exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage()));
// disk is full
if ( message.contains("No space left on device") )
if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) )
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
if ( t.getCause() != null && t.getCause().getMessage().contains("No space left on device") )
if ( t.getCause() != null && (t.getCause().getMessage().contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || t.getCause().getMessage().contains(DISK_QUOTA_EXCEEDED_ERROR)) )
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
// masked out of memory error

View File

@ -140,6 +140,9 @@ public class GATKArgumentCollection {
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
public boolean nonDeterministicRandomSeed = false;
@Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.")
public boolean disableRandomization = false;
// --------------------------------------------------------------------------------------------------------------
//
// Downsampling Arguments

View File

@ -1,13 +1,12 @@
package org.broadinstitute.sting.gatk.arguments;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
/**
* Created with IntelliJ IDEA.
* User: rpoplin
@ -59,4 +58,18 @@ public class StandardCallerArgumentCollection {
@Advanced
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
public int MAX_ALTERNATE_ALLELES = 3;
/**
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
* that you not play around with this parameter.
*/
@Advanced
@Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false)
public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2;
@Hidden
@Argument(shortName = "logExactCalls", doc="x", required=false)
public File exactCallsLog = null;
}

View File

@ -124,6 +124,12 @@ public class BAMScheduler implements Iterator<FilePointer> {
*/
private FilePointer generatePointerOverEntireFileset() {
FilePointer filePointer = new FilePointer();
// This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is
// the only FilePointer we will create. This allows us to have this FilePointer represent regions from
// multiple contigs
filePointer.setIsMonolithic(true);
Map<SAMReaderID,GATKBAMFileSpan> currentPosition;
// Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
@ -88,6 +89,17 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
*/
private PeekableIterator<SAMRecord> currentContigReadsIterator = null;
/**
* How many FilePointers have we pulled from the filePointers iterator?
*/
private int totalFilePointersConsumed = 0;
/**
* Have we encountered a monolithic FilePointer?
*/
private boolean encounteredMonolithicFilePointer = false;
{
createNextContigFilePointer();
advance();
@ -167,6 +179,20 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
logger.info("Loading BAM index data for next contig");
while ( filePointers.hasNext() ) {
// Make sure that if we see a monolithic FilePointer (representing all regions in all files) that
// it is the ONLY FilePointer we ever encounter
if ( encounteredMonolithicFilePointer ) {
throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer");
}
if ( filePointers.peek().isMonolithic() ) {
if ( totalFilePointersConsumed > 0 ) {
throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer");
}
encounteredMonolithicFilePointer = true;
logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek()));
}
// If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the
// same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge
if ( nextContigFilePointers.isEmpty() ||
@ -175,6 +201,7 @@ public class ExperimentalReadShardBalancer extends ShardBalancer {
(nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) {
nextContigFilePointers.add(filePointers.next());
totalFilePointersConsumed++;
}
else {
break; // next FilePointer is on a different contig or has different mapped/unmapped status,

View File

@ -50,10 +50,28 @@ public class FilePointer {
*/
protected final boolean isRegionUnmapped;
/**
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
* from more than one contig.
*/
private boolean isMonolithic = false;
/**
* Index of the contig covered by this FilePointer. Only meaningful for non-monolithic, mapped FilePointers
*/
private Integer contigIndex = null;
public FilePointer( List<GenomeLoc> locations ) {
this.locations.addAll(locations);
this.isRegionUnmapped = checkUnmappedStatus();
validateLocations();
validateAllLocations();
if ( locations.size() > 0 ) {
contigIndex = locations.get(0).getContigIndex();
}
}
public FilePointer( final GenomeLoc... locations ) {
@ -80,8 +98,9 @@ public class FilePointer {
return foundUnmapped;
}
private void validateLocations() {
if ( isRegionUnmapped ) {
private void validateAllLocations() {
// Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction
if ( isRegionUnmapped || isMonolithic ) {
return;
}
@ -89,13 +108,22 @@ public class FilePointer {
for ( GenomeLoc location : locations ) {
if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) {
throw new ReviewedStingException("File pointers must contain intervals from at most one contig");
throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig");
}
previousContigIndex = location.getContigIndex();
}
}
private void validateLocation( GenomeLoc location ) {
if ( isRegionUnmapped != GenomeLoc.isUnmapped(location) ) {
throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
}
if ( ! isRegionUnmapped && ! isMonolithic && contigIndex != null && contigIndex != location.getContigIndex() ) {
throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig");
}
}
/**
* Returns an immutable view of this FilePointer's file spans
*
@ -123,6 +151,29 @@ public class FilePointer {
return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
}
/**
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
* from more than one contig.
*
* @return true if this FP is a monolithic FP representing all regions in all files, otherwise false
*/
public boolean isMonolithic() {
return isMonolithic;
}
/**
* Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all
* regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic
* FP may contain intervals from more than one contig.
*
* @param isMonolithic set this FP's monolithic status to this value
*/
public void setIsMonolithic( boolean isMonolithic ) {
this.isMonolithic = isMonolithic;
}
@Override
public boolean equals(final Object other) {
if(!(other instanceof FilePointer))
@ -151,15 +202,12 @@ public class FilePointer {
}
public void addLocation(final GenomeLoc location) {
this.locations.add(location);
checkUnmappedStatus();
validateLocations();
}
validateLocation(location);
public void addLocations( final List<GenomeLoc> locations ) {
this.locations.addAll(locations);
checkUnmappedStatus();
validateLocations();
this.locations.add(location);
if ( contigIndex == null ) {
contigIndex = location.getContigIndex();
}
}
public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) {

View File

@ -215,19 +215,29 @@ public class ReadShard extends Shard {
int start = Integer.MAX_VALUE;
int stop = Integer.MIN_VALUE;
String contig = null;
boolean foundMapped = false;
for ( final SAMRecord read : reads ) {
if ( contig != null && ! read.getReferenceName().equals(contig) )
throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. "
+ "First contig is " + contig + " next read was " + read.getReferenceName() );
contig = read.getReferenceName();
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
// Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates
// of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries,
// this shard might consist *only* of unmapped mates! We need to refrain from using the alignment
// starts/stops of these unmapped mates, and detect the case where the shard has been filled *only*
// with unmapped mates.
if ( ! read.getReadUnmappedFlag() ) {
foundMapped = true;
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
}
}
assert contig != null;
if ( contig.equals("*") ) // all reads are unmapped
if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped
return GenomeLoc.UNMAPPED;
else
return parser.createGenomeLoc(contig, start, stop);

View File

@ -1008,6 +1008,12 @@ public class SAMDataSource {
} catch ( SAMFormatException e ) {
throw new UserException.MalformedBAM(readerID.samFile, e.getMessage());
}
// Picard is throwing a RuntimeException here when BAMs are malformed with bad headers (and so look like SAM files).
// Let's keep this separate from the SAMFormatException (which ultimately derives from RuntimeException) case,
// just in case we want to change this behavior later.
catch ( RuntimeException e ) {
throw new UserException.MalformedBAM(readerID.samFile, e.getMessage());
}
reader.setSAMRecordFactory(factory);
reader.enableFileSource(true);
reader.setValidationStringency(validationStringency);

View File

@ -34,8 +34,10 @@ import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.lang.reflect.Type;
import java.util.List;
@ -239,6 +241,8 @@ class ReferenceOrderedQueryDataPool extends ResourcePool<RMDTrack,LocationAwareS
} else {
return new SeekableRODIterator(header,sequenceDictionary,referenceSequenceDictionary,genomeLocParser,track.getIterator());
}
} catch (FileNotFoundException e) {
throw new UserException.CouldNotReadInputFile(fileDescriptor.getName(), "it could not be found");
} catch (IOException e) {
throw new ReviewedStingException("Unable to create iterator for rod named " + fileDescriptor.getName(),e);
}

View File

@ -27,7 +27,7 @@ package org.broadinstitute.sting.gatk.filters;
import net.sf.samtools.SAMRecord;
/**
* Filter out reads with low mapping qualities.
* Filter out reads whose mate maps to a different contig.
*
* @author ebanks
* @version 0.1

View File

@ -186,7 +186,7 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
// that way we don't assume it's a specific type
final FeatureManager.FeatureDescriptor fd = new FeatureManager().getByFiletype(file);
if ( fd == null )
throw new ReviewedStingException("Unexpectedly couldn't find valid codec for temporary output file " + file);
throw new UserException.LocalParallelizationProblem(file);
final FeatureCodec<VariantContext> codec = fd.getCodec();
final AbstractFeatureReader<VariantContext> source =

View File

@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -49,8 +50,12 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno
if ( perReadAlleleLikelihoodMap.size() == 0 )
return null;
for ( Map.Entry<String, PerReadAlleleLikelihoodMap> sample : perReadAlleleLikelihoodMap.entrySet() )
depth += sample.getValue().getNumberOfStoredElements();
for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) {
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
final GATKSAMRecord read = el.getKey();
depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
}
}
}
else
return null;

View File

@ -13,6 +13,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
@ -72,7 +73,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
for ( PileupElement p : pileup ) {
if ( alleleCounts.containsKey(p.getBase()) )
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount());
}
// we need to add counts in the correct order
@ -91,12 +92,13 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
alleleCounts.put(allele, 0);
}
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
final GATKSAMRecord read = el.getKey();
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
if (a.isNoCall())
continue; // read is non-informative
if (!vc.getAlleles().contains(a))
continue; // sanity check - shouldn't be needed
alleleCounts.put(a,alleleCounts.get(a)+1);
alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1));
}
final int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(vc.getReference());

View File

@ -38,6 +38,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -71,7 +72,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
}
else if (stratifiedPerReadAlleleLikelihoodMap != null) {
// either SNP with no alignment context, or indels: per-read likelihood map needed
final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount());
final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc);
return pValueForBestTable(table, null);
}
else
@ -235,14 +236,13 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
* allele2 # #
* @return a 2x2 contingency table
*/
private static int[][] getContingencyTable( final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap,
final Allele ref, final Allele alt) {
private static int[][] getContingencyTable( final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) {
final Allele ref = vc.getReference();
final Allele alt = vc.getAltAlleleWithHighestAlleleCount();
int[][] table = new int[2][2];
for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
if ( el.getKey().isReducedRead() ) // ignore reduced reads
continue;
final boolean matchesRef = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(ref,true);
final boolean matchesAlt = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(alt,true);
@ -254,7 +254,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
int row = matchesRef ? 0 : 1;
int column = isFW ? 0 : 1;
table[row][column]++;
final GATKSAMRecord read = el.getKey();
table[row][column] += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
}
}
@ -275,7 +276,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
for (PileupElement p : sample.getValue().getBasePileup()) {
if ( ! RankSumTest.isUsableBase(p, false) || p.getRead().isReducedRead() ) // ignore deletions and reduced reads
if ( ! RankSumTest.isUsableBase(p, false) ) // ignore deletions
continue;
if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider )
@ -290,7 +291,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
int row = matchesRef ? 0 : 1;
int column = isFW ? 0 : 1;
table[row][column]++;
table[row][column] += p.getRepresentativeCount();
}
}
}

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -10,6 +11,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsC
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.MannWhitneyU;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -19,10 +21,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
/**
@ -30,6 +29,7 @@ import java.util.Map;
*/
public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
static final boolean DEBUG = false;
private boolean useDithering = true;
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
@ -70,7 +70,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
if (refQuals.isEmpty() && altQuals.isEmpty())
return null;
final MannWhitneyU mannWhitneyU = new MannWhitneyU();
final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering);
for (final Double qual : altQuals) {
mannWhitneyU.add(qual, MannWhitneyU.USet.SET1);
}
@ -131,4 +131,15 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here
}
/**
* Initialize the rank sum test annotation using walker and engine information. Right now this checks to see if
* engine randomization is turned off, and if so does not dither.
* @param walker
* @param toolkit
* @param headerLines
*/
public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) {
useDithering = ! toolkit.getArguments().disableRandomization;
}
}

View File

@ -179,7 +179,7 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
int numReadGroups = 0;
for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() )
numReadGroups += header.getReadGroups().size();
recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups);
recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG);
recalibrationEngine = initializeRecalibrationEngine();
recalibrationEngine.initialize(requestedCovariates, recalibrationTables);
@ -200,15 +200,15 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
}
}
private boolean readHasBeenSkipped(GATKSAMRecord read) {
private boolean readHasBeenSkipped( final GATKSAMRecord read ) {
return read.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE);
}
private boolean isLowQualityBase(GATKSAMRecord read, int offset) {
return read.getBaseQualities()[offset] < minimumQToUse;
private boolean isLowQualityBase( final PileupElement p ) {
return p.getQual() < minimumQToUse;
}
private boolean readNotSeen(GATKSAMRecord read) {
private boolean readNotSeen( final GATKSAMRecord read ) {
return !read.containsTemporaryAttribute(SEEN_ATTRIBUTE);
}
@ -230,7 +230,7 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
final int offset = p.getOffset();
// This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases)
if (readHasBeenSkipped(read) || isLowQualityBase(read, offset))
if (readHasBeenSkipped(read) || p.isInsertionAtBeginningOfRead() || isLowQualityBase(p) )
continue;
if (readNotSeen(read)) {

View File

@ -182,6 +182,10 @@ public class RecalibrationArgumentCollection {
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
public String FORCE_PLATFORM = null;
@Hidden
@Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only")
public PrintStream RECAL_TABLE_UPDATE_LOG = null;
public File existingRecalibrationReport = null;
public GATKReportTable generateReportTable(final String covariateNames) {

View File

@ -1,231 +0,0 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* The model representing how we calculate a genotype given the priors and a pile
* of bases and quality scores
*/
public abstract class AlleleFrequencyCalculationModel implements Cloneable {
public enum Model {
/** The default model with the best performance in all cases */
EXACT
}
protected int N;
protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
protected Logger logger;
protected PrintStream verboseWriter;
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
protected AlleleFrequencyCalculationModel(final UnifiedArgumentCollection UAC, final int N, final Logger logger, final PrintStream verboseWriter) {
this.N = N;
this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = UAC.MAX_ALTERNATE_ALLELES;
this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
this.logger = logger;
this.verboseWriter = verboseWriter;
}
/**
* Wrapper class that compares two likelihoods associated with two alleles
*/
protected static final class LikelihoodSum implements Comparable<LikelihoodSum> {
public double sum = 0.0;
public Allele allele;
public LikelihoodSum(Allele allele) { this.allele = allele; }
public int compareTo(LikelihoodSum other) {
final double diff = sum - other.sum;
return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0;
}
}
/**
* Unpack GenotypesContext into arraylist of doubel values
* @param GLs Input genotype context
* @return ArrayList of doubles corresponding to GL vectors
*/
protected static ArrayList<double[]> getGLs(GenotypesContext GLs) {
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(GLs.size());
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
for ( Genotype sample : GLs.iterateInSampleNameOrder() ) {
if ( sample.hasLikelihoods() ) {
double[] gls = sample.getLikelihoods().getAsVector();
if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL )
genotypeLikelihoods.add(gls);
}
}
return genotypeLikelihoods;
}
/**
* Must be overridden by concrete subclasses
* @param vc variant context with alleles and genotype likelihoods
* @param log10AlleleFrequencyPriors priors
* @param result (pre-allocated) object to store likelihoods results
* @return the alleles used for genotyping
*/
protected abstract List<Allele> getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result);
/**
* Must be overridden by concrete subclasses
* @param vc variant context with alleles and genotype likelihoods
* @param allelesToUse alleles to subset
* @param assignGenotypes
* @param ploidy
* @return GenotypesContext object
*/
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy);
// -------------------------------------------------------------------------------------
//
// protected classes used to store exact model matrix columns
//
// -------------------------------------------------------------------------------------
protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first
// a wrapper around the int array so that we can make it hashable
protected static final class ExactACcounts {
protected final int[] counts;
private int hashcode = -1;
public ExactACcounts(final int[] counts) {
this.counts = counts;
}
public int[] getCounts() {
return counts;
}
@Override
public boolean equals(Object obj) {
return (obj instanceof ExactACcounts) && Arrays.equals(counts, ((ExactACcounts) obj).counts);
}
@Override
public int hashCode() {
if ( hashcode == -1 )
hashcode = Arrays.hashCode(counts);
return hashcode;
}
@Override
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append(counts[0]);
for ( int i = 1; i < counts.length; i++ ) {
sb.append("/");
sb.append(counts[i]);
}
return sb.toString();
}
}
// This class represents a column in the Exact AC calculation matrix
protected static final class ExactACset {
// the counts of the various alternate alleles which this column represents
final ExactACcounts ACcounts;
// the column of the matrix
final double[] log10Likelihoods;
int sum = -1;
public ExactACset(final int size, final ExactACcounts ACcounts) {
this.ACcounts = ACcounts;
log10Likelihoods = new double[size];
Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY);
}
// sum of all the non-reference alleles
public int getACsum() {
if ( sum == -1 ) {
sum = 0;
for ( int count : ACcounts.getCounts() )
sum += count;
}
return sum;
}
public boolean equals(Object obj) {
return (obj instanceof ExactACset) && ACcounts.equals(((ExactACset)obj).ACcounts);
}
}
protected static final class MaxLikelihoodSeen {
double maxLog10L = Double.NEGATIVE_INFINITY;
ExactACcounts ACs = null;
public MaxLikelihoodSeen() {}
public void update(final double maxLog10L, final ExactACcounts ACs) {
this.maxLog10L = maxLog10L;
this.ACs = ACs;
}
// returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set
public boolean isLowerAC(final ExactACcounts otherACs) {
final int[] myACcounts = this.ACs.getCounts();
final int[] otherACcounts = otherACs.getCounts();
for ( int i = 0; i < myACcounts.length; i++ ) {
if ( myACcounts[i] > otherACcounts[i] )
return false;
}
return true;
}
}
}

View File

@ -1,150 +0,0 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: ebanks
* Date: Dec 14, 2011
*
* Useful helper class to communicate the results of the allele frequency calculation
*/
public class AlleleFrequencyCalculationResult {
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
private double log10MLE;
private double log10MAP;
private final int[] alleleCountsOfMLE;
private final int[] alleleCountsOfMAP;
// The posteriors seen, not including that of AF=0
private static final int POSTERIORS_CACHE_SIZE = 5000;
private final double[] log10PosteriorMatrixValues = new double[POSTERIORS_CACHE_SIZE];
private int currentPosteriorsCacheIndex = 0;
private Double log10PosteriorMatrixSum = null;
// These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
private double log10LikelihoodOfAFzero;
private double log10PosteriorOfAFzero;
public AlleleFrequencyCalculationResult(final int maxAltAlleles) {
alleleCountsOfMLE = new int[maxAltAlleles];
alleleCountsOfMAP = new int[maxAltAlleles];
reset();
}
public double getLog10MLE() {
return log10MLE;
}
public double getLog10MAP() {
return log10MAP;
}
public double getLog10PosteriorsMatrixSumWithoutAFzero() {
if ( log10PosteriorMatrixSum == null ) {
log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
}
return log10PosteriorMatrixSum;
}
public int[] getAlleleCountsOfMLE() {
return alleleCountsOfMLE;
}
public int[] getAlleleCountsOfMAP() {
return alleleCountsOfMAP;
}
public double getLog10LikelihoodOfAFzero() {
return log10LikelihoodOfAFzero;
}
public double getLog10PosteriorOfAFzero() {
return log10PosteriorOfAFzero;
}
public void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
alleleCountsOfMLE[i] = 0;
alleleCountsOfMAP[i] = 0;
}
currentPosteriorsCacheIndex = 0;
log10PosteriorMatrixSum = null;
}
public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
if ( log10LofK > log10MLE ) {
log10MLE = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMLE[i] = alleleCountsForK[i];
}
}
public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
addToPosteriorsCache(log10LofK);
if ( log10LofK > log10MAP ) {
log10MAP = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMAP[i] = alleleCountsForK[i];
}
}
private void addToPosteriorsCache(final double log10LofK) {
// add to the cache
log10PosteriorMatrixValues[currentPosteriorsCacheIndex++] = log10LofK;
// if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
if ( currentPosteriorsCacheIndex == POSTERIORS_CACHE_SIZE ) {
final double temporarySum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
log10PosteriorMatrixValues[0] = temporarySum;
currentPosteriorsCacheIndex = 1;
}
}
public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {
log10MLE = log10LikelihoodOfAFzero;
Arrays.fill(alleleCountsOfMLE, 0);
}
}
public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
if ( log10PosteriorOfAFzero > log10MAP ) {
log10MAP = log10PosteriorOfAFzero;
Arrays.fill(alleleCountsOfMAP, 0);
}
}
}

View File

@ -1,481 +0,0 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// private final static boolean DEBUG = false;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
}
public List<Allele> getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
GenotypesContext GLs = vc.getGenotypes();
List<Allele> alleles = vc.getAlleles();
final int myMaxAltAllelesToGenotype = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS && vc.getType().equals(VariantContext.Type.INDEL) ? 2 : MAX_ALTERNATE_ALLELES_TO_GENOTYPE;
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
GLs = VariantContextUtils.subsetDiploidAlleles(vc, alleles, false);
}
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result);
return alleles;
}
private static final int PL_INDEX_OF_HOM_REF = 0;
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
for ( int i = 0; i < numOriginalAltAlleles; i++ )
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
for ( final double[] likelihoods : GLs ) {
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL);
if ( alleles.alleleIndex1 != 0 )
likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
// don't double-count it
if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 )
likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
}
}
// sort them by probability mass and choose the best ones
Collections.sort(Arrays.asList(likelihoodSums));
final ArrayList<Allele> bestAlleles = new ArrayList<Allele>(numAllelesToChoose);
for ( int i = 0; i < numAllelesToChoose; i++ )
bestAlleles.add(likelihoodSums[i].allele);
final ArrayList<Allele> orderedBestAlleles = new ArrayList<Allele>(numAllelesToChoose);
for ( Allele allele : vc.getAlternateAlleles() ) {
if ( bestAlleles.contains(allele) )
orderedBestAlleles.add(allele);
}
return orderedBestAlleles;
}
// -------------------------------------------------------------------------------------
//
// Multi-allelic implementation.
//
// -------------------------------------------------------------------------------------
public static void linearExactMultiAllelic(final GenotypesContext GLs,
final int numAlternateAlleles,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
// queue of AC conformations to process
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1);
// add AC=0 to the queue
int[] zeroCounts = new int[numAlternateAlleles];
ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts));
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet);
// keep processing while we have AC conformations that need to be calculated
MaxLikelihoodSeen maxLikelihoodSeen = new MaxLikelihoodSeen();
while ( !ACqueue.isEmpty() ) {
// compute log10Likelihoods
final ExactACset set = ACqueue.remove();
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
// adjust max likelihood seen if needed
if ( log10LofKs > maxLikelihoodSeen.maxLog10L )
maxLikelihoodSeen.update(log10LofKs, set.ACcounts);
// clean up memory
indexesToACset.remove(set.ACcounts);
//if ( DEBUG )
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
}
}
private static final class DependentSet {
public final int[] ACcounts;
public final int PLindex;
public DependentSet(final int[] ACcounts, final int PLindex) {
this.ACcounts = ACcounts;
this.PLindex = PLindex;
}
}
private static double calculateAlleleCountConformation(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final MaxLikelihoodSeen maxLikelihoodSeen,
final int numChr,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
//if ( DEBUG )
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
// compute the log10Likelihoods
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result);
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
// can we abort early because the log10Likelihoods are so small?
if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) {
//if ( DEBUG )
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
return log10LofK;
}
// iterate over higher frequencies if possible
final int ACwiggle = numChr - set.getACsum();
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK;
final int numAltAlleles = set.ACcounts.getCounts().length;
// add conformations for the k+1 case
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele]++;
// to get to this conformation, a sample would need to be AB (remember that ref=0)
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
if ( ACwiggle > 1 ) {
final ArrayList<DependentSet> differentAlleles = new ArrayList<DependentSet>(numAltAlleles * numAltAlleles);
final ArrayList<DependentSet> sameAlleles = new ArrayList<DependentSet>(numAltAlleles);
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele_i]++;
ACcountsClone[allele_j]++;
// to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index)
final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1);
if ( allele_i == allele_j )
sameAlleles.add(new DependentSet(ACcountsClone, PLindex));
else
differentAlleles.add(new DependentSet(ACcountsClone, PLindex));
}
}
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
for ( DependentSet dependent : differentAlleles )
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
for ( DependentSet dependent : sameAlleles )
updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
return log10LofK;
}
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
// also pushes its value to the given callingSetIndex.
private static void updateACset(final int[] newSetCounts,
final int numChr,
final ExactACset dependentSet,
final int PLsetIndex,
final Queue<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final ArrayList<double[]> genotypeLikelihoods) {
final ExactACcounts index = new ExactACcounts(newSetCounts);
if ( !indexesToACset.containsKey(index) ) {
ExactACset set = new ExactACset(numChr/2 +1, index);
indexesToACset.put(index, set);
ACqueue.add(set);
}
// push data from the dependency to the new set
//if ( DEBUG )
// System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts);
pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods);
}
private static void computeLofK(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
set.log10Likelihoods[0] = 0.0; // the zero case
final int totalK = set.getACsum();
// special case for k = 0 over all k
if ( totalK == 0 ) {
for ( int j = 1; j < set.log10Likelihoods.length; j++ )
set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1];
result.setLog10LikelihoodOfAFzero(log10Lof0);
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
return;
}
// if we got here, then k > 0 for at least one k.
// the non-AA possible conformations were already dealt with by pushes from dependent sets;
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
if ( totalK < 2*j-1 ) {
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue);
}
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator;
}
double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
// update the MLE if necessary
result.updateMLEifNeeded(log10LofK, set.ACcounts.counts);
// apply the priors over each alternate allele
for ( final int ACcount : set.ACcounts.getCounts() ) {
if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
result.updateMAPifNeeded(log10LofK, set.ACcounts.counts);
}
private static void pushData(final ExactACset targetSet,
final ExactACset dependentSet,
final int PLsetIndex,
final ArrayList<double[]> genotypeLikelihoods) {
final int totalK = targetSet.getACsum();
for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) {
if ( totalK <= 2*j ) { // skip impossible conformations
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue =
determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex];
targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue);
}
}
}
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
// the closed form representation generalized for multiple alleles is as follows:
// AA: (2j - totalK) * (2j - totalK - 1)
// AB: 2k_b * (2j - totalK)
// AC: 2k_c * (2j - totalK)
// BB: k_b * (k_b - 1)
// BC: 2 * k_b * k_c
// CC: k_c * (k_c - 1)
// find the 2 alleles that are represented by this PL index
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
// *** note that throughout this method we subtract one from the alleleIndex because ACcounts ***
// *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. ***
// the AX het case
if ( alleles.alleleIndex1 == 0 )
return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK];
final int k_i = ACcounts[alleles.alleleIndex1-1];
// the hom var case (e.g. BB, CC, DD)
final double coeff;
if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) {
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
}
// the het non-ref case (e.g. BC, BD, CD)
else {
final int k_j = ACcounts[alleles.alleleIndex2-1];
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
}
return coeff;
}
public GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy) {
return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes);
}
// -------------------------------------------------------------------------------------
//
// Deprecated bi-allelic ~O(N) implementation. Kept here for posterity.
//
// -------------------------------------------------------------------------------------
/**
* A simple data structure that holds the current, prev, and prev->prev likelihoods vectors
* for the exact model calculation
*/
/*
private final static class ExactACCache {
double[] kMinus2, kMinus1, kMinus0;
private final static double[] create(int n) {
return new double[n];
}
public ExactACCache(int n) {
kMinus2 = create(n);
kMinus1 = create(n);
kMinus0 = create(n);
}
final public void rotate() {
double[] tmp = kMinus2;
kMinus2 = kMinus1;
kMinus1 = kMinus0;
kMinus0 = tmp;
}
final public double[] getkMinus2() {
return kMinus2;
}
final public double[] getkMinus1() {
return kMinus1;
}
final public double[] getkMinus0() {
return kMinus0;
}
}
public int linearExact(GenotypesContext GLs,
double[] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyLikelihoods,
double[][] log10AlleleFrequencyPosteriors) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
final ExactACCache logY = new ExactACCache(numSamples+1);
logY.getkMinus0()[0] = 0.0; // the zero case
double maxLog10L = Double.NEGATIVE_INFINITY;
boolean done = false;
int lastK = -1;
for (int k=0; k <= numChr && ! done; k++ ) {
final double[] kMinus0 = logY.getkMinus0();
if ( k == 0 ) { // special case for k = 0
for ( int j=1; j <= numSamples; j++ ) {
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0];
}
} else { // k > 0
final double[] kMinus1 = logY.getkMinus1();
final double[] kMinus2 = logY.getkMinus2();
for ( int j=1; j <= numSamples; j++ ) {
final double[] gl = genotypeLikelihoods.get(j);
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
double aa = Double.NEGATIVE_INFINITY;
double ab = Double.NEGATIVE_INFINITY;
if (k < 2*j-1)
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0];
if (k < 2*j)
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1];
double log10Max;
if (k > 1) {
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2];
log10Max = approximateLog10SumLog10(aa, ab, bb);
} else {
// we know we aren't considering the BB case, so we can use an optimized log10 function
log10Max = approximateLog10SumLog10(aa, ab);
}
// finally, update the L(j,k) value
kMinus0[j] = log10Max - logDenominator;
}
}
// update the posteriors vector
final double log10LofK = kMinus0[numSamples];
log10AlleleFrequencyLikelihoods[0][k] = log10LofK;
log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k];
// can we abort early?
lastK = k;
maxLog10L = Math.max(maxLog10L, log10LofK);
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
//if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
done = true;
}
logY.rotate();
}
return lastK;
}
final static double approximateLog10SumLog10(double a, double b, double c) {
return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c);
}
*/
}

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
@ -41,7 +42,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
*/
@Advanced
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.EXACT;
/**
* The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily
@ -75,10 +76,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
public Double MAX_DELETION_FRACTION = 0.05;
@Hidden
@Argument(fullName = "cap_max_alternate_alleles_for_indels", shortName = "capMaxAltAllelesForIndels", doc = "Cap the maximum number of alternate alleles to genotype for indel calls at 2; overrides the --max_alternate_alleles argument; GSA production use only", required = false)
public boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = false;
// indel-related arguments
/**
* A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site.
@ -160,7 +157,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy
*/
@Argument(shortName="ploidy", fullName="sample_ploidy", doc="Plody (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false)
int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY;
public int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY;
@Hidden
@Argument(shortName="minqs", fullName="min_quality_score", doc="Min quality score to consider. Smaller numbers process faster. Default: Q1.", required=false)
@ -186,7 +183,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
public UnifiedArgumentCollection clone() {
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
@ -212,7 +208,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE;
uac.alleles = alleles;
uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES;
uac.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
uac.MAX_ALTERNATE_ALLELES_FOR_INDELS = MAX_ALTERNATE_ALLELES_FOR_INDELS;
uac.GLmodel = GLmodel;
uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL;
uac.referenceSampleRod = referenceSampleRod;
@ -224,6 +220,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
uac.minReferenceDepth = minReferenceDepth;
uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES;
uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO;
uac.exactCallsLog = exactCallsLog;
// todo- arguments to remove
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
@ -239,8 +236,10 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
this.GenotypingMode = SCAC.GenotypingMode;
this.heterozygosity = SCAC.heterozygosity;
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
this.MAX_ALTERNATE_ALLELES_FOR_INDELS = SCAC.MAX_ALTERNATE_ALLELES_FOR_INDELS;
this.OutputMode = SCAC.OutputMode;
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
this.exactCallsLog = SCAC.exactCallsLog;
}
}

View File

@ -27,10 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
@ -249,7 +249,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2");
}
if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL)
if (UAC.AFmodel != AFCalc.Model.POOL)
throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2");
}

View File

@ -34,6 +34,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.classloader.PluginManager;
@ -78,11 +81,7 @@ public class UnifiedGenotyperEngine {
private ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>> glcm = new ThreadLocal<Map<String, GenotypeLikelihoodsCalculationModel>>();
// the model used for calculating p(non-ref)
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
// the allele frequency likelihoods and posteriors (allocated once as an optimization)
private ThreadLocal<AlleleFrequencyCalculationResult> alleleFrequencyCalculationResult = new ThreadLocal<AlleleFrequencyCalculationResult>();
private ThreadLocal<double[]> posteriorsArray = new ThreadLocal<double[]>();
private ThreadLocal<AFCalc> afcm = new ThreadLocal<AFCalc>();
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
private final double[] log10AlleleFrequencyPriorsSNPs;
@ -105,8 +104,6 @@ public class UnifiedGenotyperEngine {
private final GenomeLocParser genomeLocParser;
private final boolean BAQEnabledOnCMDLine;
protected static final double SUM_GL_THRESH_NOCALL = VariantContextUtils.SUM_GL_THRESH_NOCALL;
// ---------------------------------------------------------------------------------------------------------
//
// Public interface functions
@ -355,11 +352,8 @@ public class UnifiedGenotyperEngine {
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES));
posteriorsArray.set(new double[2]);
afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger));
}
AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get();
// estimate our confidence in a reference call and return
if ( vc.getNSamples() == 0 ) {
@ -370,8 +364,7 @@ public class UnifiedGenotyperEngine {
generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
}
AFresult.reset();
List<Allele> allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model));
// is the most likely frequency conformation AC=0 for all alternate alleles?
boolean bestGuessIsRef = true;
@ -380,50 +373,43 @@ public class UnifiedGenotyperEngine {
final List<Allele> myAlleles = new ArrayList<Allele>(vc.getAlleles().size());
final List<Integer> alleleCountsofMLE = new ArrayList<Integer>(vc.getAlleles().size());
myAlleles.add(vc.getReference());
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
final Allele alternateAllele = vc.getAlternateAllele(i);
final int indexOfAllele = allelesUsedInGenotyping.indexOf(alternateAllele);
// the genotyping model may have stripped it out
if ( indexOfAllele == -1 )
for ( int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++ ) {
final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i);
if ( alternateAllele.isReference() )
continue;
final int indexOfBestAC = AFresult.getAlleleCountsOfMAP()[indexOfAllele-1];
// we are non-ref if the probability of being non-ref > the emit confidence.
// the emit confidence is phred-scaled, say 30 => 10^-3.
// the posterior AF > 0 is log10: -5 => 10^-5
// we are non-ref if 10^-5 < 10^-3 => -5 < -3
final boolean isNonRef = AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0);
// if the most likely AC is not 0, then this is a good alternate allele to use
if ( indexOfBestAC != 0 ) {
if ( isNonRef ) {
myAlleles.add(alternateAllele);
alleleCountsofMLE.add(AFresult.getAlleleCountsOfMLE()[indexOfAllele-1]);
alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele));
bestGuessIsRef = false;
}
// if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele
else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
myAlleles.add(alternateAllele);
alleleCountsofMLE.add(AFresult.getAlleleCountsOfMLE()[indexOfAllele-1]);
alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele));
}
}
// calculate p(f>0):
final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get());
final double PofF = 1.0 - normalizedPosteriors[0];
final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0());
double phredScaledConfidence;
if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero();
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
if ( Double.isInfinite(phredScaledConfidence) ) {
final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
}
}
// note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice
final double phredScaledConfidence =
Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
? -10 * AFresult.getLog10PosteriorOfAFEq0()
: -10 * AFresult.getLog10PosteriorOfAFGT0());
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, 1.0 - PofF);
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0);
}
// start constructing the resulting VC
@ -439,7 +425,7 @@ public class UnifiedGenotyperEngine {
// print out stats if we have a writer
if ( verboseWriter != null && !limitedContext )
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model);
printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
final HashMap<String, Object> attributes = new HashMap<String, Object>();
@ -470,27 +456,25 @@ public class UnifiedGenotyperEngine {
// the overall lod
//double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
List<Allele> allAllelesToUse = builder.make().getAlleles();
// the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
AFresult.reset();
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model));
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
double forwardLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0();
double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap);
AFresult.reset();
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model));
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
double reverseLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0();
double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0();
//if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
@ -513,10 +497,10 @@ public class UnifiedGenotyperEngine {
// if we are subsetting alleles (either because there were too many or because some were not polymorphic)
// then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext )
if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync
vcCall = VariantContextUtils.reverseTrimAlleles(vcCall);
if ( annotationEngine != null && !limitedContext ) {
if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine
// Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
final ReadBackedPileup pileup = rawContext.getBasePileup();
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
@ -524,13 +508,7 @@ public class UnifiedGenotyperEngine {
vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap);
}
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
}
public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) {
normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero();
normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
return MathUtils.normalizeFromLog10(normalizedPosteriors);
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
}
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) {
@ -633,8 +611,6 @@ public class UnifiedGenotyperEngine {
AFline.append(i + "/" + N + "\t");
AFline.append(String.format("%.2f\t", ((float)i)/N));
AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i]));
AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MLE()));
AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MAP()));
verboseWriter.println(AFline.toString());
}
@ -700,7 +676,7 @@ public class UnifiedGenotyperEngine {
return models;
}
protected static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) {
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) {
double sum = 0.0;
@ -754,34 +730,6 @@ public class UnifiedGenotyperEngine {
return glcm;
}
private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) {
List<Class<? extends AlleleFrequencyCalculationModel>> afClasses = new PluginManager<AlleleFrequencyCalculationModel>(AlleleFrequencyCalculationModel.class).getPlugins();
// user-specified name
String afModelName = UAC.AFmodel.name();
if (!afModelName.contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY)
afModelName = GPSTRING + afModelName;
for (int i = 0; i < afClasses.size(); i++) {
Class<? extends AlleleFrequencyCalculationModel> afClass = afClasses.get(i);
String key = afClass.getSimpleName().replace("AFCalculationModel","").toUpperCase();
if (afModelName.equalsIgnoreCase(key)) {
try {
Object args[] = new Object[]{UAC,N,logger,verboseWriter};
Constructor c = afClass.getDeclaredConstructor(UnifiedArgumentCollection.class, int.class, Logger.class, PrintStream.class);
return (AlleleFrequencyCalculationModel)c.newInstance(args);
}
catch (Exception e) {
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
}
}
}
throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel);
}
public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding<VariantContext> allelesBinding) {
if ( tracker == null || ref == null || logger == null )
return null;

View File

@ -0,0 +1,224 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.List;
/**
* Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods
*
*/
public abstract class AFCalc implements Cloneable {
private final static Logger defaultLogger = Logger.getLogger(AFCalc.class);
protected final int nSamples;
protected final int maxAlternateAllelesToGenotype;
protected final int maxAlternateAllelesForIndels;
protected Logger logger = defaultLogger;
private SimpleTimer callTimer = new SimpleTimer();
private PrintStream callReport = null;
private final AFCalcResultTracker resultTracker;
protected AFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples);
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles);
if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels);
if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be > 0 but got " + ploidy);
this.nSamples = nSamples;
this.maxAlternateAllelesToGenotype = maxAltAlleles;
this.maxAlternateAllelesForIndels = maxAltAllelesForIndels;
this.resultTracker = new AFCalcResultTracker(Math.max(maxAltAlleles, maxAltAllelesForIndels));
}
public void enableProcessLog(final File exactCallsLog) {
initializeOutputFile(exactCallsLog);
}
public void setLogger(Logger logger) {
this.logger = logger;
}
/**
* Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc
*
* @param vc the VariantContext holding the alleles and sample information
* @param log10AlleleFrequencyPriors a prior vector nSamples x 2 in length indicating the Pr(AF = i)
* @return result (for programming convenience)
*/
public AFCalcResult getLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) {
if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null");
if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null");
if ( resultTracker == null ) throw new IllegalArgumentException("Results object cannot be null");
// reset the result, so we can store our new result there
resultTracker.reset();
final VariantContext vcWorking = reduceScope(vc);
callTimer.start();
final AFCalcResult result = computeLog10PNonRef(vcWorking, log10AlleleFrequencyPriors);
final long nanoTime = callTimer.getElapsedTimeNano();
if ( callReport != null )
printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, resultTracker.getLog10PosteriorOfAFzero());
return result;
}
@Deprecated
protected AFCalcResult resultFromTracker(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) {
resultTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles());
return resultTracker.toAFCalcResult(log10AlleleFrequencyPriors);
}
// ---------------------------------------------------------------------------
//
// Abstract methods that should be implemented by concrete implementations
// to actually calculate the AF
//
// ---------------------------------------------------------------------------
/**
* Look at VC and perhaps return a new one of reduced complexity, if that's necessary
*
* Used before the call to computeLog10PNonRef to simply the calculation job at hand,
* if vc exceeds bounds. For example, if VC has 100 alt alleles this function
* may decide to only genotype the best 2 of them.
*
* @param vc the initial VC provided by the caller to this AFcalculation
* @return a potentially simpler VC that's more tractable to genotype
*/
@Requires("vc != null")
@Ensures("result != null")
protected abstract VariantContext reduceScope(final VariantContext vc);
/**
* Actually carry out the log10PNonRef calculation on vc, storing results in results
*
* @param vc variant context with alleles and genotype likelihoods
* @param log10AlleleFrequencyPriors priors
* @return a AFCalcResult object describing the results of this calculation
*/
// TODO -- add consistent requires among args
protected abstract AFCalcResult computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors);
/**
* Must be overridden by concrete subclasses
*
* @param vc variant context with alleles and genotype likelihoods
* @param allelesToUse alleles to subset
* @param assignGenotypes
* @param ploidy
* @return GenotypesContext object
*/
public abstract GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy);
// ---------------------------------------------------------------------------
//
// accessors
//
// ---------------------------------------------------------------------------
public int getMaxAltAlleles() {
return Math.max(maxAlternateAllelesToGenotype, maxAlternateAllelesForIndels);
}
// ---------------------------------------------------------------------------
//
// Print information about the call to the calls log
//
// ---------------------------------------------------------------------------
private void initializeOutputFile(final File outputFile) {
try {
if (outputFile != null) {
callReport = new PrintStream( new FileOutputStream(outputFile) );
callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value")));
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(outputFile, e);
}
}
private void printCallInfo(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final long runtimeNano,
final double log10PosteriorOfAFzero) {
printCallElement(vc, "type", "ignore", vc.getType());
int allelei = 0;
for ( final Allele a : vc.getAlleles() )
printCallElement(vc, "allele", allelei++, a.getDisplayString());
for ( final Genotype g : vc.getGenotypes() )
printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString());
for ( int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++ )
printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]);
printCallElement(vc, "runtime.nano", "ignore", runtimeNano);
printCallElement(vc, "log10PosteriorOfAFzero", "ignore", log10PosteriorOfAFzero);
callReport.flush();
}
private void printCallElement(final VariantContext vc,
final Object variable,
final Object key,
final Object value) {
final String loc = String.format("%s:%d", vc.getChr(), vc.getStart());
callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value)));
}
public AFCalcResultTracker getResultTracker() {
return resultTracker;
}
}

View File

@ -0,0 +1,226 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.lang.reflect.Constructor;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
/**
* Factory to make AFCalculations
*/
public class AFCalcFactory {
/**
* Enumeration of usable AF calculation, their constraints (i.e. ploidy).
*
* Note that the order these occur in the enum is the order of preference, so
* the first value is taken over the second when multiple calculations satisfy
* the needs of the request (i.e., considering ploidy).
*/
public enum Calculation {
/** The default implementation */
EXACT(ReferenceDiploidExactAFCalc.class, 2, -1),
/** reference implementation of multi-allelic EXACT model */
EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1),
/** expt. implementation */
@Deprecated
EXACT_CONSTRAINED(ConstrainedDiploidExactAFCalc.class, 2, -1),
/** expt. implementation -- for testing only */
EXACT_INDEPENDENT(IndependentAllelesDiploidExactAFCalc.class, 2, -1),
/** original biallelic exact model, for testing only */
EXACT_ORIGINAL(OriginalDiploidExactAFCalc.class, 2, 2),
/** implementation that supports any sample ploidy */
EXACT_GENERAL_PLOIDY("GeneralPloidyExactAFCalc", -1, -1);
/**
* Must be a name because we look this up dynamically
*/
public final String className;
public final int maxAltAlleles;
public final int requiredPloidy;
private Calculation(final String className, final int requiredPloidy, final int maxAltAlleles) {
this.className = className;
this.requiredPloidy = requiredPloidy;
this.maxAltAlleles = maxAltAlleles;
}
private Calculation(final Class clazz, final int requiredPloidy, final int maxAltAlleles) {
this(clazz.getSimpleName(), requiredPloidy, maxAltAlleles);
}
public boolean usableForParams(final int requestedPloidy, final int requestedMaxAltAlleles) {
return (requiredPloidy == -1 || requiredPloidy == requestedPloidy)
&& (maxAltAlleles == -1 || maxAltAlleles >= requestedMaxAltAlleles);
}
}
private static final Map<String, Class<? extends AFCalc>> afClasses;
static {
afClasses = new PluginManager<AFCalc>(AFCalc.class).getPluginsByName();
}
private AFCalcFactory() {
}
private static Class<? extends AFCalc> getClassByName(final String name) {
for ( final Class<? extends AFCalc> clazz : afClasses.values() ) {
if ( clazz.getSimpleName().contains(name) ) {
return clazz;
}
}
return null;
}
/**
* Create a new AFCalc based on the parameters in the UAC
*
* @param UAC the UnifiedArgumentCollection containing the command-line parameters for the caller
* @param nSamples the number of samples we will be using
* @param logger an optional (can be null) logger to override the default in the model
* @return an initialized AFCalc
*/
public static AFCalc createAFCalc(final UnifiedArgumentCollection UAC,
final int nSamples,
final Logger logger) {
final int maxAltAlleles = Math.max(UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS);
if ( ! UAC.AFmodel.usableForParams(UAC.samplePloidy, maxAltAlleles) ) {
logger.info("Requested ploidy " + UAC.samplePloidy + " maxAltAlleles " + maxAltAlleles + " not supported by requested model " + UAC.AFmodel + " looking for an option");
final List<Calculation> supportingCalculations = new LinkedList<Calculation>();
for ( final Calculation calc : Calculation.values() ) {
if ( calc.usableForParams(UAC.samplePloidy, maxAltAlleles) )
supportingCalculations.add(calc);
}
if ( supportingCalculations.isEmpty() )
throw new UserException("no AFCalculation model found that supports ploidy of " + UAC.samplePloidy + " and max alt alleles " + maxAltAlleles);
else if ( supportingCalculations.size() > 1 )
logger.debug("Warning, multiple supporting AFCalcs found " + Utils.join(",", supportingCalculations) + " choosing first arbitrarily");
else
UAC.AFmodel = supportingCalculations.get(0);
logger.info("Selecting model " + UAC.AFmodel);
}
final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.MAX_ALTERNATE_ALLELES_FOR_INDELS, UAC.samplePloidy);
if ( logger != null ) calc.setLogger(logger);
if ( UAC.exactCallsLog != null ) calc.enableProcessLog(UAC.exactCallsLog);
return calc;
}
/**
* Create a new AFCalc, choosing the best implementation based on the given parameters, assuming
* that we will only be requesting bi-allelic variants to diploid genotypes
*
* @param nSamples the number of samples we'll be using
*
* @return an initialized AFCalc
*/
public static AFCalc createAFCalc(final int nSamples) {
return createAFCalc(chooseBestCalculation(nSamples, 2, 1), nSamples, 2, 2, 2);
}
/**
* Create a new AFCalc that supports maxAltAlleles for all variants and diploid genotypes
*
* @param calc the calculation we'd like to use
* @param nSamples the number of samples we'll be using
* @param maxAltAlleles the max. alt alleles for both SNPs and indels
*
* @return an initialized AFCalc
*/
public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles) {
return createAFCalc(calc, nSamples, maxAltAlleles, maxAltAlleles, 2);
}
/**
* Create a new AFCalc, choosing the best implementation based on the given parameters
*
* @param nSamples the number of samples we'll be using
* @param maxAltAlleles the max. alt alleles to consider for SNPs
* @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs
* @param ploidy the sample ploidy. Must be consistent with the calc
*
* @return an initialized AFCalc
*/
public static AFCalc createAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels);
return createAFCalc(chooseBestCalculation(nSamples, ploidy, maxAlt), nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
}
/**
* Choose the best calculation for nSamples and ploidy
*
* @param nSamples
* @param ploidy
* @param maxAltAlleles
* @return
*/
private static Calculation chooseBestCalculation(final int nSamples, final int ploidy, final int maxAltAlleles) {
for ( final Calculation calc : Calculation.values() ) {
if ( calc.usableForParams(ploidy, maxAltAlleles) ) {
return calc;
}
}
throw new IllegalStateException("no calculation found that supports nSamples " + nSamples + " ploidy " + ploidy + " and maxAltAlleles " + maxAltAlleles);
}
/**
* Create a new AFCalc
*
* @param calc the calculation to use
* @param nSamples the number of samples we'll be using
* @param maxAltAlleles the max. alt alleles to consider for SNPs
* @param maxAltAllelesForIndels the max. alt alleles to consider for non-SNPs
* @param ploidy the sample ploidy. Must be consistent with the calc
*
* @return an initialized AFCalc
*/
public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
if ( calc == null ) throw new IllegalArgumentException("Calculation cannot be null");
if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples);
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles);
if ( maxAltAllelesForIndels < 1 ) throw new IllegalArgumentException("maxAltAllelesForIndels must be greater than zero " + maxAltAllelesForIndels);
if ( ploidy < 1 ) throw new IllegalArgumentException("sample ploidy must be greater than zero " + ploidy);
final int maxAlt = Math.max(maxAltAlleles, maxAltAllelesForIndels);
if ( ! calc.usableForParams(ploidy, maxAlt) )
throw new IllegalArgumentException("AFCalc " + calc + " does not support requested ploidy " + ploidy);
final Class<? extends AFCalc> afClass = getClassByName(calc.className);
if ( afClass == null )
throw new IllegalArgumentException("Unexpected AFCalc " + calc);
try {
Object args[] = new Object[]{nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy};
Constructor c = afClass.getDeclaredConstructor(int.class, int.class, int.class, int.class);
return (AFCalc)c.newInstance(args);
} catch (Exception e) {
throw new ReviewedStingException("Could not instantiate AFCalc " + calc, e);
}
}
protected static List<AFCalc> createAFCalcs(final List<Calculation> calcs, final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
final List<AFCalc> AFCalcs = new LinkedList<AFCalc>();
for ( final Calculation calc : calcs )
AFCalcs.add(createAFCalc(calc, nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy));
return AFCalcs;
}
}

View File

@ -0,0 +1,334 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Describes the results of the AFCalc
*
* Only the bare essentials are represented here, as all AFCalc models must return meaningful results for
* all of these fields.
*
* Note that all of the values -- i.e. priors -- are checked now that they are meaningful, which means
* that users of this code can rely on the values coming out of these functions.
*/
public class AFCalcResult {
private final static int AF0 = 0;
private final static int AF1p = 1;
private final static int LOG_10_ARRAY_SIZES = 2;
private final double[] log10LikelihoodsOfAC;
private final double[] log10PriorsOfAC;
private final double[] log10PosteriorsOfAC;
private final Map<Allele, Double> log10pNonRefByAllele;
/**
* The AC values for all ALT alleles at the MLE
*/
private final int[] alleleCountsOfMLE;
int nEvaluations = 0;
/**
* The list of alleles actually used in computing the AF
*/
private List<Allele> allelesUsedInGenotyping = null;
/**
* Create a results object capability of storing results for calls with up to maxAltAlleles
*/
public AFCalcResult(final int[] alleleCountsOfMLE,
final int nEvaluations,
final List<Allele> allelesUsedInGenotyping,
final double[] log10LikelihoodsOfAC,
final double[] log10PriorsOfAC,
final Map<Allele, Double> log10pNonRefByAllele) {
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.size() < 1 ) throw new IllegalArgumentException("allelesUsedInGenotyping must be non-null list of at least 1 value " + allelesUsedInGenotyping);
if ( alleleCountsOfMLE == null ) throw new IllegalArgumentException("alleleCountsOfMLE cannot be null");
if ( alleleCountsOfMLE.length != allelesUsedInGenotyping.size() - 1) throw new IllegalArgumentException("alleleCountsOfMLE.length " + alleleCountsOfMLE.length + " != allelesUsedInGenotyping.size() " + allelesUsedInGenotyping.size());
if ( nEvaluations < 0 ) throw new IllegalArgumentException("nEvaluations must be >= 0 but saw " + nEvaluations);
if ( log10LikelihoodsOfAC.length != 2 ) throw new IllegalArgumentException("log10LikelihoodsOfAC must have length equal 2");
if ( log10PriorsOfAC.length != 2 ) throw new IllegalArgumentException("log10PriorsOfAC must have length equal 2");
if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null");
if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping);
if ( ! goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC));
if ( ! goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC));
this.alleleCountsOfMLE = alleleCountsOfMLE;
this.nEvaluations = nEvaluations;
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
this.log10LikelihoodsOfAC = Arrays.copyOf(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES);
this.log10PriorsOfAC = Arrays.copyOf(log10PriorsOfAC, LOG_10_ARRAY_SIZES);
this.log10PosteriorsOfAC = computePosteriors(log10LikelihoodsOfAC, log10PriorsOfAC);
this.log10pNonRefByAllele = new HashMap<Allele, Double>(log10pNonRefByAllele);
}
/**
* Return a new AFCalcResult with a new prior probability
*
* @param log10PriorsOfAC
* @return
*/
public AFCalcResult withNewPriors(final double[] log10PriorsOfAC) {
return new AFCalcResult(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele);
}
/**
* Returns a vector with maxAltAlleles values containing AC values at the MLE
*
* The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order,
* starting from index 0 (i.e., the first alt allele is at 0). The vector is always
* maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values
* are meaningful.
*
* @return a vector with allele counts, not all of which may be meaningful
*/
@Ensures("result != null")
public int[] getAlleleCountsOfMLE() {
return alleleCountsOfMLE;
}
/**
* Returns the AC of allele a la #getAlleleCountsOfMLE
*
* @param allele the allele whose AC we want to know. Error if its not in allelesUsedInGenotyping
* @throws IllegalStateException if allele isn't in allelesUsedInGenotyping
* @return the AC of allele
*/
public int getAlleleCountAtMLE(final Allele allele) {
return getAlleleCountsOfMLE()[altAlleleIndex(allele)];
}
/**
* Returns the number of cycles used to evaluate the pNonRef for this AF calculation
*
* @return the number of evaluations required to produce the answer for this AF calculation
*/
public int getnEvaluations() {
return nEvaluations;
}
/**
* Get the list of alleles actually used in genotyping.
*
* Due to computational / implementation constraints this may be smaller than
* the actual list of alleles requested
*
* @return a non-empty list of alleles used during genotyping
*/
@Ensures({"result != null", "! result.isEmpty()"})
public List<Allele> getAllelesUsedInGenotyping() {
return allelesUsedInGenotyping;
}
/**
* Get the log10 normalized -- across all ACs -- posterior probability of AC == 0 for all alleles
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
public double getLog10PosteriorOfAFEq0() {
return log10PosteriorsOfAC[AF0];
}
/**
* Get the log10 normalized -- across all ACs -- posterior probability of AC > 0 for any alleles
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
public double getLog10PosteriorOfAFGT0() {
return log10PosteriorsOfAC[AF1p];
}
/**
* Get the log10 unnormalized -- across all ACs -- likelihood of AC == 0 for all alleles
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
public double getLog10LikelihoodOfAFEq0() {
return log10LikelihoodsOfAC[AF0];
}
/**
* Get the log10 unnormalized -- across all ACs -- likelihood of AC > 0 for any alleles
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
public double getLog10LikelihoodOfAFGT0() {
return log10LikelihoodsOfAC[AF1p];
}
/**
* Get the log10 unnormalized -- across all ACs -- prior probability of AC == 0 for all alleles
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
public double getLog10PriorOfAFEq0() {
return log10PriorsOfAC[AF0];
}
/**
* Get the log10 unnormalized -- across all ACs -- prior probability of AC > 0
*
* @return
*/
@Ensures({"goodLog10Probability(result)"})
public double getLog10PriorOfAFGT0() {
return log10PriorsOfAC[AF1p];
}
/**
* Are we sufficiently confidence in being non-ref that the site is considered polymorphic?
*
* We are non-ref if the probability of being non-ref > the emit confidence (often an argument).
* Suppose posterior AF > 0 is log10: -5 => 10^-5
* And that log10minPNonRef is -3.
* We are considered polymorphic since 10^-5 < 10^-3 => -5 < -3
*
* @param log10minPNonRef the log10 scaled min pr of being non-ref to be considered polymorphic
*
* @return true if there's enough confidence (relative to log10minPNonRef) to reject AF == 0
*/
public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) {
return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef;
}
/**
* Returns the log10 probability that allele is segregating
*
* Unlike the sites-level annotation, this calculation is specific to allele, and can be
* used to separately determine how much evidence there is that allele is independently
* segregating as opposed to the site being polymorphic with any allele. In the bi-allelic
* case these are obviously the same but for multiple alt alleles there can be lots of
* evidence for one allele but not so much for any other allele
*
* @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping
* @return the log10 probability that allele is segregating at this site
*/
@Ensures("goodLog10Probability(result)")
public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) {
final Double log10pNonRef = log10pNonRefByAllele.get(allele);
if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele);
return log10pNonRef;
}
/**
* Returns the log10 normalized posteriors given the log10 likelihoods and priors
*
* @param log10LikelihoodsOfAC
* @param log10PriorsOfAC
*
* @return freshly allocated log10 normalized posteriors vector
*/
@Requires("log10LikelihoodsOfAC.length == log10PriorsOfAC.length")
@Ensures("goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)")
private static double[] computePosteriors(final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC) {
final double[] log10UnnormalizedPosteriors = new double[log10LikelihoodsOfAC.length];
for ( int i = 0; i < log10LikelihoodsOfAC.length; i++ )
log10UnnormalizedPosteriors[i] = log10LikelihoodsOfAC[i] + log10PriorsOfAC[i];
// necessary because the posteriors may be so skewed that the log-space normalized value isn't
// good, so we have to try both log-space normalization as well as the real-space normalization if the
// result isn't good
final double[] logNormalized = MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, true);
if ( goodLog10ProbVector(logNormalized, logNormalized.length, true) )
return logNormalized;
else
return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false);
}
/**
* Check that the log10 prob vector vector is well formed
*
* @param vector
* @param expectedSize
* @param shouldSumToOne
*
* @return true if vector is well-formed, false otherwise
*/
private static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) {
if ( vector.length != expectedSize ) return false;
for ( final double pr : vector ) {
if ( ! goodLog10Probability(pr) )
return false;
}
if ( shouldSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(vector), 1.0, 1e-4) != 0 )
return false;
return true; // everything is good
}
/**
* Computes the offset into linear vectors indexed by alt allele for allele
*
* Things like our MLE allele count vector are indexed by alt allele index, with
* the first alt allele being 0, the second 1, etc. This function computes the index
* associated with allele.
*
* @param allele the allele whose alt index we'd like to know
* @throws IllegalArgumentException if allele isn't in allelesUsedInGenotyping
* @return an index value greater than 0 suitable for indexing into the MLE and other alt allele indexed arrays
*/
@Requires("allele != null")
@Ensures({"result >= 0", "result < allelesUsedInGenotyping.size() - 1"})
private int altAlleleIndex(final Allele allele) {
if ( allele.isReference() ) throw new IllegalArgumentException("Cannot get the alt allele index for reference allele " + allele);
final int index = allelesUsedInGenotyping.indexOf(allele);
if ( index == -1 )
throw new IllegalArgumentException("could not find allele " + allele + " in " + allelesUsedInGenotyping);
else
return index - 1;
}
/**
* Checks that the result is a well-formed log10 probability
*
* @param result a supposedly well-formed log10 probability value
* @return true if result is really well formed
*/
private static boolean goodLog10Probability(final double result) {
return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result);
}
}

View File

@ -0,0 +1,256 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by IntelliJ IDEA.
* User: ebanks
* Date: Dec 14, 2011
*
* Useful helper class to communicate the results of the allele frequency calculation
*
* TODO -- WHAT IS THE CONTRACT ON MAP AC AND P NON REF?
*/
class AFCalcResultTracker {
protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY;
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
protected double log10MLE;
protected double log10MAP;
private final int[] alleleCountsOfMLE;
private final int[] alleleCountsOfMAP;
// The posteriors seen, not including that of AF=0
private static final int LIKELIHOODS_CACHE_SIZE = 5000;
private final double[] log10LikelihoodsMatrixValues = new double[LIKELIHOODS_CACHE_SIZE];
private int currentLikelihoodsCacheIndex = 0;
protected Double log10LikelihoodsMatrixSum = null;
// These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
private double log10LikelihoodOfAFzero;
private double log10PosteriorOfAFzero;
private int[] AClimits;
int nEvaluations = 0;
/**
* The list of alleles actually used in computing the AF
*/
private List<Allele> allelesUsedInGenotyping = null;
/**
* Create a results object capability of storing results for calls with up to maxAltAlleles
*
* @param maxAltAlleles an integer >= 1
*/
public AFCalcResultTracker(final int maxAltAlleles) {
if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles);
alleleCountsOfMLE = new int[maxAltAlleles];
alleleCountsOfMAP = new int[maxAltAlleles];
reset();
}
/**
* Returns a vector with maxAltAlleles values containing AC values at the MLE
*
* The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order,
* starting from index 0 (i.e., the first alt allele is at 0). The vector is always
* maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values
* are meaningful.
*
* @return a vector with allele counts, not all of which may be meaningful
*/
@Ensures("result != null")
public int[] getAlleleCountsOfMLE() {
return alleleCountsOfMLE;
}
/**
* Returns a vector with maxAltAlleles values containing AC values at the MAP
*
* @see #getAlleleCountsOfMLE() for the encoding of results in this vector
*
* @return a non-null vector of ints
*/
@Ensures("result != null")
public int[] getAlleleCountsOfMAP() {
return alleleCountsOfMAP;
}
/**
* Returns the likelihoods summed across all AC values for AC > 0
*
* @return
*/
public double getLog10LikelihoodOfAFNotZero() {
if ( log10LikelihoodsMatrixSum == null ) {
if ( currentLikelihoodsCacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have
log10LikelihoodsMatrixSum = MathUtils.LOG10_P_OF_ZERO;
else
log10LikelihoodsMatrixSum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex);
}
return log10LikelihoodsMatrixSum;
}
public double getLog10LikelihoodOfAFNotZero(final boolean capAt0) {
return Math.min(getLog10LikelihoodOfAFNotZero(), capAt0 ? 0.0 : Double.POSITIVE_INFINITY);
}
/**
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
*
* @return
*/
public double getLog10LikelihoodOfAFzero() {
return log10LikelihoodOfAFzero;
}
/**
* TODO -- eric what is this supposed to return? my unit tests don't do what I think they should
*
* @return
*/
public double getLog10PosteriorOfAFzero() {
return log10PosteriorOfAFzero;
}
protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) {
final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1);
final double[] log10Likelihoods = new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero(true)};
final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true);
// TODO -- replace with more meaningful computation
// TODO -- refactor this calculation into the ref calculation
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(allelesUsedInGenotyping.size());
for ( int i = 0; i < subACOfMLE.length; i++ ) {
final Allele allele = allelesUsedInGenotyping.get(i+1);
final double log10PNonRef = getAlleleCountsOfMAP()[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was
log10pNonRefByAllele.put(allele, log10PNonRef);
}
return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele);
}
// --------------------------------------------------------------------------------
//
// Protected mutational methods only for use within the calculation models themselves
//
// --------------------------------------------------------------------------------
/**
* Reset the data in this results object, so that it can be used in a subsequent AF calculation
*
* Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer
*/
protected void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = VALUE_NOT_CALCULATED;
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
alleleCountsOfMLE[i] = 0;
alleleCountsOfMAP[i] = 0;
}
currentLikelihoodsCacheIndex = 0;
log10LikelihoodsMatrixSum = null;
allelesUsedInGenotyping = null;
nEvaluations = 0;
Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY);
}
/**
* Tell this result we used one more evaluation cycle
*/
protected void incNEvaluations() {
nEvaluations++;
}
protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
addToLikelihoodsCache(log10LofK);
if ( log10LofK > log10MLE ) {
log10MLE = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMLE[i] = alleleCountsForK[i];
}
}
protected void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
if ( log10LofK > log10MAP ) {
log10MAP = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMAP[i] = alleleCountsForK[i];
}
}
private void addToLikelihoodsCache(final double log10LofK) {
// add to the cache
log10LikelihoodsMatrixValues[currentLikelihoodsCacheIndex++] = log10LofK;
// if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
if ( currentLikelihoodsCacheIndex == LIKELIHOODS_CACHE_SIZE) {
final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsMatrixValues, 0, currentLikelihoodsCacheIndex);
Arrays.fill(log10LikelihoodsMatrixValues, Double.POSITIVE_INFINITY);
log10LikelihoodsMatrixValues[0] = temporarySum;
currentLikelihoodsCacheIndex = 1;
}
}
protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {
log10MLE = log10LikelihoodOfAFzero;
Arrays.fill(alleleCountsOfMLE, 0);
}
}
protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
if ( log10PosteriorOfAFzero > log10MAP ) {
log10MAP = log10PosteriorOfAFzero;
Arrays.fill(alleleCountsOfMAP, 0);
}
}
protected void setAllelesUsedInGenotyping(List<Allele> allelesUsedInGenotyping) {
if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() )
throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty");
this.allelesUsedInGenotyping = allelesUsedInGenotyping;
}
protected void setAClimits(int[] AClimits) {
this.AClimits = AClimits;
}
}

View File

@ -0,0 +1,107 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@Deprecated
public class ConstrainedDiploidExactAFCalc extends DiploidExactAFCalc {
protected ConstrainedDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
}
protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) {
final int[] maxACsToConsider = computeMaxACs(vc);
resultTracker.setAClimits(maxACsToConsider);
return new StateTracker(maxACsToConsider);
}
/**
* Computes the maximum ACs we need to consider for each alt allele
*
* Walks over the genotypes in VC, and computes for each alt allele the maximum
* AC we need to consider in that alt allele dimension. Does the calculation
* based on the PLs in each genotype g, choosing to update the max AC for the
* alt alleles corresponding to that PL. Only takes the first lowest PL,
* if there are multiple genotype configurations with the same PL value. It
* takes values in the order of the alt alleles.
*
* @param vc the variant context we will compute max alt alleles for
* @return a vector of max alt alleles, indexed by alt allele, so result[0] is the AC of the
* first alt allele.
*/
@Ensures("result != null")
protected final int[] computeMaxACs(final VariantContext vc) {
final int[] maxACs = new int[vc.getNAlleles()-1];
for ( final Genotype g : vc.getGenotypes() )
updateMaxACs(g, maxACs);
return maxACs;
}
/**
* Update the maximum achievable allele counts in maxAC according to the PLs in g
*
* Selects the maximum genotype configuration from the PLs in g, and updates
* the maxAC for this configure. For example, if the lowest PL is for 0/1, updates
* the maxAC for the alt allele 1 by 1. If it's 1/1, update is 2. Works for
* many number of alt alleles (determined by length of maxACs).
*
* If the max PL occurs at 0/0, updates nothing
* Note that this function greedily takes the first min PL, so that if 0/1 and 1/1 have
* the same PL value, then updates the first one.
*
* Also, only will update 1 alt allele, so if 0/1 and 0/2 both have the same PL,
* then only first one (1) will be updated
*
* @param g the genotype to update
* @param maxACs the max allele count vector for alt alleles (starting at 0 => first alt allele)
*/
@Requires({
"g != null",
"maxACs != null",
"goodMaxACs(maxACs)"})
private void updateMaxACs(final Genotype g, final int[] maxACs) {
final int[] PLs = g.getLikelihoods().getAsPLs();
int minPLi = 0;
int minPL = PLs[0];
for ( int i = 0; i < PLs.length; i++ ) {
if ( PLs[i] < minPL ) {
minPL = PLs[i];
minPLi = i;
}
}
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(minPLi);
updateMaxACs(maxACs, pair.alleleIndex1);
updateMaxACs(maxACs, pair.alleleIndex2);
}
/**
* Simple helper. Update max alt alleles maxACs according to the allele index (where 0 == ref)
*
* If alleleI == 0 => doesn't update anything
* else maxACs[alleleI - 1]++
*
* @param maxACs array of max alt allele ACs
* @param alleleI the index (relative to 0) to update a count of 1 in max alt alleles.
*/
@Requires({
"alleleI >= 0",
"(alleleI - 1) < maxACs.length",
"goodMaxACs(maxACs)"})
private void updateMaxACs(final int[] maxACs, final int alleleI) {
if ( alleleI > 0 )
maxACs[alleleI-1]++;
}
private static boolean goodMaxACs(final int[] maxACs) {
return MathUtils.sum(maxACs) >= 0;
}
}

View File

@ -0,0 +1,350 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
public abstract class DiploidExactAFCalc extends ExactAFCalc {
public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy);
}
protected abstract StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker);
@Override
protected AFCalcResult computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
final int numAlternateAlleles = vc.getNAlleles() - 1;
final ArrayList<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), true);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
// queue of AC conformations to process
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(numChr+1);
// add AC=0 to the queue
final int[] zeroCounts = new int[numAlternateAlleles];
ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts));
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
// keep processing while we have AC conformations that need to be calculated
final StateTracker stateTracker = makeMaxLikelihood(vc, getResultTracker());
while ( !ACqueue.isEmpty() ) {
getResultTracker().incNEvaluations(); // keep track of the number of evaluations
// compute log10Likelihoods
final ExactACset set = ACqueue.remove();
if ( stateTracker.withinMaxACs(set.getACcounts()) ) {
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, stateTracker, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, getResultTracker());
// adjust max likelihood seen if needed
stateTracker.update(log10LofKs, set.getACcounts());
// clean up memory
indexesToACset.remove(set.getACcounts());
//if ( DEBUG )
// System.out.printf(" *** removing used set=%s%n", set.ACcounts);
}
}
return resultFromTracker(vc, log10AlleleFrequencyPriors);
}
@Override
protected VariantContext reduceScope(final VariantContext vc) {
final int myMaxAltAllelesToGenotype = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype;
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) {
logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
VariantContextBuilder builder = new VariantContextBuilder(vc);
List<Allele> alleles = new ArrayList<Allele>(myMaxAltAllelesToGenotype + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype));
builder.alleles(alleles);
builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false));
return builder.make();
} else {
return vc;
}
}
private static final int PL_INDEX_OF_HOM_REF = 0;
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) {
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
for ( int i = 0; i < numOriginalAltAlleles; i++ )
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes(), true);
for ( final double[] likelihoods : GLs ) {
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) {
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL);
if ( alleles.alleleIndex1 != 0 )
likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
// don't double-count it
if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 )
likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF];
}
}
// sort them by probability mass and choose the best ones
Collections.sort(Arrays.asList(likelihoodSums));
final ArrayList<Allele> bestAlleles = new ArrayList<Allele>(numAllelesToChoose);
for ( int i = 0; i < numAllelesToChoose; i++ )
bestAlleles.add(likelihoodSums[i].allele);
final ArrayList<Allele> orderedBestAlleles = new ArrayList<Allele>(numAllelesToChoose);
for ( Allele allele : vc.getAlternateAlleles() ) {
if ( bestAlleles.contains(allele) )
orderedBestAlleles.add(allele);
}
return orderedBestAlleles;
}
private static final class DependentSet {
public final int[] ACcounts;
public final int PLindex;
public DependentSet(final int[] ACcounts, final int PLindex) {
this.ACcounts = ACcounts;
this.PLindex = PLindex;
}
}
private double calculateAlleleCountConformation(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final StateTracker stateTracker,
final int numChr,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[] log10AlleleFrequencyPriors,
final AFCalcResultTracker resultTracker) {
//if ( DEBUG )
// System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
// compute the log10Likelihoods
computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, resultTracker);
final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
// can we abort early because the log10Likelihoods are so small?
if ( stateTracker.abort(log10LofK, set.getACcounts()) ) {
//if ( DEBUG )
// System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
return log10LofK;
}
// iterate over higher frequencies if possible
final int ACwiggle = numChr - set.getACsum();
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK;
final int numAltAlleles = set.getACcounts().getCounts().length;
// add conformations for the k+1 case
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
final int[] ACcountsClone = set.getACcounts().getCounts().clone();
ACcountsClone[allele]++;
// to get to this conformation, a sample would need to be AB (remember that ref=0)
final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1);
updateACset(stateTracker, ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
// add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different
if ( ACwiggle > 1 ) {
final ArrayList<DependentSet> differentAlleles = new ArrayList<DependentSet>(numAltAlleles * numAltAlleles);
final ArrayList<DependentSet> sameAlleles = new ArrayList<DependentSet>(numAltAlleles);
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
final int[] ACcountsClone = set.getACcounts().getCounts().clone();
ACcountsClone[allele_i]++;
ACcountsClone[allele_j]++;
// to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index)
final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1);
if ( allele_i == allele_j )
sameAlleles.add(new DependentSet(ACcountsClone, PLindex));
else
differentAlleles.add(new DependentSet(ACcountsClone, PLindex));
}
}
// IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering
for ( DependentSet dependent : differentAlleles )
updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
for ( DependentSet dependent : sameAlleles )
updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods);
}
return log10LofK;
}
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
// also pushes its value to the given callingSetIndex.
private void updateACset(final StateTracker stateTracker,
final int[] newSetCounts,
final int numChr,
final ExactACset dependentSet,
final int PLsetIndex,
final Queue<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final ArrayList<double[]> genotypeLikelihoods) {
final ExactACcounts index = new ExactACcounts(newSetCounts);
if ( !indexesToACset.containsKey(index) ) {
ExactACset set = new ExactACset(numChr/2 +1, index);
indexesToACset.put(index, set);
ACqueue.add(set);
}
// push data from the dependency to the new set
//if ( DEBUG )
// System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts);
pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods);
}
private void computeLofK(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final double[] log10AlleleFrequencyPriors,
final AFCalcResultTracker resultTracker) {
set.getLog10Likelihoods()[0] = 0.0; // the zero case
final int totalK = set.getACsum();
// special case for k = 0 over all k
if ( totalK == 0 ) {
for ( int j = 1; j < set.getLog10Likelihoods().length; j++ )
set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
return;
}
// if we got here, then k > 0 for at least one k.
// the non-AA possible conformations were already dealt with by pushes from dependent sets;
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) {
if ( totalK < 2*j-1 ) {
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.getLog10Likelihoods()[j-1] + gl[HOM_REF_INDEX];
set.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[j], conformationValue);
}
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j] - logDenominator;
}
double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1];
// update the MLE if necessary
resultTracker.updateMLEifNeeded(log10LofK, set.getACcounts().getCounts());
// apply the priors over each alternate allele
for ( final int ACcount : set.getACcounts().getCounts() ) {
if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
resultTracker.updateMAPifNeeded(log10LofK, set.getACcounts().getCounts());
}
private void pushData(final ExactACset targetSet,
final ExactACset dependentSet,
final int PLsetIndex,
final ArrayList<double[]> genotypeLikelihoods) {
final int totalK = targetSet.getACsum();
for ( int j = 1; j < targetSet.getLog10Likelihoods().length; j++ ) {
if ( totalK <= 2*j ) { // skip impossible conformations
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue =
determineCoefficient(PLsetIndex, j, targetSet.getACcounts().getCounts(), totalK) + dependentSet.getLog10Likelihoods()[j-1] + gl[PLsetIndex];
targetSet.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(targetSet.getLog10Likelihoods()[j], conformationValue);
}
}
}
private double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
// the closed form representation generalized for multiple alleles is as follows:
// AA: (2j - totalK) * (2j - totalK - 1)
// AB: 2k_b * (2j - totalK)
// AC: 2k_c * (2j - totalK)
// BB: k_b * (k_b - 1)
// BC: 2 * k_b * k_c
// CC: k_c * (k_c - 1)
// find the 2 alleles that are represented by this PL index
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
// *** note that throughout this method we subtract one from the alleleIndex because ACcounts ***
// *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. ***
// the AX het case
if ( alleles.alleleIndex1 == 0 )
return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK];
final int k_i = ACcounts[alleles.alleleIndex1-1];
// the hom var case (e.g. BB, CC, DD)
final double coeff;
if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) {
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
}
// the het non-ref case (e.g. BC, BD, CD)
else {
final int k_j = ACcounts[alleles.alleleIndex2-1];
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
}
return coeff;
}
public GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy) {
return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes);
}
}

View File

@ -0,0 +1,46 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import java.util.Arrays;
/**
* Created with IntelliJ IDEA.
* User: depristo
* Date: 10/5/12
* Time: 2:54 PM
* To change this template use File | Settings | File Templates.
*/ // a wrapper around the int array so that we can make it hashable
public final class ExactACcounts {
private final int[] counts;
private int hashcode = -1;
public ExactACcounts(final int[] counts) {
this.counts = counts;
}
public int[] getCounts() {
return counts;
}
@Override
public boolean equals(Object obj) {
return (obj instanceof ExactACcounts) && Arrays.equals(getCounts(), ((ExactACcounts) obj).getCounts());
}
@Override
public int hashCode() {
if ( hashcode == -1 )
hashcode = Arrays.hashCode(getCounts());
return hashcode;
}
@Override
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append(getCounts()[0]);
for ( int i = 1; i < getCounts().length; i++ ) {
sb.append("/");
sb.append(getCounts()[i]);
}
return sb.toString();
}
}

View File

@ -0,0 +1,49 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.Arrays;
/**
* Created with IntelliJ IDEA.
* User: depristo
* Date: 10/5/12
* Time: 2:53 PM
* To change this template use File | Settings | File Templates.
*/ // This class represents a column in the Exact AC calculation matrix
public final class ExactACset {
// the counts of the various alternate alleles which this column represents
private final ExactACcounts ACcounts;
// the column of the matrix
private final double[] log10Likelihoods;
int sum = -1;
public ExactACset(final int size, final ExactACcounts ACcounts) {
this.ACcounts = ACcounts;
log10Likelihoods = new double[size];
Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY);
}
/**
* sum of all the non-reference alleles
*/
public int getACsum() {
if ( sum == -1 )
sum = (int)MathUtils.sum(getACcounts().getCounts());
return sum;
}
public boolean equals(Object obj) {
return (obj instanceof ExactACset) && getACcounts().equals(((ExactACset)obj).getACcounts());
}
public ExactACcounts getACcounts() {
return ACcounts;
}
public double[] getLog10Likelihoods() {
return log10Likelihoods;
}
}

View File

@ -0,0 +1,81 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.ArrayList;
/**
* Uses the Exact calculation of Heng Li
*/
abstract class ExactAFCalc extends AFCalc {
protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first
protected ExactAFCalc(final int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
}
/**
* Wrapper class that compares two likelihoods associated with two alleles
*/
protected static final class LikelihoodSum implements Comparable<LikelihoodSum> {
public double sum = 0.0;
public Allele allele;
public LikelihoodSum(Allele allele) { this.allele = allele; }
public int compareTo(LikelihoodSum other) {
final double diff = sum - other.sum;
return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0;
}
}
/**
* Unpack GenotypesContext into arraylist of doubel values
* @param GLs Input genotype context
* @return ArrayList of doubles corresponding to GL vectors
*/
protected static ArrayList<double[]> getGLs(final GenotypesContext GLs, final boolean includeDummy) {
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(GLs.size() + 1);
if ( includeDummy ) genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
for ( Genotype sample : GLs.iterateInSampleNameOrder() ) {
if ( sample.hasLikelihoods() ) {
double[] gls = sample.getLikelihoods().getAsVector();
if ( MathUtils.sum(gls) < VariantContextUtils.SUM_GL_THRESH_NOCALL )
genotypeLikelihoods.add(gls);
}
}
return genotypeLikelihoods;
}
}

View File

@ -0,0 +1,338 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc {
private final static List<Allele> BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
private final static class CompareAFCalcResultsByPNonRef implements Comparator<AFCalcResult> {
@Override
public int compare(AFCalcResult o1, AFCalcResult o2) {
return Double.compare(o1.getLog10LikelihoodOfAFGT0(), o2.getLog10LikelihoodOfAFGT0());
}
}
private final static CompareAFCalcResultsByPNonRef compareAFCalcResultsByPNonRef = new CompareAFCalcResultsByPNonRef();
final ReferenceDiploidExactAFCalc refModel;
protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
refModel = new ReferenceDiploidExactAFCalc(nSamples, 1, 1, ploidy);
}
@Override
protected StateTracker makeMaxLikelihood(VariantContext vc, AFCalcResultTracker resultTracker) {
return refModel.makeMaxLikelihood(vc, resultTracker);
}
private static class MyAFCalcResult extends AFCalcResult {
final List<AFCalcResult> supporting;
private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List<Allele> allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map<Allele, Double> log10pNonRefByAllele, List<AFCalcResult> supporting) {
super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele);
this.supporting = supporting;
}
}
@Override
public AFCalcResult computeLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
final double log10LikelihoodOfRef = computelog10LikelihoodOfRef(vc);
final List<AFCalcResult> independentResultTrackers = computeAlleleConditionalExact(vc, log10AlleleFrequencyPriors);
final List<AFCalcResult> withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers);
return combineIndependentPNonRefs(vc, log10LikelihoodOfRef, withMultiAllelicPriors);
}
protected final double computelog10LikelihoodOfRef(final VariantContext vc) {
// this value just the likelihood of AF == 0 in the special constrained multi-allelic calculation
final List<double[]> allGLs = getGLs(vc.getGenotypes(), false);
double log10LikelihoodOfHomRef = 0.0;
// TODO -- can be easily optimized (currently looks at all GLs via getGLs)
for ( int i = 0; i < allGLs.size(); i++ ) {
final double[] GLs = allGLs.get(i);
log10LikelihoodOfHomRef += GLs[0];
//log10LikelihoodOfHomRef += MathUtils.normalizeFromLog10(GLs, true)[0];
}
return log10LikelihoodOfHomRef;
}
/**
* Computes the conditional bi-allelic exact results
*
* Suppose vc contains 2 alt allele: A* with C and T. This function first computes:
*
* (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything]
*
* it then computes the conditional probability on AF_c == 0:
*
* (2) P(D | AF_t > 0 && AF_c == 0)
*
* Thinking about this visually, we have the following likelihood matrix where each cell is
* the P(D | AF_c == i && AF_t == j):
*
* 0 AF_c > 0
* -----------------
* 0 | |
* |--|-------------
* a | |
* f | |
* _ | |
* t | |
* > | |
* 0 | |
*
* What we really want to know how
*
* (3) P(D | AF_c == 0 & AF_t == 0)
*
* compares with
*
* (4) P(D | AF_c > 0 || AF_t > 0)
*
* This is effectively asking for the value in the upper left vs. the sum of all cells.
*
* The quantity (1) is the same of all cells except those with AF_c == 0, while (2) is the
* band at the top where AF_t > 0 and AF_c == 0
*
* So (4) is actually (1) + (2).
*
* (3) is the direct inverse of the (1) and (2), as we are simultaneously calculating
*
* (1*) P(D | AF_c == 0 && AF_t == *) [i.e., T can be anything]
* (2*) P(D | AF_t == 0 && AF_c == 0) [TODO -- note this value looks like the thing we are supposed to use]
*
* This function implements the conditional likelihoods summation for any number of alt
* alleles (not just the tri-allelic case), where each subsequent variant context is
* further constrained such that each already considered allele x has AF_x == 0 in the
* compute.
*
* @param vc
* @param log10AlleleFrequencyPriors
* @return
*/
protected List<AFCalcResult> computeAlleleConditionalExact(final VariantContext vc,
final double[] log10AlleleFrequencyPriors) {
final List<AFCalcResult> results = new LinkedList<AFCalcResult>();
for ( final VariantContext subvc : makeAlleleConditionalContexts(vc) ) {
final AFCalcResult resultTracker = refModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors);
results.add(resultTracker);
}
return results;
}
protected List<VariantContext> makeAlleleConditionalContexts(final VariantContext vc) {
final int nAltAlleles = vc.getNAlleles() - 1;
final List<VariantContext> vcs = new LinkedList<VariantContext>();
final List<Allele> afZeroAlleles = new LinkedList<Allele>();
for ( int altI = 0; altI < nAltAlleles; altI++ ) {
final Allele altAllele = vc.getAlternateAllele(altI);
final List<Allele> biallelic = Arrays.asList(vc.getReference(), altAllele);
vcs.add(biallelicCombinedGLs(vc, biallelic, afZeroAlleles, altI + 1));
//afZeroAlleles.add(altAllele);
}
return vcs;
}
protected VariantContext biallelicCombinedGLs(final VariantContext rootVC, final List<Allele> biallelic, final List<Allele> afZeroAlleles, final int allele2) {
if ( rootVC.isBiallelic() ) {
if ( ! afZeroAlleles.isEmpty() ) throw new IllegalArgumentException("Root VariantContext is biallelic but afZeroAlleles wasn't empty: " + afZeroAlleles);
return rootVC;
} else {
final Set<Integer> allelesToDiscard = new HashSet<Integer>(rootVC.getAlleleIndices(afZeroAlleles));
final int nAlts = rootVC.getNAlleles() - 1;
final List<Genotype> biallelicGenotypes = new ArrayList<Genotype>(rootVC.getNSamples());
for ( final Genotype g : rootVC.getGenotypes() )
biallelicGenotypes.add(combineGLs(g, allele2, allelesToDiscard, nAlts));
final VariantContextBuilder vcb = new VariantContextBuilder(rootVC);
vcb.alleles(biallelic);
vcb.genotypes(biallelicGenotypes);
return vcb.make();
}
}
/**
* Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case
*
* This is handled in the following way:
*
* Suppose we have for a A/B/C site the following GLs:
*
* AA AB BB AC BC CC
*
* and we want to get the bi-allelic GLs for X/B, where X is everything not B
*
* XX = AA + AC + CC (since X = A or C)
* XB = AB + BC
* BB = BB
*
* Supports the additional mode of simply dropping GLs whose allele index occurs in allelesToDiscard. This is
* useful in the case where you want to drop alleles (not combine them), such as above:
*
* AA AB BB AC BC CC
*
* and we want to get the bi-allelic GLs for X/B, where X is everything not B, but dropping C (index 2)
*
* XX = AA (since X = A and C is dropped)
* XB = AB
* BB = BB
*
* This allows us to recover partial GLs the correspond to any allele in allelesToDiscard having strictly
* AF == 0.
*
* @param original the original multi-allelic genotype
* @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0
* @param nAlts the total number of alt alleles
* @return a new biallelic genotype with appropriate PLs
*/
@Requires({"original.hasLikelihoods()", "! allelesToDiscard.contains(altIndex)"})
@Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"})
protected Genotype combineGLs(final Genotype original, final int altIndex, final Set<Integer> allelesToDiscard, final int nAlts ) {
if ( original.isNonInformative() )
return new GenotypeBuilder(original).PL(new int[]{0,0,0}).alleles(BIALLELIC_NOCALL).make();
if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts);
final double[] normalizedPr = MathUtils.normalizeFromLog10(GenotypeLikelihoods.fromPLs(original.getPL()).getAsVector());
final double[] biAllelicPr = new double[3];
for ( int index = 0; index < normalizedPr.length; index++ ) {
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index);
// just continue if we shouldn't include the pair because it's in the discard set
if ( discardAllelePair(pair, allelesToDiscard) )
continue;
if ( pair.alleleIndex1 == altIndex ) {
if ( pair.alleleIndex2 == altIndex )
// hom-alt case
biAllelicPr[2] = normalizedPr[index];
else
// het-alt case
biAllelicPr[1] += normalizedPr[index];
} else {
if ( pair.alleleIndex2 == altIndex )
// het-alt case
biAllelicPr[1] += normalizedPr[index];
else
// hom-non-alt
biAllelicPr[0] += normalizedPr[index];
}
}
final double[] GLs = new double[3];
for ( int i = 0; i < GLs.length; i++ ) GLs[i] = Math.log10(biAllelicPr[i]);
return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make();
}
protected boolean discardAllelePair(final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair, Set<Integer> allelesToDiscard) {
return allelesToDiscard.contains(pair.alleleIndex1) || allelesToDiscard.contains(pair.alleleIndex2);
}
protected List<AFCalcResult> applyMultiAllelicPriors(final List<AFCalcResult> conditionalPNonRefResults) {
final ArrayList<AFCalcResult> sorted = new ArrayList<AFCalcResult>(conditionalPNonRefResults);
// sort the results, so the most likely allele is first
Collections.sort(sorted, compareAFCalcResultsByPNonRef);
final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0();
for ( int i = 0; i < sorted.size(); i++ ) {
final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0;
final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0));
final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 };
// bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior
sorted.set(i, sorted.get(i).withNewPriors(MathUtils.normalizeFromLog10(thetaTONPriors, true)));
}
return sorted;
}
/**
* Take the independent estimates of pNonRef for each alt allele and combine them into a single result
*
* @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently
*/
protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc,
final double log10LikelihoodsOfACEq0,
final List<AFCalcResult> sortedResultsWithThetaNPriors) {
int nEvaluations = 0;
final int nAltAlleles = sortedResultsWithThetaNPriors.size();
final int[] alleleCountsOfMLE = new int[nAltAlleles];
final double[] log10PriorsOfAC = new double[2];
final Map<Allele, Double> log10pNonRefByAllele = new HashMap<Allele, Double>(nAltAlleles);
// this value is a sum in real space so we need to store values to sum up later
final double[] log10LikelihoodsOfACGt0 = new double[nAltAlleles];
for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) {
final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1);
final int altI = vc.getAlleles().indexOf(altAllele) - 1;
// MLE of altI allele is simply the MLE of this allele in altAlleles
alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele);
log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0();
log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0();
// the AF > 0 case requires us to store the normalized likelihood for later summation
log10LikelihoodsOfACGt0[altI] = sortedResultWithThetaNPriors.getLog10LikelihoodOfAFGT0();
// bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior
log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0());
// trivial -- update the number of evaluations
nEvaluations += sortedResultWithThetaNPriors.nEvaluations;
}
// the log10 likelihoods are the sum of the log10 likelihoods across all alt alleles
final double[] log10LikelihoodsOfAC = new double[]{
log10LikelihoodsOfACEq0,
MathUtils.log10sumLog10(log10LikelihoodsOfACGt0)};
return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(),
MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true, true), // necessary to ensure all values < 0
MathUtils.normalizeFromLog10(log10PriorsOfAC, true), // priors incorporate multiple alt alleles, must be normalized
log10pNonRefByAllele, sortedResultsWithThetaNPriors);
}
}

View File

@ -0,0 +1,145 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Map;
/**
* Original bi-allelic ~O(N) implementation. Kept here for posterity and reference
*/
public class OriginalDiploidExactAFCalc extends DiploidExactAFCalc {
protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
}
protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) {
return new StateTracker();
}
@Override
protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) {
final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length];
final double[] log10AlleleFrequencyPosteriors = new double[log10AlleleFrequencyPriors.length];
final int lastK = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1)};
final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)};
final double pNonRef = lastK > 0 ? 0.0 : -1000.0;
final Map<Allele, Double> log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), pNonRef);
return new AFCalcResult(new int[]{lastK}, 0, vc.getAlleles(), log10Likelihoods, log10Priors, log10pNonRefByAllele);
}
/**
* A simple data structure that holds the current, prev, and prev->prev likelihoods vectors
* for the exact model calculation
*/
private final static class ExactACCache {
double[] kMinus2, kMinus1, kMinus0;
private static double[] create(int n) {
return new double[n];
}
public ExactACCache(int n) {
kMinus2 = create(n);
kMinus1 = create(n);
kMinus0 = create(n);
}
final public void rotate() {
double[] tmp = kMinus2;
kMinus2 = kMinus1;
kMinus1 = kMinus0;
kMinus0 = tmp;
}
final public double[] getkMinus2() {
return kMinus2;
}
final public double[] getkMinus1() {
return kMinus1;
}
final public double[] getkMinus0() {
return kMinus0;
}
}
public int linearExact(final VariantContext vc,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyLikelihoods,
double[] log10AlleleFrequencyPosteriors) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(vc.getGenotypes(), false);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
final ExactACCache logY = new ExactACCache(numSamples+1);
logY.getkMinus0()[0] = 0.0; // the zero case
double maxLog10L = Double.NEGATIVE_INFINITY;
boolean done = false;
int lastK = -1;
for (int k=0; k <= numChr && ! done; k++ ) {
final double[] kMinus0 = logY.getkMinus0();
if ( k == 0 ) { // special case for k = 0
for ( int j=1; j <= numSamples; j++ ) {
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0];
}
} else { // k > 0
final double[] kMinus1 = logY.getkMinus1();
final double[] kMinus2 = logY.getkMinus2();
for ( int j=1; j <= numSamples; j++ ) {
final double[] gl = genotypeLikelihoods.get(j);
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
double aa = Double.NEGATIVE_INFINITY;
double ab = Double.NEGATIVE_INFINITY;
if (k < 2*j-1)
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0];
if (k < 2*j)
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1];
double log10Max;
if (k > 1) {
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2];
log10Max = MathUtils.approximateLog10SumLog10(aa, ab, bb);
} else {
// we know we aren't considering the BB case, so we can use an optimized log10 function
log10Max = MathUtils.approximateLog10SumLog10(aa, ab);
}
// finally, update the L(j,k) value
kMinus0[j] = log10Max - logDenominator;
}
}
// update the posteriors vector
final double log10LofK = kMinus0[numSamples];
log10AlleleFrequencyLikelihoods[k] = log10LofK;
log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k];
// can we abort early?
lastK = k;
maxLog10L = Math.max(maxLog10L, log10LofK);
if ( log10LofK < maxLog10L - StateTracker.MAX_LOG10_ERROR_TO_STOP_EARLY ) {
//if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L);
done = true;
}
logY.rotate();
}
return lastK;
}
}

View File

@ -0,0 +1,13 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
public class ReferenceDiploidExactAFCalc extends DiploidExactAFCalc {
protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
}
protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) {
return new StateTracker();
}
}

View File

@ -0,0 +1,96 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
/**
* Keeps track of the best state seen by the exact model and the max states to visit
* allowing us to abort the search before we visit the entire matrix of AC x samples
*/
final class StateTracker {
public final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
final private int[] maxACsToConsider;
private ExactACcounts ACsAtMax = null;
private double maxLog10L = Double.NEGATIVE_INFINITY;
public StateTracker() {
this(null);
}
public StateTracker(final int[] maxACsToConsider) {
this.maxACsToConsider = maxACsToConsider;
}
/**
* Update the maximum log10L seen, if log10LofKs is higher, and the corresponding ACs of this state
*
* @param log10LofKs the likelihood of our current configuration state
*/
public void update(final double log10LofKs, final ExactACcounts ACs) {
if ( log10LofKs > getMaxLog10L()) {
this.setMaxLog10L(log10LofKs);
this.ACsAtMax = ACs;
}
}
/**
* Is the likelihood of configuration K too low to consider, related to the
* maximum likelihood seen already?
*
* @param log10LofK the log10 likelihood of the configuration we're considering analyzing
* @return true if the configuration cannot meaningfully contribute to our likelihood sum
*/
public boolean tooLowLikelihood(final double log10LofK) {
return log10LofK < getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY;
}
/**
* Are all ACs in otherACs less than or equal to their corresponding ACs in the maxACsToConsider?
*
* @param otherACs the set of otherACs that we want to know if we should consider analyzing
* @return true if otherACs is a state worth considering, or false otherwise
*/
public boolean withinMaxACs(final ExactACcounts otherACs) {
if ( maxACsToConsider == null )
return true;
final int[] otherACcounts = otherACs.getCounts();
for ( int i = 0; i < maxACsToConsider.length; i++ ) {
// consider one more than the max AC to collect a bit more likelihood mass
if ( otherACcounts[i] > maxACsToConsider[i] + 1 )
return false;
}
return true;
}
/**
* returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set
*/
public boolean isLowerAC(final ExactACcounts otherACs) {
if ( ACsAtMax == null )
return true;
final int[] myACcounts = this.ACsAtMax.getCounts();
final int[] otherACcounts = otherACs.getCounts();
for ( int i = 0; i < myACcounts.length; i++ ) {
if ( myACcounts[i] > otherACcounts[i] )
return false;
}
return true;
}
public boolean abort( final double log10LofK, final ExactACcounts ACs ) {
return tooLowLikelihood(log10LofK) && isLowerAC(ACs);
}
public double getMaxLog10L() {
return maxLog10L;
}
public void setMaxLog10L(double maxLog10L) {
this.maxLog10L = maxLog10L;
}
}

View File

@ -23,8 +23,9 @@
*/
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult;
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.TreeSet;
@ -32,7 +33,9 @@ import java.util.TreeSet;
public class GLBasedSampleSelector extends SampleSelector {
double[] flatPriors = null;
double referenceLikelihood;
final double referenceLikelihood;
AFCalc AFCalculator;
public GLBasedSampleSelector(TreeSet<String> sm, double refLik) {
super(sm);
referenceLikelihood = refLik;
@ -49,11 +52,11 @@ public class GLBasedSampleSelector extends SampleSelector {
// do we want to apply a prior? maybe user-spec?
if ( flatPriors == null ) {
flatPriors = new double[1+2*samples.size()];
AFCalculator = AFCalcFactory.createAFCalc(samples.size(), 4, 4, 2);
}
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size());
ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result);
final AFCalcResult result = AFCalculator.getLog10PNonRef(subContext, flatPriors);
// do we want to let this qual go up or down?
if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) {
if ( result.getLog10LikelihoodOfAFEq0() < referenceLikelihood ) {
return true;
}

View File

@ -75,7 +75,7 @@ public class FilterLiftedVariants extends RodWalker<Integer, Integer> {
boolean failed = false;
byte[] recordRef = vc.getReference().getBases();
for (int i = 0; i < recordRef.length && i < MAX_VARIANT_SIZE; i++) {
if ( recordRef[i] != ref[i + (vc.isPointEvent() ? 0 : 1)] ) {
if ( recordRef[i] != ref[i] ) {
failed = true;
break;
}

View File

@ -50,7 +50,6 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.*;
/**
@ -278,13 +277,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
/**
* Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory
* given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants.
*/
@Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false)
protected int numRandom = 0;
/**
* This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions.
*/
@ -330,20 +322,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;
/* Private class used to store the intermediate variants in the integer random selection process */
private static class RandomVariantStructure {
private VariantContext vc;
RandomVariantStructure(VariantContext vcP) {
vc = vcP;
}
public void set (VariantContext vcP) {
vc = vcP;
}
}
public enum NumberAlleleRestriction {
ALL,
BIALLELIC,
@ -364,12 +342,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
/* variables used by the SELECT RANDOM modules */
private boolean SELECT_RANDOM_NUMBER = false;
private boolean SELECT_RANDOM_FRACTION = false;
private int variantNumber = 0;
private int nVariantsAdded = 0;
private int positionToAdd = 0;
private RandomVariantStructure [] variantArray;
//Random number generator for the genotypes to remove
private Random randomGenotypes = new Random();
@ -478,12 +451,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true);
}
SELECT_RANDOM_NUMBER = numRandom > 0;
if (SELECT_RANDOM_NUMBER) {
logger.info("Selecting " + numRandom + " variants at random from the variant track");
variantArray = new RandomVariantStructure[numRandom];
}
SELECT_RANDOM_FRACTION = fractionRandom > 0;
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
@ -588,14 +555,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
break;
}
}
if ( !failedJexlMatch ) {
if (SELECT_RANDOM_NUMBER) {
randomlyAddVariant(++variantNumber, sub);
}
else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
if ( ! justRead )
vcfWriter.add(sub);
}
if ( !failedJexlMatch &&
!justRead &&
( !SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom ) ) {
vcfWriter.add(sub);
}
}
}
@ -718,14 +681,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
public void onTraversalDone(Integer result) {
logger.info(result + " records processed.");
if (SELECT_RANDOM_NUMBER) {
int positionToPrint = positionToAdd;
for (int i=0; i<numRandom; i++) {
vcfWriter.add(variantArray[positionToPrint].vc);
positionToPrint = nextCircularPosition(positionToPrint);
}
}
}
@ -746,9 +701,9 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
GenotypesContext newGC = sub.getGenotypes();
// if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate)
// if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs and AD (because they are no longer accurate)
if ( vc.getAlleles().size() != sub.getAlleles().size() )
newGC = VariantContextUtils.stripPLs(sub.getGenotypes());
newGC = VariantContextUtils.stripPLsAndAD(sub.getGenotypes());
// if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags
if ( vc.getNSamples() != sub.getNSamples() ) {
@ -809,25 +764,4 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if ( sawDP )
builder.attribute("DP", depth);
}
private void randomlyAddVariant(int rank, VariantContext vc) {
if (nVariantsAdded < numRandom)
variantArray[nVariantsAdded++] = new RandomVariantStructure(vc);
else {
double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble();
double t = (1.0/(rank-numRandom+1));
if ( v < t) {
variantArray[positionToAdd].set(vc);
nVariantsAdded++;
positionToAdd = nextCircularPosition(positionToAdd);
}
}
}
private int nextCircularPosition(int cur) {
if ((cur + 1) == variantArray.length)
return 0;
return cur + 1;
}
}

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broad.tribble.TribbleException;
import org.broadinstitute.sting.alignment.bwa.java.AlignmentMatchSequence;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
@ -7,29 +9,28 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.*;
import java.util.*;
/**
* Yet another VCF to Ped converter. The world actually does need one that will
* work efficiently on large VCFs (or at least give a progress bar). This
* produces a binary ped file in individual major mode.
* Converts a VCF file to a binary plink Ped file (.bed/.bim/.fam)
*/
@DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=0,stop=100))
public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@ -40,29 +41,33 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
/**
* The metaData file can take two formats, the first of which is the first 6 lines of the standard ped file. This
* is what Plink describes as a fam file. An example fam file is (note that there is no header):
*
* CEUTrio NA12878 NA12891 NA12892 2 -9
* CEUTrio NA12891 UNKN1 UNKN2 2 -9
* CEUTrio NA12892 UNKN3 UNKN4 1 -9
*
* <p><p>
* CEUTrio NA12878 NA12891 NA12892 2 -9</p><p>
* CEUTrio NA12891 UNKN1 UNKN2 2 -9</p><p>
* CEUTrio NA12892 UNKN3 UNKN4 1 -9</p><p>
* </p>
* where the entries are (FamilyID IndividualID DadID MomID Phenotype Sex)
*
* <p>
* An alternate format is a two-column key-value file
*
* NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9
* NA12891 fid=CEUTrio;sex=2;phenotype=-9
* NA12892 fid=CEUTrio;sex=1;phenotype=-9
*
* </p><p><p>
* NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9</p><p>
* NA12891 fid=CEUTrio;sex=2;phenotype=-9</p><p>
* NA12892 fid=CEUTrio;sex=1;phenotype=-9</p><p>
* </p><p>
* wherein unknown parents needn't be specified. The columns are the individual ID, and a list of key-value pairs.
*
* </p><p>
* Regardless of which file is specified, the walker will output a .fam file alongside the bed file. If the
* command line has "-md [name].fam", the fam file will simply be copied. However, if a metadata file of the
* alternate format is passed by "-md [name].txt", the walker will construct a formatted .fam file from the data.
* </p>
*/
@Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " +
"(in which case it will be copied to the file you provide as fam output).")
File metaDataFile;
@Input(shortName="mode",fullName="outputMode",required=false,doc="The output file mode (SNP major or individual major)")
OutputMode mode = OutputMode.INDIVIDUAL_MAJOR;
@Output(shortName="bed",fullName = "bed",required=true,doc="output ped file")
PrintStream outBed;
@ -78,7 +83,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
@Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele")
boolean majorAlleleFirst = false;
private ValidateVariants vv = new ValidateVariants();
enum OutputMode { INDIVIDUAL_MAJOR,SNP_MAJOR }
private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
@ -89,6 +94,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples
private static final String PLINK_DELETION_MARKER = "-";
// note that HET and NO_CALL are flipped from the documentation: that's because
// plink actually reads these in backwards; and we want to use a shift operator
// to put these in the appropriate location
@ -99,9 +106,10 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private int genotypeCount = 0;
private int byteCount = 0;
private List<String> famOrder = new ArrayList<String>();
private long totalByteCount = 0l;
private long totalGenotypeCount = 0l;
public void initialize() {
initializeValidator();
writeBedHeader();
Map<String,Map<String,String>> sampleMetaValues = parseMetaData();
// create temporary output streams and buffers
@ -136,36 +144,43 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
throw new UserException("No metadata provided for sample "+sample);
}
}
try {
File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp");
printMap.put(sample,new PrintStream(temp));
tempFiles.put(sample,temp);
} catch (IOException e) {
throw new ReviewedStingException("Error creating temporary file",e);
if ( mode == OutputMode.INDIVIDUAL_MAJOR ) {
// only need to instantiate the files and buffers if in individual major.
// Cut down on memory.
try {
File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp");
printMap.put(sample,new PrintStream(temp));
tempFiles.put(sample,temp);
} catch (IOException e) {
throw new ReviewedStingException("Error creating temporary file",e);
}
genotypeBuffer.put(sample,new byte[BUFFER_SIZE]);
}
genotypeBuffer.put(sample,new byte[BUFFER_SIZE]);
famOrder.add(sample);
}
}
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null || ! tracker.hasValues(variantCollection.variants) ||
tracker.getFirstValue(variantCollection.variants).isFiltered() ||
! tracker.getFirstValue(variantCollection.variants).isSNP() ||
! tracker.getFirstValue(variantCollection.variants).isBiallelic()) {
if ( tracker == null ) {
return 0;
}
VariantContext vc = tracker.getFirstValue(variantCollection.variants,context.getLocation());
if ( vc == null || vc.isFiltered() || ! vc.isBiallelic() ) {
return 0;
}
try {
vv.map(tracker,ref,context);
} catch (UserException e) {
validateVariantSite(vc,ref,context);
} catch (TribbleException e) {
throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+
"Please run ValidateVariants for more detailed information.");
"Please run ValidateVariants for more detailed information. This error is: "+e.getMessage());
}
VariantContext vc = tracker.getFirstValue(variantCollection.variants);
String refOut;
String altOut;
String vcRef = getReferenceAllele(vc);
String vcAlt = getAlternateAllele(vc);
boolean altMajor;
if ( majorAlleleFirst ) {
// want to use the major allele as ref
@ -174,29 +189,42 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
VariantContextUtils.calculateChromosomeCounts(vc,ats,true);
}
if ( getAF(ats.get("AF")) > 0.5 ) {
refOut = vc.getAlternateAllele(0).getBaseString();
altOut = vc.getReference().getBaseString();
refOut = vcAlt;
altOut = vcRef;
altMajor = true;
} else {
refOut = vc.getReference().getBaseString();
altOut = vc.getAlternateAllele(0).getBaseString();
refOut = vcRef;
altOut = vcAlt;
altMajor = false;
}
} else {
refOut = vc.getReference().getBaseString();
altOut = vc.getAlternateAllele(0).getBaseString();
refOut = vcRef;
altOut = vcAlt;
altMajor = false;
}
// write an entry into the map file
outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(),
refOut,altOut);
if ( mode == OutputMode.INDIVIDUAL_MAJOR ) {
writeIndividualMajor(vc,altMajor);
} else {
writeSNPMajor(vc,altMajor);
}
return 1;
}
public void writeIndividualMajor(VariantContext vc, boolean altMajor) {
// store genotypes per sample into the buffer
for ( Genotype g : vc.getGenotypes() ) {
++totalGenotypeCount;
String sample = g.getSampleName();
byte[] samBuf = genotypeBuffer.get(sample);
byte enc = getEncoding(g,genotypeCount,altMajor);
samBuf[byteCount] |= enc;
}
genotypeCount++;
if ( genotypeCount % 4 == 0 ) {
byteCount++;
@ -217,8 +245,30 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
}
genotypeCount = 0;
}
}
return 1;
public void writeSNPMajor(VariantContext vc, boolean altMajor) {
// for each sample, write the genotype into the bed file, in the
// order of the fam file
genotypeCount = 0;
byteCount = 0;
byte[] bytes = new byte[(3+famOrder.size())/4]; // this exploits java integer fractions, which round down by default (1-4) -> 1, (5-8) -> 2
for ( Genotype g : vc.getGenotypesOrderedBy(famOrder) ) {
byte enc = getEncoding(g,genotypeCount,altMajor);
bytes[byteCount] |= enc;
genotypeCount++;
if ( genotypeCount % 4 == 0 ) {
byteCount++;
genotypeCount = 0;
}
}
totalGenotypeCount += famOrder.size();
totalByteCount += bytes.length;
try {
outBed.write(bytes);
} catch (IOException e) {
throw new ReviewedStingException("Error writing to output bed file",e);
}
}
public Integer reduce(Integer m, Integer r) {
@ -230,7 +280,15 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
}
public void onTraversalDone(Integer numSites) {
logger.info(String.format("%d sites processed!",numSites));
logger.info(String.format("%d sites processed for a total of %d genotypes encoded in %d bytes",numSites,totalGenotypeCount,totalByteCount));
if ( mode == OutputMode.INDIVIDUAL_MAJOR ) {
mergeGenotypeTempFiles(numSites);
}
}
private void mergeGenotypeTempFiles(int numSites) {
// push out the remaining genotypes and close stream
for ( String sample : printMap.keySet() ) {
try {
@ -262,18 +320,19 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
byte[] readGenotypes = new byte[BUFFER_SIZE];
inStream.read(readGenotypes);
outBed.write(readGenotypes);
totalByteCount += BUFFER_SIZE;
}
if ( ttr > 0 ) {
byte[] readGenotypes = new byte[ttr];
inStream.read(readGenotypes);
outBed.write(readGenotypes);
totalByteCount += ttr;
}
inStream.close();
} catch (IOException e) {
throw new ReviewedStingException("Error reading form temp file for input.",e);
}
}
}
private byte getEncoding(Genotype g, int offset, boolean altMajor) {
@ -286,8 +345,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private byte getStandardEncoding(Genotype g, int offset) {
byte b;
if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) {
b = NO_CALL;
if ( ! checkGQIsGood(g) ) {
b = NO_CALL;
} else if ( g.isHomRef() ) {
b = HOM_REF;
} else if ( g.isHomVar() ) {
@ -322,10 +381,11 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
if ( genotype.hasGQ() ) {
return genotype.getGQ() >= minGenotypeQuality;
} else if ( genotype.hasLikelihoods() ) {
return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality;
double log10gq = GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector());
return MathUtils.log10ProbabilityToPhredScale(log10gq) >= minGenotypeQuality;
}
return false;
return minGenotypeQuality <= 0;
}
private static String getID(VariantContext v) {
@ -346,17 +406,10 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
}
}
private void initializeValidator() {
vv.variantCollection = variantCollection;
vv.dbsnp = dbsnp;
vv.DO_NOT_VALIDATE_FILTERED = true;
vv.type = ValidateVariants.ValidationType.REF;
}
private void writeBedHeader() {
// write magic bits into the ped file
try {
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0});
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, (byte) (mode == OutputMode.INDIVIDUAL_MAJOR ? 0x0 : 0x1)});
// ultimately, the bed will be in individual-major mode
} catch (IOException e) {
throw new ReviewedStingException("error writing to output file.");
@ -410,4 +463,53 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
return metaValues;
}
private void validateVariantSite(VariantContext vc, ReferenceContext ref, AlignmentContext context) {
final Allele reportedRefAllele = vc.getReference();
final int refLength = reportedRefAllele.length();
if ( refLength > 100 ) {
logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", refLength, vc.getChr(), vc.getStart()));
return;
}
final byte[] observedRefBases = new byte[refLength];
System.arraycopy(ref.getBases(), 0, observedRefBases, 0, refLength);
final Allele observedRefAllele = Allele.create(observedRefBases);
vc.validateReferenceBases(reportedRefAllele, observedRefAllele);
vc.validateAlternateAlleles();
}
private String getReferenceAllele(VariantContext vc) {
if ( vc.isSimpleInsertion() ) {
// bi-allelic, so we just have "-" for ped output
return PLINK_DELETION_MARKER;
}
if ( vc.isSymbolic() ) {
// either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Reference will just be 1.
return "1";
}
if ( vc.isSimpleDeletion() ) {
// bi-allelic. Want to take the standard representation and strip off the leading base.
return vc.getReference().getBaseString().substring(1);
}
// snp or mnp
return vc.getReference().getBaseString();
}
private String getAlternateAllele(VariantContext vc ) {
if ( vc.isSimpleInsertion() ) {
// bi-allelic. Want to take the standard representation and strip off the leading base.
return vc.getAlternateAllele(0).getBaseString().substring(1);
}
if ( vc.isSymbolic() ) {
// either symbolic or really long alleles. Plink alleles are allowed to be 1 or 2. Alt will just be 2.
return "2";
}
if ( vc.isSimpleDeletion() ) {
// bi-allelic, so we just have "-" for ped output
return PLINK_DELETION_MARKER;
}
// snp or mnp
return vc.getAlternateAllele(0).getBaseString();
}
}

View File

@ -42,6 +42,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.io.PrintStream;
import java.lang.reflect.Array;
import java.util.*;
/**
@ -334,12 +335,12 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
return records;
}
private static void addFieldValue(Object val, List<List<String>> result) {
private static void addFieldValue(final Object val, final List<List<String>> result) {
final int numResultRecords = result.size();
// if we're trying to create a single output record, add it
if ( numResultRecords == 1 ) {
result.get(0).add(val.toString());
result.get(0).add(prettyPrintObject(val));
}
// if this field is a list of the proper size, add the appropriate entry to each record
else if ( (val instanceof List) && ((List)val).size() == numResultRecords ) {
@ -355,6 +356,26 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
}
}
private static String prettyPrintObject(final Object val) {
if ( val instanceof List )
return prettyPrintObject(((List)val).toArray());
if ( !val.getClass().isArray() )
return val.toString();
final int length = Array.getLength(val);
if ( length == 0 )
return "";
final StringBuilder sb = new StringBuilder(prettyPrintObject(Array.get(val, 0)));
for ( int i = 1; i < length; i++ ) {
sb.append(",");
sb.append(prettyPrintObject(Array.get(val, i)));
}
return sb.toString();
}
public static List<List<String>> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
return extractFields(vc, fields, null, null, allowMissingData, false);
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.utils;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.Arrays;
@ -198,7 +199,9 @@ public class BaseUtils {
* @param base [AaCcGgTt]
* @return 0, 1, 2, 3, or -1 if the base can't be understood
*/
static public int simpleBaseToBaseIndex(byte base) {
static public int simpleBaseToBaseIndex(final byte base) {
if ( base < 0 || base >= 256 )
throw new UserException.BadInput("Non-standard bases were encountered in either the input reference or BAM file(s)");
return baseIndexMap[base];
}

View File

@ -58,6 +58,12 @@ public class MathUtils {
private static final int MAXN = 50000;
private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients
/**
* The smallest log10 value we'll emit from normalizeFromLog10 and other functions
* where the real-space value is 0.0.
*/
public final static double LOG10_P_OF_ZERO = -1000000.0;
static {
log10Cache = new double[LOG10_CACHE_SIZE];
log10FactorialCache = new double[LOG10_CACHE_SIZE];
@ -572,16 +578,26 @@ public class MathUtils {
return normalizeFromLog10(array, takeLog10OfOutput, false);
}
/**
* See #normalizeFromLog10 but with the additional option to use an approximation that keeps the calculation always in log-space
*
* @param array
* @param takeLog10OfOutput
* @param keepInLogSpace
*
* @return
*/
public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput, boolean keepInLogSpace) {
// for precision purposes, we need to add (or really subtract, since they're
// all negative) the largest value; also, we need to convert to normal-space.
double maxValue = arrayMax(array);
// we may decide to just normalize in log space without converting to linear space
if (keepInLogSpace) {
for (int i = 0; i < array.length; i++)
for (int i = 0; i < array.length; i++) {
array[i] -= maxValue;
array[i] = Math.max(array[i], LOG10_P_OF_ZERO);
}
return array;
}
@ -598,7 +614,8 @@ public class MathUtils {
for (int i = 0; i < array.length; i++) {
double x = normalized[i] / sum;
if (takeLog10OfOutput)
x = Math.log10(x);
x = Math.max(Math.log10(x), LOG10_P_OF_ZERO);
normalized[i] = x;
}
@ -1666,4 +1683,36 @@ public class MathUtils {
return result;
}
/**
* Returns a series of integer values between start and stop, inclusive,
* expontentially distributed between the two. That is, if there are
* ten values between 0-10 there will be 10 between 10-100.
*
* WARNING -- BADLY TESTED
* @param start
* @param stop
* @param eps
* @return
*/
public static List<Integer> log10LinearRange(final int start, final int stop, final double eps) {
final LinkedList<Integer> values = new LinkedList<Integer>();
final double log10range = Math.log10(stop - start);
if ( start == 0 )
values.add(0);
double i = 0.0;
while ( i <= log10range ) {
final int index = (int)Math.round(Math.pow(10, i)) + start;
if ( index < stop && (values.peekLast() == null || values.peekLast() != index ) )
values.add(index);
i += eps;
}
if ( values.peekLast() == null || values.peekLast() != stop )
values.add(stop);
return values;
}
}

View File

@ -236,6 +236,33 @@ public class Utils {
}
}
public static <T> List<T> append(final List<T> left, T ... elts) {
final List<T> l = new LinkedList<T>(left);
l.addAll(Arrays.asList(elts));
return l;
}
/**
* Returns a string of the values in joined by separator, such as A,B,C
*
* @param separator
* @param doubles
* @return
*/
public static String join(String separator, double[] doubles) {
if ( doubles == null || doubles.length == 0)
return "";
else {
StringBuilder ret = new StringBuilder();
ret.append(doubles[0]);
for (int i = 1; i < doubles.length; ++i) {
ret.append(separator);
ret.append(doubles[i]);
}
return ret.toString();
}
}
/**
* Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of
* elti objects (note there's no actual space between sep and the elti elements). Returns

View File

@ -0,0 +1,79 @@
/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.collections;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.PrintStream;
/**
* Wrapper around the basic NestedIntegerArray class that logs all updates (ie., all calls to put())
* to the provided output stream. For testing/debugging purposes.
*
* Log entries are of the following form (fields are tab-separated):
* LABEL VALUE KEY1 KEY2 ... KEY_N
*
* @author David Roazen
*/
public class LoggingNestedIntegerArray<T> extends NestedIntegerArray<T> {
private PrintStream log;
private String logEntryLabel;
/**
*
* @param log output stream to which to log update operations
* @param logEntryLabel String that should be prefixed to each log entry
* @param dimensions
*/
public LoggingNestedIntegerArray( PrintStream log, String logEntryLabel, final int... dimensions ) {
super(dimensions);
if ( log == null ) {
throw new ReviewedStingException("Log output stream must not be null");
}
this.log = log;
this.logEntryLabel = logEntryLabel != null ? logEntryLabel : "";
}
@Override
public void put( final T value, final int... keys ) {
super.put(value, keys);
StringBuilder logEntry = new StringBuilder();
logEntry.append(logEntryLabel);
logEntry.append("\t");
logEntry.append(value);
for ( int key : keys ) {
logEntry.append("\t");
logEntry.append(key);
}
// PrintStream methods all use synchronized blocks internally, so our logging is thread-safe
log.println(logEntry.toString());
}
}

View File

@ -129,6 +129,12 @@ public class UserException extends ReviewedStingException {
}
}
public static class LocalParallelizationProblem extends UserException {
public LocalParallelizationProblem(final File file) {
super(String.format("There was a failure because temporary file %s could not be found while running the GATK with more than one thread. Possible causes for this problem include: your system's open file handle limit is too small, your output or temp directories do not have sufficient space, or just an isolated file system blip", file.getAbsolutePath()));
}
}
public static class NotEnoughMemory extends UserException {
public NotEnoughMemory() {
super(String.format("There was a failure because you did not provide enough memory to run this program. See the -Xmx JVM argument to adjust the maximum heap size provided to Java"));
@ -160,6 +166,10 @@ public class UserException extends ReviewedStingException {
super(String.format("Couldn't read file %s because %s", file.getAbsolutePath(), message));
}
public CouldNotReadInputFile(String file, String message) {
super(String.format("Couldn't read file %s because %s", file, message));
}
public CouldNotReadInputFile(File file, String message, Exception e) {
super(String.format("Couldn't read file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e)));
}

View File

@ -25,9 +25,12 @@
package org.broadinstitute.sting.utils.recalibration;
import org.broadinstitute.sting.utils.collections.LoggingNestedIntegerArray;
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
import org.broadinstitute.sting.utils.collections.NestedIntegerArray;
import java.io.PrintStream;
/**
* Utility class to facilitate on-the-fly base quality score recalibration.
*
@ -52,19 +55,31 @@ public class RecalibrationTables {
private final NestedIntegerArray[] tables;
public RecalibrationTables(final Covariate[] covariates) {
this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1);
this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, null);
}
public RecalibrationTables(final Covariate[] covariates, final PrintStream log) {
this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, log);
}
public RecalibrationTables(final Covariate[] covariates, final int numReadGroups) {
this(covariates, numReadGroups, null);
}
public RecalibrationTables(final Covariate[] covariates, final int numReadGroups, final PrintStream log) {
tables = new NestedIntegerArray[covariates.length];
final int qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.index].maximumKeyValue() + 1;
final int eventDimension = EventType.values().length;
tables[TableType.READ_GROUP_TABLE.index] = new NestedIntegerArray<RecalDatum>(numReadGroups, eventDimension);
tables[TableType.QUALITY_SCORE_TABLE.index] = new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, eventDimension);
tables[TableType.READ_GROUP_TABLE.index] = log == null ? new NestedIntegerArray<RecalDatum>(numReadGroups, eventDimension) :
new LoggingNestedIntegerArray<RecalDatum>(log, "READ_GROUP_TABLE", numReadGroups, eventDimension);
tables[TableType.QUALITY_SCORE_TABLE.index] = log == null ? new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, eventDimension) :
new LoggingNestedIntegerArray<RecalDatum>(log, "QUALITY_SCORE_TABLE", numReadGroups, qualDimension, eventDimension);
for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.index; i < covariates.length; i++)
tables[i] = new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension);
tables[i] = log == null ? new NestedIntegerArray<RecalDatum>(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) :
new LoggingNestedIntegerArray<RecalDatum>(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.index + 1),
numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension);
}
public NestedIntegerArray<RecalDatum> getReadGroupTable() {

View File

@ -288,6 +288,24 @@ public abstract class Genotype implements Comparable<Genotype> {
return hasLikelihoods() ? GenotypeLikelihoods.fromPLs(getPL()) : null;
}
/**
* Are all likelihoods for this sample non-informative?
*
* Returns true if all PLs are 0 => 0,0,0 => true
* 0,0,0,0,0,0 => true
* 0,10,100 => false
*
* @return true if all samples PLs are equal and == 0
*/
public boolean isNonInformative() {
for ( final int PL : getPL() ) {
if ( PL != 0 )
return false;
}
return true;
}
/**
* Unsafe low-level accessor the PL field itself, may be null.
*

View File

@ -1517,15 +1517,32 @@ public class VariantContext implements Feature { // to enable tribble integratio
return best;
}
/**
* Lookup the index of allele in this variant context
*
* @param allele the allele whose index we want to get
* @return the index of the allele into getAlleles(), or -1 if it cannot be found
*/
public int getAlleleIndex(final Allele allele) {
return getAlleles().indexOf(allele);
}
/**
* Return the allele index #getAlleleIndex for each allele in alleles
*
* @param alleles the alleles we want to look up
* @return a list of indices for each allele, in order
*/
public List<Integer> getAlleleIndices(final Collection<Allele> alleles) {
final List<Integer> indices = new LinkedList<Integer>();
for ( final Allele allele : alleles )
indices.add(getAlleleIndex(allele));
return indices;
}
public int[] getGLIndecesOfAlternateAllele(Allele targetAllele) {
int index = 1;
for ( Allele allele : getAlternateAlleles() ) {
if ( allele.equals(targetAllele) )
break;
index++;
}
final int index = getAlleleIndex(targetAllele);
if ( index == -1 ) throw new IllegalArgumentException("Allele " + targetAllele + " not in this VariantContex " + this);
return GenotypeLikelihoods.getPLIndecesOfAlleles(0, index);
}
}

View File

@ -157,11 +157,8 @@ public class VariantContextUtils {
builder.attributes(calculateChromosomeCounts(vc, new HashMap<String, Object>(vc.getAttributes()), removeStaleValues, founderIds));
}
public static Genotype removePLs(Genotype g) {
if ( g.hasLikelihoods() )
return new GenotypeBuilder(g).noPL().make();
else
return g;
public static Genotype removePLsAndAD(final Genotype g) {
return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g;
}
public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) {
@ -573,7 +570,7 @@ public class VariantContextUtils {
}
// if we have more alternate alleles in the merged VC than in one or more of the
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD
for ( final VariantContext vc : VCs ) {
if (vc.alleles.size() == 1)
continue;
@ -581,7 +578,7 @@ public class VariantContextUtils {
if ( ! genotypes.isEmpty() )
logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s",
genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles));
genotypes = stripPLs(genotypes);
genotypes = stripPLsAndAD(genotypes);
// this will remove stale AC,AF attributed from vc
calculateChromosomeCounts(vc, attributes, true);
break;
@ -672,11 +669,11 @@ public class VariantContextUtils {
return true;
}
public static GenotypesContext stripPLs(GenotypesContext genotypes) {
public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) {
GenotypesContext newGs = GenotypesContext.create(genotypes.size());
for ( final Genotype g : genotypes ) {
newGs.add(g.hasLikelihoods() ? removePLs(g) : g);
newGs.add(removePLsAndAD(g));
}
return newGs;
@ -1343,10 +1340,7 @@ public class VariantContextUtils {
public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) {
// TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed
// see whether we need to trim common reference base from all alleles
final int trimExtent = computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, false);
if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 )
return inputVC;

View File

@ -477,10 +477,10 @@ class VCFWriter extends IndexingVariantContextWriter {
else if ( val instanceof List ) {
result = formatVCFField(((List)val).toArray());
} else if ( val.getClass().isArray() ) {
int length = Array.getLength(val);
final int length = Array.getLength(val);
if ( length == 0 )
return formatVCFField(null);
StringBuffer sb = new StringBuffer(formatVCFField(Array.get(val, 0)));
final StringBuilder sb = new StringBuilder(formatVCFField(Array.get(val, 0)));
for ( int i = 1; i < length; i++) {
sb.append(",");
sb.append(formatVCFField(Array.get(val, i)));

View File

@ -1,127 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.Arrays;
public class ExactAFCalculationModelUnitTest extends BaseTest {
static double[] AA1, AB1, BB1;
static double[] AA2, AB2, AC2, BB2, BC2, CC2;
static final int numSamples = 3;
static double[] priors = new double[2*numSamples+1]; // flat priors
@BeforeSuite
public void before() {
AA1 = new double[]{0.0, -20.0, -20.0};
AB1 = new double[]{-20.0, 0.0, -20.0};
BB1 = new double[]{-20.0, -20.0, 0.0};
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
}
private class GetGLsTest extends TestDataProvider {
GenotypesContext GLs;
int numAltAlleles;
String name;
private GetGLsTest(String name, int numAltAlleles, Genotype... arg) {
super(GetGLsTest.class, name);
GLs = GenotypesContext.create(arg);
this.name = name;
this.numAltAlleles = numAltAlleles;
}
public String toString() {
return String.format("%s input=%s", super.toString(), GLs);
}
}
private static Genotype createGenotype(String name, double[] gls) {
return new GenotypeBuilder(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).PL(gls).make();
}
@DataProvider(name = "getGLs")
public Object[][] createGLsData() {
// bi-allelic case
new GetGLsTest("B0", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AA3", AA1));
new GetGLsTest("B1", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AB", AB1));
new GetGLsTest("B2", 1, createGenotype("AA1", AA1), createGenotype("BB", BB1), createGenotype("AA2", AA1));
new GetGLsTest("B3a", 1, createGenotype("AB", AB1), createGenotype("AA", AA1), createGenotype("BB", BB1));
new GetGLsTest("B3b", 1, createGenotype("AB1", AB1), createGenotype("AB2", AB1), createGenotype("AB3", AB1));
new GetGLsTest("B4", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("AA", AA1));
new GetGLsTest("B5", 1, createGenotype("BB1", BB1), createGenotype("AB", AB1), createGenotype("BB2", BB1));
new GetGLsTest("B6", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("BB3", BB1));
// tri-allelic case
new GetGLsTest("B1C0", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AB", AB2));
new GetGLsTest("B0C1", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AC", AC2));
new GetGLsTest("B1C1a", 2, createGenotype("AA", AA2), createGenotype("AB", AB2), createGenotype("AC", AC2));
new GetGLsTest("B1C1b", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("BC", BC2));
new GetGLsTest("B2C1", 2, createGenotype("AB1", AB2), createGenotype("AB2", AB2), createGenotype("AC", AC2));
new GetGLsTest("B3C2a", 2, createGenotype("AB", AB2), createGenotype("BC1", BC2), createGenotype("BC2", BC2));
new GetGLsTest("B3C2b", 2, createGenotype("AB", AB2), createGenotype("BB", BB2), createGenotype("CC", CC2));
return GetGLsTest.getTests(GetGLsTest.class);
}
@Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) {
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
}
}
@Test
public void testLargeGLs() {
final double[] BB = new double[]{-20000000.0, -20000000.0, 0.0};
GetGLsTest cfg = new GetGLsTest("B6", 1, createGenotype("1", BB), createGenotype("2", BB), createGenotype("3", BB));
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0];
Assert.assertEquals(calculatedAlleleCount, 6);
}
@Test
public void testMismatchedGLs() {
final double[] AB = new double[]{-2000.0, 0.0, -2000.0, -2000.0, -2000.0, -2000.0};
final double[] AC = new double[]{-100.0, -100.0, -100.0, 0.0, -100.0, -100.0};
GetGLsTest cfg = new GetGLsTest("B1C1", 2, createGenotype("1", AC), createGenotype("2", AB));
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
Assert.assertEquals(result.getAlleleCountsOfMAP()[0], 1);
Assert.assertEquals(result.getAlleleCountsOfMAP()[1], 1);
}
}

View File

@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("cafd404f1b4f53586f7aa7a7084b91da"));
Arrays.asList("fe9c0e9e4b4ee4677145748cdd2285ff"));
executeTest("test MultiSample Pilot1", spec);
}
@ -36,7 +36,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testWithAllelesPassedIn1() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("9a760dffbb299bda4934bcb4f7aad42a"));
Arrays.asList("bc15123620e1134f799005d71d1180fe"));
executeTest("test MultiSample Pilot2 with alleles passed in", spec1);
}
@ -44,7 +44,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testWithAllelesPassedIn2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("8391146877aa7801ffdb3aa954bf2965"));
Arrays.asList("1ba7afccc8552f20d72d0b62237558e3"));
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
}
@ -52,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("85b79ff7910f218dd59595d03ffe6ccc"));
Arrays.asList("57e409dbb12e0d85cd8af73db221b1fc"));
executeTest("test SingleSample Pilot2", spec);
}
@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("cceb34ffbd2dbc45b8821f86ea255284"));
Arrays.asList("772e14d8c908044c04053d204bad69ef"));
executeTest("test Multiple SNP alleles", spec);
}
@ -76,7 +76,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("00f54a0097e710c0f7b001444c237e32"));
Arrays.asList("1fb69aa3857e921191997daa73f1b687"));
executeTest("test reverse trim", spec);
}
@ -84,7 +84,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMismatchedPLs() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("b3fae6bf4c620458f4259dbc93125e37"));
Arrays.asList("d210ee1baa75dd4a0c63aef6b1fa7a8a"));
executeTest("test mismatched PLs", spec);
}
@ -94,7 +94,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "712e87db5e278e92bd36e96d377303c6";
private final static String COMPRESSED_OUTPUT_MD5 = "367c0355b4e7b10c2988e5c41f44b3d2";
@Test
public void testCompressedOutput() {
@ -115,7 +115,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
String md5 = "306943dd63111e2e64388cd2e2de6c01";
String md5 = "360d1274c1072a1ae9868e4e106c2650";
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
@ -147,7 +147,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinBaseQualityScore() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1,
Arrays.asList("f73dec2e77f14c170f7b6a8eee5793ff"));
Arrays.asList("6ae4a219c7b9c837fcbf12edeeac3c0c"));
executeTest("test min_base_quality_score 26", spec);
}
@ -155,7 +155,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSLOD() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("da7a5a3aa1c9f401896c34199c535954"));
Arrays.asList("c7429e670ba477bf9a6bbee2fb41c5a9"));
executeTest("test SLOD", spec);
}
@ -163,7 +163,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testNDA() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("07f5962f790673a1299f3a0f56579b65"));
Arrays.asList("abd8e33e649cc11b55e200d3940cc7e2"));
executeTest("test NDA", spec);
}
@ -171,23 +171,23 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testCompTrack() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("22037eac40a3b1df3086c2d7b27f0d5f"));
Arrays.asList("8a9b424e00cdbe6b5e73d517335b2186"));
executeTest("test using comp track", spec);
}
@Test
public void testOutputParameterSitesOnly() {
testOutputParameters("-sites_only", "92db524b334f1416e595c711abc2d798");
testOutputParameters("-sites_only", "97ba874eafc9884a4de027a84c036311");
}
@Test
public void testOutputParameterAllConfident() {
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "da318257d25a02abd26a3348421c3c69");
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "f9ea04d96eeef29e71d37e60518c2579");
}
@Test
public void testOutputParameterAllSites() {
testOutputParameters("--output_mode EMIT_ALL_SITES", "13c4f01cffbbfac600318be95b3ca02f");
testOutputParameters("--output_mode EMIT_ALL_SITES", "67739a3ccf30975bcaef8a563e4b80cf");
}
private void testOutputParameters(final String args, final String md5) {
@ -201,7 +201,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
Arrays.asList("7326eb84d8418546a408b68839a0a47e"));
Arrays.asList("9addd225a985178339a0c49dc5fdc220"));
executeTest("test confidence 1", spec1);
}
@ -209,7 +209,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1,
Arrays.asList("7326eb84d8418546a408b68839a0a47e"));
Arrays.asList("9addd225a985178339a0c49dc5fdc220"));
executeTest("test confidence 2", spec2);
}
@ -220,12 +220,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// --------------------------------------------------------------------------------------------------------------
@Test
public void testHeterozyosity1() {
testHeterozosity( 0.01, "7aed8361e692eff559e6bca88752db0d" );
testHeterozosity( 0.01, "f1c4c8e701b2334bf3c4f12fc395fec8" );
}
@Test
public void testHeterozyosity2() {
testHeterozosity( 1.0 / 1850, "989e65bb7337117d31cd615163a8ac84" );
testHeterozosity( 1.0 / 1850, "7fbbf4a21d6bf0026bfdadbb3c086fbe" );
}
private void testHeterozosity(final double arg, final String md5) {
@ -249,7 +249,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("c155587aa0410f43d7ccc57e1ae09a68"));
Arrays.asList("5d19e3077e0cabbb364f68676a09ebe0"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -268,7 +268,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("0748a711c6154f8d85847afb79aead94"));
Arrays.asList("8a1931095f70523ad11cb99b30df7b84"));
executeTest(String.format("test calling with BAQ"), spec);
}
@ -287,7 +287,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("6aa034f669ec09ac4f5a28624cbe1830"));
Arrays.asList("64a491b5276fd5d1cd04260ea3e63cf7"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -302,7 +302,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("ba7a011d0c665acc4455d58a6ab28716"));
Arrays.asList("f63a8b8061e6c5999408d34798061895"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
}
@ -315,7 +315,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("4f7d80f4f53ef0f0959414cb30097482"));
Arrays.asList("c9d684ff2f2a9083480db6e962d612a9"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -325,7 +325,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("95986d0c92436d3b9c1f1be9c768a368"));
Arrays.asList("833fd97c6f32d7af6c9c088a78e51f68"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
}
@ -335,7 +335,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("cecd3e35a817e299e97e8f7bbf083d2c"));
Arrays.asList("95b73c24c68dc475516571d9f49dfb1e"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
}
@ -343,13 +343,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSampleIndels1() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("af04b81f0548ca22b8d1f6bf223b336e"));
Arrays.asList("3bdbf48de30bac58f3bcbc5bf3aa63aa"));
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("c7792e27477ecf99893a76ecbac5c2f9"));
Arrays.asList("beee9457d7cea42006ac45400db5e873"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@ -371,7 +371,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 20:10,000,000-10,100,000",
1,
Arrays.asList("59ff26d7e5ca2503ebe9f74902251551"));
Arrays.asList("945a2f994eaced8efdf8de24b58f2680"));
executeTest(String.format("test UG with base indel quality scores"), spec);
}
@ -405,7 +405,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("f99f9a917529bfef717fad97f725d5df"));
Arrays.asList("ba4fafec383fb988f20c8cf53dd3e9a0"));
executeTest("test minIndelFraction 0.0", spec);
}
@ -413,7 +413,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("eac2cd649bd5836068350eb4260aaea7"));
Arrays.asList("4c57a88de275105156aaafc6f9041365"));
executeTest("test minIndelFraction 0.25", spec);
}
@ -435,7 +435,22 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testNsInCigar() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + validationDataLocation + "testWithNs.bam -o %s -L 8:141799600-141814700", 1,
Arrays.asList("22c9fd65ce3298bd7fbf400c9c209f29"));
Arrays.asList("e8ebfaac0804b782f22ab8ea35152735"));
executeTest("test calling on reads with Ns in CIGAR", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing reduced reads
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void testReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("bbf16e1873e525ee5975021cfb8988cf"));
executeTest("test calling on a ReducedRead BAM", spec);
}
}

View File

@ -61,4 +61,13 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest {
Arrays.asList("7e7bad0e1890753a01303c09a38ceb8d"));
executeTest("test hg18 to hg19, unsorted", spec);
}
@Test
public void testLiftoverFilteringOfIndels() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T FilterLiftedVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "liftover_indel_test.vcf --no_cmdline_in_header",
1,
Arrays.asList("0909a953291a5e701194668c9b8833ab"));
executeTest("test liftover filtering of indels", spec);
}
}

View File

@ -190,7 +190,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("52cb2f150559ca1457e9df7ec153dbb4")
Arrays.asList("549321a2543608f214ab4893ab478be6")
);
executeTest("testRegenotype--" + testFile, spec);
@ -216,7 +216,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("52cb2f150559ca1457e9df7ec153dbb4")
Arrays.asList("549321a2543608f214ab4893ab478be6")
);
executeTest("testRemoveMLEAndRegenotype--" + testFile, spec);
@ -255,7 +255,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile,
1,
Arrays.asList("3ab35d5e81a29fb5db3e2add11c7e823")
Arrays.asList("f14d75892b99547d8e9ba3a03bfb04ea")
);
executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec);
}

View File

@ -28,6 +28,13 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
}
public static String baseTestString(String inputVCF, String inputMetaData, int gq, String mode) {
return "-T VariantsToBinaryPed -R " + b37KGReference + " -mode "+mode +
" -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) +
" -bim %s -fam %s -bed %s";
}
@Test
public void testNA12878Alone() {
String testName = "testNA12878Alone";
@ -52,6 +59,72 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
executeTest(testName, spec);
}
@Test
public void testNA12878AloneSNPMajor() {
String testName = "testNA12878AloneSNPMajor";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10,"SNP_MAJOR"),
3,
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","ada1acc475d096012b921b3219c3a446")
);
executeTest(testName, spec);
}
@Test
public void testNA12878HighGQ() {
String testName = "testNA12878HighGQ";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",80),
3,
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","0822adea688e99bb336afe5172d4c959")
);
executeTest(testName, spec);
}
@Test
public void testVCFMismatchReference() {
String testName = "testVCFMismatchReference";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12878.badReference.vcf", "CEUTrio.NA12878.metadata.txt",80),
3,
UserException.class
);
executeTest(testName, spec);
}
@Test
public void test1000GWithIndels() {
String testName = "test1000GWithIndels";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0),
3,
Arrays.asList("3c98112434d9948dc47da72ad14e8d84","3aceda4f9bb5b5457797c1fe5a85b03d","451498ceff06c1649890900fa994f1af")
);
}
@Test
public void test1000GWithIndelsSNPMajor() {
String testName = "test1000GWithIndelsSNPMajor";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0,"SNP_MAJOR"),
3,
Arrays.asList("3c98112434d9948dc47da72ad14e8d84","4a0ba3d0594b06306aa6459e4e28ec9a","451498ceff06c1649890900fa994f1af")
);
}
@Test
public void test1000G_Symbolic() {
String testName = "test1000G_Symbolic";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("1000G_selected_SVs.vcf", "1000G_selected_allVariants.md.txt",0),
3,
Arrays.asList("5e7ede48e7c5d5972c59dc5558a06e40","451498ceff06c1649890900fa994f1af","4b53a82a0b2d1a22a6eebca50a4f83a8")
);
}
@Test
public void testCEUTrio() {
String testName = "testCEUTrio";
@ -112,6 +185,7 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
executeTest(testName, spec);
}
}

View File

@ -63,7 +63,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
@Test(enabled = true)
public void testMultiAllelicOneRecord() {
WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(""),
Arrays.asList("13dd36c08be6c800f23988e6000d963e"));
Arrays.asList("0ff49c08690f61a38614606a090f23ea"));
executeTest("testMultiAllelicOneRecord", spec);
}
@ -100,6 +100,19 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
executeTest("testGenotypeFieldsWithInline", spec);
}
@Test(enabled = true)
public void testListFields() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" --variant " + privateTestDir + "vcfexample.withMLE.vcf" +
" -T VariantsToTable" +
" -GF PL" +
" -o %s",
1,
Arrays.asList("1cb2737ab0eaee0a9ae25ab2e7ac3e7e"));
executeTest("testGenotypeFields", spec);
}
@Test(enabled = true)
public void testMoltenOutput() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(

View File

@ -21,7 +21,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest {
for ( final int nct : Arrays.asList(1, 2) ) {
// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct });
//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct });
tests.add(new Object[]{ "BOTH", "081d077786ac0af24e9f97259a55209c", nt, nct });
tests.add(new Object[]{ "BOTH", "78ce72d8f9d029313f5f2ceb02bb9822", nt, nct });
}
return tests.toArray(new Object[][]{});

View File

@ -10,13 +10,17 @@ class ExampleRetryMemoryLimit extends QScript {
var bamFile: File = _
def script() {
val ug = new UnifiedGenotyper with RetryMemoryLimit
// First run with 1m
ug.memoryLimit = .001
// On retry run with 1g
ug.retryMemoryFunction = (d => d * 1000)
ug.reference_sequence = referenceFile
ug.input_file = Seq(bamFile)
add(ug)
for (scatterCount <- 1 to 2) {
val ug = new UnifiedGenotyper with RetryMemoryLimit
// First run with 1m
ug.memoryLimit = .001
// On retry run with 1g
ug.retryMemoryFunction = (d => d * 1000)
ug.reference_sequence = referenceFile
ug.input_file = Seq(bamFile)
ug.out = swapExt(bamFile, ".bam", ".scattered_%d.vcf".format(scatterCount))
ug.scatterCount = scatterCount
add(ug)
}
}
}

View File

@ -189,7 +189,7 @@ class QCommandLine extends CommandLineProgram with Logging {
private def createQueueHeader() : Seq[String] = {
Seq(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp),
"Copyright (c) 2012 The Broad Institute",
"Fro support and documentation go to http://www.broadinstitute.org/gatk")
"For support and documentation go to http://www.broadinstitute.org/gatk")
}
private def getQueueVersion : String = {

View File

@ -26,19 +26,19 @@ package org.broadinstitute.sting.queue.extensions.gatk
import org.broadinstitute.sting.queue.function.scattergather.GatherFunction
import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction
import org.broadinstitute.sting.queue.function.QFunction
import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction}
import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor
/**
* Merges BAM files using net.sf.picard.sam.MergeSamFiles.
*/
class BamGatherFunction extends GatherFunction with PicardBamFunction {
class BamGatherFunction extends GatherFunction with PicardBamFunction with RetryMemoryLimit {
this.javaMainClass = "net.sf.picard.sam.MergeSamFiles"
this.assumeSorted = Some(true)
protected def inputBams = gatherParts
protected def outputBam = originalOutput
override def freezeFieldValues {
override def freezeFieldValues() {
val originalGATK = originalFunction.asInstanceOf[CommandLineGATK]
// Whatever the original function can handle, merging *should* do less.

View File

@ -25,13 +25,13 @@
package org.broadinstitute.sting.queue.extensions.gatk
import org.broadinstitute.sting.queue.function.scattergather.GatherFunction
import org.broadinstitute.sting.queue.function.QFunction
import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction}
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor
/**
* Merges a vcf text file.
*/
class VcfGatherFunction extends CombineVariants with GatherFunction {
class VcfGatherFunction extends CombineVariants with GatherFunction with RetryMemoryLimit {
this.assumeIdenticalSamples = true
this.suppressCommandLineHeader = true

View File

@ -50,7 +50,7 @@ class SortSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFun
override def freezeFieldValues() {
super.freezeFieldValues()
if (outputIndex == null && output != null)
outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai")
outputIndex = new File(output.getAbsolutePath.stripSuffix(".bam") + ".bai")
}

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.queue.function
import org.broadinstitute.sting.queue.util._
import org.broadinstitute.sting.commandline.Argument
/**
* A command line that will be run in a pipeline.
@ -33,12 +34,15 @@ trait CommandLineFunction extends QFunction with Logging {
def commandLine: String
/** Upper memory limit */
@Argument(doc="Memory limit", required=false)
var memoryLimit: Option[Double] = None
/** Resident memory limit */
@Argument(doc="Resident memory limit", required=false)
var residentLimit: Option[Double] = None
/** Resident memory request */
@Argument(doc="Resident memory request", required=false)
var residentRequest: Option[Double] = None
/** the number of SMP cores this job wants */

View File

@ -47,6 +47,7 @@ trait JavaCommandLineFunction extends CommandLineFunction {
/**
* Memory limit for the java executable, or if None will use the default memoryLimit.
*/
@Argument(doc="Java memory limit", required=false)
var javaMemoryLimit: Option[Double] = None
/**

View File

@ -113,11 +113,13 @@ trait QFunction extends Logging with QJobReport {
var jobErrorFile: File = _
/** Errors (if any) from the last failed run of jobErrorFiles. */
@Argument(doc="Job error lines", required=false)
var jobErrorLines: Seq[String] = Nil
/**
* The number of times this function has previously been run.
*/
@Argument(doc="Job retries", required=false)
var retries = 0
/** Change settings for the next run. Retries will be set to the number of times the function was run and jobErrorLines may contain the error text. */
@ -541,4 +543,11 @@ object QFunction {
classFields
}
}
/**
* Returns the Seq of fields for a QFunction class.
* @param clazz Class to retrieve fields for.
* @return the fields of the class.
*/
def classFunctionFields(clazz: Class[_]) = classFields(clazz).functionFields
}

View File

@ -24,17 +24,26 @@
package org.broadinstitute.sting.queue.function
import org.broadinstitute.sting.commandline.Argument
object RetryMemoryLimit {
private val defaultRetryMemoryFunction: (Double => Double) = ( 2 * _ )
private val defaultMemoryLimitErrorText = Seq("OutOfMemory", "you did not provide enough memory", "TERM_MEMLIMIT")
}
/** A mixin that on retry increases the memory limit when certain text is found. */
trait RetryMemoryLimit extends CommandLineFunction {
/** How to increase the memory. By default doubles the memory. */
var retryMemoryFunction: (Double => Double) = (2 * _)
var retryMemoryFunction: (Double => Double) = RetryMemoryLimit.defaultRetryMemoryFunction
/** Once the threshold is passed, no more memory will be added to memory limit. */
@Argument(doc="threshold to stop doubling the memory", required=false)
var memoryLimitThreshold: Option[Double] = None
/** Various strings to look for to determine we ran out of memory. */
var memoryLimitErrorText = Seq("OutOfMemory", "you did not provide enough memory", "TERM_MEMLIMIT")
@Argument(doc="text to look for in the errors", required = false)
var memoryLimitErrorText = RetryMemoryLimit.defaultMemoryLimitErrorText
override def freezeFieldValues() {
super.freezeFieldValues()
@ -42,6 +51,21 @@ trait RetryMemoryLimit extends CommandLineFunction {
this.memoryLimitThreshold = this.qSettings.memoryLimitThreshold
}
override def copySettingsTo(function: QFunction) {
super.copySettingsTo(function)
function match {
case retryMemoryLimit: RetryMemoryLimit =>
if (retryMemoryLimit.memoryLimitThreshold.isEmpty)
retryMemoryLimit.memoryLimitThreshold = this.memoryLimitThreshold
if (retryMemoryLimit.retryMemoryFunction == RetryMemoryLimit.defaultRetryMemoryFunction)
retryMemoryLimit.retryMemoryFunction = this.retryMemoryFunction
if (retryMemoryLimit.memoryLimitErrorText == RetryMemoryLimit.defaultMemoryLimitErrorText)
retryMemoryLimit.memoryLimitErrorText = this.memoryLimitErrorText
case _ => /* ignore */
}
}
override def setupRetry() {
super.setupRetry()
if (this.memoryLimitThreshold.isDefined && this.memoryLimit.isDefined) {

View File

@ -30,6 +30,10 @@ import org.broadinstitute.sting.queue.function.{QFunction, CommandLineFunction}
/**
* Shadow clones another command line function.
*/
object CloneFunction {
private lazy val cloneFunctionFields = QFunction.classFunctionFields(classOf[CloneFunction])
}
class CloneFunction extends CommandLineFunction {
var originalFunction: ScatterGatherableFunction = _
var cloneIndex: Int = _
@ -41,10 +45,10 @@ class CloneFunction extends CommandLineFunction {
var originalValues = Map.empty[ArgumentSource, Any]
withScatterPartCount += 1
if (withScatterPartCount == 1) {
overriddenFields.foreach{
case (field, overrideValue) => {
originalFunction.functionFields.foreach {
case (field) => {
originalValues += field -> originalFunction.getFieldValue(field)
originalFunction.setFieldValue(field, overrideValue)
originalFunction.setFieldValue(field, getFieldValue(field))
}
}
}
@ -52,9 +56,11 @@ class CloneFunction extends CommandLineFunction {
f()
} finally {
if (withScatterPartCount == 1) {
originalValues.foreach{
case (name, value) =>
originalFunction.setFieldValue(name, value)
originalFunction.functionFields.foreach {
case (field) => {
setFieldValue(field, originalFunction.getFieldValue(field))
originalFunction.setFieldValue(field, originalValues(field))
}
}
}
withScatterPartCount -= 1
@ -63,6 +69,8 @@ class CloneFunction extends CommandLineFunction {
override def description = withScatterPart(() => originalFunction.description)
override def shortDescription = withScatterPart(() => originalFunction.shortDescription)
override def setupRetry() { withScatterPart(() => originalFunction.setupRetry()) }
override protected def functionFieldClass = originalFunction.getClass
def commandLine = withScatterPart(() => originalFunction.commandLine)
@ -73,13 +81,19 @@ class CloneFunction extends CommandLineFunction {
}
override def getFieldValue(source: ArgumentSource): AnyRef = {
overriddenFields.get(source) match {
case Some(value) => value.asInstanceOf[AnyRef]
case None => {
val value = originalFunction.getFieldValue(source)
overriddenFields += source -> value
value
}
CloneFunction.cloneFunctionFields.find(_.field.getName == source.field.getName) match {
case Some(cloneSource) =>
super.getFieldValue(cloneSource)
case None =>
overriddenFields.get(source) match {
case Some(value) =>
value.asInstanceOf[AnyRef]
case None => {
val value = originalFunction.getFieldValue(source)
overriddenFields += source -> value
value
}
}
}
}